galois_amd64.s 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. //+build !noasm !appengine !gccgo
  2. // Copyright 2015, Klaus Post, see LICENSE for details.
  3. // Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
  4. // and http://jerasure.org/jerasure/gf-complete/tree/master
  5. // func galMulSSSE3Xor(low, high, in, out []byte)
  6. TEXT ·galMulSSSE3Xor(SB), 7, $0
  7. MOVQ low+0(FP), SI // SI: &low
  8. MOVQ high+24(FP), DX // DX: &high
  9. MOVOU (SI), X6 // X6 low
  10. MOVOU (DX), X7 // X7: high
  11. MOVQ $15, BX // BX: low mask
  12. MOVQ BX, X8
  13. PXOR X5, X5
  14. MOVQ in+48(FP), SI // R11: &in
  15. MOVQ in_len+56(FP), R9 // R9: len(in)
  16. MOVQ out+72(FP), DX // DX: &out
  17. PSHUFB X5, X8 // X8: lomask (unpacked)
  18. SHRQ $4, R9 // len(in) / 16
  19. MOVQ SI, AX
  20. MOVQ DX, BX
  21. ANDQ $15, AX
  22. ANDQ $15, BX
  23. CMPQ R9, $0
  24. JEQ done_xor
  25. ORQ AX, BX
  26. CMPQ BX, $0
  27. JNZ loopback_xor
  28. loopback_xor_aligned:
  29. MOVOA (SI), X0 // in[x]
  30. MOVOA (DX), X4 // out[x]
  31. MOVOA X0, X1 // in[x]
  32. MOVOA X6, X2 // low copy
  33. MOVOA X7, X3 // high copy
  34. PSRLQ $4, X1 // X1: high input
  35. PAND X8, X0 // X0: low input
  36. PAND X8, X1 // X0: high input
  37. PSHUFB X0, X2 // X2: mul low part
  38. PSHUFB X1, X3 // X3: mul high part
  39. PXOR X2, X3 // X3: Result
  40. PXOR X4, X3 // X3: Result xor existing out
  41. MOVOA X3, (DX) // Store
  42. ADDQ $16, SI // in+=16
  43. ADDQ $16, DX // out+=16
  44. SUBQ $1, R9
  45. JNZ loopback_xor_aligned
  46. JMP done_xor
  47. loopback_xor:
  48. MOVOU (SI), X0 // in[x]
  49. MOVOU (DX), X4 // out[x]
  50. MOVOU X0, X1 // in[x]
  51. MOVOU X6, X2 // low copy
  52. MOVOU X7, X3 // high copy
  53. PSRLQ $4, X1 // X1: high input
  54. PAND X8, X0 // X0: low input
  55. PAND X8, X1 // X0: high input
  56. PSHUFB X0, X2 // X2: mul low part
  57. PSHUFB X1, X3 // X3: mul high part
  58. PXOR X2, X3 // X3: Result
  59. PXOR X4, X3 // X3: Result xor existing out
  60. MOVOU X3, (DX) // Store
  61. ADDQ $16, SI // in+=16
  62. ADDQ $16, DX // out+=16
  63. SUBQ $1, R9
  64. JNZ loopback_xor
  65. done_xor:
  66. RET
  67. // func galMulSSSE3(low, high, in, out []byte)
  68. TEXT ·galMulSSSE3(SB), 7, $0
  69. MOVQ low+0(FP), SI // SI: &low
  70. MOVQ high+24(FP), DX // DX: &high
  71. MOVOU (SI), X6 // X6 low
  72. MOVOU (DX), X7 // X7: high
  73. MOVQ $15, BX // BX: low mask
  74. MOVQ BX, X8
  75. PXOR X5, X5
  76. MOVQ in+48(FP), SI // R11: &in
  77. MOVQ in_len+56(FP), R9 // R9: len(in)
  78. MOVQ out+72(FP), DX // DX: &out
  79. PSHUFB X5, X8 // X8: lomask (unpacked)
  80. MOVQ SI, AX
  81. MOVQ DX, BX
  82. SHRQ $4, R9 // len(in) / 16
  83. ANDQ $15, AX
  84. ANDQ $15, BX
  85. CMPQ R9, $0
  86. JEQ done
  87. ORQ AX, BX
  88. CMPQ BX, $0
  89. JNZ loopback
  90. loopback_aligned:
  91. MOVOA (SI), X0 // in[x]
  92. MOVOA X0, X1 // in[x]
  93. MOVOA X6, X2 // low copy
  94. MOVOA X7, X3 // high copy
  95. PSRLQ $4, X1 // X1: high input
  96. PAND X8, X0 // X0: low input
  97. PAND X8, X1 // X0: high input
  98. PSHUFB X0, X2 // X2: mul low part
  99. PSHUFB X1, X3 // X3: mul high part
  100. PXOR X2, X3 // X3: Result
  101. MOVOA X3, (DX) // Store
  102. ADDQ $16, SI // in+=16
  103. ADDQ $16, DX // out+=16
  104. SUBQ $1, R9
  105. JNZ loopback_aligned
  106. JMP done
  107. loopback:
  108. MOVOU (SI), X0 // in[x]
  109. MOVOU X0, X1 // in[x]
  110. MOVOA X6, X2 // low copy
  111. MOVOA X7, X3 // high copy
  112. PSRLQ $4, X1 // X1: high input
  113. PAND X8, X0 // X0: low input
  114. PAND X8, X1 // X0: high input
  115. PSHUFB X0, X2 // X2: mul low part
  116. PSHUFB X1, X3 // X3: mul high part
  117. PXOR X2, X3 // X3: Result
  118. MOVOU X3, (DX) // Store
  119. ADDQ $16, SI // in+=16
  120. ADDQ $16, DX // out+=16
  121. SUBQ $1, R9
  122. JNZ loopback
  123. done:
  124. RET
  125. // func galMulAVX2Xor(low, high, in, out []byte)
  126. TEXT ·galMulAVX2Xor(SB), 7, $0
  127. MOVQ low+0(FP), SI // SI: &low
  128. MOVQ high+24(FP), DX // DX: &high
  129. MOVQ $15, BX // BX: low mask
  130. MOVQ BX, X5
  131. MOVOU (SI), X6 // X6: low
  132. MOVOU (DX), X7 // X7: high
  133. MOVQ in_len+56(FP), R9 // R9: len(in)
  134. VINSERTI128 $1, X6, Y6, Y6 // low
  135. VINSERTI128 $1, X7, Y7, Y7 // high
  136. VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
  137. SHRQ $5, R9 // len(in) / 32
  138. MOVQ out+72(FP), DX // DX: &out
  139. MOVQ in+48(FP), SI // SI: &in
  140. TESTQ R9, R9
  141. JZ done_xor_avx2
  142. loopback_xor_avx2:
  143. VMOVDQU (SI), Y0
  144. VMOVDQU (DX), Y4
  145. VPSRLQ $4, Y0, Y1 // Y1: high input
  146. VPAND Y8, Y0, Y0 // Y0: low input
  147. VPAND Y8, Y1, Y1 // Y1: high input
  148. VPSHUFB Y0, Y6, Y2 // Y2: mul low part
  149. VPSHUFB Y1, Y7, Y3 // Y3: mul high part
  150. VPXOR Y3, Y2, Y3 // Y3: Result
  151. VPXOR Y4, Y3, Y4 // Y4: Result
  152. VMOVDQU Y4, (DX)
  153. ADDQ $32, SI // in+=32
  154. ADDQ $32, DX // out+=32
  155. SUBQ $1, R9
  156. JNZ loopback_xor_avx2
  157. done_xor_avx2:
  158. VZEROUPPER
  159. RET
  160. // func galMulAVX2(low, high, in, out []byte)
  161. TEXT ·galMulAVX2(SB), 7, $0
  162. MOVQ low+0(FP), SI // SI: &low
  163. MOVQ high+24(FP), DX // DX: &high
  164. MOVQ $15, BX // BX: low mask
  165. MOVQ BX, X5
  166. MOVOU (SI), X6 // X6: low
  167. MOVOU (DX), X7 // X7: high
  168. MOVQ in_len+56(FP), R9 // R9: len(in)
  169. VINSERTI128 $1, X6, Y6, Y6 // low
  170. VINSERTI128 $1, X7, Y7, Y7 // high
  171. VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
  172. SHRQ $5, R9 // len(in) / 32
  173. MOVQ out+72(FP), DX // DX: &out
  174. MOVQ in+48(FP), SI // SI: &in
  175. TESTQ R9, R9
  176. JZ done_avx2
  177. loopback_avx2:
  178. VMOVDQU (SI), Y0
  179. VPSRLQ $4, Y0, Y1 // Y1: high input
  180. VPAND Y8, Y0, Y0 // Y0: low input
  181. VPAND Y8, Y1, Y1 // Y1: high input
  182. VPSHUFB Y0, Y6, Y2 // Y2: mul low part
  183. VPSHUFB Y1, Y7, Y3 // Y3: mul high part
  184. VPXOR Y3, Y2, Y4 // Y4: Result
  185. VMOVDQU Y4, (DX)
  186. ADDQ $32, SI // in+=32
  187. ADDQ $32, DX // out+=32
  188. SUBQ $1, R9
  189. JNZ loopback_avx2
  190. done_avx2:
  191. VZEROUPPER
  192. RET
  193. // func sSE2XorSlice(in, out []byte)
  194. TEXT ·sSE2XorSlice(SB), 7, $0
  195. MOVQ in+0(FP), SI // SI: &in
  196. MOVQ in_len+8(FP), R9 // R9: len(in)
  197. MOVQ out+24(FP), DX // DX: &out
  198. SHRQ $4, R9 // len(in) / 16
  199. CMPQ R9, $0
  200. JEQ done_xor_sse2
  201. loopback_xor_sse2:
  202. MOVOU (SI), X0 // in[x]
  203. MOVOU (DX), X1 // out[x]
  204. PXOR X0, X1
  205. MOVOU X1, (DX)
  206. ADDQ $16, SI // in+=16
  207. ADDQ $16, DX // out+=16
  208. SUBQ $1, R9
  209. JNZ loopback_xor_sse2
  210. done_xor_sse2:
  211. RET