galoisAvx512_amd64.s 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. //+build !noasm !appengine !gccgo
  2. // Copyright 2015, Klaus Post, see LICENSE for details.
  3. // Copyright 2019, Minio, Inc.
  4. //
  5. // Process 2 output rows in parallel from a total of 8 input rows
  6. //
  7. // func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
  8. TEXT ·_galMulAVX512Parallel82(SB), 7, $0
  9. MOVQ in+0(FP), SI //
  10. MOVQ 8(SI), R9 // R9: len(in)
  11. SHRQ $6, R9 // len(in) / 64
  12. TESTQ R9, R9
  13. JZ done_avx512_parallel82
  14. MOVQ matrix+48(FP), SI
  15. LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
  16. LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
  17. LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
  18. LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
  19. LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
  20. LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
  21. LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
  22. LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
  23. MOVQ $15, BX
  24. MOVQ BX, X5
  25. LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
  26. MOVB addTo+56(FP), AX
  27. LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
  28. WORD $0xf749; BYTE $0xe0 // mul r8
  29. LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
  30. MOVQ in+0(FP), SI // SI: &in
  31. MOVQ in_len+8(FP), AX // number of inputs
  32. XORQ R11, R11
  33. MOVQ out+24(FP), DX
  34. MOVQ 24(DX), CX // CX: &out[1][0]
  35. MOVQ (DX), DX // DX: &out[0][0]
  36. loopback_avx512_parallel82:
  37. LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
  38. LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
  39. MOVQ (SI), BX // BX: &in[0][0]
  40. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  41. LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
  42. LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
  43. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  44. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  45. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  46. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  47. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  48. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  49. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  50. LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
  51. LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
  52. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  53. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  54. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  55. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  56. CMPQ AX, $1
  57. JE skip_avx512_parallel82
  58. MOVQ 24(SI), BX // BX: &in[1][0]
  59. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  60. LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
  61. LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
  62. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  63. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  64. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  65. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  66. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  67. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  68. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  69. LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
  70. LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
  71. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  72. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  73. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  74. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  75. CMPQ AX, $2
  76. JE skip_avx512_parallel82
  77. MOVQ 48(SI), BX // BX: &in[2][0]
  78. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  79. LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
  80. LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
  81. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  82. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  83. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  84. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  85. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  86. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  87. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  88. LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
  89. LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
  90. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  91. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  92. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  93. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  94. CMPQ AX, $3
  95. JE skip_avx512_parallel82
  96. MOVQ 72(SI), BX // BX: &in[3][0]
  97. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  98. LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
  99. LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
  100. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  101. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  102. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  103. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  104. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  105. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  106. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  107. LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
  108. LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
  109. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  110. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  111. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  112. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  113. CMPQ AX, $4
  114. JE skip_avx512_parallel82
  115. MOVQ 96(SI), BX // BX: &in[4][0]
  116. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  117. LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
  118. LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
  119. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  120. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  121. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  122. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  123. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  124. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  125. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  126. LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
  127. LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
  128. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  129. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  130. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  131. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  132. CMPQ AX, $5
  133. JE skip_avx512_parallel82
  134. MOVQ 120(SI), BX // BX: &in[5][0]
  135. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  136. LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
  137. LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
  138. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  139. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  140. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  141. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  142. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  143. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  144. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  145. LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
  146. LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
  147. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  148. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  149. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  150. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  151. CMPQ AX, $6
  152. JE skip_avx512_parallel82
  153. MOVQ 144(SI), BX // BX: &in[6][0]
  154. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  155. LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
  156. LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
  157. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  158. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  159. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  160. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  161. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  162. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  163. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  164. LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
  165. LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
  166. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  167. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  168. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  169. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  170. CMPQ AX, $7
  171. JE skip_avx512_parallel82
  172. MOVQ 168(SI), BX // BX: &in[7][0]
  173. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  174. LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
  175. LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
  176. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  177. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  178. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  179. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  180. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  181. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  182. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  183. LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
  184. LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
  185. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  186. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  187. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  188. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  189. skip_avx512_parallel82:
  190. LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
  191. LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
  192. ADDQ $64, R11 // in4+=64
  193. ADDQ $64, DX // out+=64
  194. ADDQ $64, CX // out2+=64
  195. SUBQ $1, R9
  196. JNZ loopback_avx512_parallel82
  197. done_avx512_parallel82:
  198. VZEROUPPER
  199. RET
  200. //
  201. // Process 4 output rows in parallel from a total of 8 input rows
  202. //
  203. // func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
  204. TEXT ·_galMulAVX512Parallel84(SB), 7, $0
  205. MOVQ in+0(FP), SI //
  206. MOVQ 8(SI), R9 // R9: len(in)
  207. SHRQ $6, R9 // len(in) / 64
  208. TESTQ R9, R9
  209. JZ done_avx512_parallel84
  210. MOVQ matrix+48(FP), SI
  211. LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
  212. LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
  213. LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
  214. LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
  215. LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
  216. LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
  217. LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
  218. LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
  219. LONG $0x48fe6162; WORD $0x466f; BYTE $0x08 // VMOVDQU64 ZMM24, 0x200[rsi]
  220. LONG $0x48fe6162; WORD $0x4e6f; BYTE $0x09 // VMOVDQU64 ZMM25, 0x240[rsi]
  221. LONG $0x48fe6162; WORD $0x566f; BYTE $0x0a // VMOVDQU64 ZMM26, 0x280[rsi]
  222. LONG $0x48fe6162; WORD $0x5e6f; BYTE $0x0b // VMOVDQU64 ZMM27, 0x2c0[rsi]
  223. LONG $0x48fe6162; WORD $0x666f; BYTE $0x0c // VMOVDQU64 ZMM28, 0x300[rsi]
  224. LONG $0x48fe6162; WORD $0x6e6f; BYTE $0x0d // VMOVDQU64 ZMM29, 0x340[rsi]
  225. LONG $0x48fe6162; WORD $0x766f; BYTE $0x0e // VMOVDQU64 ZMM30, 0x380[rsi]
  226. LONG $0x48fe6162; WORD $0x7e6f; BYTE $0x0f // VMOVDQU64 ZMM31, 0x3c0[rsi]
  227. MOVQ $15, BX
  228. MOVQ BX, X5
  229. LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
  230. MOVB addTo+56(FP), AX
  231. LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
  232. WORD $0xf749; BYTE $0xe0 // mul r8
  233. LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
  234. MOVQ in+0(FP), SI // SI: &in
  235. MOVQ in_len+8(FP), AX // number of inputs
  236. XORQ R11, R11
  237. MOVQ out+24(FP), DX
  238. MOVQ 24(DX), CX // CX: &out[1][0]
  239. MOVQ 48(DX), R10 // R10: &out[2][0]
  240. MOVQ 72(DX), R12 // R12: &out[3][0]
  241. MOVQ (DX), DX // DX: &out[0][0]
  242. loopback_avx512_parallel84:
  243. LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
  244. LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
  245. LONG $0xc9fed162; WORD $0x326f // VMOVDQU64 ZMM6{k1}{z}, [r10]
  246. LONG $0xc9fed162; WORD $0x3c6f; BYTE $0x24 // VMOVDQU64 ZMM7{k1}{z}, [r12]
  247. MOVQ (SI), BX // BX: &in[0][0]
  248. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  249. LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
  250. LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
  251. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  252. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  253. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  254. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  255. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  256. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  257. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  258. LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
  259. LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
  260. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  261. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  262. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  263. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  264. LONG $0x40bd1362; WORD $0xd043; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0x00
  265. LONG $0x40bd1362; WORD $0xd843; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0x55
  266. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  267. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  268. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  269. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  270. LONG $0x409d1362; WORD $0xc443; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0x00
  271. LONG $0x409d1362; WORD $0xcc43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0x55
  272. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  273. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  274. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  275. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  276. CMPQ AX, $1
  277. JE skip_avx512_parallel84
  278. MOVQ 24(SI), BX // BX: &in[1][0]
  279. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  280. LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
  281. LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
  282. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  283. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  284. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  285. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  286. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  287. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  288. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  289. LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
  290. LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
  291. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  292. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  293. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  294. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  295. LONG $0x40bd1362; WORD $0xd043; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0xaa
  296. LONG $0x40bd1362; WORD $0xd843; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0xff
  297. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  298. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  299. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  300. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  301. LONG $0x409d1362; WORD $0xc443; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0xaa
  302. LONG $0x409d1362; WORD $0xcc43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0xff
  303. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  304. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  305. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  306. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  307. CMPQ AX, $2
  308. JE skip_avx512_parallel84
  309. MOVQ 48(SI), BX // BX: &in[2][0]
  310. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  311. LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
  312. LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
  313. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  314. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  315. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  316. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  317. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  318. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  319. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  320. LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
  321. LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
  322. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  323. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  324. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  325. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  326. LONG $0x40b51362; WORD $0xd143; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0x00
  327. LONG $0x40b51362; WORD $0xd943; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0x55
  328. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  329. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  330. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  331. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  332. LONG $0x40951362; WORD $0xc543; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0x00
  333. LONG $0x40951362; WORD $0xcd43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0x55
  334. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  335. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  336. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  337. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  338. CMPQ AX, $3
  339. JE skip_avx512_parallel84
  340. MOVQ 72(SI), BX // BX: &in[3][0]
  341. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  342. LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
  343. LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
  344. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  345. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  346. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  347. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  348. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  349. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  350. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  351. LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
  352. LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
  353. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  354. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  355. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  356. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  357. LONG $0x40b51362; WORD $0xd143; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0xaa
  358. LONG $0x40b51362; WORD $0xd943; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0xff
  359. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  360. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  361. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  362. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  363. LONG $0x40951362; WORD $0xc543; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0xaa
  364. LONG $0x40951362; WORD $0xcd43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0xff
  365. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  366. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  367. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  368. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  369. CMPQ AX, $4
  370. JE skip_avx512_parallel84
  371. MOVQ 96(SI), BX // BX: &in[4][0]
  372. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  373. LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
  374. LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
  375. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  376. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  377. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  378. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  379. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  380. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  381. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  382. LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
  383. LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
  384. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  385. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  386. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  387. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  388. LONG $0x40ad1362; WORD $0xd243; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0x00
  389. LONG $0x40ad1362; WORD $0xda43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0x55
  390. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  391. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  392. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  393. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  394. LONG $0x408d1362; WORD $0xc643; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0x00
  395. LONG $0x408d1362; WORD $0xce43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0x55
  396. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  397. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  398. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  399. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  400. CMPQ AX, $5
  401. JE skip_avx512_parallel84
  402. MOVQ 120(SI), BX // BX: &in[5][0]
  403. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  404. LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
  405. LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
  406. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  407. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  408. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  409. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  410. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  411. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  412. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  413. LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
  414. LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
  415. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  416. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  417. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  418. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  419. LONG $0x40ad1362; WORD $0xd243; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0xaa
  420. LONG $0x40ad1362; WORD $0xda43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0xff
  421. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  422. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  423. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  424. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  425. LONG $0x408d1362; WORD $0xc643; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0xaa
  426. LONG $0x408d1362; WORD $0xce43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0xff
  427. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  428. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  429. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  430. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  431. CMPQ AX, $6
  432. JE skip_avx512_parallel84
  433. MOVQ 144(SI), BX // BX: &in[6][0]
  434. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  435. LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
  436. LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
  437. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  438. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  439. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  440. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  441. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  442. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  443. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  444. LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
  445. LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
  446. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  447. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  448. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  449. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  450. LONG $0x40a51362; WORD $0xd343; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0x00
  451. LONG $0x40a51362; WORD $0xdb43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0x55
  452. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  453. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  454. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  455. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  456. LONG $0x40851362; WORD $0xc743; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0x00
  457. LONG $0x40851362; WORD $0xcf43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0x55
  458. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  459. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  460. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  461. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  462. CMPQ AX, $7
  463. JE skip_avx512_parallel84
  464. MOVQ 168(SI), BX // BX: &in[7][0]
  465. LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
  466. LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
  467. LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
  468. LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
  469. LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
  470. LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
  471. LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
  472. LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
  473. LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
  474. LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
  475. LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
  476. LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
  477. LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
  478. LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
  479. LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
  480. LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
  481. LONG $0x40a51362; WORD $0xd343; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0xaa
  482. LONG $0x40a51362; WORD $0xdb43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0xff
  483. LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
  484. LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
  485. LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
  486. LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
  487. LONG $0x40851362; WORD $0xc743; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0xaa
  488. LONG $0x40851362; WORD $0xcf43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0xff
  489. LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
  490. LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
  491. LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
  492. LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
  493. skip_avx512_parallel84:
  494. LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
  495. LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
  496. LONG $0x48fed162; WORD $0x327f // VMOVDQU64 [r10], ZMM6
  497. LONG $0x48fed162; WORD $0x3c7f; BYTE $0x24 // VMOVDQU64 [r12], ZMM7
  498. ADDQ $64, R11 // in4+=64
  499. ADDQ $64, DX // out+=64
  500. ADDQ $64, CX // out2+=64
  501. ADDQ $64, R10 // out3+=64
  502. ADDQ $64, R12 // out4+=64
  503. SUBQ $1, R9
  504. JNZ loopback_avx512_parallel84
  505. done_avx512_parallel84:
  506. VZEROUPPER
  507. RET