1
0

galoisAvx512_amd64.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. //+build !noasm
  2. //+build !appengine
  3. //+build !gccgo
  4. // Copyright 2015, Klaus Post, see LICENSE for details.
  5. // Copyright 2019, Minio, Inc.
  6. package reedsolomon
  7. //go:noescape
  8. func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
  9. //go:noescape
  10. func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
  11. const (
  12. dimIn = 8 // Number of input rows processed simultaneously
  13. dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
  14. dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
  15. matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
  16. matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
  17. )
  18. // Construct block of matrix coefficients for 2 outputs rows in parallel
  19. func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
  20. offset := 0
  21. for c := inputOffset; c < inputOffset+dimIn; c++ {
  22. for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
  23. if c < len(matrixRows[iRow]) {
  24. coeff := matrixRows[iRow][c]
  25. copy(matrix[offset*32:], mulTableLow[coeff][:])
  26. copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
  27. } else {
  28. // coefficients not used for this input shard (so null out)
  29. v := matrix[offset*32 : offset*32+32]
  30. for i := range v {
  31. v[i] = 0
  32. }
  33. }
  34. offset += dimIn
  35. if offset >= dimIn*dimOut82 {
  36. offset -= dimIn*dimOut82 - 1
  37. }
  38. }
  39. }
  40. }
  41. // Construct block of matrix coefficients for 4 outputs rows in parallel
  42. func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
  43. offset := 0
  44. for c := inputOffset; c < inputOffset+dimIn; c++ {
  45. for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
  46. if c < len(matrixRows[iRow]) {
  47. coeff := matrixRows[iRow][c]
  48. copy(matrix[offset*32:], mulTableLow[coeff][:])
  49. copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
  50. } else {
  51. // coefficients not used for this input shard (so null out)
  52. v := matrix[offset*32 : offset*32+32]
  53. for i := range v {
  54. v[i] = 0
  55. }
  56. }
  57. offset += dimIn
  58. if offset >= dimIn*dimOut84 {
  59. offset -= dimIn*dimOut84 - 1
  60. }
  61. }
  62. }
  63. }
  64. // Invoke AVX512 routine for 2 output rows in parallel
  65. func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
  66. done := len(in[0])
  67. if done == 0 {
  68. return
  69. }
  70. inputEnd := inputOffset + dimIn
  71. if inputEnd > len(in) {
  72. inputEnd = len(in)
  73. }
  74. outputEnd := outputOffset + dimOut82
  75. if outputEnd > len(out) {
  76. outputEnd = len(out)
  77. }
  78. matrix82 := [matrixSize82]byte{}
  79. setupMatrix82(matrixRows, inputOffset, outputOffset, &matrix82)
  80. addTo := inputOffset != 0 // Except for the first input column, add to previous results
  81. _galMulAVX512Parallel82(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix82, addTo)
  82. done = (done >> 6) << 6
  83. if len(in[0])-done == 0 {
  84. return
  85. }
  86. for c := inputOffset; c < inputOffset+dimIn; c++ {
  87. for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
  88. if c < len(matrixRows[iRow]) {
  89. mt := mulTable[matrixRows[iRow][c]][:256]
  90. for i := done; i < len(in[0]); i++ {
  91. if c == 0 { // only set value for first input column
  92. out[iRow][i] = mt[in[c][i]]
  93. } else { // and add for all others
  94. out[iRow][i] ^= mt[in[c][i]]
  95. }
  96. }
  97. }
  98. }
  99. }
  100. }
  101. // Invoke AVX512 routine for 4 output rows in parallel
  102. func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
  103. done := len(in[0])
  104. if done == 0 {
  105. return
  106. }
  107. inputEnd := inputOffset + dimIn
  108. if inputEnd > len(in) {
  109. inputEnd = len(in)
  110. }
  111. outputEnd := outputOffset + dimOut84
  112. if outputEnd > len(out) {
  113. outputEnd = len(out)
  114. }
  115. matrix84 := [matrixSize84]byte{}
  116. setupMatrix84(matrixRows, inputOffset, outputOffset, &matrix84)
  117. addTo := inputOffset != 0 // Except for the first input column, add to previous results
  118. _galMulAVX512Parallel84(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix84, addTo)
  119. done = (done >> 6) << 6
  120. if len(in[0])-done == 0 {
  121. return
  122. }
  123. for c := inputOffset; c < inputOffset+dimIn; c++ {
  124. for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
  125. if c < len(matrixRows[iRow]) {
  126. mt := mulTable[matrixRows[iRow][c]][:256]
  127. for i := done; i < len(in[0]); i++ {
  128. if c == 0 { // only set value for first input column
  129. out[iRow][i] = mt[in[c][i]]
  130. } else { // and add for all others
  131. out[iRow][i] ^= mt[in[c][i]]
  132. }
  133. }
  134. }
  135. }
  136. }
  137. }
  138. // Perform the same as codeSomeShards, but taking advantage of
  139. // AVX512 parallelism for up to 4x faster execution as compared to AVX2
  140. func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
  141. outputRow := 0
  142. // First process (multiple) batches of 4 output rows in parallel
  143. for ; outputRow+dimOut84 <= len(outputs); outputRow += dimOut84 {
  144. for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
  145. galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow)
  146. }
  147. }
  148. // Then process a (single) batch of 2 output rows in parallel
  149. if outputRow+dimOut82 <= len(outputs) {
  150. // fmt.Println(outputRow, len(outputs))
  151. for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
  152. galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow)
  153. }
  154. outputRow += dimOut82
  155. }
  156. // Lastly, we may have a single output row left (for uneven parity)
  157. if outputRow < len(outputs) {
  158. for c := 0; c < r.DataShards; c++ {
  159. if c == 0 {
  160. galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
  161. } else {
  162. galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
  163. }
  164. }
  165. }
  166. }