00001 #ifndef BMSSE_UTIL__H__INCLUDED__
00002 #define BMSSE_UTIL__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 namespace bm
00032 {
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049 class sse_empty_guard
00050 {
00051 public:
00052 BMFORCEINLINE sse_empty_guard()
00053 {
00054 _mm_empty();
00055 }
00056
00057 BMFORCEINLINE ~sse_empty_guard()
00058 {
00059 _mm_empty();
00060 }
00061 };
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071 BMFORCEINLINE
00072 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst,
00073 const __m128i* BMRESTRICT src,
00074 const __m128i* BMRESTRICT src_end,
00075 bm::word_t mask)
00076 {
00077 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00078 do
00079 {
00080 __m128i xmm1 = _mm_load_si128(src);
00081
00082 xmm1 = _mm_xor_si128(xmm1, xmm2);
00083 _mm_store_si128(dst, xmm1);
00084 ++dst;
00085 ++src;
00086
00087 } while (src < src_end);
00088 }
00089
00090
00091
00092
00093
00094
00095
00096
00097 BMFORCEINLINE
00098 void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst,
00099 const __m128i* BMRESTRICT src,
00100 const __m128i* BMRESTRICT src_end,
00101 bm::word_t mask)
00102 {
00103 __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00104 do
00105 {
00106
00107
00108
00109 __m128i xmm1 = _mm_load_si128(src);
00110
00111 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00112 _mm_store_si128(dst, xmm1);
00113 ++dst;
00114 ++src;
00115
00116 } while (src < src_end);
00117 }
00118
00119
00120
00121
00122
00123
00124
00125 BMFORCEINLINE
00126 void sse2_and_arr(__m128i* BMRESTRICT dst,
00127 const __m128i* BMRESTRICT src,
00128 const __m128i* BMRESTRICT src_end)
00129 {
00130 __m128i xmm1, xmm2;
00131 do
00132 {
00133 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00134
00135 xmm1 = _mm_load_si128(src++);
00136 xmm2 = _mm_load_si128(dst);
00137 xmm1 = _mm_and_si128(xmm1, xmm2);
00138 _mm_store_si128(dst++, xmm1);
00139
00140 xmm1 = _mm_load_si128(src++);
00141 xmm2 = _mm_load_si128(dst);
00142 xmm1 = _mm_and_si128(xmm1, xmm2);
00143 _mm_store_si128(dst++, xmm1);
00144
00145 xmm1 = _mm_load_si128(src++);
00146 xmm2 = _mm_load_si128(dst);
00147 xmm1 = _mm_and_si128(xmm1, xmm2);
00148 _mm_store_si128(dst++, xmm1);
00149
00150 xmm1 = _mm_load_si128(src++);
00151 xmm2 = _mm_load_si128(dst);
00152 xmm1 = _mm_and_si128(xmm1, xmm2);
00153 _mm_store_si128(dst++, xmm1);
00154
00155 } while (src < src_end);
00156
00157 }
00158
00159
00160
00161
00162
00163
00164
00165
00166 BMFORCEINLINE
00167 void sse2_or_arr(__m128i* BMRESTRICT dst,
00168 const __m128i* BMRESTRICT src,
00169 const __m128i* BMRESTRICT src_end)
00170 {
00171 __m128i xmm1, xmm2;
00172 do
00173 {
00174 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00175
00176 xmm1 = _mm_load_si128(src++);
00177 xmm2 = _mm_load_si128(dst);
00178 xmm1 = _mm_or_si128(xmm1, xmm2);
00179 _mm_store_si128(dst++, xmm1);
00180
00181 xmm1 = _mm_load_si128(src++);
00182 xmm2 = _mm_load_si128(dst);
00183 xmm1 = _mm_or_si128(xmm1, xmm2);
00184 _mm_store_si128(dst++, xmm1);
00185
00186 xmm1 = _mm_load_si128(src++);
00187 xmm2 = _mm_load_si128(dst);
00188 xmm1 = _mm_or_si128(xmm1, xmm2);
00189 _mm_store_si128(dst++, xmm1);
00190
00191 xmm1 = _mm_load_si128(src++);
00192 xmm2 = _mm_load_si128(dst);
00193 xmm1 = _mm_or_si128(xmm1, xmm2);
00194 _mm_store_si128(dst++, xmm1);
00195
00196 } while (src < src_end);
00197 }
00198
00199
00200
00201
00202
00203
00204
00205
00206 BMFORCEINLINE
00207 void sse2_xor_arr(__m128i* BMRESTRICT dst,
00208 const __m128i* BMRESTRICT src,
00209 const __m128i* BMRESTRICT src_end)
00210 {
00211 __m128i xmm1, xmm2;
00212 do
00213 {
00214 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00215
00216 xmm1 = _mm_load_si128(src++);
00217 xmm2 = _mm_load_si128(dst);
00218 xmm1 = _mm_xor_si128(xmm1, xmm2);
00219 _mm_store_si128(dst++, xmm1);
00220
00221 xmm1 = _mm_load_si128(src++);
00222 xmm2 = _mm_load_si128(dst);
00223 xmm1 = _mm_xor_si128(xmm1, xmm2);
00224 _mm_store_si128(dst++, xmm1);
00225
00226 xmm1 = _mm_load_si128(src++);
00227 xmm2 = _mm_load_si128(dst);
00228 xmm1 = _mm_xor_si128(xmm1, xmm2);
00229 _mm_store_si128(dst++, xmm1);
00230
00231 xmm1 = _mm_load_si128(src++);
00232 xmm2 = _mm_load_si128(dst);
00233 xmm1 = _mm_xor_si128(xmm1, xmm2);
00234 _mm_store_si128(dst++, xmm1);
00235
00236 } while (src < src_end);
00237 }
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 BMFORCEINLINE
00248 void sse2_sub_arr(__m128i* BMRESTRICT dst,
00249 const __m128i* BMRESTRICT src,
00250 const __m128i* BMRESTRICT src_end)
00251 {
00252 __m128i xmm1, xmm2;
00253 do
00254 {
00255 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00256
00257 xmm1 = _mm_load_si128(src++);
00258 xmm2 = _mm_load_si128(dst);
00259 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00260 _mm_store_si128(dst++, xmm1);
00261
00262 xmm1 = _mm_load_si128(src++);
00263 xmm2 = _mm_load_si128(dst);
00264 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00265 _mm_store_si128(dst++, xmm1);
00266
00267 xmm1 = _mm_load_si128(src++);
00268 xmm2 = _mm_load_si128(dst);
00269 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00270 _mm_store_si128(dst++, xmm1);
00271
00272 xmm1 = _mm_load_si128(src++);
00273 xmm2 = _mm_load_si128(dst);
00274 xmm1 = _mm_andnot_si128(xmm1, xmm2);
00275 _mm_store_si128(dst++, xmm1);
00276
00277 } while (src < src_end);
00278 }
00279
00280
00281
00282
00283
00284
00285
00286
00287 BMFORCEINLINE
00288 void sse2_set_block(__m128i* BMRESTRICT dst,
00289 __m128i* BMRESTRICT dst_end,
00290 bm::word_t value)
00291 {
00292 __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
00293 do
00294 {
00295 _mm_store_si128(dst, xmm0);
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308 } while (++dst < dst_end);
00309
00310 _mm_sfence();
00311 }
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321 BMFORCEINLINE
00322 void sse2_copy_block(__m128i* BMRESTRICT dst,
00323 const __m128i* BMRESTRICT src,
00324 const __m128i* BMRESTRICT src_end)
00325 {
00326 __m128i xmm0, xmm1, xmm2, xmm3;
00327 do
00328 {
00329 _mm_prefetch((const char*)(src)+512, _MM_HINT_NTA);
00330
00331 xmm0 = _mm_load_si128(src+0);
00332 xmm1 = _mm_load_si128(src+1);
00333 xmm2 = _mm_load_si128(src+2);
00334 xmm3 = _mm_load_si128(src+3);
00335
00336 _mm_store_si128(dst+0, xmm0);
00337 _mm_store_si128(dst+1, xmm1);
00338 _mm_store_si128(dst+2, xmm2);
00339 _mm_store_si128(dst+3, xmm3);
00340
00341 xmm0 = _mm_load_si128(src+4);
00342 xmm1 = _mm_load_si128(src+5);
00343 xmm2 = _mm_load_si128(src+6);
00344 xmm3 = _mm_load_si128(src+7);
00345
00346 _mm_store_si128(dst+4, xmm0);
00347 _mm_store_si128(dst+5, xmm1);
00348 _mm_store_si128(dst+6, xmm2);
00349 _mm_store_si128(dst+7, xmm3);
00350
00351 src += 8;
00352 dst += 8;
00353
00354 } while (src < src_end);
00355 }
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365 BMFORCEINLINE
00366 void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
00367 {
00368 __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF,
00369 0xFFFFFFFF, 0xFFFFFFFF);
00370 __m128i* wrd_ptr = (__m128i*)first;
00371
00372 do
00373 {
00374 _mm_prefetch((const char*)(wrd_ptr)+512, _MM_HINT_NTA);
00375
00376 __m128i xmm0 = _mm_load_si128(wrd_ptr);
00377 xmm0 = _mm_xor_si128(xmm0, xmm1);
00378 _mm_store_si128(wrd_ptr, xmm0);
00379 ++wrd_ptr;
00380 } while (wrd_ptr < (__m128i*)last);
00381 }
00382
00383 BMFORCEINLINE
00384 __m128i sse2_and(__m128i a, __m128i b)
00385 {
00386 return _mm_and_si128(a, b);
00387 }
00388
00389 BMFORCEINLINE
00390 __m128i sse2_or(__m128i a, __m128i b)
00391 {
00392 return _mm_or_si128(a, b);
00393 }
00394
00395
00396 BMFORCEINLINE
00397 __m128i sse2_xor(__m128i a, __m128i b)
00398 {
00399 return _mm_xor_si128(a, b);
00400 }
00401
00402 BMFORCEINLINE
00403 __m128i sse2_sub(__m128i a, __m128i b)
00404 {
00405 return _mm_andnot_si128(b, a);
00406 }
00407
00408
00409
00410 }
00411
00412
00413
00414 #endif