00001 #ifndef BMSSE2__H__INCLUDED__
00002 #define BMSSE2__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035
00036 #include "bmdef.h"
00037 #include "bmsse_util.h"
00038
00039
00040 namespace bm
00041 {
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063 inline
00064 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00065 {
00066 const unsigned mu1 = 0x55555555;
00067 const unsigned mu2 = 0x33333333;
00068 const unsigned mu3 = 0x0F0F0F0F;
00069 const unsigned mu4 = 0x0000003F;
00070
00071
00072 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00073 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00074 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00075 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00076 __m128i mcnt;
00077 mcnt = _mm_xor_si128(m1, m1);
00078
00079 __m128i tmp1, tmp2;
00080 do
00081 {
00082 __m128i b = _mm_load_si128(block);
00083 ++block;
00084
00085
00086 tmp1 = _mm_srli_epi32(b, 1);
00087 tmp1 = _mm_and_si128(tmp1, m1);
00088 tmp2 = _mm_and_si128(b, m1);
00089 b = _mm_add_epi32(tmp1, tmp2);
00090
00091
00092 tmp1 = _mm_srli_epi32(b, 2);
00093 tmp1 = _mm_and_si128(tmp1, m2);
00094 tmp2 = _mm_and_si128(b, m2);
00095 b = _mm_add_epi32(tmp1, tmp2);
00096
00097
00098 tmp1 = _mm_srli_epi32(b, 4);
00099 b = _mm_add_epi32(b, tmp1);
00100 b = _mm_and_si128(b, m3);
00101
00102
00103 tmp1 = _mm_srli_epi32 (b, 8);
00104 b = _mm_add_epi32(b, tmp1);
00105
00106
00107 tmp1 = _mm_srli_epi32 (b, 16);
00108 b = _mm_add_epi32(b, tmp1);
00109 b = _mm_and_si128(b, m4);
00110
00111 mcnt = _mm_add_epi32(mcnt, b);
00112
00113 } while (block < block_end);
00114
00115
00116 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00117 _mm_store_si128((__m128i*)tcnt, mcnt);
00118
00119 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00120 }
00121
00122
00123
00124 template<class Func>
00125 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block,
00126 const __m128i* BMRESTRICT block_end,
00127 const __m128i* BMRESTRICT mask_block,
00128 Func sse2_func)
00129 {
00130 const unsigned mu1 = 0x55555555;
00131 const unsigned mu2 = 0x33333333;
00132 const unsigned mu3 = 0x0F0F0F0F;
00133 const unsigned mu4 = 0x0000003F;
00134
00135
00136 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00137 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00138 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00139 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00140 __m128i mcnt;
00141 mcnt = _mm_xor_si128(m1, m1);
00142 do
00143 {
00144 __m128i tmp1, tmp2;
00145 __m128i b = _mm_load_si128(block++);
00146
00147 tmp1 = _mm_load_si128(mask_block++);
00148
00149 b = sse2_func(b, tmp1);
00150
00151
00152 tmp1 = _mm_srli_epi32(b, 1);
00153 tmp1 = _mm_and_si128(tmp1, m1);
00154 tmp2 = _mm_and_si128(b, m1);
00155 b = _mm_add_epi32(tmp1, tmp2);
00156
00157
00158 tmp1 = _mm_srli_epi32(b, 2);
00159 tmp1 = _mm_and_si128(tmp1, m2);
00160 tmp2 = _mm_and_si128(b, m2);
00161 b = _mm_add_epi32(tmp1, tmp2);
00162
00163
00164 tmp1 = _mm_srli_epi32(b, 4);
00165 b = _mm_add_epi32(b, tmp1);
00166 b = _mm_and_si128(b, m3);
00167
00168
00169 tmp1 = _mm_srli_epi32 (b, 8);
00170 b = _mm_add_epi32(b, tmp1);
00171
00172
00173 tmp1 = _mm_srli_epi32 (b, 16);
00174 b = _mm_add_epi32(b, tmp1);
00175 b = _mm_and_si128(b, m4);
00176
00177 mcnt = _mm_add_epi32(mcnt, b);
00178
00179 } while (block < block_end);
00180
00181 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00182 _mm_store_si128((__m128i*)tcnt, mcnt);
00183
00184 return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00185 }
00186
00187
00188
00189
00190 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00191 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00192
00193 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00194 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00195
00196 #define VECT_BITCOUNT(first, last) \
00197 sse2_bit_count((__m128i*) (first), (__m128i*) (last))
00198
00199 #define VECT_BITCOUNT_AND(first, last, mask) \
00200 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00201
00202 #define VECT_BITCOUNT_OR(first, last, mask) \
00203 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00204
00205 #define VECT_BITCOUNT_XOR(first, last, mask) \
00206 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00207
00208 #define VECT_BITCOUNT_SUB(first, last, mask) \
00209 sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00210
00211 #define VECT_INVERT_ARR(first, last) \
00212 sse2_invert_arr(first, last);
00213
00214 #define VECT_AND_ARR(dst, src, src_end) \
00215 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00216
00217 #define VECT_OR_ARR(dst, src, src_end) \
00218 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00219
00220 #define VECT_SUB_ARR(dst, src, src_end) \
00221 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00222
00223 #define VECT_XOR_ARR(dst, src, src_end) \
00224 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00225
00226 #define VECT_COPY_BLOCK(dst, src, src_end) \
00227 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00228
00229 #define VECT_SET_BLOCK(dst, dst_end, value) \
00230 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00231
00232
00233
00234
00235
00236 inline
00237 bm::id_t sse2_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00238 const __m128i* BMRESTRICT block_end,
00239 unsigned* BMRESTRICT bit_count)
00240 {
00241 const unsigned mu1 = 0x55555555;
00242 const unsigned mu2 = 0x33333333;
00243 const unsigned mu3 = 0x0F0F0F0F;
00244 const unsigned mu4 = 0x0000003F;
00245
00246
00247 __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00248 __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00249 __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00250 __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00251 __m128i mcnt, ccnt;
00252 mcnt = _mm_xor_si128(m1, m1);
00253 ccnt = _mm_xor_si128(m1, m1);
00254
00255 __m128i tmp1, tmp2;
00256
00257 int count = (block_end - block)*4;
00258
00259 bm::word_t w, w0, w_prev;
00260 const int w_shift = sizeof(w) * 8 - 1;
00261 bool first_word = true;
00262
00263
00264 {
00265 const bm::word_t* blk = (const bm::word_t*) block;
00266 w = w0 = blk[0];
00267 w ^= (w >> 1);
00268 BM_INCWORD_BITCOUNT(count, w);
00269 count -= (w_prev = (w0 >> w_shift));
00270 }
00271
00272 bm::id_t BM_ALIGN16 tcnt[4] BM_ALIGN16ATTR;
00273
00274 do
00275 {
00276
00277
00278 {
00279 __m128i b = _mm_load_si128(block);
00280
00281
00282 tmp1 = _mm_srli_epi32(b, 1);
00283 tmp2 = _mm_xor_si128(b, tmp1);
00284 _mm_store_si128((__m128i*)tcnt, tmp2);
00285
00286
00287
00288 {
00289
00290
00291 tmp1 = _mm_and_si128(tmp1, m1);
00292 tmp2 = _mm_and_si128(b, m1);
00293 b = _mm_add_epi32(tmp1, tmp2);
00294
00295
00296 tmp1 = _mm_srli_epi32(b, 2);
00297 tmp1 = _mm_and_si128(tmp1, m2);
00298 tmp2 = _mm_and_si128(b, m2);
00299 b = _mm_add_epi32(tmp1, tmp2);
00300
00301
00302 tmp1 = _mm_srli_epi32(b, 4);
00303 b = _mm_add_epi32(b, tmp1);
00304 b = _mm_and_si128(b, m3);
00305
00306
00307 tmp1 = _mm_srli_epi32 (b, 8);
00308 b = _mm_add_epi32(b, tmp1);
00309
00310
00311 tmp1 = _mm_srli_epi32 (b, 16);
00312 b = _mm_add_epi32(b, tmp1);
00313 b = _mm_and_si128(b, m4);
00314
00315 mcnt = _mm_add_epi32(mcnt, b);
00316 }
00317
00318 }
00319
00320 {
00321
00322
00323 const bm::word_t* BMRESTRICT blk = (const bm::word_t*) block;
00324
00325 if (first_word)
00326 {
00327 first_word = false;
00328 }
00329 else
00330 {
00331 if ((w0=blk[0]))
00332 {
00333 BM_INCWORD_BITCOUNT(count, tcnt[0]);
00334 count -= !(w_prev ^ (w0 & 1));
00335 count -= w_prev = (w0 >> w_shift);
00336 }
00337 else
00338 {
00339 count -= !w_prev; w_prev ^= w_prev;
00340 }
00341 }
00342 if ((w0=blk[1]))
00343 {
00344 BM_INCWORD_BITCOUNT(count, tcnt[1]);
00345 count -= !(w_prev ^ (w0 & 1));
00346 count -= w_prev = (w0 >> w_shift);
00347 }
00348 else
00349 {
00350 count -= !w_prev; w_prev ^= w_prev;
00351 }
00352 if ((w0=blk[2]))
00353 {
00354 BM_INCWORD_BITCOUNT(count, tcnt[2]);
00355 count -= !(w_prev ^ (w0 & 1));
00356 count -= w_prev = (w0 >> w_shift);
00357 }
00358 else
00359 {
00360 count -= !w_prev; w_prev ^= w_prev;
00361 }
00362 if ((w0=blk[3]))
00363 {
00364 BM_INCWORD_BITCOUNT(count, tcnt[3]);
00365 count -= !(w_prev ^ (w0 & 1));
00366 count -= w_prev = (w0 >> w_shift);
00367 }
00368 else
00369 {
00370 count -= !w_prev; w_prev ^= w_prev;
00371 }
00372 }
00373 } while (++block < block_end);
00374
00375 _mm_store_si128((__m128i*)tcnt, mcnt);
00376 *bit_count = tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00377
00378 return count;
00379 }
00380
00381 }
00382
00383
00384
00385
00386 #endif