00001 #ifndef BMSSE4__H__INCLUDED__
00002 #define BMSSE4__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #include<mmintrin.h>
00034 #include<emmintrin.h>
00035 #include<smmintrin.h>
00036
00037 #include "bmdef.h"
00038 #include "bmsse_util.h"
00039
00040 namespace bm
00041 {
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054 inline
00055 bm::id_t sse4_bit_count(const __m128i* block, const __m128i* block_end)
00056 {
00057 bm::id_t count = 0;
00058
00059 #ifdef BM64_SSE4
00060 do
00061 {
00062 __m128i tmp0 = _mm_load_si128(block);
00063 count += _mm_popcnt_u64(_mm_extract_epi64(tmp0, 0)) +
00064 _mm_popcnt_u64(_mm_extract_epi64(tmp0, 1));
00065 __m128i tmp1 = _mm_load_si128(block+1);
00066 count += _mm_popcnt_u64(_mm_extract_epi64(tmp1, 0)) +
00067 _mm_popcnt_u64(_mm_extract_epi64(tmp1, 1));
00068
00069 block +=2;
00070 } while (block < block_end);
00071
00072 #else
00073 do
00074 {
00075 const unsigned* b = (unsigned*) block;
00076 count += _mm_popcnt_u32(b[0]) +
00077 _mm_popcnt_u32(b[1]) +
00078 _mm_popcnt_u32(b[2]) +
00079 _mm_popcnt_u32(b[3]);
00080 } while (++block < block_end);
00081 #endif
00082 return count;
00083 }
00084
00085
00086
00087
00088 BMFORCEINLINE
00089 unsigned op_xor(unsigned a, unsigned b)
00090 {
00091 unsigned ret = (a ^ b);
00092 return ret;
00093 }
00094
00095
00096
00097
00098 BMFORCEINLINE
00099 unsigned op_or(unsigned a, unsigned b)
00100 {
00101 return (a | b);
00102 }
00103
00104
00105
00106
00107 BMFORCEINLINE
00108 unsigned op_and(unsigned a, unsigned b)
00109 {
00110 return (a & b);
00111 }
00112
00113
00114 template<class Func>
00115 bm::id_t sse4_bit_count_op(const __m128i* BMRESTRICT block,
00116 const __m128i* BMRESTRICT block_end,
00117 const __m128i* BMRESTRICT mask_block,
00118 Func sse2_func)
00119 {
00120 bm::id_t count = 0;
00121 #ifdef BM64_SSE4
00122 do
00123 {
00124 __m128i tmp0 = _mm_load_si128(block);
00125 __m128i tmp1 = _mm_load_si128(mask_block);
00126 __m128i b = sse2_func(tmp0, tmp1);
00127
00128 count += _mm_popcnt_u64(_mm_extract_epi64(b, 0));
00129 count += _mm_popcnt_u64(_mm_extract_epi64(b, 1));
00130
00131 ++block; ++mask_block;
00132 } while (block < block_end);
00133 #else
00134 do
00135 {
00136 __m128i tmp0 = _mm_load_si128(block);
00137 __m128i tmp1 = _mm_load_si128(mask_block);
00138 __m128i b = sse2_func(tmp0, tmp1);
00139
00140 count += _mm_popcnt_u32(_mm_extract_epi32(b, 0));
00141 count += _mm_popcnt_u32(_mm_extract_epi32(b, 1));
00142 count += _mm_popcnt_u32(_mm_extract_epi32(b, 2));
00143 count += _mm_popcnt_u32(_mm_extract_epi32(b, 3));
00144
00145 ++block; ++mask_block;
00146 } while (block < block_end);
00147 #endif
00148
00149 return count;
00150 }
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00196 sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00197
00198 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00199 sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00200
00201 #define VECT_BITCOUNT(first, last) \
00202 sse4_bit_count((__m128i*) (first), (__m128i*) (last))
00203
00204 #define VECT_BITCOUNT_AND(first, last, mask) \
00205 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and)
00206
00207 #define VECT_BITCOUNT_OR(first, last, mask) \
00208 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or)
00209
00210 #define VECT_BITCOUNT_XOR(first, last, mask) \
00211 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor)
00212
00213 #define VECT_BITCOUNT_SUB(first, last, mask) \
00214 sse4_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub)
00215
00216 #define VECT_INVERT_ARR(first, last) \
00217 sse2_invert_arr(first, last);
00218
00219 #define VECT_AND_ARR(dst, src, src_end) \
00220 sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00221
00222 #define VECT_OR_ARR(dst, src, src_end) \
00223 sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00224
00225 #define VECT_SUB_ARR(dst, src, src_end) \
00226 sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00227
00228 #define VECT_XOR_ARR(dst, src, src_end) \
00229 sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00230
00231 #define VECT_COPY_BLOCK(dst, src, src_end) \
00232 sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00233
00234 #define VECT_SET_BLOCK(dst, dst_end, value) \
00235 sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 inline
00248 bm::id_t sse4_bit_block_calc_count_change(const __m128i* BMRESTRICT block,
00249 const __m128i* BMRESTRICT block_end,
00250 unsigned* BMRESTRICT bit_count)
00251 {
00252
00253 register int count = (block_end - block)*4;
00254
00255 register bm::word_t w0, w_prev;
00256 const int w_shift = sizeof(w0) * 8 - 1;
00257 bool first_word = true;
00258 *bit_count = 0;
00259
00260
00261 {
00262 bm::word_t w;
00263 const bm::word_t* blk = (const bm::word_t*) block;
00264 w = w0 = blk[0];
00265 *bit_count += _mm_popcnt_u32(w);
00266 w ^= (w >> 1);
00267 count += _mm_popcnt_u32(w);
00268 count -= (w_prev = (w0 >> w_shift));
00269 }
00270
00271 do
00272 {
00273 __m128i b = _mm_load_si128(block);
00274 __m128i tmp2 = _mm_xor_si128(b, _mm_srli_epi32(b, 1));
00275 __m128i tmp3 = _mm_srli_epi32(b, w_shift);
00276
00277
00278
00279 {
00280 if (first_word)
00281 {
00282 first_word = false;
00283 }
00284 else
00285 {
00286 w0 = _mm_extract_epi32(b, 0);
00287 if (w0)
00288 {
00289 *bit_count += _mm_popcnt_u32(w0);
00290 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 0));
00291 count -= !(w_prev ^ (w0 & 1));
00292 count -= w_prev = _mm_extract_epi32(tmp3, 0);
00293 }
00294 else
00295 {
00296 count -= !w_prev; w_prev ^= w_prev;
00297 }
00298 }
00299 w0 = _mm_extract_epi32(b, 1);
00300 if (w0)
00301 {
00302 *bit_count += _mm_popcnt_u32(w0);
00303 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 1));
00304 count -= !(w_prev ^ (w0 & 1));
00305 count -= w_prev = _mm_extract_epi32(tmp3, 1);
00306 }
00307 else
00308 {
00309 count -= !w_prev; w_prev ^= w_prev;
00310 }
00311 w0 = _mm_extract_epi32(b, 2);
00312 if (w0)
00313 {
00314 *bit_count += _mm_popcnt_u32(w0);
00315 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 2));
00316 count -= !(w_prev ^ (w0 & 1));
00317 count -= w_prev = _mm_extract_epi32(tmp3, 2);
00318 }
00319 else
00320 {
00321 count -= !w_prev; w_prev ^= w_prev;
00322 }
00323 w0 = _mm_extract_epi32(b, 3);
00324 if (w0)
00325 {
00326 *bit_count += _mm_popcnt_u32(w0);
00327 count += _mm_popcnt_u32(_mm_extract_epi32(tmp2, 3));
00328 count -= !(w_prev ^ (w0 & 1));
00329 count -= w_prev = _mm_extract_epi32(tmp3, 3);
00330 }
00331 else
00332 {
00333 count -= !w_prev; w_prev ^= w_prev;
00334 }
00335 }
00336 } while (++block < block_end);
00337
00338 return count;
00339 }
00340
00341
00342
00343 }
00344
00345
00346
00347
00348 #endif