00001 #ifndef BMRANDOM__H__INCLUDED__
00002 #define BMRANDOM__H__INCLUDED__
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include "bm.h"
00030 #include "bmfunc.h"
00031 #include "bmdef.h"
00032
00033 #include <stdlib.h>
00034 #include <algorithm>
00035
00036
00037 namespace bm
00038 {
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052 template<class BV>
00053 class random_subset
00054 {
00055 public:
00056 random_subset();
00057 ~random_subset();
00058
00059
00060
00061
00062
00063
00064
00065 void sample(BV& bv_out, const BV& bv_in, unsigned count);
00066
00067
00068 private:
00069 typedef
00070 typename BV::blocks_manager_type blocks_manager_type;
00071
00072 private:
00073 void get_subset(BV& bv_out,
00074 const BV& bv_in,
00075 unsigned bv_in_count,
00076 unsigned count);
00077
00078 unsigned find_max_block();
00079
00080 void get_random_subset(bm::word_t* blk_out,
00081 const bm::word_t* blk_src,
00082 unsigned count);
00083 static
00084 unsigned process_word(bm::word_t* blk_out,
00085 const bm::word_t* blk_src,
00086 unsigned i,
00087 unsigned count);
00088
00089 static
00090 void get_random_array(bm::word_t* blk_out,
00091 bm::gap_word_t* bit_list,
00092 unsigned bit_list_size,
00093 unsigned count);
00094
00095
00096 private:
00097 random_subset(const random_subset&);
00098 random_subset& operator=(const random_subset&);
00099 private:
00100 unsigned* block_counts_;
00101 unsigned short* block_bits_take_;
00102 unsigned blocks_;
00103 bm::gap_word_t bit_list_[bm::gap_max_bits];
00104 unsigned* block_candidates_;
00105 unsigned candidates_count_;
00106 bm::word_t* sub_block_;
00107 };
00108
00109
00110
00111
00112
00113
00114 template<class BV>
00115 random_subset<BV>::random_subset()
00116 : block_counts_(new unsigned[bm::set_total_blocks]),
00117 block_bits_take_(new bm::gap_word_t[bm::set_total_blocks]),
00118 block_candidates_(new unsigned[bm::set_total_blocks]),
00119 candidates_count_(0),
00120 sub_block_(new bm::word_t[bm::set_block_size])
00121 {
00122 }
00123
00124 template<class BV>
00125 random_subset<BV>::~random_subset()
00126 {
00127 delete [] block_counts_;
00128 delete [] block_bits_take_;
00129 delete [] block_candidates_;
00130 delete [] sub_block_;
00131 }
00132
00133 template<class BV>
00134 void random_subset<BV>::sample(BV& bv_out,
00135 const BV& bv_in,
00136 unsigned count)
00137 {
00138 if (count == 0)
00139 {
00140 bv_out.clear(true);
00141 return;
00142 }
00143
00144 unsigned bcnt = bv_in.count();
00145 if (count >= bcnt)
00146 {
00147 bv_out = bv_in;
00148 return;
00149 }
00150 if (count > bcnt/2)
00151 {
00152
00153 BV tmp_bv;
00154 unsigned delta_count = bcnt - count;
00155
00156 get_subset(tmp_bv, bv_in, bcnt, delta_count);
00157 bv_out = bv_in;
00158 bv_out -= tmp_bv;
00159 bv_out.forget_count();
00160 return;
00161 }
00162
00163 get_subset(bv_out, bv_in, bcnt, count);
00164 bv_out.forget_count();
00165 }
00166
00167 template<class BV>
00168 void random_subset<BV>::get_subset(BV& bv_out,
00169 const BV& bv_in,
00170 unsigned bcnt,
00171 unsigned count)
00172 {
00173 bv_out.clear(true);
00174 bv_out.resize(bv_in.size());
00175
00176 const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
00177 blocks_manager_type& bman_out = bv_out.get_blocks_manager();
00178
00179 bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
00180 candidates_count_ = 0;
00181
00182
00183
00184
00185 unsigned block_count = blocks_ = bv_in.count_blocks(block_counts_);
00186 for (unsigned i = 0; i <= block_count; ++i)
00187 {
00188 if (block_counts_[i])
00189 {
00190 float block_percent =
00191 ((float)(block_counts_[i]+1)) / (float)bcnt;
00192 float bits_to_take = ((float)count) * block_percent;
00193 bits_to_take += 0.99f;
00194
00195 unsigned t = (unsigned)bits_to_take;
00196 block_bits_take_[i] = (bm::gap_word_t)t;
00197
00198 if (block_bits_take_[i] == 0)
00199 {
00200 block_bits_take_[i] = 1;
00201 }
00202 else
00203 if (block_bits_take_[i] > block_counts_[i])
00204 block_bits_take_[i] = (gap_word_t)block_counts_[i];
00205
00206 BM_ASSERT(bman_in.get_block(i));
00207 }
00208 else
00209 {
00210 block_bits_take_[i] = 0;
00211 }
00212 }
00213
00214 for (unsigned take_count = 0; count; count -= take_count)
00215 {
00216 unsigned i = find_max_block();
00217 take_count = block_bits_take_[i];
00218 if (take_count > count)
00219 take_count = count;
00220 if (take_count == 0)
00221 continue;
00222 const bm::word_t* blk_src = bman_in.get_block(i);
00223 BM_ASSERT(blk_src);
00224
00225
00226 bm::word_t* blk_out = bman_out.get_block(i);
00227 if (blk_out != 0)
00228 {
00229 blk_out = bman_out.deoptimize_block(i);
00230 }
00231 else
00232 {
00233 blk_out = bman_out.get_allocator().alloc_bit_block();
00234 bman_out.set_block(i, blk_out);
00235 }
00236 if (take_count == block_counts_[i])
00237 {
00238
00239 if (BM_IS_GAP(blk_src))
00240 {
00241 gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
00242 }
00243 else
00244 {
00245 bm::bit_block_copy(blk_out, blk_src);
00246 }
00247 block_bits_take_[i] = 0;
00248 continue;
00249 }
00250 bit_block_set(blk_out, 0);
00251
00252 if (block_counts_[i] < 4096)
00253 {
00254 unsigned arr_len;
00255
00256 if (BM_IS_GAP(blk_src))
00257 {
00258 arr_len = gap_convert_to_arr(bit_list_,
00259 BMGAP_PTR(blk_src),
00260 bm::gap_max_bits);
00261 }
00262 else
00263 {
00264 arr_len = bit_convert_to_arr(bit_list_,
00265 blk_src,
00266 bm::gap_max_bits,
00267 bm::gap_max_bits,
00268 0);
00269 }
00270 BM_ASSERT(arr_len);
00271 get_random_array(blk_out, bit_list_, arr_len, take_count);
00272 }
00273 else
00274 {
00275
00276 if (BM_IS_GAP(blk_src))
00277 {
00278 gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
00279 blk_src = tmp_block;
00280 }
00281
00282
00283 get_random_subset(blk_out, blk_src, take_count);
00284 }
00285
00286 block_bits_take_[i] = 0;
00287 }
00288 }
00289
00290 template<class BV>
00291 void random_subset<BV>::get_random_subset(bm::word_t* blk_out,
00292 const bm::word_t* blk_src,
00293 unsigned take_count)
00294 {
00295 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
00296 {
00297
00298
00299 unsigned i = rand() % bm::set_block_size;
00300 unsigned new_count;
00301
00302 for (; i < bm::set_block_size && take_count; ++i)
00303 {
00304 if (blk_src[i] && (blk_out[i] != blk_src[i]))
00305 {
00306 take_count -= new_count =
00307 process_word(blk_out, blk_src, i, take_count);
00308 }
00309 }
00310
00311 }
00312
00313
00314 if (take_count)
00315 {
00316
00317 for (unsigned i = 0; i < bm::set_block_size; ++i)
00318 {
00319 sub_block_[i] = blk_src[i] & ~blk_out[i];
00320 }
00321
00322
00323 unsigned arr_len = bit_convert_to_arr(bit_list_,
00324 sub_block_,
00325 bm::gap_max_bits,
00326 bm::gap_max_bits,
00327 0);
00328 BM_ASSERT(arr_len);
00329 get_random_array(blk_out, bit_list_, arr_len, take_count);
00330 }
00331 }
00332
00333 template<class BV>
00334 unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
00335 const bm::word_t* blk_src,
00336 unsigned i,
00337 unsigned count)
00338 {
00339 unsigned new_bits, mask;
00340
00341 do
00342 {
00343 mask = rand();
00344 mask ^= mask << 16;
00345 } while (mask == 0);
00346
00347 unsigned src_rand = blk_src[i] & mask;
00348 new_bits = src_rand & ~blk_out[i];
00349
00350 if (new_bits)
00351 {
00352 unsigned new_count = bm::word_bitcount(new_bits);
00353
00354
00355 if (new_count > count)
00356 {
00357 BM_ASSERT(count);
00358
00359 unsigned char blist[64];
00360 unsigned arr_size = bm::bit_list_4(new_bits, blist);
00361 BM_ASSERT(arr_size == new_count);
00362 std::random_shuffle(blist, blist + arr_size);
00363 unsigned value = 0;
00364 for (unsigned j = 0; j < count; ++j)
00365 {
00366 value |= (1 << blist[j]);
00367 }
00368 new_bits = value;
00369 new_count = count;
00370
00371 BM_ASSERT(bm::word_bitcount(new_bits) == count);
00372 BM_ASSERT((new_bits & ~blk_src[i]) == 0);
00373 }
00374
00375 blk_out[i] |= new_bits;
00376 return new_count;
00377 }
00378
00379 return 0;
00380 }
00381
00382
00383 template<class BV>
00384 void random_subset<BV>::get_random_array(bm::word_t* blk_out,
00385 bm::gap_word_t* bit_list,
00386 unsigned bit_list_size,
00387 unsigned count)
00388 {
00389 std::random_shuffle(bit_list, bit_list + bit_list_size);
00390 for (unsigned i = 0; i < count; ++i)
00391 {
00392 bm::set_bit(blk_out, bit_list[i]);
00393 }
00394 }
00395
00396 template<class BV>
00397 unsigned random_subset<BV>::find_max_block()
00398 {
00399 if (candidates_count_)
00400 {
00401 return block_candidates_[--candidates_count_];
00402 }
00403
00404 unsigned candidate = 0;
00405 unsigned max_i = 0;
00406 for (unsigned i = 0; i <= blocks_; ++i)
00407 {
00408 if (block_bits_take_[i] == 0) continue;
00409 if (block_bits_take_[i] == candidate)
00410 {
00411 block_candidates_[candidates_count_++] = i;
00412 }
00413 else
00414 {
00415 unsigned diff = abs((int)block_bits_take_[i] - (int)candidate);
00416 double d = (double)diff / (double)candidate;
00417
00418 if (d < 0.20f)
00419 {
00420 block_candidates_[candidates_count_++] = i;
00421 }
00422 else
00423 if (block_bits_take_[i] > candidate)
00424 {
00425 candidate = block_bits_take_[i];
00426 max_i = i;
00427 candidates_count_ = 0;
00428 block_candidates_[candidates_count_++] = i;
00429 }
00430 }
00431 }
00432
00433 if (candidates_count_ > 1)
00434 {
00435 std::random_shuffle(block_candidates_, block_candidates_ + candidates_count_);
00436 return find_max_block();
00437 }
00438 return max_i;
00439 }
00440
00441
00442 }
00443
00444 #include "bmundef.h"
00445
00446 #endif