00001 #ifndef FAKEHEAP_HAEDER_
00002 #define FAKEHEAP_HEADER_
00003
00010 #include <vector>
00011 #include <algorithm>
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00058 template <class T> class FakeHeap : private std::vector<T>
00059 {
00060 private:
00061
00062 #ifdef FAKEHEAP_DEBUG
00063 mutable bool push_locked;
00064 #endif
00065
00066 public:
00067
00068 #ifdef FAKEHEAP_DEBUG
00069 inline FakeHeap() : std::vector<T>(), push_locked(false) { cout << "FakeHeap:: constructor..." << endl; };
00070 inline ~FakeHeap() { };
00071 #endif
00072
00073 #ifdef FAKEHEAP_DEBUG
00074 inline void clear()
00075 {
00076 cout << "FakeHeap:: CLEAR..." << endl;
00077 std::vector<T>::clear();
00078 push_locked=false;
00079 }
00080 #else
00081 inline void clear() { std::vector<T>::clear(); }
00082 #endif
00083
00084
00085 inline bool empty() const { return std::vector<T>::empty(); }
00086
00087 inline void sort(void)
00088 {
00089
00090 std::sort(this->begin(),this->end());
00091
00092 #ifdef FAKEHEAP_DEBUG
00093 push_locked=true;
00094 #endif
00095 }
00096 inline void push(const T &x)
00097 {
00098 #ifdef FAKEHEAP_DEBUG
00099 if (push_locked)
00100 {
00101 std::cerr << "FakeHeap:: Hey?!!! push is locked!!!" << endl;
00102 push_back(x);
00103 sort();
00104 return;
00105 }
00106 #endif
00107 push_back(x);
00108 }
00109 inline void pop(void)
00110 {
00111 #ifdef FAKEHEAP_DEBUG
00112 if (!push_locked)
00113 {
00114 std::cerr << "FakeHeap:: Hey?!!! pop is locked!!!" << endl;
00115
00116 }
00117 push_locked=true;
00118 #endif
00119 std::vector<T>::pop_back();
00120 }
00121 inline const T& top(void) const
00122 {
00123 #ifdef FAKEHEAP_DEBUG
00124 if (!push_locked)
00125 {
00126 std::cerr << "FakeHeap:: Hey?!!! top is locked!!!" << endl;
00127
00128 }
00129 push_locked=true;
00130 #endif
00131 return std::vector<T>::back();
00132 }
00133 };
00134
00135
00136 #if 1 && (defined (ASM_MMX) || defined(ASM_ATHLON) || defined(ASM_SSE) || defined(ASM_X86_64))
00137 #ifdef DEBUG
00138 #warning "new experimental FAKEHEAP specialization enabled"
00139 #endif
00140
00141
00142
00143 template<> class FakeHeap<TSieve_Delta>
00144 {
00145 private:
00146 static const int MORESIZE = 0x10000;
00147 TSieve_Delta* p;
00148 int size, capacity;
00149
00150 inline void swap_p (const long int i, const long int j)
00151 {
00152 asm volatile( \
00153 "movq (%[p],%[i],8),%%mm0 \n\t" \
00154 "movq (%[p],%[j],8),%%mm1 \n\t" \
00155 "movq %%mm0,(%[p],%[j],8) \n\t" \
00156 "movq %%mm1,(%[p],%[i],8)" \
00157 :
00158 : [p] "r" (p), [i] "r" (i), [j] "r" (j)
00159 : "memory", "mm0", "mm1");
00160 }
00161
00162 void quicksort(register long int l, register long int r)
00163 {
00164
00165 struct { int L,R; } stack[256];
00166 int stackpointer = 0;
00167
00168
00169 iteration_loop:
00170 if (r>l)
00171 {
00172 if (r-l<32)
00173 {
00174
00175 #if 0
00176 for (int i=l+1; i<=r; ++i)
00177 {
00178 TSieve_Delta v=p[i];
00179 int j=i;
00180 while (j>l && p[j-1].delta<v.delta) { p[j]=p[j-1]; --j; }
00181 p[j]=v;
00182 }
00183 #elif defined(ASM_ATHLON)
00184 asm volatile( \
00185 "movl %[l],%%edx \n\t" \
00186 "jmp 2f \n\t" \
00187 ".balign 16 \n\t" \
00188 "3: movl %%edx,%%ecx \n\t " \
00189 "decl %%ecx \n\t" \
00190 "movq (%[p],%%edx,8),%%mm1 \n\t" \
00191 "movl (%[p],%%edx,8),%%eax \n\t" \
00192 "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
00193 "jle 1f \n\t" \
00194 "movq (%[p],%%ecx,8),%%mm0 \n\t" \
00195 "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
00196 "decl %%ecx \n\t" \
00197 "jns 0b \n\t" \
00198 "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
00199 "2: incl %%edx \n\t" \
00200 "cmpl %[r],%%edx \n\t" \
00201 "jle 3b" \
00202 :
00203 : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00204 : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
00205 #elif defined(ASM_X86_64)
00206 asm volatile( \
00207 "mov %[l],%%rdx \n\t" \
00208 "jmp 2f \n\t" \
00209 ".balign 16 \n\t" \
00210 "3: mov %%rdx,%%rcx \n\t " \
00211 "dec %%rcx \n\t" \
00212 "mov (%[p],%%rdx,8),%%rax \n\t" \
00213 "0: cmpl (%[p],%%rcx,8),%%eax \n\t" \
00214 "jle 1f \n\t" \
00215 "movq (%[p],%%rcx,8),%%mm0 \n\t" \
00216 "movq %%mm0,8(%[p],%%rcx,8) \n\t" \
00217 "dec %%rcx \n\t" \
00218 "jns 0b \n\t" \
00219 "1: mov %%rax,8(%[p],%%rcx,8) \n\t" \
00220 "2: inc %%rdx \n\t" \
00221 "cmp %[r],%%rdx \n\t" \
00222 "jle 3b" \
00223 :
00224 : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00225 : "cc", "memory", "rax", "rcx", "rdx", "mm0");
00226 #else
00227 asm volatile( \
00228 "movl %[l],%%edx \n\t" \
00229 "jmp 2f \n\t" \
00230 "3: movl %%edx,%%ecx \n\t " \
00231 "sub $1,%%ecx \n\t" \
00232 "movq (%[p],%%edx,8),%%mm1 \n\t" \
00233 "movl (%[p],%%edx,8),%%eax \n\t" \
00234 "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
00235 "jle 1f \n\t" \
00236 "movq (%[p],%%ecx,8),%%mm0 \n\t" \
00237 "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
00238 "sub $1,%%ecx \n\t" \
00239 "jns 0b \n\t" \
00240 "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
00241 "2: add $1,%%edx \n\t" \
00242 "cmpl %[r],%%edx \n\t" \
00243 "jle 3b" \
00244 :
00245 : [p] "r" (p), [l] "g" (l), [r] "g" (r)
00246 : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
00247 #endif
00248 }
00249 else
00250 {
00251
00252
00253 register long int i = (l+r)>>1;
00254 if (p[l].delta<p[r].delta) swap_p(l,r);
00255 if (p[l].delta<p[i].delta) swap_p(l,i);
00256 if (p[r].delta<p[i].delta) swap_p(r,i);
00257 #if 0
00258
00259 const int pivot=p[r].delta;
00260 i=l;
00261 int j=r;
00262 while (true)
00263 {
00264 do ++i; while (p[i].delta>pivot);
00265 do --j; while (p[j].delta<pivot);
00266 if (i>=j) break;
00267 swap_p(i,j);
00268 }
00269 swap_p(i,r);
00270 #elif defined(ASM_ATHLON)
00271
00272 asm( \
00273 "movq (%[p],%[r],8),%%mm2 \n\t" \
00274 "movl (%[p],%[r],8),%%eax \n\t" \
00275 "movl %[l],%[i] \n\t " \
00276 "movl %[r],%%ecx \n\t" \
00277 "0: incl %[i] \n\t"
00278 "cmpl (%[p],%[i],8),%%eax \n\t" \
00279 "jl 0b \n\t" \
00280 "prefetchw 192(%[p],%[i],8) \n\t" \
00281 "1: decl %%ecx \n\t"
00282 "cmpl (%[p],%%ecx,8),%%eax \n\t" \
00283 "jg 1b \n\t" \
00284 "cmpl %%ecx,%[i] \n\t" \
00285 "movq (%[p],%[i],8),%%mm0 \n\t" \
00286 "jge 2f \n\t" \
00287 "movq (%[p],%%ecx,8),%%mm1 \n\t" \
00288 "movq %%mm0,(%[p],%%ecx,8) \n\t" \
00289 "movq %%mm1,(%[p],%[i],8) \n\t" \
00290 "prefetchw -192(%[p],%%ecx,8) \n\t" \
00291 "jmp 0b \n\t" \
00292 "2: movq %%mm2,(%[p],%[i],8) \n\t" \
00293 "movq %%mm0,(%[p],%[r],8)" \
00294 : [i] "=&r" (i)
00295 : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00296 : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
00297 #elif defined(ASM_X86_64)
00298
00299 asm( \
00300 "mov (%[p],%[r],8),%%rax \n\t" \
00301 "mov %[l],%[i] \n\t " \
00302 "mov %[r],%%rcx \n\t" \
00303 "0: inc %[i] \n\t"
00304 "cmpl (%[p],%[i],8),%%eax \n\t" \
00305 "jl 0b \n\t" \
00306 "#prefetchw 192(%[p],%[i],8) \n\t" \
00307 "1: dec %%ecx \n\t"
00308 "cmpl (%[p],%%rcx,8),%%eax \n\t" \
00309 "jg 1b \n\t" \
00310 "cmp %%rcx,%[i] \n\t" \
00311 "movq (%[p],%[i],8),%%mm0 \n\t" \
00312 "jge 2f \n\t" \
00313 "movq (%[p],%%rcx,8),%%mm1 \n\t" \
00314 "movq %%mm0,(%[p],%%rcx,8) \n\t" \
00315 "movq %%mm1,(%[p],%[i],8) \n\t" \
00316 "#prefetchw -192(%[p],%%rcx,8) \n\t" \
00317 "jmp 0b \n\t" \
00318 "2: mov %%rax,(%[p],%[i],8) \n\t" \
00319 "movq %%mm0,(%[p],%[r],8)" \
00320 : [i] "=&r" (i)
00321 : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00322 : "cc", "memory", "rax", "rcx", "mm0", "mm1");
00323 #else
00324
00325 asm( \
00326 "lea (%[p],%[r],8),%%ecx \n\t" \
00327 "lea (%[p],%[l],8),%[i] \n\t " \
00328 "movl (%%ecx),%%eax \n\t" \
00329 "movq (%%ecx),%%mm2 \n\t" \
00330 "0: add $8,%[i] \n\t"
00331 "cmpl (%[i]),%%eax \n\t" \
00332 "jl 0b \n\t" \
00333 "movq (%[i]),%%mm0 \n\t" \
00334 "1: sub $8,%%ecx \n\t"
00335 "cmpl (%%ecx),%%eax \n\t" \
00336 "jg 1b \n\t" \
00337 "cmpl %%ecx,%[i] \n\t" \
00338 "jge 2f \n\t" \
00339 "movq (%%ecx),%%mm1 \n\t" \
00340 "movq %%mm0,(%%ecx) \n\t" \
00341 "movq %%mm1,(%[i]) \n\t" \
00342 "jmp 0b \n\t" \
00343 "2: movq %%mm2,(%[i]) \n\t" \
00344 "subl %[p],%[i] \n\t" \
00345 "shrl $3,%[i] \n\t" \
00346 "movq %%mm0,(%[p],%[r],8)" \
00347 : [i] "=&r" (i)
00348 : [p] "r" (p), [l] "r" (l), [r] "r" (r)
00349 : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
00350 #endif
00351
00352
00353
00354 if (r-i<i-l)
00355 {
00356 stack[stackpointer].L=l; stack[stackpointer].R=i-1; l=i+1;
00357 }
00358 else
00359 {
00360 stack[stackpointer].L=i+1; stack[stackpointer].R=r; r=i-1;
00361 }
00362 ++stackpointer;
00363 goto iteration_loop;
00364 }
00365 }
00366 if (stackpointer)
00367 {
00368 --stackpointer;
00369 l=stack[stackpointer].L; r=stack[stackpointer].R;
00370 goto iteration_loop;
00371 }
00372 }
00373
00374 public:
00375
00376 inline FakeHeap() : p(new TSieve_Delta[MORESIZE]), size(0), capacity(MORESIZE) { }
00377 inline ~FakeHeap()
00378 {
00379 delete [] p;
00380 #ifdef DEBUG
00381 MARK; cout << "cleared " << capacity*8/1024 << " KB." << endl;
00382 #endif
00383 }
00384
00385 inline void clear() { size=0; }
00386 inline bool empty() const { return size==0; }
00387
00388 inline void sort(void)
00389 {
00390
00391 if (size>1)
00392 {
00393 quicksort(0,size-1);
00394 asm volatile ("emms");
00395 }
00396
00397 #if 0 || defined(DEBUG)
00398 for (int i=1; i<size; ++i) if (p[i-1].delta<p[i].delta) { std::cout << "not sorted " << i << ": " << p[i-1].delta << " " << p[i].delta << std::endl; MARK; exit(1); }
00399 #endif
00400
00401
00402 }
00403 inline void push(const TSieve_Delta &x)
00404 {
00405 if (size>=capacity)
00406 {
00407 TSieve_Delta* p_old = p;
00408 capacity+=MORESIZE;
00409 p = new TSieve_Delta[capacity];
00410 for (int i=0; i<size; ++i) p[i]=p_old[i];
00411 delete [] p_old;
00412 }
00413 p[size]=x; ++size;
00414 __builtin_prefetch (&p[size+24], 1, 1);
00415 }
00416 inline void pop(void)
00417 {
00418 --size;
00419 }
00420 inline const TSieve_Delta& top(void) const
00421 {
00422 __builtin_prefetch (&p[size-16], 0, 0);
00423 return p[size-1];
00424 }
00425 };
00426
00427 #endif
00428
00429 #endif