00001
00006 #include "Sieving.H"
00007 #include "qsieve.H"
00008 #include "modulo.H"
00009 using namespace numtheory;
00010
00011 #include "mpqsPolynom.H"
00012 extern CmpqsPolynom Polynom;
00013
00014
00015 int SieveOffset = -LogicalSieveSize;
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 class StaticRelations
00027 {
00028 public:
00029 static void insert(CRelation *GL, const bool do_multi_combine_init=true);
00030 };
00031
00032 TSieveElement SieveArray_[PhysicalSieveSize+64+8] __attribute__ ((aligned (64)));
00033
00034 TSieveElement log_Primzahl_of_PrimePowers[StaticFactorbase::max_additional_Powers];
00035
00036
00037 TSieveElement SieveControl::log_PrimeNumbers[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
00038 TSieveElement SieveControl::log_PrimeNumbers_dirty[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
00039 int SieveControl::myFBStartIndex = SieveControl::FBLowestStartIndex;
00040 int SieveControl::myFBStartIndexHint = SieveControl::FBLowestStartIndex;
00041 signed int SieveControl::DirtinessPegel = 0;
00042
00043
00044 int CDeltaComputations::Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
00045 int CDeltaComputations::Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));
00046
00047 int CDeltaComputations::D_mod_FBPrime[StaticFactorbase::MaxSize];
00048 signed int CDeltaComputations::D_difference_to_last = 0;
00049 CDeltaComputations::CmpqsDmodPUpdater CDeltaComputations::mpqsDmodPUpdater;
00050
00051 void CDeltaComputations::CmpqsDmodPUpdater::update()
00052 {
00053
00054 mpz_sub(act_D,Polynom.get_D(),act_D);
00055 if (mpz_fits_uint_p(act_D))
00056 {
00057 const int d = D_difference_to_last = mpz_get_ui(act_D);
00058 #ifdef VERBOSE
00059 MARK; cout<< "D update difference is " << d << ", #FB=" << StaticFactorbase::Size() << endl;
00060 #endif
00061
00062 int i;
00063 for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00064 {
00065 D_mod_FBPrime[i+1]+=d;
00066 while (D_mod_FBPrime[i+1]>=D_mod_FBPrime[i]) D_mod_FBPrime[i+1]-=D_mod_FBPrime[i];
00067 }
00068 for (; i<StaticFactorbase::Size(); ++i)
00069 {
00070 D_mod_FBPrime[i]+=d;
00071 while (D_mod_FBPrime[i]>=PrimeNumbers[i]) D_mod_FBPrime[i]-=PrimeNumbers[i];
00072 }
00073 mpz_set(act_D,Polynom.get_D());
00074 }
00075 else
00076 {
00077 #ifdef VERBOSE
00078 MARK; cout<< "D update difference is " << act_D << ", #FB=" << StaticFactorbase::Size() << endl;
00079 #endif
00080 mpz_set(act_D,Polynom.get_D());
00081 int i;
00082 for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00083 {
00084 D_mod_FBPrime[i]=PrimeNumbers[i]*PrimeNumbers[i+1];
00085 D_mod_FBPrime[i+1]=mpz_remainder_ui(act_D,D_mod_FBPrime[i]);
00086 }
00087 for (; i<StaticFactorbase::Size(); ++i) D_mod_FBPrime[i]=mpz_remainder_ui(act_D,PrimeNumbers[i]);
00088 }
00089
00090 #ifdef DEBUG
00091 int i;
00092 for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
00093 if (mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i]) != D_mod_FBPrime[i+1])
00094 {
00095 MARK; cerr << "sanity check failed for i=" << i << ": "
00096 << mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i])
00097 << " != " << D_mod_FBPrime[i+1] << endl;
00098 exit(1);
00099 }
00100 for (; i<StaticFactorbase::Size(); ++i)
00101 if (mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i]) != D_mod_FBPrime[i])
00102 {
00103 MARK; cerr << "sanity check failed for i=" << i << ": "
00104 << mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i])
00105 << " != " << D_mod_FBPrime[i] << endl;
00106 exit(1);
00107 }
00108 #endif
00109 }
00110
00111
00112 #if (TSIEVEELEMENTSIZE == 1)
00113 unsigned short int CSieveStaticFactors::LogValRuns[128];
00114 unsigned short int CSieveStaticFactors::PrimeNumberDiffs[CSieveStaticFactors::MaxSize];
00115 #endif
00116
00117 int* CSieveStaticFactors::DefaultHelperTable[2] = { NULL, NULL };
00118
00119 unsigned int SieveControl::PhysicalSieveIntervalIndex;
00120
00121
00122
00123 unsigned int SieveControl::BestPSI[2];
00124
00125
00126 TSieveElement SieveControl::PhysicalSieveIntervalThresholdCorrector[1024] = { 0 };
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136 void SieveControl::set_logVal_for_Primenumber(const int pos, const TSieveElement val, const TSieveElement dirtyval)
00137 {
00138 log_PrimeNumbers[pos]=val;
00139 log_PrimeNumbers_dirty[pos]=dirtyval;
00140 }
00141
00142 void SieveControl::set_logVal_for_Primepower(const int pos, const TSieveElement val)
00143 {
00144 log_Primzahl_of_PrimePowers[pos]=val;
00145 }
00146
00147 bool SieveControl::Hit_after_DirtyCorrection(const signed int SievePos, TSieveElement Value)
00148 {
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160 for (int i=FBStartIndex()-1; i>=FBLowestStartIndex; --i)
00161 {
00162 register int d = normalized_signed_mod(SievePos,StaticFactorbase::PrimeNumbers[i],StaticFactorbase::PrimeNumberReciprocals[i]);
00163
00164
00165
00166
00167 const bool Treffer = (d==CDeltaComputations::Delta_of_PrimeNumbers[i][0] || d==CDeltaComputations::Delta_of_PrimeNumbers[i][1]);
00168 if (Treffer)
00169 {
00170
00171 Value+=PrimzahlSieveWeight_dirty(i);
00172 Value-=PrimzahlSieveWeight(i);
00173 }
00174 else
00175 {
00176
00177
00178 Value+=PrimzahlSieveWeight_dirty(i);
00179
00180
00181 }
00182 }
00183
00184 if (Value>=0)
00185 {
00186 SieveControl::RequestLessDirtiness();
00187 return false;
00188 }
00189 return true;
00190 }
00191
00192
00193 TSieveElement SieveControl::log_Threshold = 0;
00194 TSieveElement SieveControl::raw_log_Threshold = 0;
00195 TSieveElement SieveControl::log_ThresholdDeltaHint = 0;
00196
00197 void SieveControl::compute_SieveThreshold(void)
00198 {
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209 mpz_t x; mpz_init(x);
00210 mpz_div_ui(x,kN,2); mpz_sqrt(x,kN); mpz_mul_ui(x,x,LogicalSieveSize);
00211 double d = log(fabs(mpz_get_d(x))) - (Factor_Threshold*log(StaticFactorbase::BiggestPrime()));
00212 mpz_clear(x);
00213 if (StaticFactorbase::PrimeNumbers[1]==2) d-=log(2);
00214 d*=log_SieveEntryMultiplier;
00215 #ifdef VERBOSE_INFO
00216 cout << "Threshold (initial sieve entries) for detecting relations set to " << d << endl;
00217 #endif
00218 raw_log_Threshold=log_Threshold=static_cast<TSieveElement>(floor(d));
00219 if (log_Threshold!=floor(d))
00220 {
00221 cerr << "With the compiled-in types and constants this" << endl;
00222 cerr << "factorization cannot be done, sorry." << endl;
00223 cerr << "TSieveElement (compile-time type) needs to be increased to fit the threshold!" << endl;
00224 cerr << "Please modify log_SieveEntryMultiplier and/or TSieveElement" << endl;
00225 cerr << "and compile the program again." << endl;
00226 exit(1);
00227 }
00228
00229 #if TSIEVEELEMENTSIZE > 0
00230
00231 if (sizeof(TSieveElement)!=TSIEVEELEMENTSIZE)
00232 {
00233 cerr << "Compiled types and values for " << endl
00234 << "TSieveElement (Size=" << sizeof(TSieveElement) << ")"
00235 << " and TSIEVEELEMENTSIZE (=" << TSIEVEELEMENTSIZE << ")" << endl
00236 << "do not match. Please correct them!" << endl;
00237 exit(1);
00238 }
00239 #endif
00240 }
00241
00242
00243
00244
00245 #if defined(ASM_386) || defined(ASM_X86_64)
00246 #define SIEBASM_386
00247
00248 unsigned int clobbered_int;
00249
00250 #if TSIEVEELEMENTSIZE == 1
00251 #if defined(ASM_SSE2) || defined(ASM_X86_64)
00252 #ifdef DEBUG
00253 #warning "Initialize Sieve: SSE2 Sieve initialization activated."
00254 #endif
00255 #define asm_sieb_init(logValue) \
00256 asm volatile(\
00257 "movd %%eax,%%xmm0 # -> (x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,a) \n\t" \
00258 "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,x,x,a,a) \n\t" \
00259 "movl %[count],%%eax \n\t" \
00260 "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,a,a,a,a) \n\t" \
00261 "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,a,a,a,a,a,a,a,a) \n\t" \
00262 "punpcklbw %%xmm0,%%xmm0 # (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a) \n\t" \
00263 ".balign 16 \n\t" \
00264 "1: \n\t" \
00265 "movdqa %%xmm0,(%[Sieve]) \n\t" \
00266 "movdqa %%xmm0,0x10(%[Sieve]) \n\t" \
00267 "movdqa %%xmm0,0x20(%[Sieve]) \n\t" \
00268 "movdqa %%xmm0,0x30(%[Sieve]) \n\t" \
00269 "add $0x40,%[Sieve] \n\t" \
00270 "sub $0x40,%%eax \n\t" \
00271 "jnz 1b \n\t" \
00272 "emms \n" \
00273 : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "xmm0");
00274 #elif defined(ASM_MMX)
00275 #ifdef DEBUG
00276 #warning "Initialize Sieve: MMX Sieve initialization activated."
00277 #endif
00278 #define asm_sieb_init(logValue) \
00279 asm volatile(\
00280 "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
00281 "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
00282 "movl %[count],%%eax \n\t" \
00283 "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
00284 "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
00285 ".balign 16 \n\t" \
00286 "1: \n\t" \
00287 "movq %%mm0,(%[Sieve]) \n\t" \
00288 "movq %%mm0,8(%[Sieve]) \n\t" \
00289 "movq %%mm0,16(%[Sieve]) \n\t" \
00290 "movq %%mm0,24(%[Sieve]) \n\t" \
00291 "movq %%mm0,32(%[Sieve]) \n\t" \
00292 "movq %%mm0,40(%[Sieve]) \n\t" \
00293 "movq %%mm0,48(%[Sieve]) \n\t" \
00294 "movq %%mm0,56(%[Sieve]) \n\t" \
00295 "add $64,%[Sieve] \n\t" \
00296 "sub $64,%%eax \n\t" \
00297 "jnz 1b \n\t" \
00298 "emms \n" \
00299 : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "mm0");
00300 #else
00301 #define asm_sieb_init(logValue) \
00302 asm volatile ( \
00303 "movb %%al,%%ah\n\t" \
00304 "pushw %%ax\n\t" \
00305 "shll $16,%%eax\n\t" \
00306 "popw %%ax\n\t" \
00307 "shrl $2,%%ecx\n\t" \
00308 "cld\n\t" \
00309 "rep\n\t" \
00310 "stosl" \
00311 : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
00312 #endif
00313
00314 #if defined(ASM_ATHLON) || defined(ASM_SSE) || defined(ASM_X86_64)
00315
00316 #ifdef DEBUG
00317 #warning "using IA-32/ATHLON version for macro: asm_search_sieb"
00318 #endif
00319 #define asm_search_sieb(offset) \
00320 asm volatile ( \
00321 "0: # calibrate alignment \n\t" \
00322 "test $7,%[pos] \n\t" \
00323 "jz 1f # is aligned \n\t" \
00324 "dec %[pos] \n\t" \
00325 "testb $0x80,(%[sieb],%[pos]) \n\t" \
00326 "jns 0b \n\t" \
00327 "js 4f # hit detected \n\t" \
00328 "1: # search for hit, (big steps) loop \n\t" \
00329 "prefetcht2 -256(%[sieb],%[pos]) # tuning cache!! \n\t" \
00330 "sub $64,%[pos] \n\t" \
00331 "movq (%[sieb],%[pos]),%%mm0 \n\t" \
00332 "movq 8(%[sieb],%[pos]),%%mm1 \n\t" \
00333 "por 16(%[sieb],%[pos]),%%mm0 \n\t" \
00334 "por 24(%[sieb],%[pos]),%%mm1 \n\t" \
00335 "por 32(%[sieb],%[pos]),%%mm0 \n\t" \
00336 "por 40(%[sieb],%[pos]),%%mm1 \n\t" \
00337 "por 48(%[sieb],%[pos]),%%mm0 \n\t" \
00338 "por 56(%[sieb],%[pos]),%%mm1 \n\t" \
00339 "por %%mm1,%%mm0 \n\t" \
00340 "pmovmskb %%mm0,%%eax # highest bits of packed bytes to %%ah \n\t" \
00341 "testl %%eax,%%eax \n\t" \
00342 "jz 1b \n\t" \
00343 "emms # clear MMX state \n\t" \
00344 "2: # search for hit, (little steps) init \n\t" \
00345 "add $64,%[pos] \n\t" \
00346 "3: # search for hit, (little steps) loop \n\t" \
00347 "sub $4,%[pos] \n\t" \
00348 "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
00349 "jz 3b \n\t" \
00350 "movl (%[sieb],%[pos]),%%eax \n\t" \
00351 "add $3,%[pos] \n\t" \
00352 "testl $0x80000000,%%eax \n\t" \
00353 "jnz 4f \n\t" \
00354 "dec %[pos] \n\t" \
00355 "testl $0x00800000,%%eax \n\t" \
00356 "jnz 4f \n\t" \
00357 "dec %[pos] \n\t" \
00358 "testl $0x00008000,%%eax \n\t" \
00359 "jnz 4f \n\t" \
00360 "dec %[pos] \n\t" \
00361 "4: # hit detected" \
00362 : [pos] "=q" (offset) : "[pos]" (offset), [sieb] "r" (SieveArray) : "cc", "eax", "mm0", "mm1");
00363 #else
00364
00365 #define asm_search_sieb(offset) \
00366 asm volatile ( \
00367 "0: # calibrate alignment \n\t" \
00368 "testl $3,%[pos] \n\t" \
00369 "jz 1f # is aligned \n\t" \
00370 "decl %[pos] \n\t" \
00371 "testb $0x80,(%[sieb],%[pos]) \n\t" \
00372 "jns 0b \n\t" \
00373 "js 2f # hit detected \n\t" \
00374 "1: # start search \n\t" \
00375 "subl $4,%[pos] \n\t" \
00376 "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
00377 "jz 1b \n\t" \
00378 "addl $3,%[pos] \n\t" \
00379 "testb $0x80,(%[sieb],%[pos]) \n\t" \
00380 "js 2f \n\t" \
00381 "decl %[pos] \n\t" \
00382 "testb $0x80,(%[sieb],%[pos]) \n\t" \
00383 "js 2f \n\t" \
00384 "decl %[pos] \n\t" \
00385 "testb $0x80,(%[sieb],%[pos]) \n\t" \
00386 "js 2f \n\t" \
00387 "decl %[pos] \n\t" \
00388 "2: # hit detected" \
00389 : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00390 #endif
00391 #if defined(ASM_X86_64)
00392
00393 #ifdef DEBUG
00394 #warning "using CMOV X86_64 version for macro: asm_sieb"
00395 #endif
00396 #define asm_sieb(lp, d0, d1, P) \
00397 asm volatile ( \
00398 "cmp %[disp1],%[disp0]\n\t" \
00399 "cmova %[disp0],%%rax # conditional swap part1\n\t" \
00400 "cmova %[disp1],%[disp0] # part2 \n\t" \
00401 "cmova %%rax,%[disp1] # part3 \n\t" \
00402 "movl %[limit],%%eax\n\t" \
00403 "jmp 1f \n\t" \
00404 ".balign 16 \n\t" \
00405 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00406 "add %[step],%[disp1]\n\t" \
00407 "subb %[val],(%[sieb],%[disp0])\n\t" \
00408 "add %[step],%[disp0]\n\t" \
00409 "1: cmp %%rax,%[disp1]\n\t" \
00410 "jb 0b\n\t" \
00411 "cmp %%rax,%[disp0] \n\t" \
00412 "cmovb %[disp0],%%rax \n\t" \
00413 "subb %[val],(%[sieb],%%rax)\n\t" \
00414 "xor %%rax,%%rax \n\t" \
00415 "cmp %[limit],%[disp0] \n\t" \
00416 "cmovb %[step],%%rax \n\t" \
00417 "add %%rax,%[disp0] \n" \
00418 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "rax");
00419
00420
00421 #define asm_sieb_small(lp, d0, d1, P) \
00422 asm volatile ( \
00423 "cmp %[disp1],%[disp0]\n\t" \
00424 "cmova %[disp0],%%rax # conditional swap part1\n\t" \
00425 "cmova %[disp1],%[disp0] # part2 \n\t" \
00426 "cmova %%rax,%[disp1] # part3 \n\t" \
00427 "mov %[limit],%%eax\n\t" \
00428 "jmp 0f \n\t" \
00429 ".balign 16 \n\t" \
00430 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00431 "add %[step],%[disp1]\n\t" \
00432 "subb %[val],(%[sieb],%[disp0])\n\t" \
00433 "add %[step],%[disp0]\n\t" \
00434 "1: cmp %%rax,%[disp1]\n\t" \
00435 "jb 0b\n\t" \
00436 "cmp %%rax,%[disp0] \n\t" \
00437 "cmovb %[disp0],%%rax \n\t" \
00438 "subb %[val],(%[sieb],%%rax)\n\t" \
00439 "xor %%rax,%%rax \n\t" \
00440 "cmp %[limit],%[disp0] \n\t" \
00441 "cmovb %[step],%%rax \n\t" \
00442 "add %%rax,%[disp0] \n" \
00443 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "rax");
00444 #elif defined(ASM_CMOV)
00445
00446 #ifdef DEBUG
00447 #warning "using CMOV version for macro: asm_sieb"
00448 #endif
00449 #define asm_sieb(lp, d0, d1, P) \
00450 asm volatile ( \
00451 "cmpl %[disp1],%[disp0]\n\t" \
00452 "cmova %[disp0],%%eax # conditional swap part1\n\t" \
00453 "cmova %[disp1],%[disp0] # part2 \n\t" \
00454 "cmova %%eax,%[disp1] # part3 \n\t" \
00455 "movl %[limit],%%eax\n\t" \
00456 "jmp 1f \n\t" \
00457 ".balign 16 \n\t" \
00458 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00459 "addl %[step],%[disp1]\n\t" \
00460 "subb %[val],(%[sieb],%[disp0])\n\t" \
00461 "addl %[step],%[disp0]\n\t" \
00462 "1: cmpl %%eax,%[disp1]\n\t" \
00463 "jb 0b\n\t" \
00464 "cmpl %%eax,%[disp0] \n\t" \
00465 "cmovb %[disp0],%%eax \n\t" \
00466 "subb %[val],(%[sieb],%%eax)\n\t" \
00467 "xorl %%eax,%%eax \n\t" \
00468 "cmpl %[limit],%[disp0] \n\t" \
00469 "cmovb %[step],%%eax \n\t" \
00470 "addl %%eax,%[disp0] \n" \
00471 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");
00472
00473
00474 #define asm_sieb_small(lp, d0, d1, P) \
00475 asm volatile ( \
00476 "cmpl %[disp1],%[disp0]\n\t" \
00477 "cmova %[disp0],%%eax # conditional swap part1\n\t" \
00478 "cmova %[disp1],%[disp0] # part2 \n\t" \
00479 "cmova %%eax,%[disp1] # part3 \n\t" \
00480 "movl %[limit],%%eax\n\t" \
00481 "jmp 0f \n\t" \
00482 ".balign 16 \n\t" \
00483 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00484 "addl %[step],%[disp1]\n\t" \
00485 "subb %[val],(%[sieb],%[disp0])\n\t" \
00486 "addl %[step],%[disp0]\n\t" \
00487 "1: cmpl %%eax,%[disp1]\n\t" \
00488 "jb 0b\n\t" \
00489 "cmpl %%eax,%[disp0] \n\t" \
00490 "cmovb %[disp0],%%eax \n\t" \
00491 "subb %[val],(%[sieb],%%eax)\n\t" \
00492 "xorl %%eax,%%eax \n\t" \
00493 "cmpl %[limit],%[disp0] \n\t" \
00494 "cmovb %[step],%%eax \n\t" \
00495 "addl %%eax,%[disp0] \n" \
00496 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");
00497 #else
00498 #define asm_sieb(lp, d0, d1, P) \
00499 asm volatile ( \
00500 "cmp %[disp1],%[disp0]\n\t" \
00501 "jbe 1f \n\t" \
00502 "xchg %[disp0],%[disp1]\n\t" \
00503 "jmp 1f \n\t" \
00504 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00505 "add %[step],%[disp1]\n\t" \
00506 "subb %[val],(%[sieb],%[disp0])\n\t" \
00507 "add %[step],%[disp0]\n\t" \
00508 "1: cmp %[limit],%[disp1]\n\t" \
00509 "jb 0b\n\t" \
00510 "cmp %[limit],%[disp0] \n\t" \
00511 "jae 2f \n\t \n\t" \
00512 "subb %[val],(%[sieb],%[disp0])\n\t" \
00513 "add %[step],%[disp0] \n\t" \
00514 "2: #done \n" \
00515 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00516
00517 #define asm_sieb_small(lp, d0, d1, P) \
00518 asm volatile ( \
00519 "cmp %[disp1],%[disp0]\n\t" \
00520 "jbe 0f \n\t" \
00521 "xchg %[disp0],%[disp1]\n\t" \
00522 "0: subb %[val],(%[sieb],%[disp1])\n\t" \
00523 "add %[step],%[disp1]\n\t" \
00524 "subb %[val],(%[sieb],%[disp0])\n\t" \
00525 "add %[step],%[disp0]\n\t" \
00526 "1: cmp %[limit],%[disp1]\n\t" \
00527 "jb 0b\n\t" \
00528 "cmp %[limit],%[disp0] \n\t" \
00529 "jae 2f \n\t \n\t" \
00530 "subb %[val],(%[sieb],%[disp0])\n\t" \
00531 "add %[step],%[disp0] \n\t" \
00532 "2: #done \n" \
00533 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00534 #endif
00535 #elif TSIEVEELEMENTSIZE == 2
00536 #define asm_sieb_init(logValue) \
00537 asm volatile ( \
00538 "pushw %%ax\n\t" \
00539 "shll $16,%%eax\n\t" \
00540 "popw %%ax\n\t" \
00541 "shrl $1,%%ecx\n\t" \
00542 "cld\n\t" \
00543 "rep\n\t" \
00544 "stosl" \
00545 : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
00546 #define asm_search_sieb(offset) \
00547 asm volatile ( \
00548 "0: # calibrate alignment \n\t" \
00549 "testl $1,%[pos] \n\t" \
00550 "jz 1f # is aligned \n\t" \
00551 "decl %[pos] \n\t" \
00552 "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
00553 "js 2f # hit detected \n\t" \
00554 "1: # Treffer suchen \n\t" \
00555 "subl $2,%[pos] \n\t" \
00556 "testl $0x80008000,(%[sieb],%[pos],2) \n\t" \
00557 "jz 1b \n\t" \
00558 "incl %[pos] \n\t" \
00559 "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
00560 "js 2f \n\t" \
00561 "decl %[pos] \n\t" \
00562 "2: # hit detected" \
00563 : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00564 #define asm_sieb(lp, d0, d1, P) \
00565 asm volatile ( \
00566 "cmpl %[disp1],%[disp0]\n\t" \
00567 "jbe 1f \n\t" \
00568 "xchgl %[disp0],%[disp1]\n\t" \
00569 "jmp 1f \n\t" \
00570 "0: subw %[val],(%[sieb],%[disp1],2)\n\t" \
00571 "subw %[val],(%[sieb],%[disp0],2)\n\t" \
00572 "addl %[step],%[disp1]\n\t" \
00573 "addl %[step],%[disp0]\n\t" \
00574 "1: cmpl %[limit],%[disp1]\n\t" \
00575 "jb 0b\n\t" \
00576 "cmpl %[limit],%[disp0] \n\t" \
00577 "jae 2f \n\t \n\t" \
00578 "subw %[val],(%[sieb],%[disp0],2)\n\t" \
00579 "addl %[step],%[disp0] \n\t" \
00580 "2: \n" \
00581 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00582 #elif TSIEVEELEMENTSIZE == 4
00583 #define asm_sieb_init(logValue) \
00584 asm volatile ( \
00585 "cld\n\t" \
00586 "rep\n\t" \
00587 "stosl" \
00588 : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "memory");
00589 #define asm_search_sieb(offset) \
00590 asm volatile ( \
00591 "1: # start search \n\t" \
00592 "decl %[pos] \n\t" \
00593 "testl $0x80008000,(%[sieb],%[pos],4) \n\t" \
00594 "jz 1b \n\t" \
00595 "2: # hit detected" \
00596 : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
00597 #define asm_sieb(lp, d0, d1, P) \
00598 asm volatile ( \
00599 "cmpl %[disp1],%[disp0]\n\t" \
00600 "jbe 1f \n\t" \
00601 "xchgl %[disp0],%[disp1]\n\t" \
00602 "jmp 1f \n\t" \
00603 "0: subl %[val],(%[sieb],%[disp1],4)\n\t" \
00604 "subl %[val],(%[sieb],%[disp0],4)\n\t" \
00605 "addl %[step],%[disp1]\n\t" \
00606 "addl %[step],%[disp0]\n\t" \
00607 "1: cmpl %[limit],%[disp1]\n\t" \
00608 "jb 0b\n\t" \
00609 "cmpl %[limit],%[disp0] \n\t" \
00610 "jae 2f \n\t \n\t" \
00611 "subl %[val],(%[sieb],%[disp0],4)\n\t" \
00612 "addl %[step],%[disp0] \n\t" \
00613 "2: \n" \
00614 : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
00615 #else
00616 #undef SIEBASM_386
00617 #warning "No optimized assemblercode available for sieving with TSieveElement!"
00618 #endif
00619 #if !defined(asm_sieb_small)
00620 #warning "no optimized asm_sieb_small, therefore using asm_sieb"
00621 #define asm_sieb_small asm_sieb
00622 #endif
00623 #endif
00624
00625
00626 void SieveControl::RecalibrateThresholds()
00627 {
00628 #ifdef VERBOSE_NOTICE
00629 cout << "recalibrating MPQS thresholds" << endl;
00630 #endif
00631
00632 const double FB_Threshold = log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime());
00633 TSieveElement Min=std::numeric_limits<TSieveElement>::max(), Max=std::numeric_limits<TSieveElement>::min();
00634 unsigned int I=0;
00635 for (signed int mySieveOffset=-LogicalSieveSize; mySieveOffset<LogicalSieveSize; mySieveOffset+=PhysicalSieveSize)
00636 {
00637 double wMin = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset);
00638 double wMax = wMin;
00639 double dAvg = 0.0; unsigned int samplecount = 0;
00640 for (int i=0; i<PhysicalSieveSize; i+=8)
00641 {
00642 const double w = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i);
00643 if (w<wMin) wMin=w;
00644 if (w>wMax) wMax=w;
00645 dAvg+=w; ++samplecount;
00646 }
00647 dAvg /= samplecount;
00648
00649
00650
00651 register double d = dAvg;
00652
00653 d-=FB_Threshold;
00654 if (StaticFactorbaseSettings::StaticPrime(1)==2) d-=log_SieveEntryMultiplier*log(2);
00655 d-=GetRawLogThreshold();
00656 if ( d>std::numeric_limits<TSieveElement>::max() || d<std::numeric_limits<TSieveElement>::min() )
00657 {
00658
00659 MARK; cerr << "numeric_limits<TSieveElement> exceeded. Out of range." << endl;
00660 exit(1);
00661 }
00662 register const TSieveElement w = static_cast<TSieveElement>(d);
00663 PhysicalSieveIntervalThresholdCorrector[I++]=w;
00664
00665
00666 #if TSIEVEELEMENTSIZE == 1
00667 Min=MIN(Min,w); Max=MAX(Max,w);
00668 #endif
00669 }
00670
00671
00672 BestPSI[0]=0; BestPSI[1]=I-1;
00673 for (unsigned int i=1; i<I-i; ++i)
00674 {
00675
00676 if (PhysicalSieveIntervalThresholdCorrector[i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[0]]) BestPSI[0]=i;
00677 if (PhysicalSieveIntervalThresholdCorrector[I-1-i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[1]]) BestPSI[1]=I-1-i;
00678 }
00679 cout << "Best two physical sieve intervals are " << BestPSI[0] << " and " << BestPSI[1] << endl;
00680
00681 #if defined(VERBOSE_INFO)
00682 #if TSIEVEELEMENTSIZE == 1
00683
00684 cout << "MPQS polynomial thresholds over physical intervals:" << endl;
00685 cout << "+"; for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
00686 for (signed int i=Max; i>=Min; --i)
00687 {
00688 if (i) cout << "|"; else cout << "+";
00689 const char ch = i? ' ' : '-';
00690 for (unsigned int j=0; j<I; ++j)
00691 if (PhysicalSieveIntervalThresholdCorrector[j]==i) cout << "*"; else cout << ch;
00692 cout << "|" << endl;
00693 }
00694 cout << "+"; for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
00695 #endif
00696 #endif
00697 }
00698
00699 void initialize_Sieve()
00700 {
00701
00702
00703
00704
00705
00706
00707 SieveControl::RequestMoreDirtiness();
00708
00709 #if 1
00710
00711 register TSieveElement w = SieveControl::GetLogThreshold();
00712 #else
00713
00714
00715
00716
00717 const TSieveElement FB_Threshold
00718 = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00719 register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));
00720
00721 w-=FB_Threshold;
00722 if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00723 w-=SieveControl::GetLogDirtyCorrection();
00724 #endif
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737 #ifdef SIEBASM_386
00738 asm_sieb_init(w);
00739 #else
00740 register int i=PhysicalSieveSize; do SieveArray[--i]=w; while (i);
00741 #endif
00742
00743
00744 #if 0
00745 {
00746 TSieveElement w_test = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));
00747 TSieveElement FB_Threshold
00748 = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00749 w_test-=FB_Threshold;
00750 if (StaticFactorbaseSettings::StaticPrime(1)==2) w_test-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00751 w_test-=SieveControl::GetLogDirtyCorrection();
00752
00753 static unsigned long int overall_intervals = 0;
00754 static unsigned long int differing_intervals = 0;
00755
00756 ++overall_intervals;
00757 if (w_test!=w)
00758 {
00759 ++differing_intervals;
00760 cout << "Thresholds bad/all: " << differing_intervals << "/" << overall_intervals << " ";
00761 cout << "Offset: " << SieveOffset;
00762 #if TSIEVEELEMENTSIZE == 1
00763
00764 cout << " -> logval: " << static_cast<signed int>(w_test)
00765 << " <-> threshold: " << static_cast<signed int>(w) << endl;
00766 #else
00767 cout << " -> logval: " << w_test << " <-> threshold: " << w << endl;
00768 #endif
00769 }
00770 }
00771 #endif
00772 }
00773
00774
00775 void SieveControl::create_sieve_image(int* &Image, int mySieveOffset)
00776 {
00777 #ifdef VERBOSE
00778 cout << "creating MPQS physical sieve image for SieveOffset " << SieveOffset << endl;
00779 #endif
00780 if (Image==NULL) Image = new int[PhysicalSieveSize];
00781 const TSieveElement FB_Threshold
00782 = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
00783 for (int i=0; i<PhysicalSieveSize; ++i)
00784 {
00785 register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i));
00786 w-=FB_Threshold;
00787 if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
00788 w-=GetRawLogThreshold();
00789 Image[i]=w;
00790 }
00791 }
00792
00793
00794 void initialize_Sieve(const int Image[])
00795 {
00796
00797
00798
00799
00800
00801
00802 SieveControl::RequestMoreDirtiness();
00803
00804 #if (defined(ASM_MMX) || defined(ASM_X86_64)) && (TSIEVEELEMENTSIZE == 1)
00805 #ifdef DEBUG
00806 #warning "Initialize Sieve: MMX Sieve Image copy activated."
00807 #endif
00808
00809
00810 asm volatile(\
00811 "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
00812 "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
00813 "movl %[count],%%eax \n\t" \
00814 "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
00815 "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
00816 ".balign 16 \n\t" \
00817 "1: \n\t" \
00818 "#prefetchnta 200(%[Image]) # this is an extension, not pure MMX \n\t" \
00819 "movq (%[Image]),%%mm1 \n\t" \
00820 "movq 8(%[Image]),%%mm2 \n\t" \
00821 "movq 16(%[Image]),%%mm3 \n\t" \
00822 "movq 24(%[Image]),%%mm4 \n\t" \
00823 "paddb %%mm0,%%mm1 \n\t" \
00824 "paddb %%mm0,%%mm2 \n\t" \
00825 "paddb %%mm0,%%mm3 \n\t" \
00826 "paddb %%mm0,%%mm4 \n\t" \
00827 "movq %%mm1,(%[Sieve]) \n\t" \
00828 "movq %%mm2,8(%[Sieve]) \n\t" \
00829 "movq %%mm3,16(%[Sieve]) \n\t" \
00830 "movq %%mm4,24(%[Sieve]) \n\t" \
00831 "movq 32(%[Image]),%%mm1 \n\t" \
00832 "movq 40(%[Image]),%%mm2 \n\t" \
00833 "movq 48(%[Image]),%%mm3 \n\t" \
00834 "movq 56(%[Image]),%%mm4 \n\t" \
00835 "paddb %%mm0,%%mm1 \n\t" \
00836 "paddb %%mm0,%%mm2 \n\t" \
00837 "paddb %%mm0,%%mm3 \n\t" \
00838 "paddb %%mm0,%%mm4 \n\t" \
00839 "movq %%mm1,32(%[Sieve]) \n\t" \
00840 "movq %%mm2,40(%[Sieve]) \n\t" \
00841 "movq %%mm3,48(%[Sieve]) \n\t" \
00842 "movq %%mm4,56(%[Sieve]) \n\t" \
00843 "add $64,%[Image] \n\t" \
00844 "add $64,%[Sieve] \n\t" \
00845 "sub $64,%%eax \n\t" \
00846 "jnz 1b \n\t" \
00847 "emms \n" \
00848 : "=D" (clobbered_int), "=S" (clobbered_int), "=a" (clobbered_int)
00849 : [threshold] "a" (SieveControl::log_Threshold), [Image] "S" (Image),
00850 [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize)
00851 : "cc", "memory", "mm0", "mm1", "mm2", "mm3", "mm4");
00852 #else
00853
00854 register TSieveElement w = SieveControl::log_Threshold;
00855 register int i=PhysicalSieveSize-1;
00856 do
00857 {
00858 register TSieveElement x=w+Image[i];
00859 SieveArray[i]=x;
00860 } while (--i>=0);
00861 #endif
00862 }
00863
00864
00865 int CDeltaComputations::LocalPhysInterval_Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
00866 int CDeltaComputations::LocalPhysInterval_Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));
00867
00868
00869 void CSieveStaticFactors::init_LocalDeltas()
00870 {
00871 SieveControl::RenewThresholds();
00872
00873 for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr)
00874 {
00875 register int Primzahl=PrimeNumbers[nr];
00876 register int d0,d1;
00877 {
00878 register int h=normalized_signed_mod(SieveOffset,Primzahl,PrimeNumberReciprocals[nr]);
00879 d0=Delta_of_PrimeNumbers[nr][0]-h;
00880 d1=Delta_of_PrimeNumbers[nr][1]-h;
00881 if (d1<0)
00882 d1+=Primzahl;
00883 if (d0<0)
00884 d0+=Primzahl;
00885 }
00886 LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
00887 LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
00888 }
00889
00890
00891 for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00892 {
00893 register int PPotenz = PrimePowers[nr];
00894 register int d0,d1;
00895 {
00896 register int h=normalized_signed_mod(SieveOffset,PPotenz,PrimePowerReciprocals[nr]);
00897 d0=Delta_of_PrimePowers[nr][0]-h;
00898 d1=Delta_of_PrimePowers[nr][1]-h;
00899 if (d1<0)
00900 d1+=PPotenz;
00901 if (d0<0)
00902 d0+=PPotenz;
00903 }
00904 LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
00905 LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
00906 }
00907 }
00908
00909
00910 void CSieveStaticFactors::create_Tables_for_init_LocalDeltas(int* &Table1, int* &Table2, const int SieveOffset)
00911 {
00912 if (Table1==NULL) Table1 = new int[StaticFactorbase::Size()];
00913 if (Table2==NULL) Table2 = new int[StaticFactorbase::Size()];
00914 for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr)
00915 Table1[nr]=normalized_signed_mod(SieveOffset,PrimeNumbers[nr],PrimeNumberReciprocals[nr]);
00916
00917
00918 for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00919 Table2[nr]=normalized_signed_mod(SieveOffset,PrimePowers[nr],PrimePowerReciprocals[nr]);
00920 }
00921
00922
00923 void CSieveStaticFactors::init_LocalDeltas(int const Table1[], int const Table2[])
00924 {
00925 SieveControl::RenewThresholds();
00926
00927 for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr)
00928 {
00929 register int Primzahl=PrimeNumbers[nr];
00930 register int d0,d1;
00931 {
00932 register int h=Table1[nr];
00933 d0=Delta_of_PrimeNumbers[nr][0]-h;
00934 d1=Delta_of_PrimeNumbers[nr][1]-h;
00935 if (d1<0)
00936 d1+=Primzahl;
00937 if (d0<0)
00938 d0+=Primzahl;
00939 }
00940 LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
00941 LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
00942 }
00943
00944
00945 for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
00946 {
00947 register int PPotenz = PrimePowers[nr];
00948 register int d0,d1;
00949 {
00950 register int h=Table2[nr];
00951 d0=Delta_of_PrimePowers[nr][0]-h;
00952 d1=Delta_of_PrimePowers[nr][1]-h;
00953 if (d1<0)
00954 d1+=PPotenz;
00955 if (d0<0)
00956 d0+=PPotenz;
00957 }
00958 LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
00959 LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
00960 }
00961 }
00962
00963
00964 void CSieveStaticFactors::do_sieving_StaticFactors()
00965 {
00966
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979
00980
00981
00982
00983
00984 register long int nr = StaticFactorbase::Size()-1;
00985
00986 #if 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_X86_64)
00987
00988
00989
00990
00991
00992 #ifdef DEBUG
00993 #warning "experimental (cmov) static factor sieving code X86_64"
00994 #endif
00995 asm volatile ( \
00996 "movb (%[SieveWeight],%[nr]),%%al \n\t" \
00997 "movb $1,%%ah # position of next run length \n\t" \
00998 "bswap %%eax # eax holds 3 different values\n\t" \
00999 "movw (%[LogValRuns]),%%ax # first run-length \n\t" \
01000 "movl (%[PrimeNumbers],%[nr],4),%%edi \n\t" \
01001 "incw %%ax \n\t" \
01002 "jmp 2f \n\t" \
01003 ".balign 16 \n\t" \
01004 "1: movl (%[Deltas],%[nr],8),%%esi\n\t" \
01005 "prefetchw -256(%[Deltas],%[nr],8) # tuning cache!! \n\t" \
01006 "prefetch -128(%[PrimeDiffs],%[nr],2)# tuning cache!! \n\t" \
01007 "bswap %%eax \n\t" \
01008 "movl %%esi,%%edx\n\t" \
01009 "subl %%ecx,%%esi\n\t" \
01010 "cmovnsl %%ecx,%%edx\n\t" \
01011 "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01012 "movsx %%edx,%%rdx \n\t" \
01013 "cmovsl %%edi,%%ecx\n\t" \
01014 "subb %%al,(%[feld],%%rdx)\n\t" \
01015 "addl %%ecx,%%esi\n\t" \
01016 "movl %[PhSiSi],%%ecx\n\t" \
01017 "movl %%esi,(%[Deltas],%[nr],8)\n\t" \
01018 "movl 4(%[Deltas],%[nr],8),%%esi \n\t" \
01019 "movl %%esi,%%edx\n\t" \
01020 "subl %%ecx,%%esi\n\t" \
01021 "cmovnsl %%ecx,%%edx\n\t" \
01022 "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01023 "movsx %%edx,%%rdx \n\t" \
01024 "cmovsl %%edi,%%ecx\n\t" \
01025 "subb %%al,(%[feld],%%rdx)\n\t" \
01026 "addl %%ecx,%%esi\n\t" \
01027 "movl %%esi,4(%[Deltas],%[nr],8)\n\t" \
01028 "bswap %%eax \n\t" \
01029 "movzwl (%[PrimeDiffs],%[nr],2),%%edx\n\t" \
01030 "dec %[nr] \n\t" \
01031 "subl %%edx,%%edi \n\t" \
01032 "2: movl %[PhSiSi],%%ecx\n\t" \
01033 "decw %%ax \n\t " \
01034 "jnz 1b \n\t" \
01035 "bswap %%eax \n\t" \
01036 "movzx %%ah,%%edx \n\t" \
01037 "dec %%al # LogVal for run \n\t" \
01038 "inc %%ah # next run \n\t" \
01039 "movsx %%edx,%%rdx \n\t" \
01040 "bswap %%eax \n\t" \
01041 "movw (%[LogValRuns],%%rdx,2),%%ax # run length \n\t" \
01042 "test %%ax,%%ax \n\t" \
01043 "jnz 1b \n" \
01044 : [nr] "+r" (nr)
01045 : [SieveWeight] "r" (&SieveControl::log_PrimeNumbers[0]),
01046 [PrimeNumbers] "r" (&PrimeNumbers[0]),
01047 [PrimeDiffs] "r" (&PrimeNumberDiffs[0]),
01048 [Deltas] "r" (&LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01049 [feld] "r" (&SieveArray[0]),
01050 [LogValRuns] "r" (&LogValRuns[0]),
01051 [PhSiSi] "i" (PhysicalSieveSize)
01052 : "cc", "memory", "rax", "rcx", "rdx", "rsi", "rdi");
01053
01054 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_ATHLON)
01055
01056
01057
01058
01059
01060
01061
01062
01063
01064
01065
01066
01067
01068
01069
01070 #ifdef DEBUG
01071 #warning "experimental (cmov) static factor sieving code Athlon (and Athlon XP/Athlon-64 (32bit))"
01072 #endif
01073 asm volatile ( \
01074 "movb %[SieveWeight](%[nr]),%%al \n\t" \
01075 "movb $1,%%ah # position of next run length \n\t" \
01076 "bswap %%eax # eax holds 3 different values\n\t" \
01077 "movw %[LogValRuns],%%ax # first run-length \n\t" \
01078 "movl %[PrimeNumbers](,%[nr],4),%%edi \n\t" \
01079 "incw %%ax \n\t" \
01080 "jmp 2f \n\t" \
01081 ".balign 16 \n\t" \
01082 "1: movl %[Deltas](,%[nr],8),%%esi\n\t" \
01083 "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
01084 "prefetch %[PrimeDiffs]-64(,%[nr],2)# tuning cache!! \n\t" \
01085 "bswap %%eax \n\t" \
01086 "movl %%esi,%%edx\n\t" \
01087 "subl %%ecx,%%esi\n\t" \
01088 "cmovnsl %%ecx,%%edx\n\t" \
01089 "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01090 "cmovsl %%edi,%%ecx\n\t" \
01091 "subb %%al,%[feld](%%edx)\n\t" \
01092 "addl %%ecx,%%esi\n\t" \
01093 "movl %[PhSiSi],%%ecx\n\t" \
01094 "movl %%esi,%[Deltas](,%[nr],8)\n\t" \
01095 "movl %[Deltas]+4(,%[nr],8),%%esi \n\t" \
01096 "movl %%esi,%%edx\n\t" \
01097 "subl %%ecx,%%esi\n\t" \
01098 "cmovnsl %%ecx,%%edx\n\t" \
01099 "movzx %%cl,%%ecx # assuming cl=0\n\t" \
01100 "cmovsl %%edi,%%ecx\n\t" \
01101 "subb %%al,%[feld](%%edx)\n\t" \
01102 "addl %%ecx,%%esi\n\t" \
01103 "movl %%esi,%[Deltas]+4(,%[nr],8)\n\t" \
01104 "bswap %%eax \n\t" \
01105 "movzwl %[PrimeDiffs](,%[nr],2),%%edx\n\t" \
01106 "dec %[nr] \n\t" \
01107 "subl %%edx,%%edi \n\t" \
01108 "2: movl %[PhSiSi],%%ecx\n\t" \
01109 "decw %%ax \n\t " \
01110 "jnz 1b \n\t" \
01111 "bswap %%eax \n\t" \
01112 "movzx %%ah,%%edx \n\t" \
01113 "dec %%al # LogVal for run \n\t" \
01114 "inc %%ah # next run \n\t" \
01115 "bswap %%eax \n\t" \
01116 "movw %[LogValRuns](,%%edx,2),%%ax # run length \n\t" \
01117 "test %%ax,%%ax \n\t" \
01118 "jnz 1b \n" \
01119 : [nr] "+r" (nr)
01120 : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01121 [PrimeNumbers] "o" (PrimeNumbers[0]),
01122 [PrimeDiffs] "o" (PrimeNumberDiffs[0]),
01123 [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01124 [feld] "o" (SieveArray[0]),
01125 [LogValRuns] "o" (LogValRuns[0]),
01126 [PhSiSi] "i" (PhysicalSieveSize)
01127 : "cc", "memory", "eax", "ecx", "edx", "esi", "edi");
01128
01129 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_3DNOW) && !defined(ASM_ATHLON))
01130
01131
01132 #ifdef DEBUG
01133 #warning "experimental static factor sieving code (3DNow!) for K6 3D"
01134 #endif
01135 asm volatile ( \
01136 "movl %[PhSiSi],%%edx \n\t" \
01137 "movd %%edx,%%mm7 \n\t" \
01138 "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01139 "jmp 2f \n\t" \
01140 ".balign 32,,8 \n\t" \
01141 "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01142 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01143 "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01144 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01145 "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01146 "psubd %%mm7,%%mm0 \n\t" \
01147 "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01148 "sub $2,%[nr] \n\t" \
01149 "movq %%mm2,%%mm3 \n\t" \
01150 "psubd %%mm7,%%mm1 \n\t" \
01151 "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01152 "pcmpgtd %%mm0,%%mm4 \n\t" \
01153 "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01154 "pcmpgtd %%mm1,%%mm5 \n\t" \
01155 "pand %%mm4,%%mm2 \n\t" \
01156 "pand %%mm5,%%mm3 \n\t" \
01157 "pand %%mm0,%%mm4 \n\t" \
01158 "pand %%mm1,%%mm5 \n\t" \
01159 "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
01160 "prefetch %[PrimeNumbers]-64(,%[nr],4)# tuning cache!! \n\t" \
01161 "movd %%mm4,%%esi \n\t" \
01162 "movd %%mm5,%%edi \n\t" \
01163 "paddd %%mm2,%%mm0 \n\t" \
01164 "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01165 "subb %%al,%[feldEnd](%%esi)\n\t" \
01166 "paddd %%mm3,%%mm1 \n\t" \
01167 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01168 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01169 "movd %%mm4,%%esi \n\t" \
01170 "movd %%mm5,%%edi \n\t" \
01171 "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01172 "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01173 "subb %%al,%[feldEnd](%%esi)\n\t" \
01174 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01175 "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01176 "jge 1b \n\t" \
01177 "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01178 "jl 3f \n\t" \
01179 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01180 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01181 "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01182 "movb %[SieveWeight](%[nr]),%%al\n\t" \
01183 "psubd %%mm7,%%mm1 \n\t" \
01184 "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01185 "pcmpgtd %%mm1,%%mm5 \n\t" \
01186 "pand %%mm5,%%mm3 \n\t" \
01187 "pand %%mm1,%%mm5 \n\t" \
01188 "movd %%mm5,%%esi \n\t" \
01189 "paddd %%mm3,%%mm1 \n\t" \
01190 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01191 "movd %%mm5,%%edi \n\t" \
01192 "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01193 "subb %%al,%[feldEnd](%%esi)\n\t" \
01194 "subb %%al,%[feldEnd](%%edi)\n\t" \
01195 "sub $1,%[nr] \n\t" \
01196 "3: femms \n" \
01197 : [nr] "+r" (nr)
01198 : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01199 [PrimeNumbers] "o" (PrimeNumbers[0]),
01200 [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01201 [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01202 [PhSiSi] "i" (PhysicalSieveSize)
01203 : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01204
01205 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_ATHLON) || defined(ASM_SSE))
01206
01207 #ifdef DEBUG
01208 #warning "experimental static factor sieving code for MMX (plus prefetching MMX-ext)"
01209 #endif
01210 asm volatile ( \
01211 "movl %[PhSiSi],%%edx \n\t" \
01212 "movd %%edx,%%mm7 \n\t" \
01213 "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01214 "jmp 2f \n\t" \
01215 ".balign 16 \n\t" \
01216 "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01217 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01218 "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01219 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01220 "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01221 "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01222 "sub $2,%[nr] \n\t" \
01223 "movq %%mm2,%%mm3 \n\t" \
01224 "psubd %%mm7,%%mm0 \n\t" \
01225 "psubd %%mm7,%%mm1 \n\t" \
01226 "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01227 "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01228 "pcmpgtd %%mm0,%%mm4 \n\t" \
01229 "pcmpgtd %%mm1,%%mm5 \n\t" \
01230 "pand %%mm4,%%mm2 \n\t" \
01231 "pand %%mm5,%%mm3 \n\t" \
01232 "pand %%mm0,%%mm4 \n\t" \
01233 "pand %%mm1,%%mm5 \n\t" \
01234 "prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
01235 "prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
01236 "movd %%mm4,%%esi \n\t" \
01237 "movd %%mm5,%%edi \n\t" \
01238 "paddd %%mm2,%%mm0 \n\t" \
01239 "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01240 "paddd %%mm3,%%mm1 \n\t" \
01241 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01242 "subb %%al,%[feldEnd](%%esi)\n\t" \
01243 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01244 "movd %%mm4,%%esi \n\t" \
01245 "movd %%mm5,%%edi \n\t" \
01246 "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01247 "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01248 "subb %%al,%[feldEnd](%%esi)\n\t" \
01249 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01250 "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01251 "jge 1b \n\t" \
01252 "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01253 "jl 3f \n\t" \
01254 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01255 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01256 "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01257 "movb %[SieveWeight](%[nr]),%%al\n\t" \
01258 "psubd %%mm7,%%mm1 \n\t" \
01259 "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01260 "pcmpgtd %%mm1,%%mm5 \n\t" \
01261 "pand %%mm5,%%mm3 \n\t" \
01262 "pand %%mm1,%%mm5 \n\t" \
01263 "movd %%mm5,%%esi \n\t" \
01264 "paddd %%mm3,%%mm1 \n\t" \
01265 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01266 "movd %%mm5,%%edi \n\t" \
01267 "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01268 "subb %%al,%[feldEnd](%%esi)\n\t" \
01269 "subb %%al,%[feldEnd](%%edi)\n\t" \
01270 "sub $1,%[nr] \n\t" \
01271 "3: emms \n" \
01272 : [nr] "+r" (nr)
01273 : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01274 [PrimeNumbers] "o" (PrimeNumbers[0]),
01275 [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01276 [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01277 [PhSiSi] "i" (PhysicalSieveSize)
01278 : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01279
01280 #elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_MMX)
01281
01282 #ifdef DEBUG
01283 #warning "experimental static factor sieving code for MMX compatible machines"
01284 #endif
01285 asm volatile ( \
01286 "movl %[PhSiSi],%%edx \n\t" \
01287 "movd %%edx,%%mm7 \n\t" \
01288 "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
01289 "jmp 2f \n\t" \
01290 "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
01291 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01292 "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
01293 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01294 "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
01295 "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
01296 "sub $2,%[nr] \n\t" \
01297 "movq %%mm2,%%mm3 \n\t" \
01298 "psubd %%mm7,%%mm0 \n\t" \
01299 "psubd %%mm7,%%mm1 \n\t" \
01300 "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
01301 "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
01302 "pcmpgtd %%mm0,%%mm4 \n\t" \
01303 "pcmpgtd %%mm1,%%mm5 \n\t" \
01304 "pand %%mm4,%%mm2 \n\t" \
01305 "pand %%mm5,%%mm3 \n\t" \
01306 "pand %%mm0,%%mm4 \n\t" \
01307 "pand %%mm1,%%mm5 \n\t" \
01308 "#prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
01309 "#prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
01310 "movd %%mm4,%%esi \n\t" \
01311 "movd %%mm5,%%edi \n\t" \
01312 "paddd %%mm2,%%mm0 \n\t" \
01313 "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
01314 "paddd %%mm3,%%mm1 \n\t" \
01315 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01316 "subb %%al,%[feldEnd](%%esi)\n\t" \
01317 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01318 "movd %%mm4,%%esi \n\t" \
01319 "movd %%mm5,%%edi \n\t" \
01320 "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
01321 "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
01322 "subb %%al,%[feldEnd](%%esi)\n\t" \
01323 "subb %%ah,%[feldEnd](%%edi)\n\t" \
01324 "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
01325 "jge 1b \n\t" \
01326 "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
01327 "jl 3f \n\t" \
01328 "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
01329 "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
01330 "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
01331 "movb %[SieveWeight](%[nr]),%%al\n\t" \
01332 "psubd %%mm7,%%mm1 \n\t" \
01333 "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
01334 "pcmpgtd %%mm1,%%mm5 \n\t" \
01335 "pand %%mm5,%%mm3 \n\t" \
01336 "pand %%mm1,%%mm5 \n\t" \
01337 "movd %%mm5,%%esi \n\t" \
01338 "paddd %%mm3,%%mm1 \n\t" \
01339 "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
01340 "movd %%mm5,%%edi \n\t" \
01341 "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
01342 "subb %%al,%[feldEnd](%%esi)\n\t" \
01343 "subb %%al,%[feldEnd](%%edi)\n\t" \
01344 "sub $1,%[nr] \n\t" \
01345 "3: emms \n" \
01346 : [nr] "+r" (nr)
01347 : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
01348 [PrimeNumbers] "o" (PrimeNumbers[0]),
01349 [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01350 [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
01351 [PhSiSi] "i" (PhysicalSieveSize)
01352 : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");
01353
01354 #else
01355
01356
01357 for (;; --nr)
01358 {
01359 register int Primzahl=PrimeNumbers[nr];
01360 if (Primzahl<PhysicalSieveSize) break;
01361
01362 register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
01363
01364
01365
01366 register int d0,d1;
01367
01368 d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
01369 d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];
01370
01371
01372
01373 SieveArray[d0<PhysicalSieveSize ? d0 : PhysicalSieveSize]-=log_Primzahl;
01374 SieveArray[d1<PhysicalSieveSize ? d1 : PhysicalSieveSize]-=log_Primzahl;
01375
01376 d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
01377 d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;
01378
01379 LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
01380 LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
01381 }
01382 #endif
01383
01384
01385 const int NonDirtyIndex = SieveControl::FBStartIndex();
01386 for (; nr>=NonDirtyIndex; --nr)
01387 {
01388 register long int Primzahl=PrimeNumbers[nr];
01389 register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
01390
01391
01392
01393 register long int d0,d1;
01394
01395 d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
01396 d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];
01397
01398 #ifdef SIEBASM_386
01399 asm_sieb_small(log_Primzahl, d0, d1, Primzahl);
01400 #else
01401 if (d0>d1) { register int h=d0; d0=d1; d1=h; }
01402
01403 while (d1<PhysicalSieveSize)
01404 {
01405 SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
01406 SieveArray[d1]-=log_Primzahl; d1+=Primzahl;
01407 }
01408 if (d0<PhysicalSieveSize)
01409 {
01410 SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
01411 }
01412 #endif
01413 d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
01414 d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;
01415
01416 LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
01417 LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
01418 }
01419
01420
01421 for (nr=0; nr<NumberOf_more_PrimePowers; nr++)
01422 {
01423 register long int PPotenz = PrimePowers[nr];
01424 register TSieveElement log_Primzahl = log_Primzahl_of_PrimePowers[nr];
01425
01426
01427
01428
01429
01430
01431
01432 register long int d0,d1;
01433
01434 d0=LocalPhysInterval_Delta_of_PrimePowers[nr][0];
01435 d1=LocalPhysInterval_Delta_of_PrimePowers[nr][1];
01436
01437 #ifdef SIEBASM_386
01438 asm_sieb(log_Primzahl, d0, d1, PPotenz);
01439 #else
01440 if (d0>d1) { register int h=d0; d0=d1; d1=h; }
01441
01442 while (d1<PhysicalSieveSize)
01443 {
01444 SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
01445 SieveArray[d1]-=log_Primzahl; d1+=PPotenz;
01446 }
01447 if (d0<PhysicalSieveSize)
01448 {
01449 SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
01450 }
01451 #endif
01452 d0-=PhysicalSieveSize; if (d0<0) d0+=PPotenz;
01453 d1-=PhysicalSieveSize; if (d1<0) d1+=PPotenz;
01454
01455 LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
01456 LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
01457 }
01458
01459 }
01460
01461
01462 void do_scanning_Sieve(void)
01463 {
01464
01465
01466
01467
01468
01469 register long int d=PhysicalSieveSize;
01470 do
01471 {
01472 #ifdef SIEBASM_386
01473 asm_search_sieb(d);
01474
01475
01476 #else
01477 while (SieveArray[--d]>=0) { }
01478 #endif
01479 if (d>=0)
01480 {
01481
01482 if (SieveControl::Hit_after_DirtyCorrection(SieveOffset+d,SieveArray[d]))
01483 StaticRelations::insert(new CRelation(SieveOffset+d));
01484 }
01485 } while (d>0);
01486
01487 }
01488
01489
01490 #if defined(ASM_X86_64)
01491
01492 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01493 {
01494
01495
01496
01497
01498 #if 1 //def DEBUG
01499 #warning "X86_64/SSE2 specific code to detect static factors"
01500 #endif
01501 typedef int v4sf __attribute__ ((vector_size (16)));
01502 register v4sf XMM6 asm("xmm6");
01503 register v4sf XMM7 asm("xmm7");
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513 asm volatile ( \
01514 "emms # enter mmx\n\t" \
01515 "movd %[Sieboffs],%%mm0 # mm0=(0,Sieboffs)\n\t" \
01516 "xorps %[_xmm6],%[_xmm6] # L2AM clear mm6 \n\t" \
01517 "punpckldq %%mm0,%%mm0 # L2AM (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01518 "cvtpi2ps %%mm0,%[_xmm7] # packed int to float \n\t" \
01519 "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01520 "cmpnleps %[_xmm7],%[_xmm6] # L2AM set bits if signed, clear bits if unsigned" \
01521 : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01522
01523
01524
01525 {
01526 register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01527 if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01528 d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01529 if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01530 }
01531
01532 long int nr;
01533 for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4)
01534 {
01535 register unsigned int pflags;
01536 asm ( \
01537 "movaps %[_xmm7],%%xmm2 \n\t" \
01538 "cvtdq2ps (%[divisors],%[i],4),%%xmm1 # convert 4 packed int to float \n\t" \
01539 "mulps (%[FloatRecips],%[i],4),%%xmm2 \n\t" \
01540 "movaps %%xmm1,%%xmm3 \n\t" \
01541 "cvttps2dq %%xmm2,%%xmm0 \n\t" \
01542 "movaps %[_xmm7],%%xmm2 \n\t" \
01543 "cvtdq2ps %%xmm0,%%xmm0 \n\t" \
01544 "mulps %%xmm1,%%xmm0 \n\t" \
01545 "subps %%xmm0,%%xmm2 \n\t" \
01546 "andps %[_xmm6],%%xmm1 # correct sign (part 1) \n\t" \
01547 "addps %%xmm1,%%xmm2 # correct sign (part 2) \n\t" \
01548 "cmpneqps %%xmm2,%%xmm3 # assure modp=0 instead of modp=p \n\t" \
01549 "andps %%xmm3,%%xmm2 \n\t" \
01550 "cvttps2dq %%xmm2,%%xmm3 \n\t" \
01551 "pshufd $0x50,%%xmm3,%%xmm0 \n\t" \
01552 "pshufd $0xfa,%%xmm3,%%xmm1 \n\t" \
01553 "pcmpeqd (%[Deltas],%[i],8),%%xmm0 \n\t" \
01554 "pcmpeqd 16(%[Deltas],%[i],8),%%xmm1 \n\t" \
01555 "packssdw %%xmm1,%%xmm0 \n\t" \
01556 "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01557 : [pflags] "=r" (pflags)
01558 : [divisors] "r" (&PrimeNumbers[0]), [Deltas] "r" (&Delta_of_PrimeNumbers[0][0]),
01559 [FloatRecips] "r" (&PrimeNumberFloatReciprocals[0]),
01560 [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01561 : "xmm0", "xmm1", "xmm2", "xmm3");
01562
01563
01564
01565 #if defined(DEBUG)
01566 {
01567
01568 int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01569 if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01570 if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01571 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01572 if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01573 if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01574 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01575 if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01576 if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01577 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01578 if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01579 if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01580 }
01581 #endif
01582 if ( pflags & 0x000f ) Factors.push_back(nr);
01583 if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01584 if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01585 if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01586
01587 }
01588
01589 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01590 asm volatile ( \
01591 "movd %[mySievePos],%[_xmm7] # - mm0=(0,mySiebPos)\n\t" \
01592 "shufps $0x00,%[_xmm7],%[_xmm7] \n\t" \
01593 : [_xmm7] "=x" (XMM7) : [mySievePos] "r" (mySievePos));
01594
01595 for (; nr<StaticFactorbase::Size(); nr+=4)
01596 {
01597 register unsigned int pflags;
01598 asm ( \
01599 "movdqa (%[PrimeNumbers],%[i],4),%%xmm2 # L2AMS get y : x (as packed int)\n\t" \
01600 "paddd %[_xmm7],%%xmm2 # L2AM \n\t" \
01601 "movdqa 16(%[LocDeltas],%[i],8),%%xmm3 \n\t" \
01602 "pshufd $0x50,%%xmm2,%%xmm0 \n\t" \
01603 "pshufd $0xfa,%%xmm2,%%xmm1 \n\t" \
01604 "pcmpeqd (%[LocDeltas],%[i],8),%%xmm0 \n\t" \
01605 "pcmpeqd %%xmm3,%%xmm1 \n\t" \
01606 "packssdw %%xmm1,%%xmm0 \n\t" \
01607 "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01608 : [pflags] "=r" (pflags)
01609 : [PrimeNumbers] "r" (&PrimeNumbers[0]), [LocDeltas] "r" (&LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01610 [_xmm7] "x" (XMM7), [i] "r" (nr)
01611 : "xmm0", "xmm1", "xmm2", "xmm3");
01612
01613 #if 0 || defined(DEBUG)
01614 {
01615
01616 int d = mySievePos+PrimeNumbers[nr];
01617 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01618 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01619 d=mySievePos+PrimeNumbers[nr+1];
01620 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01621 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01622 d=mySievePos+PrimeNumbers[nr+2];
01623 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01624 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01625 d=mySievePos+PrimeNumbers[nr+3];
01626 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01627 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01628 }
01629 #endif
01630 if (pflags==0) continue;
01631 if ( pflags & 0x000f ) Factors.push_back(nr);
01632 if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01633 if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01634 if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01635 }
01636 asm volatile ("emms # exit mmx");
01637
01638 }
01639
01640
01641
01642 #elif defined(ASM_SSE2)
01643
01644 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01645 {
01646
01647
01648
01649
01650 #ifdef DEBUG
01651 #warning "Pentium4/SSE2 specific code to detect static factors"
01652 #endif
01653 typedef int v4sf __attribute__ ((vector_size (16)));
01654 register v4sf XMM6 asm("xmm6");
01655 register v4sf XMM7 asm("xmm7");
01656
01657
01658
01659
01660
01661
01662
01663
01664
01665 asm volatile ( \
01666 "emms # enter mmx\n\t" \
01667 "movd %[Sieboffs],%%mm0 # mm0=(0,Sieboffs)\n\t" \
01668 "xorps %[_xmm6],%[_xmm6] # L2AM clear mm6 \n\t" \
01669 "punpckldq %%mm0,%%mm0 # L2AM (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01670 "cvtpi2ps %%mm0,%[_xmm7] # packed int to float \n\t" \
01671 "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01672 "cmpnleps %[_xmm7],%[_xmm6] # L2AM set bits if signed, clear bits if unsigned" \
01673 : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01674
01675
01676
01677 {
01678 register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01679 if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01680 d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01681 if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01682 }
01683
01684 int nr;
01685 for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4)
01686 {
01687 register unsigned int pflags;
01688 asm ( \
01689 "movaps %[_xmm7],%%xmm2 \n\t" \
01690 "cvtdq2ps %[divisors](,%[i],4),%%xmm1 # convert 4 packed int to float \n\t" \
01691 "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
01692 "movaps %%xmm1,%%xmm3 \n\t" \
01693 "cvttps2dq %%xmm2,%%xmm0 \n\t" \
01694 "movaps %[_xmm7],%%xmm2 \n\t" \
01695 "cvtdq2ps %%xmm0,%%xmm0 \n\t" \
01696 "mulps %%xmm1,%%xmm0 \n\t" \
01697 "subps %%xmm0,%%xmm2 \n\t" \
01698 "andps %[_xmm6],%%xmm1 # correct sign (part 1) \n\t" \
01699 "addps %%xmm1,%%xmm2 # correct sign (part 2) \n\t" \
01700 "cmpneqps %%xmm2,%%xmm3 # assure modp=0 instead of modp=p \n\t" \
01701 "andps %%xmm3,%%xmm2 \n\t" \
01702 "cvttps2dq %%xmm2,%%xmm3 \n\t" \
01703 "pshufd $0x50,%%xmm3,%%xmm0 \n\t" \
01704 "pshufd $0xfa,%%xmm3,%%xmm1 \n\t" \
01705 "pcmpeqd %[Deltas](,%[i],8),%%xmm0 \n\t" \
01706 "pcmpeqd 16+%[Deltas](,%[i],8),%%xmm1 \n\t" \
01707 "packssdw %%xmm1,%%xmm0 \n\t" \
01708 "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01709 : [pflags] "=r" (pflags)
01710 : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
01711 [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
01712 [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01713 : "xmm0", "xmm1", "xmm2", "xmm3");
01714
01715
01716
01717 #if defined(DEBUG)
01718 {
01719
01720 int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01721 if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01722 if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01723 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01724 if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01725 if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01726 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01727 if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01728 if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01729 d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01730 if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01731 if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01732 }
01733 #endif
01734 if ( pflags & 0x000f ) Factors.push_back(nr);
01735 if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01736 if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01737 if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01738
01739 }
01740
01741 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01742 asm volatile ( \
01743 "movd %[mySievePos],%[_xmm7] # - mm0=(0,mySiebPos)\n\t" \
01744 "shufps $0x00,%[_xmm7],%[_xmm7] \n\t" \
01745 : [_xmm7] "=x" (XMM7) : [mySievePos] "r" (mySievePos));
01746
01747 for (; nr<StaticFactorbase::Size(); nr+=4)
01748 {
01749 register unsigned int pflags;
01750 asm ( \
01751 "movdqa %[PrimeNumbers](,%[i],4),%%xmm2 # L2AMS get y : x (as packed int)\n\t" \
01752 "paddd %[_xmm7],%%xmm2 # L2AM \n\t" \
01753 "movdqa 16+%[LocDeltas](,%[i],8),%%xmm3 \n\t" \
01754 "pshufd $0x50,%%xmm2,%%xmm0 \n\t" \
01755 "pshufd $0xfa,%%xmm2,%%xmm1 \n\t" \
01756 "pcmpeqd %[LocDeltas](,%[i],8),%%xmm0 \n\t" \
01757 "pcmpeqd %%xmm3,%%xmm1 \n\t" \
01758 "packssdw %%xmm1,%%xmm0 \n\t" \
01759 "pmovmskb %%xmm0,%[pflags] # create bytemask" \
01760 : [pflags] "=r" (pflags)
01761 : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01762 [_xmm7] "x" (XMM7), [i] "r" (nr)
01763 : "xmm0", "xmm1", "xmm2", "xmm3");
01764
01765 #if 0 || defined(DEBUG)
01766 {
01767
01768 int d = mySievePos+PrimeNumbers[nr];
01769 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
01770 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
01771 d=mySievePos+PrimeNumbers[nr+1];
01772 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
01773 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
01774 d=mySievePos+PrimeNumbers[nr+2];
01775 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
01776 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
01777 d=mySievePos+PrimeNumbers[nr+3];
01778 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
01779 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
01780 }
01781 #endif
01782 if (pflags==0) continue;
01783 if ( pflags & 0x000f ) Factors.push_back(nr);
01784 if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
01785 if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
01786 if ( pflags & 0xf000 ) Factors.push_back(nr+3);
01787 }
01788 asm volatile ("emms # exit mmx");
01789
01790 }
01791
01792
01793
01794 #elif defined(ASM_SSE)
01795
01796 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01797 {
01798
01799
01800
01801
01802 #ifdef DEBUG
01803 #warning "Pentium3/SSE specific code to detect static factors"
01804 #endif
01805 typedef int v4sf __attribute__ ((vector_size (16)));
01806 register v4sf XMM6 asm("xmm6");
01807 register v4sf XMM7 asm("xmm7");
01808
01809
01810
01811
01812
01813
01814
01815
01816
01817 asm volatile ( \
01818 "emms # enter mmx\n\t" \
01819 "movd %[Sieboffs],%%mm0 # - mm0=(0,Sieboffs)\n\t" \
01820 "xorps %[_xmm6],%[_xmm6] # L2AM clear mm6 \n\t" \
01821 "punpckldq %%mm0,%%mm0 # L2AM (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
01822 "cvtpi2ps %%mm0,%[_xmm7] # packed int to float \n\t" \
01823 "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
01824 "cmpnleps %[_xmm7],%[_xmm6] # L2AM set bits if signed, clear bits if unsigned" \
01825 : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");
01826
01827
01828
01829 {
01830 register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
01831 if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
01832 d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
01833 if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
01834 }
01835
01836 int nr;
01837 for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4)
01838 {
01839 register unsigned int pflags, pflags2;
01840 asm ( \
01841 "movaps %[_xmm7],%%xmm2 \n\t" \
01842 "movq %[divisors](,%[i],4),%%mm4 # L2AMS get y : x (as packed int) \n\t" \
01843 "movq 8+%[divisors](,%[i],4),%%mm5 # L2AMS get y : x (as packed int) \n\t" \
01844 "cvtpi2ps %%mm4,%%xmm0 # convert packed int to float \n\t" \
01845 "cvtpi2ps %%mm5,%%xmm1 # convert packed int to float \n\t" \
01846 "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
01847 "movlhps %%xmm1,%%xmm0 \n\t" \
01848 "movaps %%xmm0,%%xmm1 \n\t" \
01849 "movaps %%xmm0,%%xmm3 \n\t" \
01850 "movhlps %%xmm2,%%xmm4 \n\t" \
01851 "cvttps2pi %%xmm2,%%mm0 \n\t" \
01852 "cvttps2pi %%xmm4,%%mm1 \n\t" \
01853 "movaps %[_xmm7],%%xmm2 \n\t" \
01854 "cvtpi2ps %%mm0,%%xmm0 \n\t" \
01855 "cvtpi2ps %%mm1,%%xmm4 \n\t" \
01856 "movlhps %%xmm4,%%xmm0 \n\t" \
01857 "mulps %%xmm1,%%xmm0 \n\t" \
01858 "subps %%xmm0,%%xmm2 \n\t" \
01859 "andps %[_xmm6],%%xmm1 # correct sign (part 1) \n\t" \
01860 "addps %%xmm1,%%xmm2 # correct sign (part 2) \n\t" \
01861 "movaps %%xmm2,%%xmm0 \n\t" \
01862 "cmpneqps %%xmm3,%%xmm0 # assure modp=0 instead of modp=p \n\t" \
01863 "andps %%xmm0,%%xmm2 \n\t" \
01864 "movhlps %%xmm2,%%xmm4 \n\t" \
01865 "cvttps2pi %%xmm2,%%mm0 \n\t" \
01866 "pshufw $0x44,%%mm0,%%mm1 # L2AM \n\t" \
01867 "pshufw $0xee,%%mm0,%%mm2 # L2AM \n\t" \
01868 "pcmpeqd %[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
01869 "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
01870 "packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \
01871 "pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask \n\t" \
01872 "cvttps2pi %%xmm4,%%mm0 \n\t" \
01873 "pshufw $0x44,%%mm0,%%mm1 # L2AM \n\t" \
01874 "pshufw $0xee,%%mm0,%%mm2 # L2AM \n\t" \
01875 "pcmpeqd 16+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
01876 "pcmpeqd 24+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
01877 "packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \
01878 "pmovmskb %%mm1,%[pflags2] # L6+ (VectorPath) into bytemask" \
01879 : [pflags] "=r" (pflags), [pflags2] "=r" (pflags2)
01880 : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
01881 [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
01882 [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
01883 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
01884
01885
01886
01887 #if defined(DEBUG)
01888 {
01889
01890 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
01891 if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; cerr << nr << "!!" << (pflags&0x03) << endl; exit(7); }
01892 if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; cerr << nr << "!!" << (pflags&0x0c) << endl; exit(7); }
01893 }
01894 #endif
01895 if ( pflags & 0x0f ) Factors.push_back(nr);
01896
01897 #if defined(DEBUG)
01898 {
01899
01900 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
01901 if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x30) << endl; exit(7); }
01902 if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0xc0) << endl; exit(7); }
01903 }
01904 #endif
01905 if ( pflags & 0xf0 ) Factors.push_back(nr+1);
01906
01907 #if defined(DEBUG)
01908 {
01909
01910 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
01911 if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags2&0x03)==0x03) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x03) << endl; exit(7); }
01912 if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags2&0x0c)==0x0c) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x0c) << endl; exit(7); }
01913 }
01914 #endif
01915 if ( pflags2 & 0x0f ) Factors.push_back(nr+2);
01916
01917 #if defined(DEBUG)
01918 {
01919
01920 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
01921 if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags2&0x30)==0x30) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0x30) << endl; exit(7); }
01922 if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags2&0xc0)==0xc0) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0xc0) << endl; exit(7); }
01923 }
01924 #endif
01925 if ( pflags2 & 0xf0 ) Factors.push_back(nr+3);
01926 }
01927
01928 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
01929 typedef int v2di __attribute__ ((vector_size (8)));
01930 register v2di MM7 asm("mm7");
01931
01932 asm volatile ( \
01933 "movd %[mySievePos],%%mm7 # - mm0=(0,mySiebPos)\n\t" \
01934 "punpckldq %%mm7,%%mm7 # L2AM (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
01935 : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
01936
01937 for (; nr<StaticFactorbase::Size(); nr+=2)
01938 {
01939 register unsigned int pflags;
01940 asm ( \
01941 "movq %[PrimeNumbers](,%[i],4),%%mm0 # L2AMS get y : x (as packed int)\n\t" \
01942 "paddd %[_mm7],%%mm0 # L2AM \n\t" \
01943 "pshufw $0x44,%%mm0,%%mm1 # L2AM \n\t" \
01944 "pshufw $0xee,%%mm0,%%mm2 # L2AM \n\t" \
01945 "pcmpeqd %[LocDeltas](,%[i],8),%%mm1 # L2AM \n\t" \
01946 "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
01947 "packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \
01948 "pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \
01949 : [pflags] "=r" (pflags)
01950 : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
01951 [_mm7] "y" (MM7), [i] "r" (nr)
01952 : "mm0", "mm1", "mm2");
01953
01954 #if 0 || defined(DEBUG)
01955 {
01956
01957 register const int d = mySievePos+PrimeNumbers[nr];
01958 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
01959 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
01960 }
01961 #endif
01962 if ( pflags & 0x0f ) Factors.push_back(nr);
01963
01964 #if 0 || defined(DEBUG)
01965 {
01966
01967 register const int d = mySievePos+PrimeNumbers[nr+1];
01968 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
01969 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
01970 }
01971 #endif
01972 if ( pflags & 0xf0 ) Factors.push_back(nr+1);
01973
01974 }
01975 asm volatile ("emms # exit mmx");
01976
01977 }
01978
01979
01980
01981 #elif defined(ASM_ATHLON)
01982
01983 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
01984 {
01985
01986
01987
01988
01989 #ifdef DEBUG
01990 #warning "AMD Athlon 3DNow! specific code to detect static factors"
01991 #endif
01992 typedef int v2sf __attribute__ ((vector_size (8)));
01993 register v2sf MM7 asm("mm7");
01994 register v2sf MM6 asm("mm6");
01995
01996
01997
01998
01999
02000
02001
02002
02003
02004 asm volatile ( \
02005 "femms # L2AMS fast enter mmx\n\t" \
02006 "movd %[Sieboffs],%[_mm7] # - MM7=(0,Sieboffs)\n\t" \
02007 "punpckldq %[_mm7],%[_mm7] # L2AM (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
02008 "movq %[_mm7],%%mm0 # L2AM \n\t" \
02009 "pxor %[_mm6],%[_mm6] # L2AM clear mm6 \n\t" \
02010 "pi2fd %[_mm7],%[_mm7] # L4A int to float \n\t" \
02011 "pcmpgtd %%mm0,%%mm6 # L2AM set bits if signed, clear bits if unsigned" \
02012 : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");
02013
02014
02015 int nr;
02016 for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2)
02017 {
02018 register unsigned int pflags;
02019 asm ( \
02020 "movq %[_mm7],%%mm2 # L2AMS \n\t" \
02021 "movq %[divisors](,%[i],4),%%mm4 # L2AMS get y : x (as packed int)\n\t" \
02022 "#prefetch %[FloatRecips]+64(,%[i],4) \n\t" \
02023 "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M z/y : w/x \n\t" \
02024 "pi2fd %%mm4,%%mm0 # L4A get y : x (as packed float)\n\t" \
02025 "movq %%mm4,%%mm5 # L2AMS \n\t" \
02026 "pf2id %%mm2,%%mm1 # L4A packed float to int \n\t" \
02027 "pand %%mm6,%%mm4 # L2AM correct sign (part 1) \n\t" \
02028 "pi2fd %%mm1,%%mm1 # L4A and back (->floor) \n\t" \
02029 "pfmul %%mm0,%%mm1 # L4M multiply \n\t" \
02030 "pfsubr %[_mm7],%%mm1 # L4A subtract \n\t" \
02031 "pf2id %%mm1,%%mm0 # L4A ->modulo \n\t" \
02032 "paddd %%mm4,%%mm0 # L2AM correct sign (part 2) \n\t" \
02033 "pcmpeqd %%mm0,%%mm5 # L2AM assure modp=0 instead of modp=p \n\t" \
02034 "pandn %%mm0,%%mm5 # L2AM \n\t" \
02035 "pshufw $0x44,%%mm5,%%mm1 # L2AM \n\t" \
02036 "pshufw $0xee,%%mm5,%%mm2 # L2AM \n\t" \
02037 "pcmpeqd %[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
02038 "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
02039 "packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \
02040 "pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \
02041 : [pflags] "=r" (pflags)
02042 : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
02043 [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
02044 [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
02045 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");
02046
02047
02048
02049 #ifdef DEBUG
02050 {
02051
02052 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
02053 if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
02054 if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
02055 }
02056 #endif
02057 if ( pflags & 0x0f ) Factors.push_back(nr);
02058
02059 #ifdef DEBUG
02060 {
02061
02062 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
02063 if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
02064 if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
02065 }
02066 #endif
02067 if ( pflags & 0xf0 ) Factors.push_back(nr+1);
02068 }
02069
02070 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02071
02072 asm volatile ( \
02073 "movd %[mySievePos],%%mm7 # - mm0=(0,mySiebPos)\n\t" \
02074 "punpckldq %%mm7,%%mm7 # L2AM (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
02075 : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
02076
02077 for (; nr<StaticFactorbase::Size(); nr+=2)
02078 {
02079 register unsigned int pflags;
02080 asm ( \
02081 "movq %[PrimeNumbers](,%[i],4),%%mm0 # L2AMS get y : x (as packed int)\n\t" \
02082 "paddd %[_mm7],%%mm0 # L2AM \n\t" \
02083 "pshufw $0x44,%%mm0,%%mm1 # L2AM \n\t" \
02084 "pshufw $0xee,%%mm0,%%mm2 # L2AM \n\t" \
02085 "pcmpeqd %[LocDeltas](,%[i],8),%%mm1 # L2AM \n\t" \
02086 "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
02087 "packssdw %%mm2,%%mm1 # L2AM pack comparison results together \n\t" \
02088 "pmovmskb %%mm1,%[pflags] # L6+ (VectorPath) into bytemask" \
02089 : [pflags] "=r" (pflags)
02090 : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
02091 [_mm7] "y" (MM7), [i] "r" (nr)
02092 : "mm0", "mm1", "mm2");
02093
02094 #if 0 || defined(DEBUG)
02095 {
02096
02097 register const int d = mySievePos+PrimeNumbers[nr];
02098 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
02099 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
02100 }
02101 #endif
02102 if ( pflags & 0x0f ) Factors.push_back(nr);
02103
02104 #if 0 || defined(DEBUG)
02105 {
02106
02107 register const int d = mySievePos+PrimeNumbers[nr+1];
02108 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
02109 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
02110 }
02111 #endif
02112 if ( pflags & 0xf0 ) Factors.push_back(nr+1);
02113
02114 }
02115 asm volatile ("femms # L2AMS fast exit mmx");
02116
02117 }
02118
02119
02120 #elif defined(ASM_3DNOW)
02121
02122 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
02123 {
02124
02125
02126
02127
02128 #ifdef DEBUG
02129 #warning "AMD K6 3DNow! specific code to detect static factors"
02130 #endif
02131 typedef int v2sf __attribute__ ((vector_size (8)));
02132 register v2sf MM7 asm("mm7");
02133 register v2sf MM6 asm("mm6");
02134
02135
02136
02137
02138
02139
02140
02141
02142
02143 asm volatile ( \
02144 "femms # L2AMS fast enter mmx\n\t" \
02145 "movd %[Sieboffs],%[_mm7] # - MM7=(0,Sieboffs)\n\t" \
02146 "punpckldq %[_mm7],%[_mm7] # L2AM (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
02147 "movq %[_mm7],%%mm0 # L2AM \n\t" \
02148 "pxor %[_mm6],%[_mm6] # L2AM clear mm6 \n\t" \
02149 "pi2fd %[_mm7],%[_mm7] # L4A int to float \n\t" \
02150 "pcmpgtd %%mm0,%%mm6 # L2AM set bits if signed, clear bits if unsigned" \
02151 : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");
02152
02153
02154 int nr;
02155 for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2)
02156 {
02157 register unsigned int pflags;
02158 asm ( \
02159 "movq %[_mm7],%%mm2 # L2AMS \n\t" \
02160 "movq %[divisors](,%[i],4),%%mm4 # L2AMS get y : x (as packed int)\n\t" \
02161 "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M z/y : w/x \n\t" \
02162 "pi2fd %%mm4,%%mm0 # L4A get y : x (as packed float)\n\t" \
02163 "movq %%mm4,%%mm5 # L2AMS \n\t" \
02164 "pf2id %%mm2,%%mm1 # L4A packed float to int \n\t" \
02165 "pand %%mm6,%%mm4 # L2AM correct sign (part 1) \n\t" \
02166 "pi2fd %%mm1,%%mm1 # L4A and back (->floor) \n\t" \
02167 "pfmul %%mm0,%%mm1 # L4M multiply \n\t" \
02168 "pfsubr %[_mm7],%%mm1 # L4A subtract \n\t" \
02169 "pf2id %%mm1,%%mm0 # L4A ->modulo \n\t" \
02170 "prefetch %[FloatRecips]+96(,%[i],4) \n\t" \
02171 "paddd %%mm4,%%mm0 # L2AM correct sign (part 2) \n\t" \
02172 "pcmpeqd %%mm0,%%mm5 # L2AM assure modp=0 instead of modp=p \n\t" \
02173 "pandn %%mm0,%%mm5 # L2AM \n\t" \
02174 "movq %%mm5,%%mm1 \n\t" \
02175 "punpckldq %%mm5,%%mm5 \n\t" \
02176 "punpckhdq %%mm1,%%mm1 \n\t" \
02177 "pcmpeqd %[Deltas](,%[i],8),%%mm5 # L2AM \n\t" \
02178 "pcmpeqd 8+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
02179 "packssdw %%mm1,%%mm5 \n\t" \
02180 "packsswb %%mm0,%%mm5 # mm0 is not relevant\n\t" \
02181 "movd %%mm5,%[pflags] # L5+ (VectorPath)" \
02182 : [pflags] "=r" (pflags)
02183 : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
02184 [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
02185 [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
02186 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");
02187
02188
02189
02190 #if 0 || defined(DEBUG)
02191 {
02192
02193 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
02194 if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
02195 if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
02196 }
02197 #endif
02198 if ( pflags & 0x0000ffffU ) Factors.push_back(nr);
02199
02200 #if 0 || defined(DEBUG)
02201 {
02202
02203 const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
02204 if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
02205 if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
02206 }
02207 #endif
02208 if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);
02209 }
02210
02211 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02212
02213 asm volatile ( \
02214 "movd %[mySievePos],%%mm7 # - mm0=(0,mySiebPos)\n\t" \
02215 "punpckldq %%mm7,%%mm7 # L2AM (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
02216 : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));
02217
02218 for (; nr<StaticFactorbase::Size(); nr+=2)
02219 {
02220 register unsigned int pflags;
02221 asm ( \
02222 "movq %[PrimeNumbers](,%[i],4),%%mm0 # L2AMS get y : x (as packed int)\n\t" \
02223 "paddd %[_mm7],%%mm0 # L2AM \n\t" \
02224 "movq %%mm0,%%mm1 \n\t" \
02225 "punpckldq %%mm0,%%mm0 \n\t" \
02226 "punpckhdq %%mm1,%%mm1 \n\t" \
02227 "pcmpeqd %[LocDeltas](,%[i],8),%%mm0 # L2AM \n\t" \
02228 "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm1 # L2AM \n\t" \
02229 "packssdw %%mm1,%%mm0 \n\t" \
02230 "packsswb %%mm1,%%mm0 \n\t" \
02231 "movd %%mm0,%[pflags] # L5+ (VectorPath)" \
02232 : [pflags] "=r" (pflags)
02233 : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
02234 [_mm7] "y" (MM7), [i] "r" (nr)
02235 : "mm0", "mm1");
02236
02237 #if 0 || defined(DEBUG)
02238 {
02239
02240 register const int d = mySievePos+PrimeNumbers[nr];
02241 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
02242 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
02243 }
02244 #endif
02245 if ( pflags & 0x0000ffffU ) Factors.push_back(nr);
02246
02247 #if 0 || defined(DEBUG)
02248 {
02249
02250 register const int d = mySievePos+PrimeNumbers[nr+1];
02251 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
02252 if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
02253 }
02254 #endif
02255 if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);
02256
02257 }
02258 asm volatile ("femms # L2AMS fast exit mmx");
02259
02260 }
02261
02262
02263 #else
02264
02265 void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
02266 {
02267
02268
02269
02270 int i;
02271 for (i=2; i<StaticFactorbase::Size() && PrimeNumbers[i]<PhysicalSieveSize; ++i)
02272 {
02273
02274
02275
02276
02277
02278 register int d = normalized_signed_mod(SievePos,PrimeNumbers[i],PrimeNumberReciprocals[i]);
02279
02280
02281
02282
02283
02284
02285 if (d==Delta_of_PrimeNumbers[i][0] || d==Delta_of_PrimeNumbers[i][1])
02286 Factors.push_back(i);
02287 }
02288
02289 const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
02290 for (; i<StaticFactorbase::Size(); ++i)
02291 {
02292
02293
02294
02295
02296
02297
02298
02299
02300 register int d = mySievePos+PrimeNumbers[i];
02301 if (d==LocalPhysInterval_Delta_of_PrimeNumbers[i][0] || d==LocalPhysInterval_Delta_of_PrimeNumbers[i][1])
02302 Factors.push_back(i);
02303 }
02304
02305
02306 }
02307 #endif
02308
02309
02310 #include "Sieving-inc.cc"