00001 00007 /*-------------------------------------------------------------------- 00008 This file is placed in the public domain by its author, 00009 Jason Papadopoulos. You may use it for any purpose, free of charge, 00010 without having to notify anyone. I disclaim any responsibility for any 00011 errors. 00012 00013 Optionally, please be nice and tell me if you find this source to be 00014 useful. Again optionally, if you add to the functionality present here 00015 please consider making those additions public too, so that others may 00016 benefit from your work. 00017 --jasonp@boo.net 00018 --------------------------------------------------------------------*/ 00019 00020 00021 /*------------------------------------------------------------------------- 00022 This file was modified by Thorsten Reinecke to be compatible with Qsieve. 00023 It can be used as replacement for the other invmod functions in modulo.cc. 00024 00025 For generic support of 23bit invmod, this version is a very fast 00026 alternative. (For Athlon computers, the crafted cmov assembler version of 00027 montgom_invmod is faster than this one.) Many thanks to the author for 00028 providing me this file, so that I was able to do performance tests. 00029 00030 Except for this file (and except otherwise noted), all the other files in 00031 the Qsieve distribution are still copyrighted under the GPL! 00032 thre@thorstenreinecke.de 00033 -------------------------------------------------------------------------*/ 00034 00035 00036 /*--------------------------------------------------------------------*/ 00037 00038 /* When the factor base is really big, the MPQS 00039 initialization stage takes a big fraction of the 00040 runtime; timings show that ~90% of the time in 00041 the initialization stage is spent computing modular 00042 inverses. To that end, the following implements a 00043 custom modular inverse that avoids integer division. 00044 It's somewhat obscure, but far less obscure than 00045 implementing the self-initializing variant of MPQS. 00046 00047 This is *much* faster than using mp_expo_1, because 00048 it uses floating point and several tricks for fast 00049 rounding. */ 00050 00051 const double ieee_round = 6755399441055744.0; 00052 const double intel_round = 6755399441055744.0 * 2048.0; 00053 const double round_test = 2.7; 00054 const double round_correct = 3.0; 00055 volatile double round_constant[2]; 00056 00057 00058 void init_fast_invmod(void) { 00059 00060 /* Unfortunately, the fast rounding requires figuring 00061 out the size of a floating point register. The choices 00062 are the 53-bit IEEE size and the 64-bit Intel size; 00063 however, the OS may force the Intel size down to the 00064 IEEE size for compatibility. The following performs 00065 a runtime check to see which register size is in use, 00066 and selects the magic rounding constant accordingly. */ 00067 00068 double round_me; 00069 00070 round_constant[0] = intel_round; 00071 round_constant[1] = intel_round; 00072 round_me = round_test; 00073 round_me += round_constant[0]; 00074 round_me -= round_constant[1]; 00075 00076 00077 printf("initializing jason's fast_invmod\n"); 00078 if (round_me != round_correct) { 00079 round_constant[0] = ieee_round; 00080 round_constant[1] = ieee_round; 00081 round_me = round_test; 00082 round_me += round_constant[0]; 00083 round_me -= round_constant[1]; 00084 if( round_me != round_correct ) { 00085 printf("error: can't initialize fast rounding\n"); 00086 exit(-1); 00087 } 00088 } 00089 00090 /* Check that the compiler did not incorrectly 00091 optimize our custom modular inverse routine */ 00092 00093 if (fast_invmod(6, 17) != 3) { 00094 printf("error: modular inverse does not work\n"); 00095 exit(-1); 00096 } 00097 } 00098 00099 unsigned int fast_invmod(unsigned int a, unsigned int p) { 00100 00101 /* Compute mp_expo_1(a, p-2, p) using floating point. 00102 'a' and 'p' must be less than ~25 bits in size if 00103 using IEEE fast rounding, but can go up to ~31 bits 00104 if Intel fast rounding is in use. Both of these 00105 are far larger than MPQS will ever need. 00106 00107 The code avoids an integer remainder by multiplying 00108 by the floating point reciprocal and (fast-)rounding to 00109 the nearest integral floating point value. Within 00110 the while() loop, the accumulated result is always 00111 bounded by +-p, and is normalized afterwards. 00112 00113 The exponent is scanned right to left; at each step the 00114 modular squaring and multiply are both performed, even 00115 if the multiply result will be thrown away */ 00116 00117 unsigned int exponent = p - 2; 00118 int result; 00119 double dp = (double)(p); 00120 double dsquare = (double)(a); 00121 double recip = 1.0 / dp; 00122 double round0 = round_constant[0]; 00123 double round1 = round_constant[1]; 00124 double dresult = 1.0; 00125 00126 while (exponent) { 00127 double dsquare2 = dsquare * dsquare; 00128 double dresult2 = dresult * dsquare; 00129 00130 double drsquare = dsquare2 * recip + round0 - round1; 00131 double drresult = dresult2 * recip + round0 - round1; 00132 00133 dsquare2 -= dp * drsquare; 00134 dresult2 -= dp * drresult; 00135 00136 dsquare = dsquare2; 00137 if (exponent & 1) 00138 dresult = dresult2; 00139 00140 exponent >>= 1; 00141 } 00142 00143 result = (int)dresult; 00144 00145 if (result < 0) 00146 return (unsigned int)(result + p); 00147 else 00148 return (unsigned int)result; 00149 }