libbpm: bpmnr/nr_levmar.c Source File

00001 
00013 
00031 
00037 #include <bpm/bpm_messages.h>
00038 #include <bpm/bpm_nr.h>
00039 
00040 static int nr_lmLUinverse(double *A, double *B, int m);
00041 
00042 // -------------------------------------------------------------------
00043 
00044 /* blocked multiplication of the transpose of the nxm matrix a with itself (i.e. a^T a)
00045  * using a block size of bsize. The product is returned in b.
00046  * Since a^T a is symmetric, its computation can be speeded up by computing only its
00047  * upper triangular part and copying it to the lower part.
00048  *
00049  * More details on blocking can be found at 
00050  * http://www-2.cs.cmu.edu/afs/cs/academic/class/15213-f02/www/R07/section_a/Recitation07-SectionA.pdf
00051  */
00052 
00053 void nr_trans_mat_mat_mult(double *a, double *b, int n, int m ) {
00054 
00055   register int i, j, k, jj, kk;
00056   register double sum, *bim, *akm;
00057   const int bsize=__LM_BLOCKSZ__;
00058 
00059 #define __MIN__(x, y) (((x)<=(y))? (x) : (y))
00060 #define __MAX__(x, y) (((x)>=(y))? (x) : (y))
00061 
00062   /* compute upper triangular part using blocking */
00063   for(jj=0; jj<m; jj+=bsize){
00064     for(i=0; i<m; ++i){
00065       bim=b+i*m;
00066       for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j)
00067         bim[j]=0.0; //b[i*m+j]=0.0;
00068     }
00069     
00070     for(kk=0; kk<n; kk+=bsize){
00071       for(i=0; i<m; ++i){
00072         bim=b+i*m;
00073         for(j=__MAX__(jj, i); j<__MIN__(jj+bsize, m); ++j){
00074           sum=0.0;
00075           for(k=kk; k<__MIN__(kk+bsize, n); ++k){
00076             akm=a+k*m;
00077             sum+=akm[i]*akm[j]; //a[k*m+i]*a[k*m+j];
00078           }
00079           bim[j]+=sum; //b[i*m+j]+=sum;
00080         }
00081       }
00082     }
00083   }
00084 
00085   /* copy upper triangular part to the lower one */
00086   for(i=0; i<m; ++i)
00087     for(j=0; j<i; ++j)
00088       b[i*m+j]=b[j*m+i];
00089 
00090 #undef __MIN__
00091 #undef __MAX__
00092 
00093   return;
00094 }
00095 
00096 // ------------------------------------------
00097 
00098 /* forward finite difference approximation to the jacobian of func */
00099 void nr_fdif_forw_jac_approx(
00100   void (*func)(double *p, double *hx, int m, int n, void *adata),
00101   /* function to differentiate */
00102   double *p,              /* I: current parameter estimate, mx1 */
00103   double *hx,             /* I: func evaluated at p, i.e. hx=func(p), nx1 */
00104   double *hxx,            /* W/O: work array for evaluating func(p+delta), nx1 */
00105   double delta,           /* increment for computing the jacobian */
00106   double *jac,            /* O: array for storing approximated jacobian, nxm */
00107   int m,
00108   int n,
00109   void *adata ) 
00110 {
00111   register int i, j;
00112   double tmp;
00113   register double d;
00114 
00115   for(j=0; j<m; ++j){
00116     /* determine d=max(1E-04*|p[j]|, delta), see HZ */
00117     d=CNST(1E-04)*p[j]; // force evaluation
00118     d=FABS(d);
00119     if(d<delta)
00120       d=delta;
00121     
00122     tmp=p[j];
00123     p[j]+=d;
00124     
00125     (*func)(p, hxx, m, n, adata);
00126     
00127     p[j]=tmp; /* restore */
00128     
00129     d=CNST(1.0)/d; /* invert so that divisions can be carried out faster as multiplications */
00130     for(i=0; i<n; ++i){
00131       jac[i*m+j]=(hxx[i]-hx[i])*d;
00132     }
00133   }
00134 
00135   return;
00136 }
00137 
00138 // -------------------------------------------------------------------
00139 
00140 /* central finite difference approximation to the jacobian of func */
00141 void nr_fdif_cent_jac_approx(
00142   void (*func)(double *p, double *hx, int m, int n, void *adata),
00143   /* function to differentiate */
00144   double *p,              /* I: current parameter estimate, mx1 */
00145   double *hxm,            /* W/O: work array for evaluating func(p-delta), nx1 */
00146   double *hxp,            /* W/O: work array for evaluating func(p+delta), nx1 */
00147   double delta,           /* increment for computing the jacobian */
00148   double *jac,            /* O: array for storing approximated jacobian, nxm */
00149   int m,
00150   int n,
00151   void *adata )
00152 {
00153   register int i, j;
00154   double tmp;
00155   register double d;
00156   
00157   for(j=0; j<m; ++j){
00158     /* determine d=max(1E-04*|p[j]|, delta), see HZ */
00159     d=CNST(1E-04)*p[j]; // force evaluation
00160     d=FABS(d);
00161     if(d<delta)
00162       d=delta;
00163     
00164     tmp=p[j];
00165     p[j]-=d;
00166     (*func)(p, hxm, m, n, adata);
00167     
00168     p[j]=tmp+d;
00169     (*func)(p, hxp, m, n, adata);
00170     p[j]=tmp; /* restore */
00171     
00172     d=CNST(0.5)/d; /* invert so that divisions can be carried out faster as multiplications */
00173     for(i=0; i<n; ++i){
00174       jac[i*m+j]=(hxp[i]-hxm[i])*d;
00175     }
00176   }
00177 
00178   return;
00179 }
00180 
00181 // -------------------------------------------------------------------
00182 
00183 /* 
00184  * Check the jacobian of a n-valued nonlinear function in m variables
00185  * evaluated at a point p, for consistency with the function itself.
00186  *
00187  * Based on fortran77 subroutine CHKDER by
00188  * Burton S. Garbow, Kenneth E. Hillstrom, Jorge J. More
00189  * Argonne National Laboratory. MINPACK project. March 1980.
00190  *
00191  *
00192  * func points to a function from R^m --> R^n: Given a p in R^m it yields hx in R^n
00193  * jacf points to a function implementing the jacobian of func, whose correctness
00194  *     is to be tested. Given a p in R^m, jacf computes into the nxm matrix j the
00195  *     jacobian of func at p. Note that row i of j corresponds to the gradient of
00196  *     the i-th component of func, evaluated at p.
00197  * p is an input array of length m containing the point of evaluation.
00198  * m is the number of variables
00199  * n is the number of functions
00200  * adata points to possible additional data and is passed uninterpreted
00201  *     to func, jacf.
00202  * err is an array of length n. On output, err contains measures
00203  *     of correctness of the respective gradients. if there is
00204  *     no severe loss of significance, then if err[i] is 1.0 the
00205  *     i-th gradient is correct, while if err[i] is 0.0 the i-th
00206  *     gradient is incorrect. For values of err between 0.0 and 1.0,
00207  *     the categorization is less certain. In general, a value of
00208  *     err[i] greater than 0.5 indicates that the i-th gradient is
00209  *     probably correct, while a value of err[i] less than 0.5
00210  *     indicates that the i-th gradient is probably incorrect.
00211  *
00212  *
00213  * The function does not perform reliably if cancellation or
00214  * rounding errors cause a severe loss of significance in the
00215  * evaluation of a function. therefore, none of the components
00216  * of p should be unusually small (in particular, zero) or any
00217  * other value which may cause loss of significance.
00218  */
00219 
00220 void nr_lmchkjac(
00221   void (*func)(double *p, double *hx, int m, int n, void *adata),
00222   void (*jacf)(double *p, double *j, int m, int n, void *adata),
00223   double *p, int m, int n, void *adata, double *err )
00224 {
00225   double factor=CNST(100.0);
00226   double one=CNST(1.0);
00227   double zero=CNST(0.0);
00228   double *fvec, *fjac, *pp, *fvecp, *buf;
00229 
00230   register int i, j;
00231   double eps, epsf, temp, epsmch;
00232   double epslog;
00233   int fvec_sz=n, fjac_sz=n*m, pp_sz=m, fvecp_sz=n;
00234   
00235   epsmch=DBL_EPSILON;
00236   eps=(double)sqrt(epsmch);
00237 
00238   buf=(double *)malloc((fvec_sz + fjac_sz + pp_sz + fvecp_sz)*sizeof(double));
00239   if(!buf){
00240     bpm_error( "memory allocation request failed in nr_lmchkjac(...)", 
00241                __FILE__, __LINE__ );
00242     exit( 1 );
00243   }
00244   fvec=buf;
00245   fjac=fvec+fvec_sz;
00246   pp=fjac+fjac_sz;
00247   fvecp=pp+pp_sz;
00248 
00249   /* compute fvec=func(p) */
00250   (*func)(p, fvec, m, n, adata);
00251 
00252   /* compute the jacobian at p */
00253   (*jacf)(p, fjac, m, n, adata);
00254 
00255   /* compute pp */
00256   for(j=0; j<m; ++j){
00257     temp=eps*FABS(p[j]);
00258     if(temp==zero) temp=eps;
00259     pp[j]=p[j]+temp;
00260   }
00261 
00262   /* compute fvecp=func(pp) */
00263   (*func)(pp, fvecp, m, n, adata);
00264 
00265   epsf=factor*epsmch;
00266   epslog=(double)log10(eps);
00267 
00268   for(i=0; i<n; ++i)
00269     err[i]=zero;
00270 
00271   for(j=0; j<m; ++j){
00272     temp=FABS(p[j]);
00273     if(temp==zero) temp=one;
00274 
00275     for(i=0; i<n; ++i)
00276       err[i]+=temp*fjac[i*m+j];
00277   }
00278 
00279   for(i=0; i<n; ++i){
00280     temp=one;
00281     if(fvec[i]!=zero && fvecp[i]!=zero && FABS(fvecp[i]-fvec[i])>=epsf*FABS(fvec[i]))
00282       temp=eps*FABS((fvecp[i]-fvec[i])/eps - err[i])/(FABS(fvec[i])+FABS(fvecp[i]));
00283     err[i]=one;
00284     if(temp>epsmch && temp<eps)
00285       err[i]=((double)log10(temp) - epslog)/epslog;
00286     if(temp>=eps) err[i]=zero;
00287   }
00288   
00289   free(buf);
00290   
00291   return;
00292 }
00293 
00294 // -------------------------------------------------------------------
00295 
00296 /*
00297  * This function computes the inverse of A in B. A and B can coincide
00298  *
00299  * The function employs LAPACK-free LU decomposition of A to solve m linear
00300  * systems A*B_i=I_i, where B_i and I_i are the i-th columns of B and I.
00301  *
00302  * A and B are mxm
00303  *
00304  * The function returns 0 in case of error,
00305  * 1 if successfull
00306  *
00307  */
00308 static int nr_lmLUinverse(double *A, double *B, int m) {
00309   void *buf=NULL;
00310   int buf_sz=0;
00311   
00312   register int i, j, k, l;
00313   int *idx, maxi=-1, idx_sz, a_sz, x_sz, work_sz, tot_sz;
00314   double *a, *x, *work, max, sum, tmp;
00315 
00316   /* calculate required memory size */
00317   idx_sz=m;
00318   a_sz=m*m;
00319   x_sz=m;
00320   work_sz=m;
00321   tot_sz=idx_sz*sizeof(int) + (a_sz+x_sz+work_sz)*sizeof(double);
00322   
00323   buf_sz=tot_sz;
00324   buf=(void *)malloc(tot_sz);
00325   if(!buf){
00326     bpm_error( "memory allocation request failed in nr_lmLUinverse(...)",           
00327                __FILE__, __LINE__ );      
00328     exit(1);
00329   }
00330   
00331   idx=(int *)buf;
00332   a=(double *)(idx + idx_sz);
00333   x=a + a_sz;
00334   work=x + x_sz;
00335   
00336   /* avoid destroying A by copying it to a */
00337   for(i=0; i<a_sz; ++i) a[i]=A[i];
00338 
00339   /* compute the LU decomposition of a row permutation of matrix a;
00340      the permutation itself is saved in idx[] */
00341   for(i=0; i<m; ++i){
00342     max=0.0;
00343     for(j=0; j<m; ++j)
00344       if((tmp=FABS(a[i*m+j]))>max)
00345         max=tmp;
00346     if(max==0.0){
00347       bpm_error( "Singular matrix A in nr_lmLUinverse(...)",           
00348                  __FILE__, __LINE__ );
00349       free(buf);
00350       
00351       return 0;
00352     }
00353     work[i]=CNST(1.0)/max;
00354   }
00355   
00356   for(j=0; j<m; ++j){
00357     for(i=0; i<j; ++i){
00358       sum=a[i*m+j];
00359       for(k=0; k<i; ++k)
00360         sum-=a[i*m+k]*a[k*m+j];
00361       a[i*m+j]=sum;
00362     }
00363     max=0.0;
00364     for(i=j; i<m; ++i){
00365       sum=a[i*m+j];
00366       for(k=0; k<j; ++k)
00367         sum-=a[i*m+k]*a[k*m+j];
00368       a[i*m+j]=sum;
00369       if((tmp=work[i]*FABS(sum))>=max){
00370         max=tmp;
00371         maxi=i;
00372       }
00373     }
00374     if(j!=maxi){
00375       for(k=0; k<m; ++k){
00376         tmp=a[maxi*m+k];
00377         a[maxi*m+k]=a[j*m+k];
00378         a[j*m+k]=tmp;
00379       }
00380       work[maxi]=work[j];
00381     }
00382     idx[j]=maxi;
00383     if(a[j*m+j]==0.0)
00384       a[j*m+j]=DBL_EPSILON;
00385     if(j!=m-1){
00386       tmp=CNST(1.0)/(a[j*m+j]);
00387       for(i=j+1; i<m; ++i)
00388         a[i*m+j]*=tmp;
00389     }
00390   }
00391   
00392   /* The decomposition has now replaced a. Solve the m linear systems using
00393    * forward and back substitution
00394    */
00395   for(l=0; l<m; ++l){
00396     for(i=0; i<m; ++i) x[i]=0.0;
00397     x[l]=CNST(1.0);
00398     
00399     for(i=k=0; i<m; ++i){
00400       j=idx[i];
00401       sum=x[j];
00402       x[j]=x[i];
00403       if(k!=0)
00404         for(j=k-1; j<i; ++j)
00405           sum-=a[i*m+j]*x[j];
00406       else
00407         if(sum!=0.0)
00408           k=i+1;
00409       x[i]=sum;
00410     }
00411     
00412     for(i=m-1; i>=0; --i){
00413       sum=x[i];
00414       for(j=i+1; j<m; ++j)
00415         sum-=a[i*m+j]*x[j];
00416       x[i]=sum/a[i*m+i];
00417     }
00418     
00419     for(i=0; i<m; ++i)
00420       B[i*m+l]=x[i];
00421   }
00422   
00423   free(buf);
00424   
00425   return 1;
00426 }
00427 
00428 // -------------------------------------------------------------------
00429 
00430 /*
00431  * This function computes in C the covariance matrix corresponding to a least
00432  * squares fit. JtJ is the approximate Hessian at the solution (i.e. J^T*J, where
00433  * J is the jacobian at the solution), sumsq is the sum of squared residuals
00434  * (i.e. goodnes of fit) at the solution, m is the number of parameters (variables)
00435  * and n the number of observations. JtJ can coincide with C.
00436  * 
00437  * if JtJ is of full rank, C is computed as sumsq/(n-m)*(JtJ)^-1
00438  * otherwise and if LAPACK is available, C=sumsq/(n-r)*(JtJ)^+
00439  * where r is JtJ's rank and ^+ denotes the pseudoinverse
00440  * The diagonal of C is made up from the estimates of the variances
00441  * of the estimated regression coefficients.
00442  * See the documentation of routine E04YCF from the NAG fortran lib
00443  *
00444  * The function returns the rank of JtJ if successful, 0 on error
00445  *
00446  * A and C are mxm
00447  *
00448  */
00449 int nr_lmcovar(double *JtJ, double *C, double sumsq, int m, int n) {
00450   register int i;
00451   int rnk;
00452   double fact;
00453   
00454   rnk=nr_lmLUinverse(JtJ, C, m);
00455   if(!rnk) return 0;
00456   
00457   rnk=m; /* assume full rank */
00458 
00459   fact=sumsq/(double)(n-rnk);
00460   for(i=0; i<m*m; ++i)
00461     C[i]*=fact;
00462   
00463   return rnk;
00464 }
00465 
00466 // -------------------------------------------------------------------
00467 
00468 int nr_lmder(
00469   void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
00470   void (*jacf)(double *p, double *j, int m, int n, void *adata),  /* function to evaluate the jacobian \part x / \part p */ 
00471   double *p,          /* I/O: initial parameter estimates. On output has the estimated solution */
00472   double *x,          /* I: measurement vector */
00473   int m,              /* I: parameter vector dimension (i.e. #unknowns) */
00474   int n,              /* I: measurement vector dimension */
00475   int itmax,          /* I: maximum number of iterations */
00476   double opts[4],     /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
00477                        * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used
00478                        */
00479   double info[LM_INFO_SZ],
00480                       /* O: information regarding the minimization. Set to NULL if don't care
00481                        * info[0]= ||e||_2 at initial p.
00482                        * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
00483                        * info[5]= # iterations,
00484                        * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
00485                        *                                 2 - stopped by small Dp
00486                        *                                 3 - stopped by itmax
00487                        *                                 4 - singular matrix. Restart from current p with increased mu 
00488                        *                                 5 - no further error reduction is possible. Restart with increased mu
00489                        *                                 6 - stopped by small ||e||_2
00490                        * info[7]= # function evaluations
00491                        * info[8]= # jacobian evaluations
00492                        */
00493   double *work,       /* working memory, allocate if NULL */
00494   double *covar,      /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
00495   void *adata)        /* pointer to possibly additional data, passed uninterpreted to func & jacf.
00496                       * Set to NULL if not needed
00497                       */
00498 {
00499   register int i, j, k, l;
00500   int worksz, freework=0, issolved;
00501 /* temp work arrays */
00502   double *e,          /* nx1 */
00503     *hx,         /* \hat{x}_i, nx1 */
00504     *jacTe,      /* J^T e_i mx1 */
00505     *jac,        /* nxm */
00506     *jacTjac,    /* mxm */
00507     *Dp,         /* mx1 */
00508     *diag_jacTjac,   /* diagonal of J^T J, mx1 */
00509     *pDp;        /* p + Dp, mx1 */
00510   
00511   register double mu,  /* damping constant */
00512     tmp; /* mainly used in matrix & vector multiplications */
00513   double p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */
00514   double p_L2, Dp_L2=DBL_MAX, dF, dL;
00515   double tau, eps1, eps2, eps2_sq, eps3;
00516   double init_p_eL2;
00517   int nu=2, nu2, stop, nfev, njev=0;
00518   const int nm=n*m;
00519 
00520   mu=jacTe_inf=0.0; /* -Wall */
00521   if(n<m){
00522     bpm_error( "Cannot solve a problem with fewer measurements than unknowns",
00523                __FILE__, __LINE__ );
00524     exit(1);
00525   }
00526 
00527   if(!jacf){
00528     bpm_error( "No function specified for computing the jacobian in",
00529                __FILE__, __LINE__ );
00530     exit(1);
00531   }
00532 
00533   if(opts){
00534     tau=opts[0];
00535     eps1=opts[1];
00536     eps2=opts[2];
00537     eps2_sq=opts[2]*opts[2];
00538     eps3=opts[3];
00539   }
00540   else{ // use default values
00541     tau=CNST(LM_INIT_MU);
00542     eps1=CNST(LM_STOP_THRESH);
00543     eps2=CNST(LM_STOP_THRESH);
00544     eps2_sq=CNST(LM_STOP_THRESH)*CNST(LM_STOP_THRESH);
00545     eps3=CNST(LM_STOP_THRESH);
00546   }
00547 
00548   if(!work){
00549     worksz=LM_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m;
00550     work=(double*)malloc(worksz*sizeof(double)); /* allocate a big chunk in one step */
00551     if(!work){
00552       bpm_error( "memory allocation request failed in nr_lmder(...)",           
00553                  __FILE__, __LINE__ );
00554       exit(1);
00555     }
00556     freework=1;
00557   }
00558 
00559   /* set up work arrays */
00560   e=work;
00561   hx=e + n;
00562   jacTe=hx + n;
00563   jac=jacTe + m;
00564   jacTjac=jac + nm;
00565   Dp=jacTjac + m*m;
00566   diag_jacTjac=Dp + m;
00567   pDp=diag_jacTjac + m;
00568   
00569   /* compute e=x - f(p) and its L2 norm */
00570   (*func)(p, hx, m, n, adata); nfev=1;
00571   for(i=0, p_eL2=0.0; i<n; ++i){
00572     e[i]=tmp=x[i]-hx[i];
00573     p_eL2+=tmp*tmp;
00574   }
00575   init_p_eL2=p_eL2;
00576   
00577   for(k=stop=0; k<itmax && !stop; ++k){
00578     /* Note that p and e have been updated at a previous iteration */
00579     
00580     if(p_eL2<=eps3){ /* error is small */
00581       stop=6;
00582       break;
00583     }
00584     
00585     /* Compute the jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
00586      * Since J^T J is symmetric, its computation can be speeded up by computing
00587      * only its upper triangular part and copying it to the lower part
00588      */
00589     
00590     (*jacf)(p, jac, m, n, adata); ++njev;
00591     
00592     /* J^T J, J^T e */
00593     if(nm<__LM_BLOCKSZ__SQ){ // this is a small problem
00594       /* This is the straightforward way to compute J^T J, J^T e. However, due to
00595        * its noncontinuous memory access pattern, it incures many cache misses when
00596        * applied to large minimization problems (i.e. problems involving a large
00597        * number of free variables and measurements), in which J is too large to
00598        * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
00599        * is preferable.
00600        *
00601        * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
00602        * performance problem.
00603        *
00604        * On the other hand, the straightforward algorithm is faster on small
00605        * problems since in this case it avoids the overheads of blocking. 
00606        */
00607       
00608       for(i=0; i<m; ++i){
00609         for(j=i; j<m; ++j){
00610           int lm;
00611           
00612           for(l=0, tmp=0.0; l<n; ++l){
00613             lm=l*m;
00614             tmp+=jac[lm+i]*jac[lm+j];
00615           }
00616 
00617                       /* store tmp in the corresponding upper and lower part elements */
00618           jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
00619         }
00620         
00621         /* J^T e */
00622         for(l=0, tmp=0.0; l<n; ++l)
00623           tmp+=jac[l*m+i]*e[l];
00624         jacTe[i]=tmp;
00625       }
00626     }
00627     else{ // this is a large problem
00628       /* Cache efficient computation of J^T J based on blocking
00629        */
00630       nr_trans_mat_mat_mult(jac, jacTjac, n, m);
00631 
00632       /* cache efficient computation of J^T e */
00633       for(i=0; i<m; ++i)
00634         jacTe[i]=0.0;
00635 
00636       for(i=0; i<n; ++i){
00637         register double *jacrow;
00638 
00639         for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l)
00640           jacTe[l]+=jacrow[l]*tmp;
00641       }
00642     }
00643 
00644     /* Compute ||J^T e||_inf and ||p||^2 */
00645     for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i){
00646       if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
00647       
00648       diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */
00649       p_L2+=p[i]*p[i];
00650     }
00651     //p_L2=sqrt(p_L2);
00652 
00653 #if 0
00654     if(!(k%100)){
00655       printf("Current estimate: ");
00656       for(i=0; i<m; ++i)
00657         printf("%.9g ", p[i]);
00658       printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
00659     }
00660 #endif
00661 
00662     /* check for convergence */
00663     if((jacTe_inf <= eps1)){
00664       Dp_L2=0.0; /* no increment for p in this case */
00665       stop=1;
00666       break;
00667     }
00668 
00669    /* compute initial damping factor */
00670     if(k==0){
00671       for(i=0, tmp=DBL_MIN; i<m; ++i)
00672         if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
00673       mu=tau*tmp;
00674     }
00675 
00676     /* determine increment using adaptive damping */
00677     while(1){
00678       /* augment normal equations */
00679       for(i=0; i<m; ++i)
00680         jacTjac[i*m+i]+=mu;
00681 
00682 
00683       /* use the LU included with levmar */
00684       issolved = nr_ax_eq_b_LU(jacTjac, jacTe, Dp, m);
00685 
00686       
00687       if(issolved){
00688         /* compute p's new estimate and ||Dp||^2 */
00689         for(i=0, Dp_L2=0.0; i<m; ++i){
00690           pDp[i]=p[i] + (tmp=Dp[i]);
00691           Dp_L2+=tmp*tmp;
00692         }
00693         //Dp_L2=sqrt(Dp_L2);
00694 
00695         if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
00696         //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */
00697           stop=2;
00698           break;
00699         }
00700 
00701        if(Dp_L2>=(p_L2+eps2)/(CNST(LM_EPSILON)*CNST(LM_EPSILON))){ /* almost singular */
00702        //if(Dp_L2>=(p_L2+eps2)/CNST(LM_EPSILON)){ /* almost singular */
00703          stop=4;
00704          break;
00705        }
00706 
00707         (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */
00708         for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */
00709           hx[i]=tmp=x[i]-hx[i];
00710           pDp_eL2+=tmp*tmp;
00711         }
00712 
00713         for(i=0, dL=0.0; i<m; ++i)
00714           dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);
00715 
00716         dF=p_eL2-pDp_eL2;
00717 
00718         if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
00719           tmp=(CNST(2.0)*dF/dL-CNST(1.0));
00720           tmp=CNST(1.0)-tmp*tmp*tmp;
00721           mu=mu*( (tmp>=CNST(LM_ONE_THIRD))? tmp : CNST(LM_ONE_THIRD) );
00722           nu=2;
00723           
00724           for(i=0 ; i<m; ++i) /* update p's estimate */
00725             p[i]=pDp[i];
00726 
00727           for(i=0; i<n; ++i) /* update e and ||e||_2 */
00728             e[i]=hx[i];
00729           p_eL2=pDp_eL2;
00730           break;
00731         }
00732       }
00733 
00734       /* if this point is reached, either the linear system could not be solved or
00735        * the error did not reduce; in any case, the increment must be rejected
00736        */
00737 
00738       mu*=nu;
00739       nu2=nu<<1; // 2*nu;
00740       if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
00741         stop=5;
00742         break;
00743       }
00744       nu=nu2;
00745 
00746       for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
00747         jacTjac[i*m+i]=diag_jacTjac[i];
00748     } /* inner loop */
00749   }
00750 
00751   if(k>=itmax) stop=3;
00752   
00753   for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
00754     jacTjac[i*m+i]=diag_jacTjac[i];
00755   
00756   if(info){
00757     info[0]=init_p_eL2;
00758     info[1]=p_eL2;
00759     info[2]=jacTe_inf;
00760     info[3]=Dp_L2;
00761     for(i=0, tmp=DBL_MIN; i<m; ++i)
00762       if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i];
00763     info[4]=mu/tmp;
00764     info[5]=(double)k;
00765     info[6]=(double)stop;
00766     info[7]=(double)nfev;
00767     info[8]=(double)njev;
00768   }
00769   
00770   /* covariance matrix */
00771   if(covar){
00772     nr_lmcovar(jacTjac, covar, p_eL2, m, n);
00773   }
00774 
00775   if(freework) free(work);
00776 
00777   return (stop!=4)?  k : -1;
00778 }
00779 
00780 
00781 /* Secant version of the LEVMAR_DER() function above: the jacobian is approximated with 
00782  * the aid of finite differences (forward or central, see the comment for the opts argument)
00783  */
00784 int nr_lmdif(
00785   void (*func)(double *p, double *hx, int m, int n, void *adata), 
00786                      /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
00787   double *p,         /* I/O: initial parameter estimates. On output has the estimated solution */
00788   double *x,         /* I: measurement vector */
00789   int m,             /* I: parameter vector dimension (i.e. #unknowns) */
00790   int n,             /* I: measurement vector dimension */
00791   int itmax,         /* I: maximum number of iterations */
00792   double opts[5],    /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
00793                        * scale factor for initial \mu, stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2 and
00794                        * the step used in difference approximation to the jacobian. Set to NULL for defaults to be used.
00795                        * If \delta<0, the jacobian is approximated with central differences which are more accurate
00796                        * (but slower!) compared to the forward differences employed by default. 
00797                        */
00798   double info[LM_INFO_SZ],
00799                      /* O: information regarding the minimization. Set to NULL if don't care
00800                       * info[0]= ||e||_2 at initial p.
00801                       * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
00802                       * info[5]= # iterations,
00803                       * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
00804                       *                                 2 - stopped by small Dp
00805                       *                                 3 - stopped by itmax
00806                       *                                 4 - singular matrix. Restart from current p with increased mu 
00807                       *                                 5 - no further error reduction is possible. Restart with increased mu
00808                       *                                 6 - stopped by small ||e||_2
00809                       * info[7]= # function evaluations
00810                       * info[8]= # jacobian evaluations
00811                       */
00812   double *work,       /* working memory, allocate if NULL */
00813   double *covar,      /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
00814   void *adata )       /* pointer to possibly additional data, passed uninterpreted to func.
00815                        * Set to NULL if not needed
00816                        */
00817 {
00818   register int i, j, k, l;
00819   int worksz, freework=0, issolved;
00820 /* temp work arrays */
00821   double *e,          /* nx1 */
00822     *hx,         /* \hat{x}_i, nx1 */
00823     *jacTe,      /* J^T e_i mx1 */
00824     *jac,        /* nxm */
00825     *jacTjac,    /* mxm */
00826     *Dp,         /* mx1 */
00827     *diag_jacTjac,   /* diagonal of J^T J, mx1 */
00828     *pDp,        /* p + Dp, mx1 */
00829     *wrk;        /* nx1 */
00830   
00831   int using_ffdif=1;
00832   double *wrk2=NULL; /* nx1, used for differentiating with central differences only */
00833   
00834   register double mu,  /* damping constant */
00835     tmp; /* mainly used in matrix & vector multiplications */
00836   double p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */
00837   double p_L2, Dp_L2=DBL_MAX, dF, dL;
00838   double tau, eps1, eps2, eps2_sq, eps3, delta;
00839   double init_p_eL2;
00840   int nu, nu2, stop, nfev, njap=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
00841   const int nm=n*m;
00842   
00843   mu=jacTe_inf=p_L2=0.0; /* -Wall */
00844   stop=updjac=newjac=0; /* -Wall */
00845   
00846   if(n<m){
00847     bpm_error( "Cannot solve a problem with fewer measurements than unknowns", 
00848                __FILE__, __LINE__ );
00849     exit(1);
00850   }
00851 
00852   if(opts){
00853     tau=opts[0];
00854     eps1=opts[1];
00855     eps2=opts[2];
00856     eps2_sq=opts[2]*opts[2];
00857     eps3=opts[3];
00858     delta=opts[4];
00859     if(delta<0.0){
00860       delta=-delta; /* make positive */
00861       using_ffdif=0; /* use central differencing */
00862       wrk2=(double *)malloc(n*sizeof(double));
00863       if(!wrk2){
00864         bpm_error( "memory allocation request failed in nr_lmdif(...)",           
00865                    __FILE__, __LINE__ );
00866         exit(1);
00867       }
00868     }
00869   }
00870   else{ // use default values
00871     tau=CNST(LM_INIT_MU);
00872     eps1=CNST(LM_STOP_THRESH);
00873     eps2=CNST(LM_STOP_THRESH);
00874     eps2_sq=CNST(LM_STOP_THRESH)*CNST(LM_STOP_THRESH);
00875     eps3=CNST(LM_STOP_THRESH);
00876     delta=CNST(LM_DIFF_DELTA);
00877   }
00878   
00879   if(!work){
00880     worksz=LM_DIF_WORKSZ(m, n); //3*n+4*m + n*m + m*m;
00881     work=(double *)malloc(worksz*sizeof(double)); /* allocate a big chunk in one step */
00882     if(!work){
00883       bpm_error( "memory allocation request failed in nr_lmdif(...)",           
00884                  __FILE__, __LINE__ );
00885       exit(1);
00886     }
00887     freework=1;
00888   }
00889   
00890   /* set up work arrays */
00891   e=work;
00892   hx=e + n;
00893   jacTe=hx + n;
00894   jac=jacTe + m;
00895   jacTjac=jac + nm;
00896   Dp=jacTjac + m*m;
00897   diag_jacTjac=Dp + m;
00898   pDp=diag_jacTjac + m;
00899   wrk=pDp + m;
00900 
00901   /* compute e=x - f(p) and its L2 norm */
00902   (*func)(p, hx, m, n, adata); nfev=1;
00903   for(i=0, p_eL2=0.0; i<n; ++i){
00904     e[i]=tmp=x[i]-hx[i];
00905     p_eL2+=tmp*tmp;
00906   }
00907   init_p_eL2=p_eL2;
00908   
00909   nu=20; /* force computation of J */
00910   
00911   for(k=0; k<itmax; ++k){
00912 
00913       
00914     /* Note that p and e have been updated at a previous iteration */
00915     
00916     if(p_eL2<=eps3){ /* error is small */
00917       stop=6;
00918       break;
00919     }
00920     
00921     /* Compute the jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
00922      * The symmetry of J^T J is again exploited for speed
00923      */
00924     
00925     if((updp && nu>16) || updjac==K){ /* compute difference approximation to J */
00926       if(using_ffdif){ /* use forward differences */
00927         nr_fdif_forw_jac_approx(func, p, hx, wrk, delta, jac, m, n, adata);
00928         ++njap; nfev+=m;
00929       }
00930       else{ /* use central differences */
00931         nr_fdif_cent_jac_approx(func, p, wrk, wrk2, delta, jac, m, n, adata);
00932         ++njap; nfev+=2*m;
00933       }
00934       nu=2; updjac=0; updp=0; newjac=1;
00935     }
00936 
00937     if(newjac){ /* jacobian has changed, recompute J^T J, J^t e, etc */
00938       newjac=0;
00939       
00940       /* J^T J, J^T e */
00941       if(nm<=__LM_BLOCKSZ__SQ){ // this is a small problem
00942         // This is the straightforward way to compute J^T J, J^T e. However, due to
00943         // its noncontinuous memory access pattern, it incures many cache misses when
00944         // applied to large minimization problems (i.e. problems involving a large
00945         // number of free variables and measurements), in which J is too large to
00946         // fit in the L1 cache. For such problems, a cache-efficient blocking scheme
00947         // is preferable.
00948         //
00949         // Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
00950         // performance problem.
00951         //
00952         // On the other hand, the straightforward algorithm is faster on small
00953         // problems since in this case it avoids the overheads of blocking. 
00955 
00956         for(i=0; i<m; ++i){
00957           for(j=i; j<m; ++j){
00958             int lm;
00959 
00960             for(l=0, tmp=0.0; l<n; ++l){
00961               lm=l*m;
00962               tmp+=jac[lm+i]*jac[lm+j];
00963             }
00964 
00965             jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
00966           }
00967 
00968           // J^T e   
00969           for(l=0, tmp=0.0; l<n; ++l)
00970             tmp+=jac[l*m+i]*e[l];
00971           jacTe[i]=tmp;
00972         }
00973       }
00974       else{ // this is a large problem
00975 
00976         /* Cache efficient computation of J^T J based on blocking
00977          */
00978         nr_trans_mat_mat_mult(jac, jacTjac, n, m);
00979 
00980         /* cache efficient computation of J^T e */
00981         for(i=0; i<m; ++i)
00982           jacTe[i]=0.0;
00983 
00984         for(i=0; i<n; ++i){
00985           register double *jacrow;
00986 
00987           for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l)
00988             jacTe[l]+=jacrow[l]*tmp;
00989         }
00990       }
00991       
00992       /* Compute ||J^T e||_inf and ||p||^2 */
00993       for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i){
00994         if ( jacTe_inf < ( tmp=FABS(jacTe[i]) )  ) jacTe_inf=tmp;
00995 
00996         diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */
00997         p_L2+=p[i]*p[i];
00998       }
00999       //p_L2=sqrt(p_L2);
01000     }
01001 
01002 
01003 #if 0
01004     if(!(k%100)){
01005       printf("Current estimate: ");
01006       for(i=0; i<m; ++i)
01007         printf("%.9g (%.9g) ", p[i], jacTe[i]);
01008       printf("-- errors %.9g %0.9g \n", jacTe_inf, p_eL2 );
01009     }
01010 #endif
01011 
01012     /* check for convergence */
01013     if((jacTe_inf <= eps1)){
01014       Dp_L2=0.0; /* no increment for p in this case */
01015       stop=1;
01016       break;
01017     }
01018 
01019    /* compute initial damping factor */
01020     if(k==0){
01021       for(i=0, tmp=DBL_MIN; i<m; ++i)
01022         if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
01023       mu=tau*tmp;
01024     }
01025 
01026     /* determine increment using adaptive damping */
01027 
01028     /* augment normal equations */
01029     for(i=0; i<m; ++i)
01030       jacTjac[i*m+i]+=mu;
01031 
01032     /* solve augmented equations */
01033     issolved=nr_ax_eq_b_LU(jacTjac, jacTe, Dp, m);
01034 
01035 
01036     if(issolved){
01037       /* compute p's new estimate and ||Dp||^2 */
01038       for(i=0, Dp_L2=0.0; i<m; ++i){
01039         pDp[i]=p[i] + (tmp=Dp[i]);
01040         Dp_L2+=tmp*tmp;
01041       }
01042       //Dp_L2=sqrt(Dp_L2);
01043 
01044       if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
01045       //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */
01046         stop=2;
01047         break;
01048       }
01049 
01050       if(Dp_L2>=(p_L2+eps2)/(CNST(LM_EPSILON)*CNST(LM_EPSILON))){ /* almost singular */
01051       //if(Dp_L2>=(p_L2+eps2)/CNST(LM_EPSILON)){ /* almost singular */
01052         stop=4;
01053         break;
01054       }
01055 
01056       (*func)(pDp, wrk, m, n, adata); ++nfev; /* evaluate function at p + Dp */
01057       for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */
01058         tmp=x[i]-wrk[i];
01059         pDp_eL2+=tmp*tmp;
01060       }
01061 
01062       dF=p_eL2-pDp_eL2;
01063       if(updp || dF>0){ /* update jac */
01064         for(i=0; i<n; ++i){
01065           for(l=0, tmp=0.0; l<m; ++l)
01066             tmp+=jac[i*m+l]*Dp[l]; /* (J * Dp)[i] */
01067           tmp=(wrk[i] - hx[i] - tmp)/Dp_L2; /* (f(p+dp)[i] - f(p)[i] - (J * Dp)[i])/(dp^T*dp) */
01068           for(j=0; j<m; ++j)
01069             jac[i*m+j]+=tmp*Dp[j];
01070         }
01071         ++updjac;
01072         newjac=1;
01073       }
01074 
01075       for(i=0, dL=0.0; i<m; ++i)
01076         dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);
01077 
01078       if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
01079         dF=(CNST(2.0)*dF/dL-CNST(1.0));
01080         tmp=dF*dF*dF;
01081         tmp=CNST(1.0)-tmp*tmp*dF;
01082         mu=mu*( (tmp>=CNST(LM_ONE_THIRD))? tmp : CNST(LM_ONE_THIRD) );
01083         nu=2;
01084 
01085         for(i=0 ; i<m; ++i) /* update p's estimate */
01086           p[i]=pDp[i];
01087 
01088         for(i=0; i<n; ++i){ /* update e, hx and ||e||_2 */
01089           e[i]=x[i]-wrk[i];
01090           hx[i]=wrk[i];
01091         }
01092         p_eL2=pDp_eL2;
01093         updp=1;
01094         continue;
01095       }
01096     }
01097 
01098     /* if this point is reached, either the linear system could not be solved or
01099      * the error did not reduce; in any case, the increment must be rejected
01100      */
01101 
01102     mu*=nu;
01103     nu2=nu<<1; // 2*nu;
01104     if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
01105       stop=5;
01106       break;
01107     }
01108     nu=nu2;
01109 
01110     for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
01111       jacTjac[i*m+i]=diag_jacTjac[i];
01112 
01113   } // iteration loop
01114 
01115   if(k>=itmax) stop=3;
01116 
01117   for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
01118     jacTjac[i*m+i]=diag_jacTjac[i];
01119 
01120   if(info){
01121     info[0]=init_p_eL2;
01122     info[1]=p_eL2;
01123     info[2]=jacTe_inf;
01124     info[3]=Dp_L2;
01125     for(i=0, tmp=DBL_MIN; i<m; ++i)
01126       if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i];
01127     info[4]=mu/tmp;
01128     info[5]=(double)k;
01129     info[6]=(double)stop;
01130     info[7]=(double)nfev;
01131     info[8]=(double)njap;
01132   }
01133 
01134   /* covariance matrix */
01135   if(covar){
01136     nr_lmcovar(jacTjac, covar, p_eL2, m, n);
01137   }
01138 
01139                                                                
01140   if ( freework && ( work != NULL )) free(work);
01141 
01142   if ( wrk2 ) free(wrk2);
01143 
01144   return (stop!=4)?  k : -1;
01145 }
01146 
01147 
01148 /*
01149  * This function returns the solution of Ax = b
01150  *
01151  * The function employs LU decomposition followed by forward/back substitution (see 
01152  * also the LAPACK-based LU solver above)
01153  *
01154  * A is mxm, b is mx1
01155  *
01156  * The function returns 0 in case of error,
01157  * 1 if successfull
01158  *
01159  * This function is often called repetitively to solve problems of identical
01160  * dimensions. To avoid repetitive malloc's and free's, allocated memory is
01161  * retained between calls and free'd-malloc'ed when not of the appropriate size.
01162  * A call with NULL as the first argument forces this memory to be released.
01163  */
01164 int nr_ax_eq_b_LU(double *A, double *B, double *x, int m ) {
01165   __LM_STATIC__ void *buf=NULL;
01166   __LM_STATIC__ int buf_sz=0;
01167 
01168   register int i, j, k;
01169   int *idx, maxi=-1, idx_sz, a_sz, work_sz, tot_sz;
01170   double *a, *work, max, sum, tmp;
01171   
01172 #ifdef LINSOLVERS_RETAIN_MEMORY
01173   if(!A){
01174     if(buf) free(buf);
01175     buf_sz=0;
01176     return 1;
01177   }
01178 #endif /* LINSOLVERS_RETAIN_MEMORY */
01179    
01180   /* calculate required memory size */
01181   idx_sz=m;
01182   a_sz=m*m;
01183   work_sz=m;
01184   tot_sz=idx_sz*sizeof(int) + (a_sz+work_sz)*sizeof(double);
01185   
01186 #ifdef LINSOLVERS_RETAIN_MEMORY
01187   if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
01188     if(buf) free(buf); /* free previously allocated memory */
01189     
01190     buf_sz=tot_sz;
01191     buf=(void *)malloc(tot_sz);
01192     if(!buf){
01193       bpm_error( "memory allocation request failed in nr_ax_eq_b_LU(...)",           
01194                  __FILE__, __LINE__ ); 
01195       exit(1);
01196     }
01197   }
01198 #else
01199   buf_sz=tot_sz;
01200   buf=(void *)malloc(tot_sz);
01201   if(!buf){
01202     bpm_error( "memory allocation request failed in nr_ax_eq_b_LU(...)",           
01203                __FILE__, __LINE__ ); 
01204     exit(1);
01205   }
01206 #endif /* LINSOLVERS_RETAIN_MEMORY */
01207   
01208   idx=(int *)buf;
01209   a=(double *)(idx + idx_sz);
01210   work=a + a_sz;
01211   
01212   /* avoid destroying A, B by copying them to a, x resp. */
01213   for(i=0; i<m; ++i){ // B & 1st row of A
01214     a[i]=A[i];
01215     x[i]=B[i];
01216   }
01217   for(  ; i<a_sz; ++i) a[i]=A[i]; // copy A's remaining rows
01218   /****
01219        for(i=0; i<m; ++i){
01220        for(j=0; j<m; ++j)
01221        a[i*m+j]=A[i*m+j];
01222        x[i]=B[i];
01223        }
01224   ****/
01225   
01226   /* compute the LU decomposition of a row permutation of matrix a; the permutation itself is saved in idx[] */
01227   for(i=0; i<m; ++i){
01228     max=0.0;
01229     for(j=0; j<m; ++j)
01230       if((tmp=FABS(a[i*m+j]))>max)
01231         max=tmp;
01232     if(max==0.0){
01233       bpm_error( "Singular matrix A in nr_ax_eq_b_LU(...)", __FILE__, __LINE__ ); 
01234 
01235 #ifndef LINSOLVERS_RETAIN_MEMORY
01236       free(buf);
01237 #endif
01238       
01239       return 0;
01240     }
01241     work[i]=CNST(1.0)/max;
01242   }
01243   
01244   for(j=0; j<m; ++j){
01245     for(i=0; i<j; ++i){
01246       sum=a[i*m+j];
01247       for(k=0; k<i; ++k)
01248         sum-=a[i*m+k]*a[k*m+j];
01249       a[i*m+j]=sum;
01250     }
01251     max=0.0;
01252     for(i=j; i<m; ++i){
01253       sum=a[i*m+j];
01254       for(k=0; k<j; ++k)
01255         sum-=a[i*m+k]*a[k*m+j];
01256       a[i*m+j]=sum;
01257       if((tmp=work[i]*FABS(sum))>=max){
01258         max=tmp;
01259         maxi=i;
01260       }
01261     }
01262     if(j!=maxi){
01263       for(k=0; k<m; ++k){
01264         tmp=a[maxi*m+k];
01265         a[maxi*m+k]=a[j*m+k];
01266         a[j*m+k]=tmp;
01267       }
01268       work[maxi]=work[j];
01269     }
01270     idx[j]=maxi;
01271     if(a[j*m+j]==0.0)
01272       a[j*m+j]=DBL_EPSILON;
01273     if(j!=m-1){
01274       tmp=CNST(1.0)/(a[j*m+j]);
01275       for(i=j+1; i<m; ++i)
01276         a[i*m+j]*=tmp;
01277     }
01278   }
01279   
01280   /* The decomposition has now replaced a. Solve the linear system using
01281    * forward and back substitution
01282    */
01283   for(i=k=0; i<m; ++i){
01284     j=idx[i];
01285     sum=x[j];
01286     x[j]=x[i];
01287     if(k!=0)
01288       for(j=k-1; j<i; ++j)
01289         sum-=a[i*m+j]*x[j];
01290     else
01291       if(sum!=0.0)
01292         k=i+1;
01293     x[i]=sum;
01294   }
01295   
01296   for(i=m-1; i>=0; --i){
01297     sum=x[i];
01298     for(j=i+1; j<m; ++j)
01299       sum-=a[i*m+j]*x[j];
01300     x[i]=sum/a[i*m+i];
01301   }
01302   
01303 #ifndef LINSOLVERS_RETAIN_MEMORY
01304   free(buf);
01305 #endif
01306   
01307   return 1;
01308 }
01309 
01310 // ---------------------------------------------------------------
01311 
01312 
01313 static void
01314 lm_lnsrch(int m, double *x, double f, double *g, double *p, double alpha, double *xpls,
01315        double *ffpls, void (*func)(double *p, double *hx, int m, int n, void *adata), struct lm_fstate state,
01316        int *mxtake, int *iretcd, double stepmx, double steptl, double *sx)
01317 {
01318 /* Find a next newton iterate by backtracking line search.
01319  * Specifically, finds a \lambda such that for a fixed alpha<0.5 (usually 1e-4),
01320  * f(x + \lambda*p) <= f(x) + alpha * \lambda * g^T*p
01321  *
01322  * Translated (with minor changes) from Schnabel, Koontz & Weiss uncmin.f,  v1.3
01323 
01324  * PARAMETERS :
01325 
01326  *      m       --> dimension of problem (i.e. number of variables)
01327  *      x(m)    --> old iterate:        x[k-1]
01328  *      f       --> function value at old iterate, f(x)
01329  *      g(m)    --> gradient at old iterate, g(x), or approximate
01330  *      p(m)    --> non-zero newton step
01331  *      alpha   --> fixed constant < 0.5 for line search (see above)
01332  *      xpls(m) <--      new iterate x[k]
01333  *      ffpls   <--      function value at new iterate, f(xpls)
01334  *      func    --> name of subroutine to evaluate function
01335  *      state   <--> information other than x and m that func requires.
01336  *                          state is not modified in xlnsrch (but can be modified by func).
01337  *      iretcd  <--      return code
01338  *      mxtake  <--      boolean flag indicating step of maximum length used
01339  *      stepmx  --> maximum allowable step size
01340  *      steptl  --> relative step size at which successive iterates
01341  *                          considered close enough to terminate algorithm
01342  *      sx(m)     --> diagonal scaling matrix for x, can be NULL
01343 
01344  *      internal variables
01345 
01346  *      sln              newton length
01347  *      rln              relative length of newton step
01348 */
01349 
01350     register int i;
01351     int firstback = 1;
01352     double disc;
01353     double a3, b;
01354     double t1, t2, t3, lambda, tlmbda, rmnlmb;
01355     double scl, rln, sln, slp;
01356     double tmp1, tmp2;
01357     double fpls, pfpls = 0., plmbda = 0.; /* -Wall */
01358 
01359     f*=CNST(0.5);
01360     *mxtake = 0;
01361     *iretcd = 2;
01362     tmp1 = 0.;
01363     if(!sx) /* no scaling */
01364       for (i = 0; i < m; ++i)
01365         tmp1 += p[i] * p[i];
01366     else
01367       for (i = 0; i < m; ++i)
01368         tmp1 += sx[i] * sx[i] * p[i] * p[i];
01369     sln = (double)sqrt(tmp1);
01370     if (sln > stepmx) {
01371           /*    newton step longer than maximum allowed */
01372             scl = stepmx / sln;
01373       for(i=0; i<m; ++i) /* p * scl */
01374         p[i]*=scl;
01375             sln = stepmx;
01376     }
01377     for(i=0, slp=0.; i<m; ++i) /* g^T * p */
01378       slp+=g[i]*p[i];
01379     rln = 0.;
01380     if(!sx) /* no scaling */
01381       for (i = 0; i < m; ++i) {
01382               tmp1 = (FABS(x[i])>=CNST(1.))? FABS(x[i]) : CNST(1.);
01383               tmp2 = FABS(p[i])/tmp1;
01384               if(rln < tmp2) rln = tmp2;
01385       }
01386     else
01387       for (i = 0; i < m; ++i) {
01388               tmp1 = (FABS(x[i])>=CNST(1.)/sx[i])? FABS(x[i]) : CNST(1.)/sx[i];
01389               tmp2 = FABS(p[i])/tmp1;
01390               if(rln < tmp2) rln = tmp2;
01391       }
01392     rmnlmb = steptl / rln;
01393     lambda = CNST(1.0);
01394 
01395     /*  check if new iterate satisfactory.  generate new lambda if necessary. */
01396 
01397     while(*iretcd > 1) {
01398             for (i = 0; i < m; ++i)
01399               xpls[i] = x[i] + lambda * p[i];
01400 
01401       /* evaluate function at new point */
01402       (*func)(xpls, state.hx, m, state.n, state.adata);
01403       for(i=0, tmp1=0.0; i<state.n; ++i){
01404         state.hx[i]=tmp2=state.x[i]-state.hx[i];
01405         tmp1+=tmp2*tmp2;
01406       }
01407       fpls=CNST(0.5)*tmp1; *ffpls=tmp1; ++(*(state.nfev));
01408 
01409             if (fpls <= f + slp * alpha * lambda) { /* solution found */
01410               *iretcd = 0;
01411               if (lambda == CNST(1.) && sln > stepmx * CNST(.99)) *mxtake = 1;
01412               return;
01413             }
01414 
01415             /* else : solution not (yet) found */
01416 
01417       /* First find a point with a finite value */
01418 
01419             if (lambda < rmnlmb) {
01420               /* no satisfactory xpls found sufficiently distinct from x */
01421 
01422               *iretcd = 1;
01423               return;
01424             }
01425             else { /*   calculate new lambda */
01426 
01427               /* modifications to cover non-finite values */
01428               if (fpls >= DBL_MAX) {
01429                       lambda *= CNST(0.1);
01430                       firstback = 1;
01431               }
01432               else {
01433                       if (firstback) { /*       first backtrack: quadratic fit */
01434                         tlmbda = -lambda * slp / ((fpls - f - slp) * CNST(2.));
01435                         firstback = 0;
01436                       }
01437                       else { /* all subsequent backtracks: cubic fit */
01438                         t1 = fpls - f - lambda * slp;
01439                         t2 = pfpls - f - plmbda * slp;
01440                         t3 = CNST(1.) / (lambda - plmbda);
01441                         a3 = CNST(3.) * t3 * (t1 / (lambda * lambda)
01442                                       - t2 / (plmbda * plmbda));
01443                         b = t3 * (t2 * lambda / (plmbda * plmbda)
01444                                   - t1 * plmbda / (lambda * lambda));
01445                         disc = b * b - a3 * slp;
01446                         if (disc > b * b)
01447                             /* only one positive critical point, must be minimum */
01448                                 tlmbda = (-b + ((a3 < 0)? -(double)sqrt(disc): (double)sqrt(disc))) /a3;
01449                         else
01450                             /* both critical points positive, first is minimum */
01451                                 tlmbda = (-b + ((a3 < 0)? (double)sqrt(disc): -(double)sqrt(disc))) /a3;
01452 
01453                         if (tlmbda > lambda * CNST(.5))
01454                                 tlmbda = lambda * CNST(.5);
01455                     }
01456                     plmbda = lambda;
01457                     pfpls = fpls;
01458                     if (tlmbda < lambda * CNST(.1))
01459                       lambda *= CNST(.1);
01460                     else
01461                       lambda = tlmbda;
01462       }
01463           }
01464   }
01465 } /* lm_lnsrch */
01466 
01467 /* Projections to feasible set \Omega: P_{\Omega}(y) := arg min { ||x - y|| : x \in \Omega},  y \in R^m */
01468 
01469 /* project vector p to a box shaped feasible set. p is a mx1 vector.
01470  * Either lb, ub can be NULL. If not NULL, they are mx1 vectors
01471  */
01472 static void boxProject(double *p, double *lb, double *ub, int m) {
01473   register int i;
01474   
01475   if(!lb){ /* no lower bounds */
01476     if(!ub) /* no upper bounds */
01477       return;
01478     else{ /* upper bounds only */
01479       for(i=0; i<m; ++i)
01480         if(p[i]>ub[i]) p[i]=ub[i];
01481     }
01482   }
01483   else
01484     if(!ub){ /* lower bounds only */
01485       for(i=0; i<m; ++i)
01486         if(p[i]<lb[i]) p[i]=lb[i];
01487     }
01488     else /* box bounds */
01489       for(i=0; i<m; ++i)
01490         p[i]=__LM_MEDIAN3(lb[i], p[i], ub[i]);
01491 
01492   return;
01493 }
01494 
01495 /* check box constraints for consistency */
01496 static int boxCheck(double *lb, double *ub, int m) {
01497   register int i;
01498   
01499   if(!lb || !ub) return 1;
01500   
01501   for(i=0; i<m; ++i)
01502     if(lb[i]>ub[i]) return 0;
01503   
01504   return 1;
01505 }
01506 
01507 /* 
01508  * This function seeks the parameter vector p that best describes the measurements
01509  * vector x under box constraints.
01510  * More precisely, given a vector function  func : R^m --> R^n with n>=m,
01511  * it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of
01512  * e=x-func(p) is minimized under the constraints lb[i]<=p[i]<=ub[i].
01513  * If no lower bound constraint applies for p[i], use -DBL_MAX/-FLT_MAX for lb[i];
01514  * If no upper bound constraint applies for p[i], use DBL_MAX/FLT_MAX for ub[i].
01515  *
01516  * This function requires an analytic jacobian. In case the latter is unavailable,
01517  * use LEVMAR_BC_DIF() bellow
01518  *
01519  * Returns the number of iterations (>=0) if successfull, -1 if failed
01520  *
01521  * For details, see C. Kanzow, N. Yamashita and M. Fukushima: "Levenberg-Marquardt
01522  * methods for constrained nonlinear equations with strong local convergence properties",
01523  * Journal of Computational and Applied Mathematics 172, 2004, pp. 375-397.
01524  * Also, see H.B. Nielsen's (http://www.imm.dtu.dk/~hbn) IMM/DTU tutorial on
01525  * unconrstrained Levenberg-Marquardt at http://www.imm.dtu.dk/courses/02611/nllsq.pdf
01526  */
01527 
01528 int nr_lmder_bc(
01529   void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
01530   void (*jacf)(double *p, double *j, int m, int n, void *adata),  /* function to evaluate the jacobian \part x / \part p */ 
01531   double *p,         /* I/O: initial parameter estimates. On output has the estimated solution */
01532   double *x,         /* I: measurement vector */
01533   int m,              /* I: parameter vector dimension (i.e. #unknowns) */
01534   int n,              /* I: measurement vector dimension */
01535   double *lb,        /* I: vector of lower bounds. If NULL, no lower bounds apply */
01536   double *ub,        /* I: vector of upper bounds. If NULL, no upper bounds apply */
01537   int itmax,          /* I: maximum number of iterations */
01538   double opts[4],    /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
01539                       * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used.
01540                       * Note that ||J^T e||_inf is computed on free (not equal to lb[i] or ub[i]) variables only.
01541                       */
01542   double info[LM_INFO_SZ],
01543   /* O: information regarding the minimization. Set to NULL if don't care
01544    * info[0]= ||e||_2 at initial p.
01545    * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
01546    * info[5]= # iterations,
01547    * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
01548    *                                 2 - stopped by small Dp
01549    *                                 3 - stopped by itmax
01550    *                                 4 - singular matrix. Restart from current p with increased mu 
01551    *                                 5 - no further error reduction is possible. Restart with increased mu
01552    *                                 6 - stopped by small ||e||_2
01553    * info[7]= # function evaluations
01554    * info[8]= # jacobian evaluations
01555    */
01556   double *work,     /* working memory, allocate if NULL */
01557   double *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
01558   void *adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
01559                       * Set to NULL if not needed
01560                       */
01561 {
01562   register int i, j, k, l;
01563   int worksz, freework=0, issolved;
01564 /* temp work arrays */
01565   double *e,          /* nx1 */
01566     *hx,         /* \hat{x}_i, nx1 */
01567     *jacTe,      /* J^T e_i mx1 */
01568     *jac,        /* nxm */
01569     *jacTjac,    /* mxm */
01570     *Dp,         /* mx1 */
01571     *diag_jacTjac,   /* diagonal of J^T J, mx1 */
01572     *pDp;        /* p + Dp, mx1 */
01573   
01574   register double mu,  /* damping constant */
01575     tmp; /* mainly used in matrix & vector multiplications */
01576   double p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */
01577   double p_L2, Dp_L2=DBL_MAX, dF, dL;
01578   double tau, eps1, eps2, eps2_sq, eps3;
01579   double init_p_eL2;
01580   int nu=2, nu2, stop, nfev, njev=0;
01581   const int nm=n*m;
01582   
01583 /* variables for constrained LM */
01584   struct lm_fstate fstate;
01585   double alpha=CNST(1e-4), beta=CNST(0.9), gamma=CNST(0.99995), gamma_sq=gamma*gamma, rho=CNST(1e-8);
01586   double t, t0;
01587   double steptl=CNST(1e3)*(double)sqrt(DBL_EPSILON), jacTeDp;
01588   double tmin=CNST(1e-12), tming=CNST(1e-18); /* minimum step length for LS and PG steps */
01589   const double tini=CNST(1.0); /* initial step length for LS and PG steps */
01590   int nLMsteps=0, nLSsteps=0, nPGsteps=0, gprevtaken=0;
01591   int numactive;
01592   
01593   mu=jacTe_inf=t=0.0;  tmin=tmin; /* -Wall */
01594   
01595   if(n<m){
01596     bpm_error( "Cannot solve a problem with fewer measurements than unknowns",           
01597                __FILE__, __LINE__ );
01598     exit(1);
01599   }
01600 
01601   if(!jacf){
01602     bpm_error( "No function specified for computing the jacobian in",
01603                __FILE__, __LINE__ );
01604     exit(1);
01605   }
01606 
01607   if(!boxCheck(lb, ub, m)){
01608     bpm_error( "At least one lower bound exceeds the upper one", 
01609                __FILE__, __LINE__ );
01610     exit(1);
01611   }
01612   
01613   if(opts){
01614     tau=opts[0];
01615     eps1=opts[1];
01616     eps2=opts[2];
01617     eps2_sq=opts[2]*opts[2];
01618     eps3=opts[3];
01619   }
01620   else{ // use default values
01621     tau=CNST(LM_INIT_MU);
01622     eps1=CNST(LM_STOP_THRESH);
01623     eps2=CNST(LM_STOP_THRESH);
01624     eps2_sq=CNST(LM_STOP_THRESH)*CNST(LM_STOP_THRESH);
01625     eps3=CNST(LM_STOP_THRESH);
01626   }
01627   
01628   if(!work){
01629     worksz=LM_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m;
01630     work=(double *)malloc(worksz*sizeof(double)); /* allocate a big chunk in one step */
01631     if(!work){
01632       bpm_error( "Memory allocation request failed", __FILE__, __LINE__ );
01633       exit(1);
01634     }
01635     freework=1;
01636   }
01637 
01638   /* set up work arrays */
01639   e=work;
01640   hx=e + n;
01641   jacTe=hx + n;
01642   jac=jacTe + m;
01643   jacTjac=jac + nm;
01644   Dp=jacTjac + m*m;
01645   diag_jacTjac=Dp + m;
01646   pDp=diag_jacTjac + m;
01647 
01648   fstate.n=n;
01649   fstate.hx=hx;
01650   fstate.x=x;
01651   fstate.adata=adata;
01652   fstate.nfev=&nfev;
01653   
01654   /* see if starting point is within the feasile set */
01655   for(i=0; i<m; ++i)
01656     pDp[i]=p[i];
01657   boxProject(p, lb, ub, m); /* project to feasible set */
01658   for(i=0; i<m; ++i)
01659     if(pDp[i]!=p[i])
01660       fprintf(stderr, "Warning: component %d of starting point not feasible in nr_lmder_bc! [%g projected to %g]\n", i, p[i], pDp[i]);
01661   
01662   /* compute e=x - f(p) and its L2 norm */
01663   (*func)(p, hx, m, n, adata); nfev=1;
01664   for(i=0, p_eL2=0.0; i<n; ++i){
01665     e[i]=tmp=x[i]-hx[i];
01666     p_eL2+=tmp*tmp;
01667   }
01668   init_p_eL2=p_eL2;
01669   
01670   for(k=stop=0; k<itmax && !stop; ++k){
01671     //printf("%d  %.15g\n", k, 0.5*p_eL2);
01672     /* Note that p and e have been updated at a previous iteration */
01673     
01674     if(p_eL2<=eps3){ /* error is small */
01675       stop=6;
01676       break;
01677     }
01678     
01679     /* Compute the jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
01680      * Since J^T J is symmetric, its computation can be speeded up by computing
01681      * only its upper triangular part and copying it to the lower part
01682      */
01683     
01684     (*jacf)(p, jac, m, n, adata); ++njev;
01685     
01686     /* J^T J, J^T e */
01687     if(nm<__LM_BLOCKSZ__SQ){ // this is a small problem
01688       /* This is the straightforward way to compute J^T J, J^T e. However, due to
01689        * its noncontinuous memory access pattern, it incures many cache misses when
01690        * applied to large minimization problems (i.e. problems involving a large
01691        * number of free variables and measurements), in which J is too large to
01692        * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
01693        * is preferable.
01694        *
01695        * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
01696        * performance problem.
01697        *
01698        * On the other hand, the straightforward algorithm is faster on small
01699        * problems since in this case it avoids the overheads of blocking. 
01700        */
01701       
01702       for(i=0; i<m; ++i){
01703         for(j=i; j<m; ++j){
01704           int lm;
01705           
01706           for(l=0, tmp=0.0; l<n; ++l){
01707             lm=l*m;
01708             tmp+=jac[lm+i]*jac[lm+j];
01709           }
01710           
01711           /* store tmp in the corresponding upper and lower part elements */
01712           jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
01713         }
01714         
01715         /* J^T e */
01716         for(l=0, tmp=0.0; l<n; ++l)
01717           tmp+=jac[l*m+i]*e[l];
01718         jacTe[i]=tmp;
01719       }
01720     }
01721     else{ // this is a large problem
01722       /* Cache efficient computation of J^T J based on blocking
01723        */
01724       nr_trans_mat_mat_mult(jac, jacTjac, n, m);
01725 
01726       /* cache efficient computation of J^T e */
01727       for(i=0; i<m; ++i)
01728         jacTe[i]=0.0;
01729       
01730       for(i=0; i<n; ++i){
01731         register double *jacrow;
01732         
01733         for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l)
01734           jacTe[l]+=jacrow[l]*tmp;
01735       }
01736     }
01737     
01738     /* Compute ||J^T e||_inf and ||p||^2. Note that ||J^T e||_inf
01739      * is computed for free (i.e. inactive) variables only. 
01740      * At a local minimum, if p[i]==ub[i] then g[i]>0;
01741      * if p[i]==lb[i] g[i]<0; otherwise g[i]=0 
01742      */
01743     for(i=j=numactive=0, p_L2=jacTe_inf=0.0; i<m; ++i){
01744       if(ub && p[i]==ub[i]){ ++numactive; if(jacTe[i]>0.0) ++j; }
01745       else if(lb && p[i]==lb[i]){ ++numactive; if(jacTe[i]<0.0) ++j; }
01746       else if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
01747       
01748       diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */
01749       p_L2+=p[i]*p[i];
01750     }
01751     //p_L2=sqrt(p_L2);
01752     
01753 #if 0
01754     if(!(k%100)){
01755       printf("Current estimate: ");
01756       for(i=0; i<m; ++i)
01757         printf("%.9g ", p[i]);
01758       printf("-- errors %.9g %0.9g, #active %d [%d]\n", jacTe_inf, p_eL2, numactive, j);
01759     }
01760 #endif
01761     
01762     /* check for convergence */
01763     if(j==numactive && (jacTe_inf <= eps1)){
01764       Dp_L2=0.0; /* no increment for p in this case */
01765       stop=1;
01766       break;
01767     }
01768     
01769     /* compute initial damping factor */
01770     if(k==0){
01771       if(!lb && !ub){ /* no bounds */
01772         for(i=0, tmp=DBL_MIN; i<m; ++i)
01773           if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
01774         mu=tau*tmp;
01775       }
01776       else 
01777         mu=CNST(0.5)*tau*p_eL2; /* use Kanzow's starting mu */
01778     }
01779     
01780     /* determine increment using a combination of adaptive damping, line search and projected gradient search */
01781     while(1){
01782       /* augment normal equations */
01783       for(i=0; i<m; ++i)
01784         jacTjac[i*m+i]+=mu;
01785       
01786       /* solve augmented equations, USE INTERNAL ROUTINE, OTHERWISE USE LAPACK... */
01787       issolved = nr_ax_eq_b_LU(jacTjac, jacTe, Dp, m);
01788 
01789       if(issolved){
01790         for(i=0; i<m; ++i)
01791           pDp[i]=p[i] + Dp[i];
01792         
01793         /* compute p's new estimate and ||Dp||^2 */
01794         boxProject(pDp, lb, ub, m); /* project to feasible set */
01795         for(i=0, Dp_L2=0.0; i<m; ++i){
01796           Dp[i]=tmp=pDp[i]-p[i];
01797           Dp_L2+=tmp*tmp;
01798         }
01799         //Dp_L2=sqrt(Dp_L2);
01800         
01801         if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
01802           stop=2;
01803           break;
01804         }
01805         
01806         if(Dp_L2>=(p_L2+eps2)/(CNST(LM_EPSILON)*CNST(LM_EPSILON))){ /* almost singular */
01807           stop=4;
01808           break;
01809         }
01810         
01811         (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */
01812         for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */
01813           hx[i]=tmp=x[i]-hx[i];
01814           pDp_eL2+=tmp*tmp;
01815         }
01816         
01817         if(pDp_eL2<=gamma_sq*p_eL2){
01818           for(i=0, dL=0.0; i<m; ++i)
01819             dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);
01820           
01821 #if 1
01822           if(dL>0.0){
01823             dF=p_eL2-pDp_eL2;
01824             tmp=(CNST(2.0)*dF/dL-CNST(1.0));
01825             tmp=CNST(1.0)-tmp*tmp*tmp;
01826             mu=mu*( (tmp>=CNST(LM_ONE_THIRD))? tmp : CNST(LM_ONE_THIRD) );
01827           }
01828           else
01829             mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */
01830 #else
01831           
01832           mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */
01833 #endif
01834           
01835           nu=2;
01836           
01837           for(i=0 ; i<m; ++i) /* update p's estimate */
01838             p[i]=pDp[i];
01839           
01840           for(i=0; i<n; ++i) /* update e and ||e||_2 */
01841             e[i]=hx[i];
01842           p_eL2=pDp_eL2;
01843           ++nLMsteps;
01844           gprevtaken=0;
01845           break;
01846         }
01847       }
01848       else{
01849         
01850         /* the augmented linear system could not be solved, increase mu */
01851         
01852         mu*=nu;
01853         nu2=nu<<1; // 2*nu;
01854         if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
01855           stop=5;
01856           break;
01857         }
01858         nu=nu2;
01859         
01860         for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
01861           jacTjac[i*m+i]=diag_jacTjac[i];
01862         
01863         continue; /* solve again with increased nu */
01864       }
01865       
01866       /* if this point is reached, the LM step did not reduce the error;
01867        * see if it is a descent direction
01868        */
01869       
01870       /* negate jacTe (i.e. g) & compute g^T * Dp */
01871       for(i=0, jacTeDp=0.0; i<m; ++i){
01872         jacTe[i]=-jacTe[i];
01873         jacTeDp+=jacTe[i]*Dp[i];
01874       }
01875       
01876       if(jacTeDp<=-rho*pow(Dp_L2, _LM_POW_/CNST(2.0))){
01877         /* Dp is a descent direction; do a line search along it */
01878         int mxtake, iretcd;
01879         double stepmx;
01880         
01881         tmp=(double)sqrt(p_L2); stepmx=CNST(1e3)*( (tmp>=CNST(1.0))? tmp : CNST(1.0) );
01882         
01883 #if 1
01884         /* use Schnabel's backtracking line search; it requires fewer "func" evaluations */
01885         lm_lnsrch(m, p, p_eL2, jacTe, Dp, alpha, pDp, &pDp_eL2, func, fstate,
01886                   &mxtake, &iretcd, stepmx, steptl, NULL); /* NOTE: lm_lnsrch() updates hx */
01887         if(iretcd!=0) goto gradproj; /* rather inelegant but effective way to handle lm_lnsrch() failures... */
01888 #else
01889         /* use the simpler (but slower!) line search described by Kanzow */
01890         for(t=tini; t>tmin; t*=beta){
01891           for(i=0; i<m; ++i){
01892             pDp[i]=p[i] + t*Dp[i];
01893             //pDp[i]=__LM_MEDIAN3(lb[i], pDp[i], ub[i]); /* project to feasible set */
01894           }
01895           
01896           (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + t*Dp */
01897           for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */
01898             hx[i]=tmp=x[i]-hx[i];
01899             pDp_eL2+=tmp*tmp;
01900           }
01901           //if(CNST(0.5)*pDp_eL2<=CNST(0.5)*p_eL2 + t*alpha*jacTeDp) break;
01902           if(pDp_eL2<=p_eL2 + CNST(2.0)*t*alpha*jacTeDp) break;
01903         }
01904 #endif
01905         ++nLSsteps;
01906         gprevtaken=0;
01907         
01908         /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2.
01909          * These values are used below to update their corresponding variables 
01910          */
01911       }
01912       else{
01913       gradproj: /* Note that this point can also be reached via a goto when lm_lnsrch() fails */
01914         
01915         /* jacTe is a descent direction; make a projected gradient step */
01916         
01917         /* if the previous step was along the gradient descent, try to use the t employed in that step */
01918         /* compute ||g|| */
01919         for(i=0, tmp=0.0; i<m; ++i)
01920           tmp=jacTe[i]*jacTe[i];
01921         tmp=(double)sqrt(tmp);
01922         tmp=CNST(100.0)/(CNST(1.0)+tmp);
01923         t0=(tmp<=tini)? tmp : tini; /* guard against poor scaling & large steps; see (3.50) in C.T. Kelley's book */
01924         
01925         for(t=(gprevtaken)? t : t0; t>tming; t*=beta){
01926           for(i=0; i<m; ++i)
01927             pDp[i]=p[i] - t*jacTe[i];
01928           boxProject(pDp, lb, ub, m); /* project to feasible set */
01929           for(i=0; i<m; ++i)
01930             Dp[i]=pDp[i]-p[i];
01931           
01932           (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p - t*g */
01933           for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */
01934             hx[i]=tmp=x[i]-hx[i];
01935             pDp_eL2+=tmp*tmp;
01936           }
01937           for(i=0, tmp=0.0; i<m; ++i) /* compute ||g^T * Dp|| */
01938             tmp+=jacTe[i]*Dp[i];
01939           
01940           if(gprevtaken && pDp_eL2<=p_eL2 + CNST(2.0)*CNST(0.99999)*tmp){ /* starting t too small */
01941             t=t0;
01942             gprevtaken=0;
01943             continue;
01944           }
01945           //if(CNST(0.5)*pDp_eL2<=CNST(0.5)*p_eL2 + alpha*tmp) break;
01946           if(pDp_eL2<=p_eL2 + CNST(2.0)*alpha*tmp) break;
01947         }
01948         
01949         ++nPGsteps;
01950         gprevtaken=1;
01951         /* NOTE: new estimate for p is in pDp, associated error in hx and its norm in pDp_eL2 */
01952       }
01953       
01954       /* update using computed values */
01955 
01956       for(i=0, Dp_L2=0.0; i<m; ++i){
01957         tmp=pDp[i]-p[i];
01958         Dp_L2+=tmp*tmp;
01959       }
01960       //Dp_L2=sqrt(Dp_L2);
01961 
01962       if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */
01963         stop=2;
01964         break;
01965       }
01966 
01967       for(i=0 ; i<m; ++i) /* update p's estimate */
01968         p[i]=pDp[i];
01969 
01970       for(i=0; i<n; ++i) /* update e and ||e||_2 */
01971         e[i]=hx[i];
01972       p_eL2=pDp_eL2;
01973       break;
01974     } /* inner loop */
01975   }
01976 
01977   if(k>=itmax) stop=3;
01978 
01979   for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
01980     jacTjac[i*m+i]=diag_jacTjac[i];
01981   
01982   if(info){
01983     info[0]=init_p_eL2;
01984     info[1]=p_eL2;
01985     info[2]=jacTe_inf;
01986     info[3]=Dp_L2;
01987     for(i=0, tmp=DBL_MIN; i<m; ++i)
01988       if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i];
01989     info[4]=mu/tmp;
01990     info[5]=(double)k;
01991     info[6]=(double)stop;
01992     info[7]=(double)nfev;
01993     info[8]=(double)njev;
01994   }
01995 
01996   /* covariance matrix */
01997   if(covar){
01998     nr_lmcovar(jacTjac, covar, p_eL2, m, n);
01999   }
02000   
02001   if(freework) free(work);
02002   
02003 #if 0
02004   printf("%d LM steps, %d line search, %d projected gradient\n", nLMsteps, nLSsteps, nPGsteps);
02005 #endif
02006   
02007   return (stop!=4)?  k : -1;
02008 }
02009 
02010 
02011 /* following struct & LMBC_DIF_XXX functions won't be necessary if a true secant
02012  * version of LEVMAR_BC_DIF() is implemented...
02013  */
02014 struct lmbc_dif_data{
02015   void (*func)(double *p, double *hx, int m, int n, void *adata);
02016   double *hx, *hxx;
02017   void *adata;
02018   double delta;
02019 };
02020 
02021 void lmbc_dif_func(double *p, double *hx, int m, int n, void *data) {
02022   struct lmbc_dif_data *dta=(struct lmbc_dif_data *)data;
02023   
02024   /* call user-supplied function passing it the user-supplied data */
02025   (*(dta->func))(p, hx, m, n, dta->adata);
02026   return;
02027 }
02028 
02029 void lmbc_dif_jacf(double *p, double *jac, int m, int n, void *data) {
02030   struct lmbc_dif_data *dta=(struct lmbc_dif_data *)data;
02031   
02032   /* evaluate user-supplied function at p */
02033   (*(dta->func))(p, dta->hx, m, n, dta->adata);
02034   nr_fdif_forw_jac_approx(dta->func, p, dta->hx, dta->hxx, dta->delta, jac, m, n, dta->adata);
02035   return;
02036 }
02037 
02038 
02039 /* No jacobian version of the LEVMAR_BC_DER() function above: the jacobian is approximated with 
02040  * the aid of finite differences (forward or central, see the comment for the opts argument)
02041  * Ideally, this function should be implemented with a secant approach. Currently, it just calls
02042  * LEVMAR_BC_DER()
02043  */
02044 int nr_lmdif_bc(
02045   void (*func)(double *p, double *hx, int m, int n, void *adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
02046   double *p,         /* I/O: initial parameter estimates. On output has the estimated solution */
02047   double *x,         /* I: measurement vector */
02048   int m,              /* I: parameter vector dimension (i.e. #unknowns) */
02049   int n,              /* I: measurement vector dimension */
02050   double *lb,        /* I: vector of lower bounds. If NULL, no lower bounds apply */
02051   double *ub,        /* I: vector of upper bounds. If NULL, no upper bounds apply */
02052   int itmax,          /* I: maximum number of iterations */
02053   double opts[5],    /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
02054                       * scale factor for initial \mu, stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2 and
02055                       * the step used in difference approximation to the jacobian. Set to NULL for defaults to be used.
02056                       * If \delta<0, the jacobian is approximated with central differences which are more accurate
02057                       * (but slower!) compared to the forward differences employed by default. 
02058                       */
02059   double info[LM_INFO_SZ],
02060   /* O: information regarding the minimization. Set to NULL if don't care
02061    * info[0]= ||e||_2 at initial p.
02062    * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
02063    * info[5]= # iterations,
02064    * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
02065    *                                 2 - stopped by small Dp
02066    *                                 3 - stopped by itmax
02067    *                                 4 - singular matrix. Restart from current p with increased mu 
02068    *                                 5 - no further error reduction is possible. Restart with increased mu
02069    *                                 6 - stopped by small ||e||_2
02070    * info[7]= # function evaluations
02071    * info[8]= # jacobian evaluations
02072    */
02073   double *work,     /* working memory, allocate if NULL */
02074   double *covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
02075   void *adata)       /* pointer to possibly additional data, passed uninterpreted to func.
02076                       * Set to NULL if not needed
02077                       */
02078 {
02079   struct lmbc_dif_data data;
02080   int ret;
02081   
02082   //fprintf(stderr, RCAT("\nWarning: current implementation of ", LEVMAR_BC_DIF) "() does not use a secant approach!\n\n");
02083   
02084   data.func=func;
02085   data.hx=(double *)malloc(2*n*sizeof(double)); /* allocate a big chunk in one step */
02086   if(!data.hx){
02087     bpm_error( "memory allocation request failed in nr_lmdif_bc(...)",           
02088                __FILE__, __LINE__ );
02089     exit(1);
02090   }
02091   data.hxx=data.hx+n;
02092   data.adata=adata;
02093   data.delta=(opts)? FABS(opts[4]) : (double)LM_DIFF_DELTA; // no central differences here...
02094   
02095   ret=nr_lmder_bc( lmbc_dif_func, lmbc_dif_jacf, p, x, m, n, lb, ub, 
02096                    itmax, opts, info, work, covar, (void *)&data );
02097   
02098   if(info) /* correct the number of function calls */
02099     info[7]+=info[8]*(m+1); /* each jacobian evaluation costs m+1 function calls */
02100   
02101   free(data.hx);
02102   
02103   return ret;
02104 }