31 # ifndef WIN32_LEAN_AND_MEAN 32 # define WIN32_LEAN_AND_MEAN 33 # endif // WIN32_LEAN_AND_MEAN 53 template<
class T>
int SparseMatrix<T>::UseAlloc=0;
61 internalAllocator.set(blockSize);
73 _maxEntriesPerRow = 0;
85 if( M._contiguous )
Resize( M.
rows , M._maxEntriesPerRow );
87 for(
int i=0 ; i<
rows ; i++ )
103 if( M._contiguous )
Resize( M.
rows , M._maxEntriesPerRow );
105 for(
int i=0 ; i<
rows ; i++ )
119 FILE* fp = fopen( fileName ,
"wb" );
120 if( !fp )
return false;
121 bool ret =
write( fp );
128 FILE* fp = fopen( fileName ,
"rb" );
129 if( !fp )
return false;
130 bool ret =
read( fp );
137 if( fwrite( &
rows ,
sizeof(
int ) , 1 , fp )!=1 )
return false;
139 for(
int i=0 ; i<rows ; i++ ) if( fwrite( (*this)[i] , sizeof( MatrixEntry< T > ) ,
rowSizes[i] , fp )!=
rowSizes[i] )
return false;
146 if( fread( &r ,
sizeof(
int ) , 1 , fp )!=1 )
return false;
149 for(
int i=0 ; i<
rows ; i++ )
167 if( _contiguous ){
if( _maxEntriesPerRow ) free(
m_ppElements[0] ); }
175 rowSizes = (
int* )malloc(
sizeof(
int ) * r );
176 memset(
rowSizes , 0 ,
sizeof(
int ) * r );
180 _maxEntriesPerRow = 0;
188 if( _contiguous ){
if( _maxEntriesPerRow ) free(
m_ppElements[0] ); }
196 rowSizes = (
int* )malloc(
sizeof(
int ) * r );
197 memset(
rowSizes , 0 ,
sizeof(
int ) * r );
203 _maxEntriesPerRow = e;
211 if( count>_maxEntriesPerRow ) fprintf( stderr ,
"[ERROR] Cannot set row size on contiguous matrix: %d<=%d\n" , count , _maxEntriesPerRow ) , exit( 0 );
214 else if( row>=0 && row<
rows )
229 Resize(this->m_N, this->m_M);
236 for(
int ij=0; ij < Min( this->Rows(), this->Columns() ); ij++)
237 (*
this)(ij,ij) = T(1);
251 for (
int i=0; i<this->Rows(); i++)
262 for(
int i=0; i<R.Rows(); i++){
280 for (
int i=0; i<
rows; i++)
295 #pragma omp parallel for num_threads( threads ) schedule( static ) 296 for(
int i=0 ; i<
rows ; i++ )
322 for (
int i=0; i<this->Rows(); i++)
345 double delta_new , delta_0;
346 for(
int i=0 ; i<r.Dimensions() ; i++ ) delta_new += r.
m_pV[i] * r.m_pV[i];
348 if( delta_new<eps )
return 0;
352 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
355 double dDotQ = 0 , alpha = 0;
357 alpha = delta_new / dDotQ;
358 #pragma omp parallel
for num_threads( threads ) schedule(
static )
359 for(
int i=0 ; i<r.Dimensions() ; i++ ) solution.
m_pV[i] += d.
m_pV[i] * T2( alpha );
363 M.
Multiply( solution , r , threads );
367 #pragma omp parallel for num_threads( threads ) schedule( static ) 368 for(
int i=0 ; i<r.Dimensions() ; i++ ) r.
m_pV[i] = r.m_pV[i] - q.
m_pV[i] * T2(alpha);
370 double delta_old = delta_new , beta;
372 for(
int i=0 ; i<r.Dimensions() ; i++ ) delta_new += r.m_pV[i]*r.m_pV[i];
373 beta = delta_new / delta_old;
374 #pragma omp parallel
for num_threads( threads ) schedule(
static )
375 for(
int i=0 ; i<d.
Dimensions() ; i++ ) d.
m_pV[i] = r.m_pV[i] + d.
m_pV[i] * T2( beta );
389 solution.
Resize(M.Columns());
394 for(i=0;i<iters && rDotR>eps;i++){
397 alpha=rDotR/d.
Dot(Md);
423 for(
int i=0; i<SparseMatrix<T>::rows; i++){
424 for(
int ii=0;ii<SparseMatrix<T>::rowSizes[i];ii++){
438 const T2* in = &In[0];
443 for(
int i=0 ; i<SparseMatrix<T>::rows ; i++ ) dcTerm += in[i];
450 const T2& in_i_ = in[i];
452 for( ; temp!=end ; temp++ )
461 if( addDCTerm )
for(
int i=0 ; i<SparseMatrix<T>::rows ; i++ ) out[i] += dcTerm;
468 const T2* in = &In[0];
469 int threads = OutScratch.
threads();
473 #pragma omp parallel for num_threads( threads ) reduction ( + : dcTerm ) 474 for(
int t=0 ; t<threads ; t++ )
476 T2* out = OutScratch[t];
477 memset( out , 0 ,
sizeof( T2 ) * dim );
480 const T2& in_i_ = in[i];
495 #pragma omp parallel for num_threads( threads ) schedule( static ) 496 for(
int i=0 ; i<dim ; i++ )
499 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
505 #pragma omp parallel for num_threads( threads ) 506 for(
int t=0 ; t<threads ; t++ )
508 T2* out = OutScratch[t];
509 memset( out , 0 ,
sizeof( T2 ) * dim );
512 const T2& in_i_ = in[i];
525 #pragma omp parallel for num_threads( threads ) schedule( static ) 526 for(
int i=0 ; i<dim ; i++ )
529 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
539 const T2* in = &In[0];
540 int threads = OutScratch.size();
541 #pragma omp parallel for num_threads( threads ) 542 for(
int t=0 ; t<threads ; t++ )
543 for(
int i=0 ; i<dim ; i++ ) OutScratch[t][i] = T2(0);
544 #pragma omp parallel for num_threads( threads ) 545 for(
int t=0 ; t<threads ; t++ )
547 T2* out = OutScratch[t];
548 for(
int i=bounds[t] ; i<bounds[t+1] ; i++ )
552 const T2& in_i_ = in[i];
554 for( ; temp!=end ; temp++ )
564 #pragma omp parallel for num_threads( threads ) schedule( static ) 569 for(
int t=0 ; t<threads ; t++ ) _out += OutScratch[t][i];
572 #if defined _WIN32 && !defined __MINGW32__ 573 #ifndef _AtomicIncrement_ 574 #define _AtomicIncrement_ 575 inline void AtomicIncrement(
volatile float* ptr ,
float addend )
577 float newValue = *ptr;
578 LONG& _newValue = *( (LONG*)&newValue );
582 _oldValue = _newValue;
584 _newValue = InterlockedCompareExchange( (LONG*) ptr , _newValue , _oldValue );
585 if( _newValue==_oldValue )
break;
588 inline void AtomicIncrement(
volatile double* ptr ,
double addend )
591 double newValue = *ptr;
592 LONGLONG& _newValue = *( (LONGLONG*)&newValue );
596 _oldValue = _newValue;
598 _newValue = InterlockedCompareExchange64( (LONGLONG*) ptr , _newValue , _oldValue );
600 while( _newValue!=_oldValue );
602 #endif // _AtomicIncrement_ 607 const float* in = &In[0];
608 float* out = &Out[0];
610 #pragma omp parallel for num_threads( threads ) 611 for(
int t=0 ; t<threads ; t++ )
612 for(
int i=partition[t] ; i<partition[t+1] ; i++ )
616 const float& in_i = in[i];
618 for( ; temp!=end ; temp++ )
621 float v = temp->
Value;
623 AtomicIncrement( out+j , v * in_i );
625 AtomicIncrement( out+i , out_i );
628 #pragma omp parallel for num_threads( threads ) 629 for(
int i=0 ; i<A.
rows ; i++ )
633 const float& in_i = in[i];
635 for( ; temp!=end ; temp++ )
638 float v = temp->
Value;
640 AtomicIncrement( out+j , v * in_i );
642 AtomicIncrement( out+i , out_i );
649 const double* in = &In[0];
650 double* out = &Out[0];
653 #pragma omp parallel for num_threads( threads ) 654 for(
int t=0 ; t<threads ; t++ )
655 for(
int i=partition[t] ; i<partition[t+1] ; i++ )
659 const double& in_i = in[i];
661 for( ; temp!=end ; temp++ )
666 AtomicIncrement( out+j , v * in_i );
668 AtomicIncrement( out+i , out_i );
671 #pragma omp parallel for num_threads( threads ) 672 for(
int i=0 ; i<A.
rows ; i++ )
676 const double& in_i = in[i];
678 for( ; temp!=end ; temp++ )
683 AtomicIncrement( out+j , v * in_i );
685 AtomicIncrement( out+i , out_i );
702 if( solveNormal ) temp.
Resize( dim );
703 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
704 const T2* _b = &b[0];
706 std::vector< int > partition( threads+1 );
709 for(
int i=0 ; i<A.
rows ; i++ ) eCount += A.
rowSizes[i];
711 #pragma omp parallel
for num_threads( threads )
712 for(
int t=0 ; t<threads ; t++ )
715 for(
int i=0 ; i<A.
rows ; i++ )
718 if( _eCount*threads>=eCount*(t+1) )
725 partition[threads] = A.
rows;
729 MultiplyAtomic( A , x , temp , threads , &partition[0] );
730 MultiplyAtomic( A , temp , r , threads , &partition[0] );
731 MultiplyAtomic( A , b , temp , threads , &partition[0] );
732 #pragma omp parallel for num_threads( threads ) schedule( static ) 733 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i];
737 MultiplyAtomic( A , x , r , threads , &partition[0] );
738 #pragma omp parallel for num_threads( threads ) schedule( static ) 739 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i];
741 double delta_new = 0 , delta_0;
742 for(
size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
746 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
750 for( ii=0; ii<iters && delta_new>eps*delta_0 ; ii++ )
752 if( solveNormal ) MultiplyAtomic( A , d , temp , threads , &partition[0] ) , MultiplyAtomic( A , temp , q , threads , &partition[0] );
753 else MultiplyAtomic( A , d , q , threads , &partition[0] );
755 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
756 T2 alpha = T2( delta_new / dDotQ );
757 #pragma omp parallel for num_threads( threads ) schedule( static ) 758 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
759 if( (ii%50)==(50-1) )
762 if( solveNormal ) MultiplyAtomic( A , x , temp , threads , &partition[0] ) , MultiplyAtomic( A , temp , r , threads , &partition[0] );
763 else MultiplyAtomic( A , x , r , threads , &partition[0] );
764 #pragma omp parallel for num_threads( threads ) schedule( static ) 765 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i];
768 #pragma omp parallel for num_threads( threads ) schedule( static ) 769 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha;
771 double delta_old = delta_new;
773 for(
size_t i=0 ; i<dim ; i++ ) delta_new += _r[i] * _r[i];
774 T2 beta = T2( delta_new / delta_old );
775 #pragma omp parallel for num_threads( threads ) schedule( static ) 776 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
780 #endif // _WIN32 && !__MINGW32__ 785 int threads = scratch.
threads();
789 if( reset ) x.
Resize( dim );
790 if( solveNormal ) temp.Resize( dim );
791 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
792 const T2* _b = &b[0];
794 double delta_new = 0 , delta_0;
797 A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm ) , A.
Multiply( b , temp , scratch , addDCTerm );
798 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 799 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
803 A.
Multiply( x , r , scratch , addDCTerm );
804 #pragma omp parallel for num_threads( threads ) reduction ( + : delta_new ) 805 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
810 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
814 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
816 if( solveNormal ) A.
Multiply( d , temp , scratch , addDCTerm ) , A.
Multiply( temp , q , scratch , addDCTerm );
817 else A.
Multiply( d , q , scratch , addDCTerm );
819 #pragma omp parallel for num_threads( threads ) reduction( + : dDotQ ) 820 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
821 T2 alpha = T2( delta_new / dDotQ );
822 double delta_old = delta_new;
824 if( (ii%50)==(50-1) )
826 #pragma omp parallel for num_threads( threads ) 827 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
829 if( solveNormal ) A.
Multiply( x , temp , scratch , addDCTerm ) , A.
Multiply( temp , r , scratch , addDCTerm );
830 else A.
Multiply( x , r , scratch , addDCTerm );
831 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 832 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
835 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 836 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
838 T2 beta = T2( delta_new / delta_old );
839 #pragma omp parallel for num_threads( threads ) 840 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
851 if( threads<1 ) threads = 1;
852 if( threads>1 ) outScratch.
resize( threads , dim );
853 if( reset ) x.
Resize( dim );
856 if( solveNormal ) temp.
Resize( dim );
857 T2 *_x = &x[0] , *_r = &r[0] , *_d = &d[0] , *_q = &q[0];
858 const T2* _b = &b[0];
860 double delta_new = 0 , delta_0;
864 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm ) , A.
Multiply( b , temp , outScratch , addDCTerm );
866 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 867 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = temp[i] - _r[i] , delta_new += _r[i] * _r[i];
871 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
872 else A.
Multiply( x , r , addDCTerm );
873 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 874 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i];
880 fprintf( stderr ,
"[WARNING] Initial residual too low: %g < %f\n" , delta_new , eps );
884 for( ii=0 ; ii<iters && delta_new>eps*delta_0 ; ii++ )
888 if( threads>1 ) A.
Multiply( d , temp , outScratch , addDCTerm ) , A.
Multiply( temp , q , outScratch , addDCTerm );
889 else A.
Multiply( d , temp , addDCTerm ) , A.
Multiply( temp , q , addDCTerm );
893 if( threads>1 ) A.
Multiply( d , q , outScratch , addDCTerm );
894 else A.
Multiply( d , q , addDCTerm );
897 #pragma omp parallel for num_threads( threads ) reduction( + : dDotQ ) 898 for(
int i=0 ; i<dim ; i++ ) dDotQ += _d[i] * _q[i];
899 T2 alpha = T2( delta_new / dDotQ );
900 double delta_old = delta_new;
903 if( (ii%50)==(50-1) )
905 #pragma omp parallel for num_threads( threads ) 906 for(
int i=0 ; i<dim ; i++ ) _x[i] += _d[i] * alpha;
910 if( threads>1 ) A.
Multiply( x , temp , outScratch , addDCTerm ) , A.
Multiply( temp , r , outScratch , addDCTerm );
911 else A.
Multiply( x , temp , addDCTerm ) , A.
Multiply( temp , r , addDCTerm );
915 if( threads>1 ) A.
Multiply( x , r , outScratch , addDCTerm );
916 else A.
Multiply( x , r , addDCTerm );
918 #pragma omp parallel for num_threads( threads ) reduction ( + : delta_new ) 919 for(
int i=0 ; i<dim ; i++ ) _r[i] = _b[i] - _r[i] , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
923 #pragma omp parallel for num_threads( threads ) reduction( + : delta_new ) 924 for(
int i=0 ; i<dim ; i++ ) _r[i] -= _q[i] * alpha , delta_new += _r[i] * _r[i] , _x[i] += _d[i] * alpha;
927 T2 beta = T2( delta_new / delta_old );
928 #pragma omp parallel for num_threads( threads ) 929 for(
int i=0 ; i<dim ; i++ ) _d[i] = _r[i] + _d[i] * beta;
946 for(
int i=0 ; i<iters ; i++ )
953 for(
int j=0 ; j<int(M.
rows) ; j++ ) solution[j] += (b[j]-Md[j])/diagonal[j];
962 for(
int i=0 ; i<SparseMatrix<T>::rows ; i++ )
static Allocator< MatrixEntry< T > > internalAllocator
SparseMatrix< T > operator*(const T &V) const
size_t Dimensions() const
SparseMatrix< T > & operator*=(const T &V)
void SetRowSize(int row, int count)
bool write(FILE *fp) const
SparseMatrix< T > Multiply(const SparseMatrix< T > &M) const
MatrixEntry< T > ** m_ppElements
void resize(int threads, int dim)
SparseMatrix< T > & operator=(const SparseMatrix< T > &M)
Vector< T2 > operator*(const Vector< T2 > &V) const
T Dot(const Vector &V) const
static int Solve(const SparseSymmetricMatrix< T > &M, const Vector< T2 > &b, int iters, Vector< T2 > &solution, T2 eps=1e-8, int reset=1, int threads=0, bool addDCTerm=false, bool solveNormal=false)
void getDiagonal(Vector< T2 > &diagonal) const
static void SetAllocator(int blockSize)
static int UseAllocator(void)
SparseMatrix< T > Transpose() const
static int Solve(const SparseMatrix< T > &M, const Vector< T > &b, int iters, Vector< T > &solution, const T eps=1e-8)
static int SolveSymmetric(const SparseMatrix< T > &M, const Vector< T2 > &b, int iters, Vector< T2 > &solution, const T2 eps=1e-8, int reset=1, int threads=1)
Vector< T2 > Multiply(const Vector< T2 > &V) const