4 #ifndef VECTORIZE_MATH_H 
    5 #define VECTORIZE_MATH_H 
   25 typedef __m512i v16si;
 
   29 #define VECD_CONST(name,x) \ 
   30 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
   31 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name) 
   33 #define VECDI_CONST(name,x) \ 
   34 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
   35 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name) 
   37 #define VECF_CONST(name,x) \ 
   38 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \ 
   39 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name) 
   41 #define VECFI_CONST(name,x) \ 
   42 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \ 
   43 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name) 
   45 #define VECI_CONST(name,x) \ 
   46 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
   47 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name) 
   49 #define VECII_CONST(name,x) \ 
   50 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \ 
   51 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name) 
   53 #define VECL_CONST(name,x) \ 
   54 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \ 
   55 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name) 
   57 #define VECLL_CONST(name,x) \ 
   58 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
   59 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name) 
   73 inline 
bool getmaskbit(
unsigned int mask, 
unsigned int n)
 
   75         return ( ((mask>>n)&1) == 1 );
 
   79 inline string print_pack(
const T x[], 
int npack, 
unsigned int mask, 
const string& name)
 
   82         for( 
int i=0; i < npack; ++i )
 
   84                 if( getmaskbit(mask, i) )
 
   88                 oss << name << 
"[" << i << 
"] = " << x[i] << 
"\n";
 
   93 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask)
 
   97         oss << 
"domain error in " << routine << 
".\n";
 
   99         oss << print_pack(p.d, 8, mask, 
"x");
 
  103 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask)
 
  107         oss << 
"domain error in " << routine << 
".\n";
 
  109         oss << print_pack(p.f, 16, mask, 
"x");
 
  113 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
 
  117         oss << 
"domain error in " << routine << 
".\n";
 
  119         oss << print_pack(p.d, 8, mask1, 
"x");
 
  121         oss << print_pack(p.d, 8, mask2, 
"y");
 
  125 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
 
  129         oss << 
"domain error in " << routine << 
".\n";
 
  131         oss << print_pack(p.f, 16, mask1, 
"x");
 
  133         oss << print_pack(p.f, 16, mask2, 
"y");
 
  139 #define VECD_CONST(name,x) \ 
  140 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \ 
  141 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name) 
  143 #define VECDI_CONST(name,x) \ 
  144 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \ 
  145 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name) 
  147 #define VECF_CONST(name,x) \ 
  148 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
  149 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name) 
  151 #define VECFI_CONST(name,x) \ 
  152 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
  153 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name) 
  155 #define VECI_CONST(name,x) \ 
  156 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \ 
  157 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name) 
  159 #define VECII_CONST(name,x) \ 
  160 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \ 
  161 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name) 
  163 #define VECL_CONST(name,x) \ 
  164 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \ 
  165 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name) 
  167 #define VECLL_CONST(name,x) \ 
  168 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \ 
  169 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name) 
  183 template<class T, class U>
 
  184 inline 
string print_pack(const T x[], const U m[], 
int npack, const 
string& name)
 
  187         for( 
int i=0; i < npack; ++i )
 
  193                 oss << name << 
"[" << i << 
"] = " << x[i] << 
"\n";
 
  198 inline string DEMsg(
const string& routine, v4df x, v4df mask)
 
  202         oss << 
"domain error in " << routine << 
".\n";
 
  205         oss << print_pack(p.d, m.l, 4, 
"x");
 
  209 inline string DEMsg(
const string& routine, v8sf x, v8sf mask)
 
  213         oss << 
"domain error in " << routine << 
".\n";
 
  216         oss << print_pack(p.f, m.i, 8, 
"x");
 
  220 inline string DEMsg(
const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
 
  224         oss << 
"domain error in " << routine << 
".\n";
 
  227         oss << print_pack(p.d, m.l, 4, 
"x");
 
  230         oss << print_pack(p.d, m.l, 4, 
"y");
 
  234 inline string DEMsg(
const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
 
  238         oss << 
"domain error in " << routine << 
".\n";
 
  241         oss << print_pack(p.f, m.i, 8, 
"x");
 
  244         oss << print_pack(p.f, m.i, 8, 
"y");
 
  248 #endif // __AVX512F__ 
  250 VECD_CONST(mthree,-3.);
 
  251 VECD_CONST(mone,-1.);
 
  252 VECD_CONST(mhalf,-0.5);
 
  254 VECDI_CONST(third,0x3fd5555555555555); 
 
  255 VECD_CONST(half,0.5);
 
  257 VECD_CONST(c1p5,1.5);
 
  259 VECD_CONST(three,3.);
 
  262 VECD_CONST(c1023,1023.);
 
  264 VECDI_CONST(ln_two,0x3fe62e42fefa39ef); 
 
  265 VECDI_CONST(log2e,0x3ff71547652b82fe);  
 
  266 VECDI_CONST(ln_ten,0x40026bb1bbb55516); 
 
  267 VECDI_CONST(log10e,0x3fdbcb7b1526e50e); 
 
  269 VECDI_CONST(dbl_min,0x0010000000000000); 
 
  270 VECDI_CONST(dbl_max,0x7fefffffffffffff); 
 
  271 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff); 
 
  273 VECF_CONST(mthreef,-3.f);
 
  274 VECF_CONST(monef,-1.f);
 
  275 VECF_CONST(mhalff,-0.5f);
 
  276 VECF_CONST(zerof,0.f);
 
  277 VECFI_CONST(thirdf,0x3eaaaaab); 
 
  278 VECF_CONST(halff,0.5f);
 
  279 VECF_CONST(onef,1.f);
 
  280 VECF_CONST(twof,2.f);
 
  281 VECF_CONST(c1p5f,1.5f);
 
  282 VECF_CONST(threef,3.f);
 
  283 VECF_CONST(sixf,6.f);
 
  284 VECF_CONST(tenf,10.f);
 
  285 VECF_CONST(c127,127.f);
 
  287 VECFI_CONST(ln_twof,0x3f317218); 
 
  288 VECFI_CONST(log2ef,0x3fb8aa3b);  
 
  289 VECFI_CONST(ln_tenf,0x40135d8e); 
 
  290 VECFI_CONST(log10ef,0x3ede5bd9); 
 
  292 VECFI_CONST(flt_min,0x00800000); 
 
  293 VECFI_CONST(flt_max,0x7f7fffff); 
 
  294 VECFI_CONST(sqrt_flt_max,0x5f7fffff); 
 
  297 inline void vecfun_partial(
const sys_float x[], 
sys_float y[], 
long nlo, 
long nhi, 
long npack, V (*vecfun1)(V))
 
  300         for( 
long i=0; i < npack; ++i )
 
  301                 xx.f[i] = x[
min(nlo+i,nhi-1)];
 
  302         yy.pf = vecfun1(xx.pf);
 
  303         for( 
long i=nlo; i < nhi; ++i )
 
  308 inline void vecfun_partial(
const double x[], 
double y[], 
long nlo, 
long nhi, 
long npack, V (*vecfun1)(V))
 
  311         for( 
long i=0; i < npack; ++i )
 
  312                 xx.d[i] = x[
min(nlo+i,nhi-1)];
 
  313         yy.pd = vecfun1(xx.pd);
 
  314         for( 
long i=nlo; i < nhi; ++i )
 
  320                             long npack, V (*vecfun1)(V, V))
 
  323         for( 
long i=0; i < npack; ++i )
 
  325                 xx1.f[i] = x1[
min(nlo+i,nhi-1)];
 
  326                 xx2.f[i] = x2[
min(nlo+i,nhi-1)];
 
  328         yy.pf = vecfun1(xx1.pf, xx2.pf);
 
  329         for( 
long i=nlo; i < nhi; ++i )
 
  334 inline void vecfun2_partial(
const double x1[], 
const double x2[], 
double y[], 
long nlo, 
long nhi, 
long npack,
 
  338         for( 
long i=0; i < npack; ++i )
 
  340                 xx1.d[i] = x1[
min(nlo+i,nhi-1)];
 
  341                 xx2.d[i] = x2[
min(nlo+i,nhi-1)];
 
  343         yy.pd = vecfun1(xx1.pd, xx2.pd);
 
  344         for( 
long i=nlo; i < nhi; ++i )
 
  352 template<
class T, 
class V>
 
  353 inline void vecfun(
const T x[], T y[], 
long nlo, 
long nhi, T (*)(T), V (*vecfun1)(V))
 
  359         long npack = 
sizeof(V)/
sizeof(T);
 
  360         if( nhi-nlo < npack )
 
  362                 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
 
  366         long ox = (
reinterpret_cast<long>(x)/
sizeof(T)+nlo)%npack;
 
  367         long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
 
  368         bool lgNonAligned = ( ox != oy );
 
  369         T *yl, *ylocal = 
NULL;
 
  372                 ylocal = 
new T[nhi+npack-1];
 
  373                 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
 
  377                         yl = &ylocal[ox+npack-ol];
 
  387                 vecfun_partial(x, yl, ilo, 
min(ilo+npack-ox,nhi), npack, vecfun1);
 
  388                 ilo = 
min(ilo+npack-ox,nhi);
 
  391         for( i=ilo; i < nhi-npack+1; i += npack )
 
  393                 const V& xx = *
reinterpret_cast<const V*
>(&x[i]);
 
  394                 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
 
  400                 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
 
  403                 for( i=nlo; i < nhi; ++i )
 
  409 template<
class T, 
class V>
 
  410 inline void vecfun2(
const T x1[], 
const T x2[], T y[], 
long nlo, 
long nhi, T (*)(T, T), V (*vecfun1)(V, V))
 
  416         long npack = 
sizeof(V)/
sizeof(T);
 
  417         if( nhi-nlo < npack )
 
  419                 vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
 
  423         long ox1 = (
reinterpret_cast<long>(
x1)/
sizeof(T)+nlo)%npack;
 
  424         long ox2 = (
reinterpret_cast<long>(
x2)/
sizeof(T)+nlo)%npack;
 
  425         long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
 
  426         bool lgNonAligned1 = ( ox1 != ox2 );
 
  427         bool lgNonAligned2 = ( ox1 != oy );
 
  432                 x2local = 
new T[nhi+npack-1];
 
  433                 long ol = (
reinterpret_cast<long>(x2local)/
sizeof(T)+nlo)%npack;
 
  436                         ptr = &x2local[ox1-ol];
 
  438                         ptr = &x2local[ox1+npack-ol];
 
  439                 memcpy(ptr+nlo, x2+nlo, 
size_t((nhi-nlo)*
sizeof(T)));
 
  446         T *yl, *ylocal = 
NULL;
 
  449                 ylocal = 
new T[nhi+npack-1];
 
  450                 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
 
  452                         yl = &ylocal[ox1-ol];
 
  454                         yl = &ylocal[ox1+npack-ol];
 
  464                 vecfun2_partial(x1, x2l, yl, ilo, 
min(ilo+npack-ox1,nhi), npack, vecfun1);
 
  465                 ilo = 
min(ilo+npack-ox1,nhi);
 
  468         for( i=ilo; i < nhi-npack+1; i += npack )
 
  470                 const V& xx1 = *
reinterpret_cast<const V*
>(&x1[i]);
 
  471                 const V& xx2 = *
reinterpret_cast<const V*
>(&x2l[i]);
 
  472                 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
 
  473                 yy = vecfun1(xx1, xx2);
 
  478                 vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
 
  483                 for( i=nlo; i < nhi; ++i )
 
  490 #define V1FUN_PD_4(FUN, V) \ 
  491         v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \ 
  492         v8df yy = v1##FUN##d(xx); \ 
  493         memcpy(y, &yy, 4*sizeof(double)) 
  495 #define V1FUN_PD_4(FUN, V) \ 
  496         v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \ 
  497         v4df yy = v1##FUN##d(xx); \ 
  498         memcpy(y, &yy, 4*sizeof(double)) 
  502 #define V1FUN_PD_8(FUN, V) \ 
  503         v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  504         v8df yy = v1##FUN##d(xx); \ 
  505         memcpy(y, &yy, 8*sizeof(double)) 
  507 #define V1FUN_PD_8(FUN, V) \ 
  509         v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \ 
  510         yy[0] = v1##FUN##d(xx); \ 
  511         xx = _mm256_set_pd( x7, x6, x5, x4 ); \ 
  512         yy[1] = v1##FUN##d(xx); \ 
  513         memcpy(y, &yy, 8*sizeof(double)) 
  517 #define V1FUN_PS_4(FUN, V) \ 
  518         v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \ 
  519         v16sf yy = v1##FUN##f(xx); \ 
  520         memcpy(y, &yy, 4*sizeof(sys_float)) 
  522 #define V1FUN_PS_4(FUN, V) \ 
  523         v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \ 
  524         v8sf yy = v1##FUN##f(xx); \ 
  525         memcpy(y, &yy, 4*sizeof(sys_float)) 
  529 #define V1FUN_PS_8(FUN, V) \ 
  530         v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  531         v16sf yy = v1##FUN##f(xx); \ 
  532         memcpy(y, &yy, 8*sizeof(sys_float)) 
  534 #define V1FUN_PS_8(FUN, V) \ 
  535         v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  536         v8sf yy = v1##FUN##f(xx); \ 
  537         memcpy(y, &yy, 8*sizeof(sys_float)) 
  541 #define V1FUN_PS_16(FUN, V) \ 
  542         v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  543         v16sf yy = v1##FUN##f(xx); \ 
  544         memcpy(y, &yy, 16*sizeof(sys_float)) 
  546 #define V1FUN_PS_16(FUN, V) \ 
  548         v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  549         yy[0] = v1##FUN##f(xx); \ 
  550         xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \ 
  551         yy[1] = v1##FUN##f(xx); \ 
  552         memcpy(y, &yy, 16*sizeof(sys_float)) 
  556 #define V1FUN2_PD_4(FUN, V) \ 
  557         v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \ 
  558         v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \ 
  559         v8df zz = v1##FUN##d(xx, yy); \ 
  560         memcpy(z, &zz, 4*sizeof(double)) 
  562 #define V1FUN2_PD_4(FUN, V) \ 
  563         v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \ 
  564         v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \ 
  565         v4df zz = v1##FUN##d(xx, yy); \ 
  566         memcpy(z, &zz, 4*sizeof(double)) 
  570 #define V1FUN2_PS_4(FUN, V) \ 
  571         v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \ 
  572         v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \ 
  573         v16sf zz = v1##FUN##f(xx, yy); \ 
  574         memcpy(z, &zz, 4*sizeof(sys_float)) 
  576 #define V1FUN2_PS_4(FUN, V) \ 
  577         v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \ 
  578         v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \ 
  579         v8sf zz = v1##FUN##f(xx, yy); \ 
  580         memcpy(z, &zz, 4*sizeof(sys_float)) 
  584 #define V1FUN2_PS_8(FUN, V) \ 
  585         v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  586         v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \ 
  587         v16sf zz = v1##FUN##f(xx, yy); \ 
  588         memcpy(z, &zz, 8*sizeof(sys_float)) 
  590 #define V1FUN2_PS_8(FUN, V) \ 
  591         v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \ 
  592         v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \ 
  593         v8sf zz = v1##FUN##f(xx, yy); \ 
  594         memcpy(z, &zz, 8*sizeof(sys_float)) 
  600 template<
class T, 
class V>
 
  601 inline void vecfun(
const T x[], T y[], 
long nlo, 
long nhi, T (*scalfun1)(T), V (*)(V))
 
  603         for( 
long i=nlo; i < nhi; ++i )
 
  604                 y[i] = scalfun1(x[i]);
 
  607 template<
class T, 
class V>
 
  608 inline void vecfun2(
const T x1[], 
const T x2[], T y[], 
long nlo, 
long nhi, T (*scalfun1)(T, T), V (*)(V, V))
 
  610         for( 
long i=nlo; i < nhi; ++i )
 
  611                 y[i] = scalfun1(x1[i], x2[i]);
 
  614 #define V1FUN_4(FUN, V) \ 
  620 #define V1FUN_8(FUN, V) \ 
  630 #define V1FUN_16(FUN, V) \ 
  648 #define V1FUN_PD_4(FUN, V) \ 
  651 #define V1FUN_PD_8(FUN, V) \ 
  654 #define V1FUN_PS_4(FUN, V) \ 
  657 #define V1FUN_PS_8(FUN, V) \ 
  660 #define V1FUN_PS_16(FUN, V) \ 
  663 #define V1FUN2_4(FUN, V) \ 
  664         z[0] = FUN(x0, y0); \ 
  665         z[1] = FUN(x1, y1); \ 
  666         z[2] = FUN(x2, y2); \ 
  669 #define V1FUN2_8(FUN, V) \ 
  670         z[0] = FUN(x0, y0); \ 
  671         z[1] = FUN(x1, y1); \ 
  672         z[2] = FUN(x2, y2); \ 
  673         z[3] = FUN(x3, y3); \ 
  674         z[4] = FUN(x4, y4); \ 
  675         z[5] = FUN(x5, y5); \ 
  676         z[6] = FUN(x6, y6); \ 
  679 #define V1FUN2_PD_4(FUN, V) \ 
  682 #define V1FUN2_PS_4(FUN, V) \ 
  685 #define V1FUN2_PS_8(FUN, V) \ 
static double x2[63]
Definition: atmdat_3body.cpp:18
 
static double x1[83]
Definition: atmdat_3body.cpp:27
 
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
Definition: vectorize_math.h:608
 
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
 
void zero(void)
Definition: zero.cpp:30
 
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
Definition: vectorize_math.h:601
 
float sys_float
Definition: cddefines.h:130
 
long min(int a, long b)
Definition: cddefines.h:766
 
#define NULL
Definition: cddefines.h:115