46 #if !defined(__MITSUBA_CORE_SSEVECTOR_H_) 
   47 #define __MITSUBA_CORE_SSEVECTOR_H_ 
   53 # error "This header requires SSE support" 
   65 template <
int idx3, 
int idx2, 
int idx1, 
int idx0>
 
   66 SSEVector4f 
shuffle(
const SSEVector4f& low, 
const SSEVector4f& hi);
 
   68 template <
int idx3, 
int idx2, 
int idx1, 
int idx0>
 
   69 SSEVector4f 
shuffle(
const SSEVector4f& a);
 
   83         xmm(_mm_set_ps(f3, f2, f1, f0))
 
   87                 xmm = _mm_set1_ps(val);
 
   92                 return _mm_setzero_ps();
 
   95         operator __m128()
 const {
 
  100                 return _mm_and_ps(a.xmm, b.xmm);
 
  103                 return _mm_or_ps(a.xmm, b.xmm);
 
  106                 return _mm_xor_ps(a.xmm, b.xmm);
 
  110                 return _mm_andnot_ps(a.xmm, b.xmm);
 
  114                 xmm = _mm_and_ps(xmm, a.xmm);
 
  118                 xmm = _mm_or_ps(xmm, a.xmm);
 
  122                 xmm = _mm_xor_ps(xmm, a.xmm);
 
  127                 return _mm_add_ps(a.xmm, b.xmm);
 
  130                 return _mm_sub_ps(a.xmm, b.xmm);
 
  133                 return _mm_mul_ps(a.xmm, b.xmm);
 
  136                 return _mm_div_ps(a.xmm, b.xmm);
 
  140                 xmm = _mm_add_ps(xmm, a.xmm);
 
  144                 xmm = _mm_sub_ps(xmm, a.xmm);
 
  148                 xmm = _mm_mul_ps(xmm, a.xmm);
 
  152                 xmm = _mm_div_ps(xmm, a.xmm);
 
  161                 __m128 x0 = _mm_rcp_ps(v.xmm);
 
  162                 return _mm_sub_ps(_mm_add_ps(x0,x0),
 
  163                         _mm_mul_ps(_mm_mul_ps(x0,v.xmm), x0));
 
  167                 return _mm_rcp_ps(v.xmm);
 
  171                 return _mm_min_ps(a.xmm, b.xmm);
 
  174                 return _mm_max_ps(a.xmm, b.xmm);
 
  178                 return _mm_cmpunord_ps(a.xmm, a.xmm);
 
  181                 return _mm_cmpunord_ps(a.xmm, b.xmm);
 
  191         template <
int idx3, 
int idx2, 
int idx1, 
int idx0>
 
  193                 return _mm_shuffle_ps(low.xmm,hi.xmm,_MM_SHUFFLE(idx3,idx2,idx1,idx0));
 
  197         template <
int idx3, 
int idx2, 
int idx1, 
int idx0>
 
  199                 return _mm_shuffle_ps(a.xmm, a.xmm, _MM_SHUFFLE(idx3,idx2,idx1,idx0));
 
  204                 return _mm_cmpeq_ps(a.xmm, b.xmm);
 
  208                 return _mm_cmplt_ps(a.xmm, b.xmm);
 
  212                 return _mm_cmple_ps(a.xmm, b.xmm);
 
  216                 return _mm_cmpgt_ps(a.xmm, b.xmm);
 
  220                 return _mm_cmpge_ps(a.xmm, b.xmm);
 
  224                 return _mm_cmpneq_ps(a.xmm, b.xmm);
 
  228                 return _mm_cmpnlt_ps(a.xmm, b.xmm);
 
  232                 return _mm_cmpnle_ps(a.xmm, b.xmm);
 
  236                 return _mm_cmpngt_ps(a.xmm, b.xmm);
 
  240                 return _mm_cmpnge_ps(a.xmm, b.xmm);
 
  267                 return _mm_xor_ps(b.xmm, _mm_and_ps(mask.xmm, _mm_xor_ps(a.xmm, b.xmm)));
 
  272                 __m128i truncated = _mm_cvttps_epi32(a.xmm);
 
  273                 return _mm_cvtepi32_ps(truncated);
 
  278                 _mm_stream_ps(reinterpret_cast<float*>(dest), value.xmm);
 
  282                 _mm_stream_ps(reinterpret_cast<float*>(dest), value.xmm);
 
  286                 _mm_stream_ps(dest, value.xmm);
 
  303         xmm(_mm_set_epi32(i3, i2, i1, i0))
 
  307                 xmm = _mm_set1_epi32(val);
 
  312                 return _mm_setzero_si128();
 
  315         operator __m128i()
 const {
 
  320                 return _mm_and_si128(a.xmm, b.xmm);
 
  323                 return _mm_or_si128(a.xmm, b.xmm);
 
  326                 return _mm_xor_si128(a.xmm, b.xmm);
 
  330                 return _mm_andnot_si128(a.xmm, b.xmm);
 
  333                 xmm = _mm_and_si128(xmm, a.xmm);
 
  337                 xmm = _mm_or_si128(xmm, a.xmm);
 
  341                 xmm = _mm_xor_si128(xmm, a.xmm);
 
  346                 return _mm_add_epi32(a.xmm, b.xmm);
 
  349                 return _mm_sub_epi32(a.xmm, b.xmm);
 
  352                 xmm = _mm_add_epi32(xmm, a.xmm);
 
  356                 xmm = _mm_sub_epi32(xmm, a.xmm);
 
  362                 const __m128i mask = _mm_cmpeq_epi32(xmm, _mm_setzero_si128());
 
  363                 return _mm_movemask_epi8(mask) == 0xFFFF;
 
  368                 return _mm_cmpeq_epi32(a.xmm, b.xmm);
 
  372                 return _mm_cmplt_epi32(a.xmm, b.xmm);
 
  376                 return _mm_cmpgt_epi32(a.xmm, b.xmm);
 
  393                 return _mm_xor_si128(b.xmm,
 
  394                         _mm_and_si128(mask.xmm, _mm_xor_si128(a.xmm, b.xmm)));
 
  397         template <
int32_t i3, 
int32_t i2, 
int32_t i1, 
int32_t i0>
 
  402                 } u = {{i0, i1, i2, i3}};
 
  406         template <
int32_t value>
 
  411                 } u = {{value, value, value, value}};
 
  417                 return _mm_srli_epi32(a.xmm, count);
 
  422                 return _mm_slli_epi32(a.xmm, count);
 
  427                 _mm_stream_si128(&(dest->xmm), value);
 
  431                 _mm_stream_si128(dest, value);
 
  437         return _mm_castps_si128(a);
 
  441         return _mm_cvttps_epi32(a);
 
  445         return _mm_cvtps_epi32(a);
 
  450         return _mm_castsi128_ps(a);
 
  454         return _mm_cvtepi32_ps(a);
 
  468         __m128 tmp3, tmp2, tmp1, tmp0;
 
  469         tmp0 = _mm_unpacklo_ps(row0, row1);
 
  470         tmp2 = _mm_unpacklo_ps(row2, row3);
 
  471         tmp1 = _mm_unpackhi_ps(row0, row1);
 
  472         tmp3 = _mm_unpackhi_ps(row2, row3);
 
  474         row0 = _mm_movelh_ps(tmp0, tmp2);
 
  475         row1 = _mm_movehl_ps(tmp2, tmp0);
 
  476         row2 = _mm_movelh_ps(tmp1, tmp3);
 
  477         row3 = _mm_movehl_ps(tmp3, tmp1);
 
SSEVector4i(int32_t i3, int32_t i2, int32_t i1, int32_t i0)
Definition: ssevector.h:302
SSEVector4i(int32_t val)
Definition: ssevector.h:301
friend SSEVector4f cmple(const SSEVector4f &a, const SSEVector4f &b)
a <= b 
Definition: ssevector.h:211
SSEVector4i castAsInt(const SSEVector4f &a)
Reinterprets as a SSEVector4i. 
Definition: ssevector.h:436
friend SSEVector4f select(const SSEVector4f &mask, const SSEVector4f &a, const SSEVector4f &b)
Select/blend operation (mask) ? a : b 
Definition: ssevector.h:263
bool isZero() const 
Test if all elements are zero. 
Definition: ssevector.h:361
SSEVector4f castAsFloat(const SSEVector4i &a)
Reinterprets a as a SSEVector4f. 
Definition: ssevector.h:449
friend SSEVector4f max(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:173
Matrix< M1, N2, T > operator*(const Matrix< M1, N1, T > &mat1, const Matrix< M2, N2, T > &mat2)
Matrix multiplication (creates a temporary) 
Definition: matrix.h:745
friend SSEVector4f cmpnlt(const SSEVector4f &a, const SSEVector4f &b)
!(a < b) 
Definition: ssevector.h:227
SSEVector4i()
Definition: ssevector.h:298
friend void stream(SSEVector4i *dest, const SSEVector4i &value)
Save to dest without polluting the cache. 
Definition: ssevector.h:426
friend SSEVector4i select(const SSEVector4i &mask, const SSEVector4i &a, const SSEVector4i &b)
Select/blend: (mask) ? a : b 
Definition: ssevector.h:389
friend SSEVector4i cmpeq(const SSEVector4i &a, const SSEVector4i &b)
a == b 
Definition: ssevector.h:367
friend SSEVector4i operator>(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:384
SSEVector4f(float val)
Definition: ssevector.h:81
friend SSEVector4f cmpnle(const SSEVector4f &a, const SSEVector4f &b)
!(a <= b) 
Definition: ssevector.h:231
Definition: ssevector.h:72
friend SSEVector4f operator<(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:249
SSEVector4i(__m128i val)
Definition: ssevector.h:300
friend void stream(__m128 *dest, const SSEVector4f &value)
Save to dest without polluting the cache. 
Definition: ssevector.h:281
friend SSEVector4f cmpge(const SSEVector4f &a, const SSEVector4f &b)
a >= b 
Definition: ssevector.h:219
Definition: ssevector.h:292
static const __m128i & constant()
Definition: ssevector.h:398
friend SSEVector4f operator==(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:243
friend SSEVector4i operator==(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:378
FINLINE void transpose(SSEVector4f &row0, SSEVector4f &row1, SSEVector4f &row2, SSEVector4f &row3)
The arguments row0, row1, row2 and row3 are __m128 values whose elements form the corresponding rows ...
Definition: ssevector.h:466
SSEVector4f toFloat(const SSEVector4i &a)
Convert a to floating point. 
Definition: ssevector.h:453
friend SSEVector4f rcp(const SSEVector4f &v)
Definition: ssevector.h:166
friend SSEVector4f cmpeq(const SSEVector4f &a, const SSEVector4f &b)
a == b 
Definition: ssevector.h:203
friend SSEVector4f operator!=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:246
friend SSEVector4f rcp_nr(const SSEVector4f &v)
Newton-Rhapson Reciprocal: . 
Definition: ssevector.h:160
friend SSEVector4f cmplt(const SSEVector4f &a, const SSEVector4f &b)
a < b 
Definition: ssevector.h:207
friend SSEVector4f operator>(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:255
SSEVector4i(const SSEVector4i &val)
Definition: ssevector.h:299
friend SSEVector4f min(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:170
friend SSEVector4f cmpnge(const SSEVector4f &a, const SSEVector4f &b)
!(a >= b) 
Definition: ssevector.h:239
SSEVector4f shuffle(const SSEVector4f &a)
Definition: ssevector.h:198
friend void stream(SSEVector4f *dest, const SSEVector4f &value)
Save to dest without polluting the cache. 
Definition: ssevector.h:277
SSEVector4i toInt(const SSEVector4f &a)
Convert a to integer using truncate. 
Definition: ssevector.h:440
friend SSEVector4f cmpgt(const SSEVector4f &a, const SSEVector4f &b)
a > b 
Definition: ssevector.h:215
SSEVector4f(const SSEVector4f &other)
Definition: ssevector.h:79
friend SSEVector4f roundTruncate(const SSEVector4f &a)
Round a towards zero. 
Definition: ssevector.h:271
friend SSEVector4f cmpneq(const SSEVector4f &a, const SSEVector4f &b)
a != b 
Definition: ssevector.h:223
SSEVector4f()
Definition: ssevector.h:78
friend SSEVector4f andnot(const SSEVector4f &a, const SSEVector4f &b)
~a & b 
Definition: ssevector.h:109
friend SSEVector4i cmplt(const SSEVector4i &a, const SSEVector4i &b)
a < b 
Definition: ssevector.h:371
friend SSEVector4f operator<=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:252
SSEVector4f(__m128 val)
Definition: ssevector.h:80
friend SSEVector4i operator<(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:381
friend SSEVector4i srl(const SSEVector4i &a, int count)
Shift right by count bits while shifting in zeros. 
Definition: ssevector.h:416
static const __m128i & constant()
Definition: ssevector.h:407
friend SSEVector4i andnot(const SSEVector4i &a, const SSEVector4i &b)
~a & b 
Definition: ssevector.h:329
static SSEVector4f zero()
Definition: ssevector.h:91
friend SSEVector4i cmpgt(const SSEVector4i &a, const SSEVector4i &b)
a > b 
Definition: ssevector.h:375
friend SSEVector4f isnan(const SSEVector4f &a)
Definition: ssevector.h:177
friend SSEVector4f isnan(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:180
int roundToInt(float value)
Integer round function (single precision) 
Definition: math.h:106
friend void stream(__m128i *dest, const SSEVector4i &value)
Save to dest without polluting the cache. 
Definition: ssevector.h:430
friend SSEVector4f cmpngt(const SSEVector4f &a, const SSEVector4f &b)
!(a > b) 
Definition: ssevector.h:235
friend SSEVector4i sll(const SSEVector4i &a, int count)
Shift left by count bits while shifting in zeros. 
Definition: ssevector.h:421
friend SSEVector4f operator>=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:258
static SSEVector4i zero()
Definition: ssevector.h:311
friend void stream(float *dest, const SSEVector4f &value)
Save to dest without polluting the cache. 
Definition: ssevector.h:285
SSEVector4f(float f3, float f2, float f1, float f0)
Definition: ssevector.h:82