46 #if !defined(__MITSUBA_CORE_SSEVECTOR_H_)
47 #define __MITSUBA_CORE_SSEVECTOR_H_
53 # error "This header requires SSE support"
65 template <
int idx3,
int idx2,
int idx1,
int idx0>
66 SSEVector4f
shuffle(
const SSEVector4f& low,
const SSEVector4f& hi);
68 template <
int idx3,
int idx2,
int idx1,
int idx0>
69 SSEVector4f
shuffle(
const SSEVector4f& a);
83 xmm(_mm_set_ps(f3, f2, f1, f0))
87 xmm = _mm_set1_ps(val);
92 return _mm_setzero_ps();
95 operator __m128()
const {
100 return _mm_and_ps(a.xmm, b.xmm);
103 return _mm_or_ps(a.xmm, b.xmm);
106 return _mm_xor_ps(a.xmm, b.xmm);
110 return _mm_andnot_ps(a.xmm, b.xmm);
114 xmm = _mm_and_ps(xmm, a.xmm);
118 xmm = _mm_or_ps(xmm, a.xmm);
122 xmm = _mm_xor_ps(xmm, a.xmm);
127 return _mm_add_ps(a.xmm, b.xmm);
130 return _mm_sub_ps(a.xmm, b.xmm);
133 return _mm_mul_ps(a.xmm, b.xmm);
136 return _mm_div_ps(a.xmm, b.xmm);
140 xmm = _mm_add_ps(xmm, a.xmm);
144 xmm = _mm_sub_ps(xmm, a.xmm);
148 xmm = _mm_mul_ps(xmm, a.xmm);
152 xmm = _mm_div_ps(xmm, a.xmm);
161 __m128 x0 = _mm_rcp_ps(v.xmm);
162 return _mm_sub_ps(_mm_add_ps(x0,x0),
163 _mm_mul_ps(_mm_mul_ps(x0,v.xmm), x0));
167 return _mm_rcp_ps(v.xmm);
171 return _mm_min_ps(a.xmm, b.xmm);
174 return _mm_max_ps(a.xmm, b.xmm);
178 return _mm_cmpunord_ps(a.xmm, a.xmm);
181 return _mm_cmpunord_ps(a.xmm, b.xmm);
191 template <
int idx3,
int idx2,
int idx1,
int idx0>
193 return _mm_shuffle_ps(low.xmm,hi.xmm,_MM_SHUFFLE(idx3,idx2,idx1,idx0));
197 template <
int idx3,
int idx2,
int idx1,
int idx0>
199 return _mm_shuffle_ps(a.xmm, a.xmm, _MM_SHUFFLE(idx3,idx2,idx1,idx0));
204 return _mm_cmpeq_ps(a.xmm, b.xmm);
208 return _mm_cmplt_ps(a.xmm, b.xmm);
212 return _mm_cmple_ps(a.xmm, b.xmm);
216 return _mm_cmpgt_ps(a.xmm, b.xmm);
220 return _mm_cmpge_ps(a.xmm, b.xmm);
224 return _mm_cmpneq_ps(a.xmm, b.xmm);
228 return _mm_cmpnlt_ps(a.xmm, b.xmm);
232 return _mm_cmpnle_ps(a.xmm, b.xmm);
236 return _mm_cmpngt_ps(a.xmm, b.xmm);
240 return _mm_cmpnge_ps(a.xmm, b.xmm);
267 return _mm_xor_ps(b.xmm, _mm_and_ps(mask.xmm, _mm_xor_ps(a.xmm, b.xmm)));
272 __m128i truncated = _mm_cvttps_epi32(a.xmm);
273 return _mm_cvtepi32_ps(truncated);
278 _mm_stream_ps(reinterpret_cast<float*>(dest), value.xmm);
282 _mm_stream_ps(reinterpret_cast<float*>(dest), value.xmm);
286 _mm_stream_ps(dest, value.xmm);
303 xmm(_mm_set_epi32(i3, i2, i1, i0))
307 xmm = _mm_set1_epi32(val);
312 return _mm_setzero_si128();
315 operator __m128i()
const {
320 return _mm_and_si128(a.xmm, b.xmm);
323 return _mm_or_si128(a.xmm, b.xmm);
326 return _mm_xor_si128(a.xmm, b.xmm);
330 return _mm_andnot_si128(a.xmm, b.xmm);
333 xmm = _mm_and_si128(xmm, a.xmm);
337 xmm = _mm_or_si128(xmm, a.xmm);
341 xmm = _mm_xor_si128(xmm, a.xmm);
346 return _mm_add_epi32(a.xmm, b.xmm);
349 return _mm_sub_epi32(a.xmm, b.xmm);
352 xmm = _mm_add_epi32(xmm, a.xmm);
356 xmm = _mm_sub_epi32(xmm, a.xmm);
362 const __m128i mask = _mm_cmpeq_epi32(xmm, _mm_setzero_si128());
363 return _mm_movemask_epi8(mask) == 0xFFFF;
368 return _mm_cmpeq_epi32(a.xmm, b.xmm);
372 return _mm_cmplt_epi32(a.xmm, b.xmm);
376 return _mm_cmpgt_epi32(a.xmm, b.xmm);
393 return _mm_xor_si128(b.xmm,
394 _mm_and_si128(mask.xmm, _mm_xor_si128(a.xmm, b.xmm)));
397 template <
int32_t i3,
int32_t i2,
int32_t i1,
int32_t i0>
402 } u = {{i0, i1, i2, i3}};
406 template <
int32_t value>
411 } u = {{value, value, value, value}};
417 return _mm_srli_epi32(a.xmm, count);
422 return _mm_slli_epi32(a.xmm, count);
427 _mm_stream_si128(&(dest->xmm), value);
431 _mm_stream_si128(dest, value);
437 return _mm_castps_si128(a);
441 return _mm_cvttps_epi32(a);
445 return _mm_cvtps_epi32(a);
450 return _mm_castsi128_ps(a);
454 return _mm_cvtepi32_ps(a);
468 __m128 tmp3, tmp2, tmp1, tmp0;
469 tmp0 = _mm_unpacklo_ps(row0, row1);
470 tmp2 = _mm_unpacklo_ps(row2, row3);
471 tmp1 = _mm_unpackhi_ps(row0, row1);
472 tmp3 = _mm_unpackhi_ps(row2, row3);
474 row0 = _mm_movelh_ps(tmp0, tmp2);
475 row1 = _mm_movehl_ps(tmp2, tmp0);
476 row2 = _mm_movelh_ps(tmp1, tmp3);
477 row3 = _mm_movehl_ps(tmp3, tmp1);
SSEVector4i(int32_t i3, int32_t i2, int32_t i1, int32_t i0)
Definition: ssevector.h:302
SSEVector4i(int32_t val)
Definition: ssevector.h:301
friend SSEVector4f cmple(const SSEVector4f &a, const SSEVector4f &b)
a <= b
Definition: ssevector.h:211
SSEVector4i castAsInt(const SSEVector4f &a)
Reinterprets as a SSEVector4i.
Definition: ssevector.h:436
friend SSEVector4f select(const SSEVector4f &mask, const SSEVector4f &a, const SSEVector4f &b)
Select/blend operation (mask) ? a : b
Definition: ssevector.h:263
bool isZero() const
Test if all elements are zero.
Definition: ssevector.h:361
SSEVector4f castAsFloat(const SSEVector4i &a)
Reinterprets a as a SSEVector4f.
Definition: ssevector.h:449
friend SSEVector4f max(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:173
Matrix< M1, N2, T > operator*(const Matrix< M1, N1, T > &mat1, const Matrix< M2, N2, T > &mat2)
Matrix multiplication (creates a temporary)
Definition: matrix.h:745
friend SSEVector4f cmpnlt(const SSEVector4f &a, const SSEVector4f &b)
!(a < b)
Definition: ssevector.h:227
SSEVector4i()
Definition: ssevector.h:298
friend void stream(SSEVector4i *dest, const SSEVector4i &value)
Save to dest without polluting the cache.
Definition: ssevector.h:426
friend SSEVector4i select(const SSEVector4i &mask, const SSEVector4i &a, const SSEVector4i &b)
Select/blend: (mask) ? a : b
Definition: ssevector.h:389
friend SSEVector4i cmpeq(const SSEVector4i &a, const SSEVector4i &b)
a == b
Definition: ssevector.h:367
friend SSEVector4i operator>(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:384
SSEVector4f(float val)
Definition: ssevector.h:81
friend SSEVector4f cmpnle(const SSEVector4f &a, const SSEVector4f &b)
!(a <= b)
Definition: ssevector.h:231
Definition: ssevector.h:72
friend SSEVector4f operator<(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:249
SSEVector4i(__m128i val)
Definition: ssevector.h:300
friend void stream(__m128 *dest, const SSEVector4f &value)
Save to dest without polluting the cache.
Definition: ssevector.h:281
friend SSEVector4f cmpge(const SSEVector4f &a, const SSEVector4f &b)
a >= b
Definition: ssevector.h:219
Definition: ssevector.h:292
static const __m128i & constant()
Definition: ssevector.h:398
friend SSEVector4f operator==(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:243
friend SSEVector4i operator==(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:378
FINLINE void transpose(SSEVector4f &row0, SSEVector4f &row1, SSEVector4f &row2, SSEVector4f &row3)
The arguments row0, row1, row2 and row3 are __m128 values whose elements form the corresponding rows ...
Definition: ssevector.h:466
SSEVector4f toFloat(const SSEVector4i &a)
Convert a to floating point.
Definition: ssevector.h:453
friend SSEVector4f rcp(const SSEVector4f &v)
Definition: ssevector.h:166
friend SSEVector4f cmpeq(const SSEVector4f &a, const SSEVector4f &b)
a == b
Definition: ssevector.h:203
friend SSEVector4f operator!=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:246
friend SSEVector4f rcp_nr(const SSEVector4f &v)
Newton-Rhapson Reciprocal: .
Definition: ssevector.h:160
friend SSEVector4f cmplt(const SSEVector4f &a, const SSEVector4f &b)
a < b
Definition: ssevector.h:207
friend SSEVector4f operator>(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:255
SSEVector4i(const SSEVector4i &val)
Definition: ssevector.h:299
friend SSEVector4f min(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:170
friend SSEVector4f cmpnge(const SSEVector4f &a, const SSEVector4f &b)
!(a >= b)
Definition: ssevector.h:239
SSEVector4f shuffle(const SSEVector4f &a)
Definition: ssevector.h:198
friend void stream(SSEVector4f *dest, const SSEVector4f &value)
Save to dest without polluting the cache.
Definition: ssevector.h:277
SSEVector4i toInt(const SSEVector4f &a)
Convert a to integer using truncate.
Definition: ssevector.h:440
friend SSEVector4f cmpgt(const SSEVector4f &a, const SSEVector4f &b)
a > b
Definition: ssevector.h:215
SSEVector4f(const SSEVector4f &other)
Definition: ssevector.h:79
friend SSEVector4f roundTruncate(const SSEVector4f &a)
Round a towards zero.
Definition: ssevector.h:271
friend SSEVector4f cmpneq(const SSEVector4f &a, const SSEVector4f &b)
a != b
Definition: ssevector.h:223
SSEVector4f()
Definition: ssevector.h:78
friend SSEVector4f andnot(const SSEVector4f &a, const SSEVector4f &b)
~a & b
Definition: ssevector.h:109
friend SSEVector4i cmplt(const SSEVector4i &a, const SSEVector4i &b)
a < b
Definition: ssevector.h:371
friend SSEVector4f operator<=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:252
SSEVector4f(__m128 val)
Definition: ssevector.h:80
friend SSEVector4i operator<(const SSEVector4i &a, const SSEVector4i &b)
Definition: ssevector.h:381
friend SSEVector4i srl(const SSEVector4i &a, int count)
Shift right by count bits while shifting in zeros.
Definition: ssevector.h:416
static const __m128i & constant()
Definition: ssevector.h:407
friend SSEVector4i andnot(const SSEVector4i &a, const SSEVector4i &b)
~a & b
Definition: ssevector.h:329
static SSEVector4f zero()
Definition: ssevector.h:91
friend SSEVector4i cmpgt(const SSEVector4i &a, const SSEVector4i &b)
a > b
Definition: ssevector.h:375
friend SSEVector4f isnan(const SSEVector4f &a)
Definition: ssevector.h:177
friend SSEVector4f isnan(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:180
int roundToInt(float value)
Integer round function (single precision)
Definition: math.h:106
friend void stream(__m128i *dest, const SSEVector4i &value)
Save to dest without polluting the cache.
Definition: ssevector.h:430
friend SSEVector4f cmpngt(const SSEVector4f &a, const SSEVector4f &b)
!(a > b)
Definition: ssevector.h:235
friend SSEVector4i sll(const SSEVector4i &a, int count)
Shift left by count bits while shifting in zeros.
Definition: ssevector.h:421
friend SSEVector4f operator>=(const SSEVector4f &a, const SSEVector4f &b)
Definition: ssevector.h:258
static SSEVector4i zero()
Definition: ssevector.h:311
friend void stream(float *dest, const SSEVector4f &value)
Save to dest without polluting the cache.
Definition: ssevector.h:285
SSEVector4f(float f3, float f2, float f1, float f0)
Definition: ssevector.h:82