Mitsuba Renderer  0.5.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ssemath.h
Go to the documentation of this file.
1 /*
2  This file is part of Mitsuba, a physically based rendering system.
3 
4  Copyright (c) 2007-2014 by Wenzel Jakob and others.
5 
6  Mitsuba is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License Version 3
8  as published by the Free Software Foundation.
9 
10  Mitsuba is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License
16  along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #pragma once
20 #if !defined(__MITSUBA_CORE_SSEMATH_H_)
21 #define __MITSUBA_CORE_SSEMATH_H_
22 
23 #ifdef MTS_SSE
24 
25 #include <mitsuba/core/sse.h>
26 
28 
29 namespace math {
30  /**
31  * \brief SIMD (SSE2) implementation of \c log
32  * \author Julien Pommier
33  */
34  extern MTS_EXPORT_CORE __m128 log_ps(__m128 x);
35 
36  /**
37  * \brief SIMD (SSE2) implementation of \c exp
38  * \author Julien Pommier
39  */
40  extern MTS_EXPORT_CORE __m128 exp_ps(__m128 x);
41 
42  /**
43  * \brief SIMD (SSE2) implementation of \c sin
44  * \author Julien Pommier
45  */
46  extern MTS_EXPORT_CORE __m128 sin_ps(__m128 x);
47 
48  /**
49  * \brief SIMD (SSE2) implementation of \c cos
50  * \author Julien Pommier
51  */
52  extern MTS_EXPORT_CORE __m128 cos_ps(__m128 x);
53 
54  /**
55  * \brief SIMD (SSE2) implementation which simultaneously
56  * computes the sine and cosine of a given value
57  * \author Julien Pommier
58  */
59  extern MTS_EXPORT_CORE void sincos_ps(__m128 x, __m128* s, __m128* c);
60 
61  /**
62  * \brief Fast SIMD (SSE2) approximation of \c log
63  * which provides about 10-11 mantissa bits.
64  * Inspired by the Intel Approximate Math Library.
65  */
66  extern MTS_EXPORT_CORE __m128 fastlog_ps(__m128 x);
67 
68  /**
69  * \brief Fast SIMD (SSE2) approximation of \c pow
70  * which provides about 10-11 mantissa bits.
71  * Inspired by the Intel Approximate Math Library.
72  */
73  extern MTS_EXPORT_CORE __m128 fastpow_ps(__m128 x, __m128 y);
74 
75  /**
76  * \brief The arguments <tt>row0</tt>, <tt>row1</tt>, <tt>row2</tt> and
77  * <tt>row3</tt> are \c __m128 values whose elements form the corresponding
78  * rows of a 4-by-4 matrix. The matrix transposition is returned in
79  * arguments <tt>row0</tt>, <tt>row1</tt>, <tt>row2</tt> and <tt>row3</tt>
80  * where \c row0 now holds column 0 of the original matrix, \c row1 now
81  * holds column 1 of the original matrix, and so on.
82  * \author Intel Intrinsics Guide for AVX2
83  */
84  FINLINE void transpose_ps(__m128& row0, __m128& row1,
85  __m128& row2, __m128& row3) {
86  __m128 tmp3, tmp2, tmp1, tmp0;
87  tmp0 = _mm_unpacklo_ps(row0, row1);
88  tmp2 = _mm_unpacklo_ps(row2, row3);
89  tmp1 = _mm_unpackhi_ps(row0, row1);
90  tmp3 = _mm_unpackhi_ps(row2, row3);
91 
92  row0 = _mm_movelh_ps(tmp0, tmp2);
93  row1 = _mm_movehl_ps(tmp2, tmp0);
94  row2 = _mm_movelh_ps(tmp1, tmp3);
95  row3 = _mm_movehl_ps(tmp3, tmp1);
96  }
97 
98  /// Component-wise clamp: <tt>max(min(x, maxVal), minVal)</tt>
99  inline __m128 clamp_ps(__m128 x, __m128 minVal, __m128 maxVal) {
100  return _mm_max_ps(_mm_min_ps(x, maxVal), minVal);
101  }
102 
103  /// Sum of all elements in the vector
104  inline float hsum_ps(__m128 vec) {
105  __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2));
106  __m128 sum_tmp = _mm_add_ps(vec, tmp);
107  tmp = _mm_shuffle_ps(sum_tmp, sum_tmp, _MM_SHUFFLE(2,3,0,1));
108  sum_tmp = _mm_add_ps(sum_tmp, tmp);
109  return _mm_cvtss_f32(sum_tmp);
110  }
111 
112  /// Maximum across all the elements of a vector
113  inline float hmax_ps(__m128 vec) {
114  __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2));
115  __m128 tmp_max = _mm_max_ps(vec, tmp);
116  tmp = _mm_shuffle_ps(tmp_max, tmp_max, _MM_SHUFFLE(2,3,0,1));
117  tmp_max = _mm_max_ps(tmp_max, tmp);
118  return _mm_cvtss_f32(tmp_max);
119  }
120 
121  /// Minimum across all the elements of a vector
122  inline float hmin_ps(__m128 vec) {
123  __m128 tmp = _mm_shuffle_ps(vec, vec, _MM_SHUFFLE(1,0,3,2));
124  __m128 tmp_min = _mm_min_ps(vec, tmp);
125  tmp = _mm_shuffle_ps(tmp_min, tmp_min, _MM_SHUFFLE(2,3,0,1));
126  tmp_min = _mm_min_ps(tmp_min, tmp);
127  return _mm_cvtss_f32(tmp_min);
128  }
129 };
130 
132 
133 #endif /* MTS_SSE */
134 
135 #endif /* __MITSUBA_CORE_SSEMATH_H_ */
#define MTS_EXPORT_CORE
Definition: getopt.h:29
#define MTS_NAMESPACE_BEGIN
Definition: platform.h:137
#define MTS_NAMESPACE_END
Definition: platform.h:138