api/ssemath_8h_source.html

 /*

     This file is part of Mitsuba, a physically based rendering system.


     Copyright (c) 2007-2014 by Wenzel Jakob and others.


     Mitsuba is free software; you can redistribute it and/or modify

     it under the terms of the GNU General Public License Version 3

     as published by the Free Software Foundation.


     Mitsuba is distributed in the hope that it will be useful,

     but WITHOUT ANY WARRANTY; without even the implied warranty of

     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

     GNU General Public License for more details.


     You should have received a copy of the GNU General Public License

     along with this program. If not, see <http://www.gnu.org/licenses/>.

 */


 #pragma once

 #if !defined(__MITSUBA_CORE_SSEMATH_H_)

 #define __MITSUBA_CORE_SSEMATH_H_


 #ifdef MTS_SSE


 #include <mitsuba/core/sse.h>


 MTS_NAMESPACE_BEGIN


 namespace math {

         /**

          * \brief SIMD (SSE2) implementation of \c log

          * \author Julien Pommier

          */

         extern MTS_EXPORT_CORE __m128 log_ps(__m128 x);


         /**

          * \brief SIMD (SSE2) implementation of \c exp

          * \author Julien Pommier

          */

         extern MTS_EXPORT_CORE __m128 exp_ps(__m128 x);


         /**

          * \brief SIMD (SSE2) implementation of \c sin

          * \author Julien Pommier

          */

         extern MTS_EXPORT_CORE __m128 sin_ps(__m128 x);


         /**

          * \brief SIMD (SSE2) implementation of \c cos

          * \author Julien Pommier

          */

         extern MTS_EXPORT_CORE __m128 cos_ps(__m128 x);


         /**

          * \brief SIMD (SSE2) implementation which simultaneously

          * computes the sine and cosine of a given value

          * \author Julien Pommier

          */

         extern MTS_EXPORT_CORE void sincos_ps(__m128 x, __m128* s, __m128* c);


         /**

          * \brief Fast SIMD (SSE2) approximation of \c log

          * which provides about 10-11 mantissa bits.

          * Inspired by the Intel Approximate Math Library.

          */

         extern MTS_EXPORT_CORE __m128 fastlog_ps(__m128 x);


         /**

          * \brief Fast SIMD (SSE2) approximation of \c pow

          * which provides about 10-11 mantissa bits.

          * Inspired by the Intel Approximate Math Library.

          */

         extern MTS_EXPORT_CORE __m128 fastpow_ps(__m128 x, __m128 y);


         /**

          * \brief The arguments <tt>row0</tt>, <tt>row1</tt>, <tt>row2</tt> and

          * <tt>row3</tt> are \c __m128 values whose elements form the corresponding

          * rows of a 4-by-4 matrix. The matrix transposition is returned in

          * arguments <tt>row0</tt>, <tt>row1</tt>, <tt>row2</tt> and <tt>row3</tt>

          * where \c row0 now holds column 0 of the original matrix, \c row1 now

          * holds column 1 of the original matrix, and so on.

          * \author Intel Intrinsics Guide for AVX2

          */

         FINLINE void transpose_ps(__m128& row0, __m128& row1,

                 __m128& row2, __m128& row3) {

                 __m128 tmp3, tmp2, tmp1, tmp0;

                 tmp0 = _mm_unpacklo_ps(row0, row1);

                 tmp2 = _mm_unpacklo_ps(row2, row3);

                 tmp1 = _mm_unpackhi_ps(row0, row1);

                 tmp3 = _mm_unpackhi_ps(row2, row3);


                 row0 = _mm_movelh_ps(tmp0, tmp2);

                 row1 = _mm_movehl_ps(tmp2, tmp0);

                 row2 = _mm_movelh_ps(tmp1, tmp3);

                 row3 = _mm_movehl_ps(tmp3, tmp1);

         }


         /// Component-wise clamp: <tt>max(min(x, maxVal), minVal)</tt>

         inline __m128 clamp_ps(__m128 x, __m128 minVal, __m128 maxVal) {

                 return _mm_max_ps(_mm_min_ps(x, maxVal), minVal);

         }


         /// Sum of all elements in the vector

         inline float hsum_ps(__m128 vec) {

                 __m128 tmp = _mm_shuffle_ps(vec, vec,  _MM_SHUFFLE(1,0,3,2));

                 __m128 sum_tmp = _mm_add_ps(vec, tmp);

                 tmp = _mm_shuffle_ps(sum_tmp, sum_tmp, _MM_SHUFFLE(2,3,0,1));

                 sum_tmp = _mm_add_ps(sum_tmp, tmp);

                 return _mm_cvtss_f32(sum_tmp);

         }


         /// Maximum across all the elements of a vector

         inline float hmax_ps(__m128 vec) {

                 __m128 tmp = _mm_shuffle_ps(vec, vec,  _MM_SHUFFLE(1,0,3,2));

                 __m128 tmp_max = _mm_max_ps(vec, tmp);

                 tmp = _mm_shuffle_ps(tmp_max, tmp_max, _MM_SHUFFLE(2,3,0,1));

                 tmp_max = _mm_max_ps(tmp_max, tmp);

                 return _mm_cvtss_f32(tmp_max);

         }


         /// Minimum across all the elements of a vector

         inline float hmin_ps(__m128 vec) {

                 __m128 tmp = _mm_shuffle_ps(vec, vec,  _MM_SHUFFLE(1,0,3,2));

                 __m128 tmp_min = _mm_min_ps(vec, tmp);

                 tmp = _mm_shuffle_ps(tmp_min, tmp_min, _MM_SHUFFLE(2,3,0,1));

                 tmp_min = _mm_min_ps(tmp_min, tmp);

                 return _mm_cvtss_f32(tmp_min);

         }

 };


 MTS_NAMESPACE_END


 #endif /* MTS_SSE */


 #endif /* __MITSUBA_CORE_SSEMATH_H_ */

sse.h

MTS_EXPORT_CORE
#define MTS_EXPORT_CORE
Definition: getopt.h:29

MTS_NAMESPACE_BEGIN
#define MTS_NAMESPACE_BEGIN
Definition: platform.h:137

MTS_NAMESPACE_END
#define MTS_NAMESPACE_END
Definition: platform.h:138