18 #ifndef __deal2__vectorization_h 19 #define __deal2__vectorization_h 21 #include <deal.II/base/config.h> 22 #include <deal.II/base/std_cxx1x/type_traits.h> 24 #include <deal.II/base/memory_consumption.h> 25 #include <deal.II/base/parallel.h> 43 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL == 2 // AVX 44 #include <immintrin.h> 45 #include <mm_malloc.h> 46 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2 47 #include <emmintrin.h> 48 #include <mm_malloc.h> 61 sqrt(const ::VectorizedArray<Number> &);
63 abs(const ::VectorizedArray<Number> &);
65 max(const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
67 min (const ::VectorizedArray<Number> &, const ::VectorizedArray<Number> &);
77 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL == 2 && defined(__AVX__) 90 static const unsigned int n_array_elements = 4;
97 operator = (
const double x)
99 data = _mm256_set1_pd(x);
107 operator [] (
const unsigned int comp)
110 return *(
reinterpret_cast<double *
>(&data)+comp);
117 operator [] (
const unsigned int comp)
const 120 return *(
reinterpret_cast<const double *
>(&data)+comp);
134 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 137 data = _mm256_add_pd(data,vec.data);
148 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 151 data = _mm256_sub_pd(data,vec.data);
161 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 164 data = _mm256_mul_pd(data,vec.data);
175 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 178 data = _mm256_div_pd(data,vec.data);
199 res.data = _mm256_sqrt_pd(data);
215 __m256d mask = _mm256_set1_pd (-0.);
217 res.data = _mm256_andnot_pd(mask, data);
230 res.data = _mm256_max_pd (data, other.data);
243 res.data = _mm256_min_pd (data, other.data);
273 static const unsigned int n_array_elements = 8;
280 operator = (
const float x)
282 data = _mm256_set1_ps(x);
290 operator [] (
const unsigned int comp)
293 return *(
reinterpret_cast<float *
>(&data)+comp);
300 operator [] (
const unsigned int comp)
const 303 return *(
reinterpret_cast<const float *
>(&data)+comp);
312 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 315 data = _mm256_add_ps(data,vec.data);
326 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 329 data = _mm256_sub_ps(data,vec.data);
339 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 342 data = _mm256_mul_ps(data,vec.data);
353 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 356 data = _mm256_div_ps(data,vec.data);
378 res.data = _mm256_sqrt_ps(data);
393 __m256 mask = _mm256_set1_ps (-0.f);
395 res.data = _mm256_andnot_ps(mask, data);
408 res.data = _mm256_max_ps (data, other.data);
421 res.data = _mm256_min_ps (data, other.data);
443 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__) 458 static const unsigned int n_array_elements = 2;
466 operator = (
const double x)
468 data = _mm_set1_pd(x);
476 operator [] (
const unsigned int comp)
479 return *(
reinterpret_cast<double *
>(&data)+comp);
486 operator [] (
const unsigned int comp)
const 489 return *(
reinterpret_cast<const double *
>(&data)+comp);
498 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 501 data = _mm_add_pd(data,vec.data);
512 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 515 data = _mm_sub_pd(data,vec.data);
525 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 528 data = _mm_mul_pd(data,vec.data);
539 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 542 data = _mm_div_pd(data,vec.data);
563 res.data = _mm_sqrt_pd(data);
579 __m128d mask = _mm_set1_pd (-0.);
581 res.data = _mm_andnot_pd(mask, data);
594 res.data = _mm_max_pd (data, other.data);
607 res.data = _mm_min_pd (data, other.data);
637 static const unsigned int n_array_elements = 4;
645 operator = (
const float x)
647 data = _mm_set1_ps(x);
655 operator [] (
const unsigned int comp)
658 return *(
reinterpret_cast<float *
>(&data)+comp);
665 operator [] (
const unsigned int comp)
const 668 return *(
reinterpret_cast<const float *
>(&data)+comp);
677 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 680 data = _mm_add_ps(data,vec.data);
691 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 694 data = _mm_sub_ps(data,vec.data);
704 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 707 data = _mm_mul_ps(data,vec.data);
718 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 721 data = _mm_div_ps(data,vec.data);
742 res.data = _mm_sqrt_ps(data);
758 __m128 mask = _mm_set1_ps (-0.f);
760 res.data = _mm_andnot_ps(mask, data);
773 res.data = _mm_max_ps (data, other.data);
786 res.data = _mm_min_ps (data, other.data);
804 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 856 template <
typename Number>
864 static const unsigned int n_array_elements = 1;
877 operator = (
const Number scalar)
888 operator [] (
const unsigned int comp)
899 operator [] (
const unsigned int comp)
const 961 res.
data = std::sqrt(data);
974 res.
data = std::fabs(data);
987 res.
data = std::max (data, other.
data);
1000 res.
data = std::min (data, other.
data);
1023 template <
typename Number>
1038 template <
typename Number>
1053 template <
typename Number>
1068 template <
typename Number>
1083 template <
typename Number>
1099 template <
typename Number>
1102 operator + (
const Number &u,
1120 operator + (
const double &u,
1134 template <
typename Number>
1165 template <
typename Number>
1168 operator - (
const Number &u,
1186 operator - (
const double &u,
1200 template <
typename Number>
1235 template <
typename Number>
1238 operator * (
const Number &u,
1256 operator * (
const double &u,
1270 template <
typename Number>
1301 template <
typename Number>
1304 operator / (
const Number &u,
1322 operator / (
const double &u,
1336 template <
typename Number>
1370 template <
typename Number>
1383 template <
typename Number>
1417 template <
typename T>
1420 static const std::size_t minimum_parallel_grain_size = 160000/
sizeof(T)+1;
1434 bool copy_only =
false)
1436 source_ (source_begin),
1437 destination_ (destination),
1438 copy_only_ (copy_only)
1441 const std::size_t size = source_end - source_begin;
1442 if (size < minimum_parallel_grain_size)
1443 apply_to_subrange (0, size);
1445 apply_parallel (0, size, minimum_parallel_grain_size);
1454 const std::size_t end)
const 1458 if (std_cxx1x::is_trivial<T>::value ==
true)
1459 std::memcpy (destination_+begin, source_+begin, (end-begin)*
sizeof(T));
1460 else if (copy_only_ ==
false)
1461 for (std::size_t i=begin; i<end; ++i)
1464 new (&destination_[i]) T;
1465 destination_[i] = source_[i];
1469 for (std::size_t i=begin; i<end; ++i)
1471 new (&destination_[i]) T;
1472 destination_[i] = source_[i];
1479 const bool copy_only_;
1487 template <
typename T>
1490 static const std::size_t minimum_parallel_grain_size = 160000/
sizeof(T)+1;
1502 destination_ (destination),
1503 trivial_element (false)
1508 if (std_cxx1x::is_trivial<T>::value ==
true)
1510 const unsigned char zero [
sizeof(T)] = {};
1511 if (std::memcmp(zero, &element,
sizeof(T)) == 0)
1512 trivial_element =
true;
1514 if (size < minimum_parallel_grain_size)
1515 apply_to_subrange (0, size);
1517 apply_parallel (0, size, minimum_parallel_grain_size);
1527 const std::size_t end)
const 1531 if (std_cxx1x::is_trivial<T>::value ==
true && trivial_element)
1532 std::memset (destination_+begin, 0, (end-begin)*
sizeof(T));
1534 for (std::size_t i=begin; i<end; ++i)
1537 new (&destination_[i]) T;
1538 destination_[i] = element_;
1543 mutable T *destination_;
1544 bool trivial_element;
1562 template <
class T >
1579 typedef std::size_t size_type;
1649 _end_data = _data + size;
1661 const T &init = T())
1663 const size_type old_size = size();
1664 if (std_cxx1x::is_trivial<T>::value ==
false && size_in < old_size)
1667 while (_end_data != _data+size_in)
1668 (--_end_data)->~T();
1671 resize_fast (size_in);
1674 if (size_in > old_size)
1693 const size_type old_size = _end_data - _data;
1694 const size_type allocated_size = _end_allocated - _data;
1695 if (size_alloc > allocated_size)
1701 size_type new_size = size_alloc;
1702 if (size_alloc < (2 * allocated_size))
1703 new_size = 2 * allocated_size;
1705 const size_type size_actual_allocate = new_size *
sizeof(T);
1707 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 1712 T *new_data =
static_cast<T *
>(_mm_malloc (size_actual_allocate,
1715 T *new_data =
static_cast<T *
>(malloc (size_actual_allocate));
1718 throw std::bad_alloc();
1724 std::swap (_data, new_data);
1725 _end_data = _data + old_size;
1726 _end_allocated = _data + new_size;
1727 if (_end_data != _data)
1731 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 1738 else if (size_alloc == 0)
1752 if (std_cxx1x::is_trivial<T>::value ==
false)
1753 while (_end_data != _data)
1754 (--_end_data)->~T();
1756 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 1777 if (_end_data == _end_allocated)
1778 reserve (std::max(2*capacity(),static_cast<size_type>(16)));
1779 if (std_cxx1x::is_trivial<T>::value ==
false)
1781 *_end_data++ = in_data;
1791 T *field = _end_data - 1;
1802 const T *field = _end_data - 1;
1810 template <
typename ForwardIterator>
1812 ForwardIterator end)
1814 const unsigned int old_size = size();
1815 reserve (old_size + (end-begin));
1816 for ( ; begin != end; ++begin, ++_end_data)
1818 if (std_cxx1x::is_trivial<T>::value ==
false)
1820 *_end_data = *begin;
1830 std::swap (_data, vec.
_data);
1840 return _end_data - _data;
1851 return _end_allocated - _data;
1859 operator [] (
const size_type index)
1862 return _data[index];
1869 const_reference operator [] (
const size_type index)
const 1872 return _data[index];
1919 size_type memory =
sizeof(
this);
1920 memory +=
sizeof(T) * capacity();
1943 DEAL_II_NAMESPACE_CLOSE
1961 template <
typename Number>
1963 ::VectorizedArray<Number>
1964 sin (const ::VectorizedArray<Number> &x)
1967 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
1968 sin_val[i] = std::sin(x[i]);
1981 template <
typename Number>
1983 ::VectorizedArray<Number>
1984 tan (const ::VectorizedArray<Number> &x)
1987 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
1988 tan_val[i] = std::tan(x[i]);
2000 template <
typename Number>
2002 ::VectorizedArray<Number>
2003 cos (const ::VectorizedArray<Number> &x)
2006 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2007 cos_val[i] = std::cos(x[i]);
2019 template <
typename Number>
2021 ::VectorizedArray<Number>
2022 exp (const ::VectorizedArray<Number> &x)
2025 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2026 exp_val[i] = std::exp(x[i]);
2038 template <
typename Number>
2040 ::VectorizedArray<Number>
2041 log (const ::VectorizedArray<Number> &x)
2044 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
2045 log_val[i] = std::log(x[i]);
2058 template <
typename Number>
2060 ::VectorizedArray<Number>
2061 sqrt (const ::VectorizedArray<Number> &x)
2063 return x.get_sqrt();
2075 template <
typename Number>
2077 ::VectorizedArray<Number>
2078 abs (const ::VectorizedArray<Number> &x)
2092 template <
typename Number>
2094 ::VectorizedArray<Number>
2095 max (const ::VectorizedArray<Number> &x,
2096 const ::VectorizedArray<Number> &y)
2098 return x.get_max(y);
2110 template <
typename Number>
2112 ::VectorizedArray<Number>
2113 min (const ::VectorizedArray<Number> &x,
2114 const ::VectorizedArray<Number> &y)
2116 return x.get_min(y);
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
virtual void apply_to_subrange(const std::size_t begin, const std::size_t end) const
size_type capacity() const
VectorizedArray< Number > make_vectorized_array(const Number &u)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
size_type memory_consumption() const
void reserve(const size_type size_alloc)
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
VectorizedArray get_sqrt() const
void push_back(const T in_data)
virtual void apply_to_subrange(const std::size_t begin, const std::size_t end) const
AlignedVector(const size_type size)
VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray get_abs() const
const_reference back() const
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
#define Assert(cond, exc)
void resize(const size_type size_in, const T &init=T())
void insert_back(ForwardIterator begin, ForwardIterator end)
void swap(AlignedVector< T > &vec)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
void resize_fast(const size_type size)
VectorizedArray get_max(const VectorizedArray &other) const
const_iterator end() const
AlignedVectorMove(T *source_begin, T *source_end, T *destination, bool copy_only=false)
const_iterator begin() const
AlignedVector(const AlignedVector< T > &vec)
::ExceptionBase & ExcInternalError()
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
AlignedVectorSet(const std::size_t size, const T &element, T *destination)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)