Grok  7.6.0
simd.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2016-2020 Grok Image Compression Inc.
3  *
4  * This source code is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU Affero General Public License, version 3,
6  * as published by the Free Software Foundation.
7  *
8  * This source code is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU Affero General Public License for more details.
12  *
13  * You should have received a copy of the GNU Affero General Public License
14  * along with this program. If not, see <http://www.gnu.org/licenses/>.
15  *
16  *
17  * This source code incorporates work covered by the BSD 2-clause license.
18  * Please see the LICENSE file in the root directory for details.
19  *
20  */
21 
22 #pragma once
23 
24 #define GRK_SKIP_POISON
25 #ifdef __SSE__
26 #include <xmmintrin.h>
27 #endif
28 #ifdef __SSE2__
29 #include <emmintrin.h>
30 #endif
31 #ifdef __SSSE3__
32 #include <tmmintrin.h>
33 #endif
34 #ifdef __AVX2__
35 #include <immintrin.h>
36 #endif
37 
38 
39 #ifdef __AVX2__
40 
41 #define VREG_INT_COUNT 8
42 #else
43 
44 #define VREG_INT_COUNT 4
45 #endif
46 
47 
48 
49 #if (defined(__SSE2__) || defined(__AVX2__))
50 
51 /* Convenience macros to improve the readability of the formulas */
52 #if __AVX2__
53 #define VREG __m256i
54 #define LOAD_CST(x) _mm256_set1_epi32(x)
55 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
56 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
57 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
58 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
59 #define ADD(x,y) _mm256_add_epi32((x),(y))
60 #define SUB(x,y) _mm256_sub_epi32((x),(y))
61 #define SAR(x,y) _mm256_srai_epi32((x),(y))
62 #define MUL(x,y) _mm256_mullo_epi32((x),(y))
63 #define VREGF __m256
64 #define LOADF(x) _mm256_load_ps((float const*)(x))
65 #define LOAD_CST_F(x)_mm256_set1_ps(x)
66 #define ADDF(x,y) _mm256_add_ps((x),(y))
67 #define MULF(x,y) _mm256_mul_ps((x),(y))
68 #define SUBF(x,y) _mm256_sub_ps((x),(y))
69 #define STOREF(x,y) _mm256_store_ps((float*)(x),(y))
70 #else
71 #define VREG __m128i
72 #define LOAD_CST(x) _mm_set1_epi32(x)
73 #define LOAD(x) _mm_load_si128((const VREG*)(x))
74 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
75 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
76 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
77 #define ADD(x,y) _mm_add_epi32((x),(y))
78 #define SUB(x,y) _mm_sub_epi32((x),(y))
79 // MUL is actually only valid for SSE 4.1
80 #define MUL(x,y) _mm_mullo_epi32((x),(y))
81 #define SAR(x,y) _mm_srai_epi32((x),(y))
82 #define VREGF __m128
83 #define LOADF(x) _mm_load_ps((float const*)(x))
84 #define LOAD_CST_F(x) _mm_set1_ps(x)
85 #define ADDF(x,y) _mm_add_ps((x),(y))
86 #define MULF(x,y) _mm_mul_ps((x),(y))
87 #define SUBF(x,y) _mm_sub_ps((x),(y))
88 #define STOREF(x,y) _mm_store_ps((float*)(x),(y))
89 #endif
90 
91 #define ADD3(x,y,z) ADD(ADD(x,y),z)
92 
93 #endif