My little initializer script

This commit is contained in:
2024-09-17 23:08:59 -03:00
commit 1305c4d625
243 changed files with 300988 additions and 0 deletions

View File

@@ -0,0 +1,134 @@
/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined X86GPRINTRIN_H_
#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef BMI2INTRIN_H_
#define BMI2INTRIN_H_
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32(unsigned int __X, unsigned int __Y) {
return ((__X << (32 - __Y)) >> (32 - __Y));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
unsigned long long __res = (unsigned long long)__X * __Y;
*__P = (unsigned int)(__res >> 32);
return (unsigned int)__res;
}
#ifdef __PPC64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
return ((__X << (64 - __Y)) >> (64 - __Y));
}
/* __int128 requires base 64-bit. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64(unsigned long long __X, unsigned long long __Y,
unsigned long long *__P) {
unsigned __int128 __res = (unsigned __int128)__X * __Y;
*__P = (unsigned long long)(__res >> 64);
return (unsigned long long)__res;
}
#ifdef _ARCH_PWR7
/* popcount and bpermd require power7 minimum. */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64(unsigned long long __X, unsigned long long __M) {
unsigned long __result = 0x0UL;
const unsigned long __mask = 0x8000000000000000UL;
unsigned long __m = __M;
unsigned long __c, __t;
unsigned long __p;
/* The pop-count of the mask gives the number of the bits from
source to process. This is also needed to shift bits from the
source into the correct position for the result. */
__p = 64 - __builtin_popcountl(__M);
/* The loop is for the number of '1' bits in the mask and clearing
each mask bit as it is processed. */
while (__m != 0) {
__c = __builtin_clzl(__m);
__t = __X << (__p - __c);
__m ^= (__mask >> __c);
__result |= (__t & (__mask >> __c));
__p++;
}
return __result;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64(unsigned long long __X, unsigned long long __M) {
unsigned long __p = 0x4040404040404040UL; // initial bit permute control
const unsigned long __mask = 0x8000000000000000UL;
unsigned long __m = __M;
unsigned long __c;
unsigned long __result;
/* if the mask is constant and selects 8 bits or less we can use
the Power8 Bit permute instruction. */
if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
/* Also if the pext mask is constant, then the popcount is
constant, we can evaluate the following loop at compile
time and use a constant bit permute vector. */
long __i;
for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
__c = __builtin_clzl(__m);
__p = (__p << 8) | __c;
__m ^= (__mask >> __c);
}
__result = __builtin_bpermd(__p, __X);
} else {
__p = 64 - __builtin_popcountl(__M);
__result = 0;
/* We could a use a for loop here, but that combined with
-funroll-loops can expand to a lot of code. The while
loop avoids unrolling and the compiler commons the xor
from clearing the mask bit with the (m != 0) test. The
result is a more compact loop setup and body. */
while (__m != 0) {
unsigned long __t;
__c = __builtin_clzl(__m);
__t = (__X & (__mask >> __c)) >> (__p - __c);
__m ^= (__mask >> __c);
__result |= (__t);
__p++;
}
}
return __result;
}
/* these 32-bit implementations depend on 64-bit pdep/pext
which depend on _ARCH_PWR7. */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32(unsigned int __X, unsigned int __Y) {
return _pdep_u64(__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32(unsigned int __X, unsigned int __Y) {
return _pext_u64(__X, __Y);
}
#endif /* _ARCH_PWR7 */
#endif /* __PPC64__ */
#endif /* BMI2INTRIN_H_ */

View File

@@ -0,0 +1,165 @@
/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined X86GPRINTRIN_H_
#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef BMIINTRIN_H_
#define BMIINTRIN_H_
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u16(unsigned short __X) {
return __builtin_ctz(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u32(unsigned int __X, unsigned int __Y) {
return (~__X & __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
return ((__X << (32 - (__L + __P))) >> (32 - __L));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u32(unsigned int __X, unsigned int __Y) {
unsigned int __P, __L;
__P = __Y & 0xFF;
__L = (__Y >> 8) & 0xFF;
return (_bextr_u32(__X, __P, __L));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u32(unsigned int __X) {
return (__X & -__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u32(unsigned int __X) {
return __blsi_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u32(unsigned int __X) {
return (__X ^ (__X - 1));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u32(unsigned int __X) {
return __blsmsk_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u32(unsigned int __X) {
return (__X & (__X - 1));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u32(unsigned int __X) {
return __blsr_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u32(unsigned int __X) {
return __builtin_ctz(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u32(unsigned int __X) {
return __builtin_ctz(__X);
}
/* use the 64-bit shift, rotate, and count leading zeros instructions
for long long. */
#ifdef __PPC64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u64(unsigned long long __X, unsigned long long __Y) {
return (~__X & __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
return ((__X << (64 - (__L + __P))) >> (64 - __L));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
unsigned int __P, __L;
__P = __Y & 0xFF;
__L = (__Y & 0xFF00) >> 8;
return (_bextr_u64(__X, __P, __L));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u64(unsigned long long __X) {
return __X & -__X;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u64(unsigned long long __X) {
return __blsi_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u64(unsigned long long __X) {
return (__X ^ (__X - 1));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u64(unsigned long long __X) {
return __blsmsk_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u64(unsigned long long __X) {
return (__X & (__X - 1));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u64(unsigned long long __X) {
return __blsr_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u64(unsigned long long __X) {
return __builtin_ctzll(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u64(unsigned long long __X) {
return __builtin_ctzll(__X);
}
#endif /* __PPC64__ */
#endif /* BMIINTRIN_H_ */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,27 @@
/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef IMMINTRIN_H_
#define IMMINTRIN_H_
#include <x86gprintrin.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <pmmintrin.h>
#include <tmmintrin.h>
#include <smmintrin.h>
#endif /* IMMINTRIN_H_ */

View File

@@ -0,0 +1,45 @@
/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <stdlib.h>
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
may not be visible. */
#ifndef __cplusplus
extern int posix_memalign(void **, size_t, size_t);
#else
extern "C" int posix_memalign(void **, size_t, size_t);
#endif
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
size_t __vec_align = sizeof(__vector float);
void *__ptr;
if (__alignment < __vec_align)
__alignment = __vec_align;
if (posix_memalign(&__ptr, __alignment, __size) == 0)
return __ptr;
else
return NULL;
}
static __inline void _mm_free(void *__ptr) { free(__ptr); }
#else
#include_next <mm_malloc.h>
#endif
#endif /* _MM_MALLOC_H_INCLUDED */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#endif
#ifndef NMMINTRIN_H_
#define NMMINTRIN_H_
/* We just include SSE4.1 header file. */
#include <smmintrin.h>
#endif /* NMMINTRIN_H_ */

View File

@@ -0,0 +1,145 @@
/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0. */
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets.
In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
is a good match for most SIMD operations. However the Horizontal
add/sub requires the data pairs be permuted into a separate
registers with vertical even/odd alignment for the operation.
And the addsub operation requires the sign of only the even numbered
elements be flipped (xored with -0.0).
For larger blocks of code using these intrinsic implementations,
the compiler be should be able to schedule instructions to avoid
additional latency.
In the specific case of the monitor and mwait instructions there are
no direct equivalent in the PowerISA at this time. So those
intrinsics are not implemented. */
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
#endif
#ifndef PMMINTRIN_H_
#define PMMINTRIN_H_
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
/* We need definitions from the SSE2 and SSE header files*/
#include <emmintrin.h>
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_ps(__m128 __X, __m128 __Y) {
const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
__v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
return (__m128)vec_add(__X, __even_neg_Y);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_pd(__m128d __X, __m128d __Y) {
const __v2df __even_n0 = {-0.0, 0.0};
__v2df __even_neg_Y = vec_xor(__Y, __even_n0);
return (__m128d)vec_add(__X, __even_neg_Y);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_ps(__m128 __X, __m128 __Y) {
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B};
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F};
return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_ps(__m128 __X, __m128 __Y) {
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
0x18, 0x19, 0x1A, 0x1B};
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
0x1C, 0x1D, 0x1E, 0x1F};
return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pd(__m128d __X, __m128d __Y) {
return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
vec_mergel((__v2df)__X, (__v2df)__Y));
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pd(__m128d __X, __m128d __Y) {
return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
vec_mergel((__v2df)__X, (__v2df)__Y));
}
#ifdef _ARCH_PWR8
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps(__m128 __X) {
return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps(__m128 __X) {
return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
}
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd(double const *__P) {
return (__m128d)vec_splats(*__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movedup_pd(__m128d __X) {
return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lddqu_si128(__m128i const *__P) {
return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
}
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
#else
#include_next <pmmintrin.h>
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* PMMINTRIN_H_ */

View File

@@ -0,0 +1,683 @@
/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0.
NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#error \
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef SMMINTRIN_H_
#define SMMINTRIN_H_
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
#include <tmmintrin.h>
/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_ZERO 0x01
#define _MM_FROUND_TO_POS_INF 0x02
#define _MM_FROUND_TO_NEG_INF 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
#define _MM_FROUND_RAISE_EXC 0x00
#define _MM_FROUND_NO_EXC 0x08
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_pd(__m128d __A, int __rounding) {
__v2df __r;
union {
double __fr;
long long __fpscr;
} __enables_save, __fpscr_save;
if (__rounding & _MM_FROUND_NO_EXC) {
/* Save enabled exceptions, disable all exceptions,
and preserve the rounding mode. */
#ifdef _ARCH_PWR9
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : "+wa"(__A));
}
switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
#ifdef _ARCH_PWR9
__fpscr_save.__fr = __builtin_ppc_mffsl();
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__fpscr_save.__fpscr &= 0x70007f0ffL;
#endif
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : "+wa"(__A));
__r = vec_rint((__v2df)__A);
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
__builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
__r = vec_floor((__v2df)__A);
break;
case _MM_FROUND_TO_POS_INF:
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
__r = vec_ceil((__v2df)__A);
break;
case _MM_FROUND_TO_ZERO:
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
__r = vec_trunc((__v2df)__A);
break;
case _MM_FROUND_CUR_DIRECTION:
__r = vec_rint((__v2df)__A);
break;
}
if (__rounding & _MM_FROUND_NO_EXC) {
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
#ifdef _ARCH_PWR9
__fpscr_save.__fr = __builtin_ppc_mffsl();
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__fpscr_save.__fpscr &= 0x70007f0ffL;
#endif
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128d)__r;
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
__B = _mm_round_pd(__B, __rounding);
__v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
return (__m128d)__r;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ps(__m128 __A, int __rounding) {
__v4sf __r;
union {
double __fr;
long long __fpscr;
} __enables_save, __fpscr_save;
if (__rounding & _MM_FROUND_NO_EXC) {
/* Save enabled exceptions, disable all exceptions,
and preserve the rounding mode. */
#ifdef _ARCH_PWR9
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
__fpscr_save.__fpscr &= ~0xf8;
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
#endif
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : "+wa"(__A));
}
switch (__rounding) {
case _MM_FROUND_TO_NEAREST_INT:
#ifdef _ARCH_PWR9
__fpscr_save.__fr = __builtin_ppc_mffsl();
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__fpscr_save.__fpscr &= 0x70007f0ffL;
#endif
__attribute__((fallthrough));
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
__builtin_ppc_set_fpscr_rn(0b00);
/* Insert an artificial "read/write" reference to the variable
read below, to ensure the compiler does not schedule
a read/use of the variable before the FPSCR is modified, above.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : "+wa"(__A));
__r = vec_rint((__v4sf)__A);
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
__builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
break;
case _MM_FROUND_TO_NEG_INF:
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
__r = vec_floor((__v4sf)__A);
break;
case _MM_FROUND_TO_POS_INF:
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
__r = vec_ceil((__v4sf)__A);
break;
case _MM_FROUND_TO_ZERO:
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
__r = vec_trunc((__v4sf)__A);
break;
case _MM_FROUND_CUR_DIRECTION:
__r = vec_rint((__v4sf)__A);
break;
}
if (__rounding & _MM_FROUND_NO_EXC) {
/* Insert an artificial "read" reference to the variable written
above, to ensure the compiler does not schedule the computation
of the value after the manipulation of the FPSCR, below.
This can be removed if and when GCC PR102783 is fixed.
*/
__asm__("" : : "wa"(__r));
/* Restore enabled exceptions. */
#ifdef _ARCH_PWR9
__fpscr_save.__fr = __builtin_ppc_mffsl();
#else
__fpscr_save.__fr = __builtin_ppc_mffs();
__fpscr_save.__fpscr &= 0x70007f0ffL;
#endif
__fpscr_save.__fpscr |= __enables_save.__fpscr;
__builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
}
return (__m128)__r;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
__B = _mm_round_ps(__B, __rounding);
__v4sf __r = (__v4sf)__A;
__r[0] = ((__v4sf)__B)[0];
return (__m128)__r;
}
#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
__v16qi __result = (__v16qi)__A;
__result[__N & 0xf] = __D;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
__v4si __result = (__v4si)__A;
__result[__N & 3] = __D;
return (__m128i)__result;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
__v2di __result = (__v2di)__A;
__result[__N & 1] = __D;
return (__m128i)__result;
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi8(__m128i __X, const int __N) {
return (unsigned char)((__v16qi)__X)[__N & 15];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi32(__m128i __X, const int __N) {
return ((__v4si)__X)[__N & 3];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi64(__m128i __X, const int __N) {
return ((__v2di)__X)[__N & 1];
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_ps(__m128 __X, const int __N) {
return ((__v4si)__X)[__N & 3];
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
__v16qu __charmask = vec_splats((unsigned char)__imm8);
__charmask = vec_gb(__charmask);
__v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
#ifdef __BIG_ENDIAN__
__shortmask = vec_reve(__shortmask);
#endif
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
#ifdef _ARCH_PWR10
return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
#else
const __v16qu __seven = vec_splats((unsigned char)0x07);
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
#endif
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
__v16qu __pcv[] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
};
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
return (__m128)__r;
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
#ifdef _ARCH_PWR10
return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
#else
const __v4si __zero = {0};
const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
#endif
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
__v16qu __pcv[] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
return (__m128d)__r;
}
#ifdef _ARCH_PWR8
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
#ifdef _ARCH_PWR10
return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
#else
const __v2di __zero = {0};
const __vector __bool long long __boolmask =
vec_cmplt((__v2di)__mask, __zero);
return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
#endif
}
#endif
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_si128(__m128i __A, __m128i __B) {
/* Note: This implementation does NOT set "zero" or "carry" flags. */
const __v16qu __zero = {0};
return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_si128(__m128i __A, __m128i __B) {
/* Note: This implementation does NOT set "zero" or "carry" flags. */
const __v16qu __zero = {0};
const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_si128(__m128i __A, __m128i __B) {
/* Note: This implementation does NOT set "zero" or "carry" flags. */
return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
}
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi8(__m128i __X, __m128i __Y) {
return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu16(__m128i __X, __m128i __Y) {
return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi32(__m128i __X, __m128i __Y) {
return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu32(__m128i __X, __m128i __Y) {
return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi8(__m128i __X, __m128i __Y) {
return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu16(__m128i __X, __m128i __Y) {
return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi32(__m128i __X, __m128i __Y) {
return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu32(__m128i __X, __m128i __Y) {
return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32(__m128i __X, __m128i __Y) {
return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32(__m128i __X, __m128i __Y) {
return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi16(__m128i __A) {
return (__m128i)vec_unpackh((__v16qi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi32(__m128i __A) {
__A = (__m128i)vec_unpackh((__v16qi)__A);
return (__m128i)vec_unpackh((__v8hi)__A);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi64(__m128i __A) {
__A = (__m128i)vec_unpackh((__v16qi)__A);
__A = (__m128i)vec_unpackh((__v8hi)__A);
return (__m128i)vec_unpackh((__v4si)__A);
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi32(__m128i __A) {
return (__m128i)vec_unpackh((__v8hi)__A);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi64(__m128i __A) {
__A = (__m128i)vec_unpackh((__v8hi)__A);
return (__m128i)vec_unpackh((__v4si)__A);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_epi64(__m128i __A) {
return (__m128i)vec_unpackh((__v4si)__A);
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi16(__m128i __A) {
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi32(__m128i __A) {
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi64(__m128i __A) {
const __v16qu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi32(__m128i __A) {
const __v8hu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi64(__m128i __A) {
const __v8hu __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu32_epi64(__m128i __A) {
const __v4su __zero = {0};
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_mergeh((__v4su)__A, __zero);
#else /* __BIG_ENDIAN__. */
__A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
#endif /* __BIG_ENDIAN__. */
return __A;
}
/* Return horizontal packed word minimum and its index in bits [15:0]
and bits [18:16] respectively. */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_minpos_epu16(__m128i __A) {
union __u {
__m128i __m;
__v8hu __uh;
};
union __u __u = {.__m = __A}, __r = {.__m = {0}};
unsigned short __ridx = 0;
unsigned short __rmin = __u.__uh[__ridx];
unsigned long __i;
for (__i = 1; __i < 8; __i++) {
if (__u.__uh[__i] < __rmin) {
__rmin = __u.__uh[__i];
__ridx = __i;
}
}
__r.__uh[0] = __rmin;
__r.__uh[1] = __ridx;
return __r.__m;
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi32(__m128i __X, __m128i __Y) {
return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
}
#endif
#else
#include_next <smmintrin.h>
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* SMMINTRIN_H_ */

View File

@@ -0,0 +1,453 @@
/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Implemented from the specification included in the Intel C++ Compiler
User Guide and Reference, version 9.0. */
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#endif
#ifndef TMMINTRIN_H_
#define TMMINTRIN_H_
#if defined(__powerpc64__) && \
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
#include <altivec.h>
/* We need definitions from the SSE header files. */
#include <pmmintrin.h>
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16(__m128i __A) {
return (__m128i)vec_abs((__v8hi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32(__m128i __A) {
return (__m128i)vec_abs((__v4si)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8(__m128i __A) {
return (__m128i)vec_abs((__v16qi)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16(__m64 __A) {
__v8hi __B = (__v8hi)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32(__m64 __A) {
__v4si __B = (__v4si)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8(__m64 __A) {
__v16qi __B = (__v16qi)(__v2du){__A, __A};
return (__m64)((__v2du)vec_abs(__B))[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
if (__builtin_constant_p(__count) && __count < 16) {
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_reve((__v16qu)__A);
__B = (__m128i)vec_reve((__v16qu)__B);
#endif
__A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_reve((__v16qu)__A);
#endif
return __A;
}
if (__count == 0)
return __B;
if (__count >= 16) {
if (__count >= 32) {
const __v16qu __zero = {0};
return (__m128i)__zero;
} else {
const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
#ifdef __LITTLE_ENDIAN__
return (__m128i)vec_sro((__v16qu)__A, __shift);
#else
return (__m128i)vec_slo((__v16qu)__A, __shift);
#endif
}
} else {
const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
#ifdef __LITTLE_ENDIAN__
__A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
__B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
#else
__A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
__B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
#endif
return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
}
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
if (__count < 16) {
__v2du __C = {__B, __A};
#ifdef __LITTLE_ENDIAN__
const __v4su __shift = {__count << 3, 0, 0, 0};
__C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
#else
const __v4su __shift = {0, 0, 0, __count << 3};
__C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
#endif
return (__m64)__C[0];
} else {
const __m64 __zero = {0};
return __zero;
}
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_add(__C, __D);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
16, 17, 18, 19, 24, 25, 26, 27};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
20, 21, 22, 23, 28, 29, 30, 31};
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
return (__m128i)vec_add(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16(__m64 __A, __m64 __B) {
__v8hi __C = (__v8hi)(__v2du){__A, __B};
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_add(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32(__m64 __A, __m64 __B) {
__v4si __C = (__v4si)(__v2du){__A, __B};
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
__v4si __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_add(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16(__m128i __A, __m128i __B) {
__v4si __C = {0}, __D = {0};
__C = vec_sum4s((__v8hi)__A, __C);
__D = vec_sum4s((__v8hi)__B, __D);
__C = (__v4si)vec_packs(__C, __D);
return (__m128i)__C;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16(__m64 __A, __m64 __B) {
const __v4si __zero = {0};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v4si __D = vec_sum4s(__C, __zero);
__C = vec_packs(__D, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_sub(__C, __D);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
16, 17, 18, 19, 24, 25, 26, 27};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
20, 21, 22, 23, 28, 29, 30, 31};
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
return (__m128i)vec_sub(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v8hi __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_sub(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
__v4si __C = (__v4si)(__v2du){__A, __B};
__v4si __D = vec_perm(__C, __C, __Q);
__C = vec_perm(__C, __C, __P);
__C = vec_sub(__C, __D);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16(__m128i __A, __m128i __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
return (__m128i)vec_subs(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16(__m64 __A, __m64 __B) {
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
__v8hi __C = (__v8hi)(__v2du){__A, __B};
__v8hi __D = vec_perm(__C, __C, __P);
__v8hi __E = vec_perm(__C, __C, __Q);
__C = vec_subs(__D, __E);
return (__m64)((__v2du)__C)[1];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8(__m128i __A, __m128i __B) {
const __v16qi __zero = {0};
__vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
__v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
return (__m128i)vec_sel(__C, __zero, __select);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8(__m64 __A, __m64 __B) {
const __v16qi __zero = {0};
__v16qi __C = (__v16qi)(__v2du){__A, __A};
__v16qi __D = (__v16qi)(__v2du){__B, __B};
__vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
__C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
__C = vec_sel(__C, __zero, __select);
return (__m64)((__v2du)(__C))[0];
}
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8(__m128i __A, __m128i __B) {
const __v16qi __zero = {0};
__v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
__v16qi __selectpos =
(__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
__v16qi __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16(__m128i __A, __m128i __B) {
const __v8hi __zero = {0};
__v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
__v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
__v8hi __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32(__m128i __A, __m128i __B) {
const __v4si __zero = {0};
__v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
__v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
__v4si __conv = vec_add(__selectneg, __selectpos);
return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8(__m64 __A, __m64 __B) {
const __v16qi __zero = {0};
__v16qi __C = (__v16qi)(__v2du){__A, __A};
__v16qi __D = (__v16qi)(__v2du){__B, __B};
__C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16(__m64 __A, __m64 __B) {
const __v8hi __zero = {0};
__v8hi __C = (__v8hi)(__v2du){__A, __A};
__v8hi __D = (__v8hi)(__v2du){__B, __B};
__C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32(__m64 __A, __m64 __B) {
const __v4si __zero = {0};
__v4si __C = (__v4si)(__v2du){__A, __A};
__v4si __D = (__v4si)(__v2du){__B, __B};
__C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
return (__m64)((__v2du)(__C))[0];
}
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16(__m128i __A, __m128i __B) {
__v8hi __unsigned = vec_splats((signed short)0x00ff);
__v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
__v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
__v8hi __E = vec_unpackh((__v16qi)__B);
__v8hi __F = vec_unpackl((__v16qi)__B);
__C = vec_mul(__C, __E);
__D = vec_mul(__D, __F);
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__E = vec_perm(__C, __D, __odds);
__F = vec_perm(__C, __D, __evens);
return (__m128i)vec_adds(__E, __F);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16(__m64 __A, __m64 __B) {
__v8hi __C = (__v8hi)(__v2du){__A, __A};
__C = vec_unpackl((__v16qi)__C);
const __v8hi __unsigned = vec_splats((signed short)0x00ff);
__C = vec_and(__C, __unsigned);
__v8hi __D = (__v8hi)(__v2du){__B, __B};
__D = vec_unpackl((__v16qi)__D);
__D = vec_mul(__C, __D);
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
16, 17, 20, 21, 24, 25, 28, 29};
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
__C = vec_perm(__D, __D, __odds);
__D = vec_perm(__D, __D, __evens);
__C = vec_adds(__C, __D);
return (__m64)((__v2du)(__C))[0];
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16(__m128i __A, __m128i __B) {
__v4si __C = vec_unpackh((__v8hi)__A);
__v4si __D = vec_unpackh((__v8hi)__B);
__C = vec_mul(__C, __D);
__D = vec_unpackl((__v8hi)__A);
__v4si __E = vec_unpackl((__v8hi)__B);
__D = vec_mul(__D, __E);
const __v4su __shift = vec_splats((unsigned int)14);
__C = vec_sr(__C, __shift);
__D = vec_sr(__D, __shift);
const __v4si __ones = vec_splats((signed int)1);
__C = vec_add(__C, __ones);
__C = vec_sr(__C, (__v4su)__ones);
__D = vec_add(__D, __ones);
__D = vec_sr(__D, (__v4su)__ones);
return (__m128i)vec_pack(__C, __D);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16(__m64 __A, __m64 __B) {
__v4si __C = (__v4si)(__v2du){__A, __A};
__C = vec_unpackh((__v8hi)__C);
__v4si __D = (__v4si)(__v2du){__B, __B};
__D = vec_unpackh((__v8hi)__D);
__C = vec_mul(__C, __D);
const __v4su __shift = vec_splats((unsigned int)14);
__C = vec_sr(__C, __shift);
const __v4si __ones = vec_splats((signed int)1);
__C = vec_add(__C, __ones);
__C = vec_sr(__C, (__v4su)__ones);
__v8hi __E = vec_pack(__C, __D);
return (__m64)((__v2du)(__E))[0];
}
#else
#include_next <tmmintrin.h>
#endif /* defined(__powerpc64__) && \
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
#endif /* TMMINTRIN_H_ */

View File

@@ -0,0 +1,17 @@
/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef X86GPRINTRIN_H_
#define X86GPRINTRIN_H_
#include <bmiintrin.h>
#include <bmi2intrin.h>
#endif /* X86GPRINTRIN_H_ */

View File

@@ -0,0 +1,28 @@
/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef NO_WARN_X86_INTRINSICS
/* This header is distributed to simplify porting x86_64 code that
makes explicit use of Intel intrinsics to powerpc64le.
It is the user's responsibility to determine if the results are
acceptable and make additional changes as necessary.
Note that much code that uses Intel intrinsics can be rewritten in
standard C or GNU C extensions, which are more portable and better
optimized across multiple targets. */
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef X86INTRIN_H_
#define X86INTRIN_H_
#ifdef __ALTIVEC__
#include <immintrin.h>
#endif /* __ALTIVEC__ */
#endif /* X86INTRIN_H_ */

File diff suppressed because it is too large Load Diff