Commit fca7791b authored by sletz's avatar sletz
Browse files

Tim Bechmann SSE 4.1 memops.c optimization patch.

git-svn-id: http://subversion.jackaudio.org/jack/jack2/trunk/jackmp@3646 0c269be4-1314-0410-8aa9-9f06e86f4224
parent edbf64a7
......@@ -37,6 +37,9 @@
#if defined (__SSE2__) && !defined (__sun__)
#include <emmintrin.h>
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
#endif
/* Notes about these *_SCALING values.
......@@ -285,6 +288,12 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
__m128i y = _mm_cvttps_epi32(clipped);
__m128i shifted = _mm_slli_epi32(y, 8);
#ifdef __SSE4_1__
*(int32_t*)dst = _mm_extract_epi32(shifted, 0);
*(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
*(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
*(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
#else
__m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
__m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
__m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
......@@ -294,6 +303,7 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
_mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
_mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
_mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
#endif
dst += 4*dst_skip;
src+= 4;
......@@ -421,6 +431,12 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
__m128 samples = _mm_loadu_ps(src);
__m128i converted = float_24_sse(samples);
#ifdef __SSE4_1__
z[0] = _mm_extract_epi32(converted, 0);
z[1] = _mm_extract_epi32(converted, 1);
z[2] = _mm_extract_epi32(converted, 2);
z[3] = _mm_extract_epi32(converted, 3);
#else
__m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
__m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
__m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
......@@ -431,13 +447,11 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
_mm_store_ss((float*)z+3, (__m128)shuffled3);
for (i = 0; i != 4; ++i) {
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy (dst, z+i, 3);
#elif __BYTE_ORDER == __BIG_ENDIAN
memcpy (dst, (float*)((char *)&z + 1)+i, 3);
#endif
dst += dst_skip;
}
#endif
nsamples -= 4;
src += 4;
}
......@@ -500,17 +514,10 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
while (nsamples >= 4) {
int x0, x1, x2, x3;
#if __BYTE_ORDER == __LITTLE_ENDIAN
memcpy((char*)&x0 + 1, src, 3);
memcpy((char*)&x1 + 1, src+src_skip, 3);
memcpy((char*)&x2 + 1, src+2*src_skip, 3);
memcpy((char*)&x3 + 1, src+3*src_skip, 3);
#elif __BYTE_ORDER == __BIG_ENDIAN
memcpy(&x0, src, 3);
memcpy(&x1, src+src_skip, 3);
memcpy(&x2, src+2*src_skip, 3);
memcpy(&x3, src+3*src_skip, 3);
#endif
src += 4 * src_skip;
const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment