Bullet Collision Detection & Physics Library
btVector3.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2011 Apple Inc.
3  http://continuousphysics.com/Bullet/
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined (_WIN32) || defined (__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 
23 #include "btVector3.h"
24 
25 
26 
27 #if defined BT_USE_SIMD_VECTOR3
28 
29 #if DEBUG
30 #include <string.h>//for memset
31 #endif
32 
33 
34 #ifdef __APPLE__
35 #include <stdint.h>
36 typedef float float4 __attribute__ ((vector_size(16)));
37 #else
38 #define float4 __m128
39 #endif
40 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
41 
42 
43 #if defined BT_USE_SSE || defined _WIN32
44 
45 #define LOG2_ARRAY_SIZE 6
46 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
47 
48 #include <emmintrin.h>
49 
50 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
51 long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
52 {
53  const float4 *vertices = (const float4*) vv;
54  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
55  float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
56  float4 vvec = _mm_loadu_ps( vec );
57  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
58  float4 vLo = _mm_movelh_ps( vvec, vvec );
59 
60  long maxIndex = -1L;
61 
62  size_t segment = 0;
63  float4 stack_array[ STACK_ARRAY_COUNT ];
64 
65 #if DEBUG
66  //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
67 #endif
68 
69  size_t index;
70  float4 max;
71  // Faster loop without cleanup code for full tiles
72  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
73  {
74  max = dotMax;
75 
76  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
77  { // do four dot products at a time. Carefully avoid touching the w element.
78  float4 v0 = vertices[0];
79  float4 v1 = vertices[1];
80  float4 v2 = vertices[2];
81  float4 v3 = vertices[3]; vertices += 4;
82 
83  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
84  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
85  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
86  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
87 
88  lo0 = lo0*vLo;
89  lo1 = lo1*vLo;
90  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
91  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
92  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
93  z = z*vHi;
94  x = x+y;
95  x = x+z;
96  stack_array[index] = x;
97  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
98 
99  v0 = vertices[0];
100  v1 = vertices[1];
101  v2 = vertices[2];
102  v3 = vertices[3]; vertices += 4;
103 
104  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
105  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
106  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
107  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
108 
109  lo0 = lo0*vLo;
110  lo1 = lo1*vLo;
111  z = _mm_shuffle_ps(hi0, hi1, 0x88);
112  x = _mm_shuffle_ps(lo0, lo1, 0x88);
113  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114  z = z*vHi;
115  x = x+y;
116  x = x+z;
117  stack_array[index+1] = x;
118  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
119 
120  v0 = vertices[0];
121  v1 = vertices[1];
122  v2 = vertices[2];
123  v3 = vertices[3]; vertices += 4;
124 
125  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
126  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
127  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
128  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
129 
130  lo0 = lo0*vLo;
131  lo1 = lo1*vLo;
132  z = _mm_shuffle_ps(hi0, hi1, 0x88);
133  x = _mm_shuffle_ps(lo0, lo1, 0x88);
134  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
135  z = z*vHi;
136  x = x+y;
137  x = x+z;
138  stack_array[index+2] = x;
139  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
140 
141  v0 = vertices[0];
142  v1 = vertices[1];
143  v2 = vertices[2];
144  v3 = vertices[3]; vertices += 4;
145 
146  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
147  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
148  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
149  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
150 
151  lo0 = lo0*vLo;
152  lo1 = lo1*vLo;
153  z = _mm_shuffle_ps(hi0, hi1, 0x88);
154  x = _mm_shuffle_ps(lo0, lo1, 0x88);
155  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
156  z = z*vHi;
157  x = x+y;
158  x = x+z;
159  stack_array[index+3] = x;
160  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
161 
162  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
163  }
164 
165  // If we found a new max
166  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
167  {
168  // copy the new max across all lanes of our max accumulator
169  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
170  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
171 
172  dotMax = max;
173 
174  // find first occurrence of that max
175  size_t test;
176  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
177  {}
178  // record where it is.
179  maxIndex = 4*index + segment + indexTable[test];
180  }
181  }
182 
183  // account for work we've already done
184  count -= segment;
185 
186  // Deal with the last < STACK_ARRAY_COUNT vectors
187  max = dotMax;
188  index = 0;
189 
190 
191  if( btUnlikely( count > 16) )
192  {
193  for( ; index + 4 <= count / 4; index+=4 )
194  { // do four dot products at a time. Carefully avoid touching the w element.
195  float4 v0 = vertices[0];
196  float4 v1 = vertices[1];
197  float4 v2 = vertices[2];
198  float4 v3 = vertices[3]; vertices += 4;
199 
200  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
201  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
202  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
203  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
204 
205  lo0 = lo0*vLo;
206  lo1 = lo1*vLo;
207  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210  z = z*vHi;
211  x = x+y;
212  x = x+z;
213  stack_array[index] = x;
214  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
215 
216  v0 = vertices[0];
217  v1 = vertices[1];
218  v2 = vertices[2];
219  v3 = vertices[3]; vertices += 4;
220 
221  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
222  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
223  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
224  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
225 
226  lo0 = lo0*vLo;
227  lo1 = lo1*vLo;
228  z = _mm_shuffle_ps(hi0, hi1, 0x88);
229  x = _mm_shuffle_ps(lo0, lo1, 0x88);
230  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
231  z = z*vHi;
232  x = x+y;
233  x = x+z;
234  stack_array[index+1] = x;
235  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
236 
237  v0 = vertices[0];
238  v1 = vertices[1];
239  v2 = vertices[2];
240  v3 = vertices[3]; vertices += 4;
241 
242  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
243  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
244  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
245  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
246 
247  lo0 = lo0*vLo;
248  lo1 = lo1*vLo;
249  z = _mm_shuffle_ps(hi0, hi1, 0x88);
250  x = _mm_shuffle_ps(lo0, lo1, 0x88);
251  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
252  z = z*vHi;
253  x = x+y;
254  x = x+z;
255  stack_array[index+2] = x;
256  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
257 
258  v0 = vertices[0];
259  v1 = vertices[1];
260  v2 = vertices[2];
261  v3 = vertices[3]; vertices += 4;
262 
263  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
264  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
265  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
266  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
267 
268  lo0 = lo0*vLo;
269  lo1 = lo1*vLo;
270  z = _mm_shuffle_ps(hi0, hi1, 0x88);
271  x = _mm_shuffle_ps(lo0, lo1, 0x88);
272  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
273  z = z*vHi;
274  x = x+y;
275  x = x+z;
276  stack_array[index+3] = x;
277  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
278 
279  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
280  }
281  }
282 
283  size_t localCount = (count & -4L) - 4*index;
284  if( localCount )
285  {
286 #ifdef __APPLE__
287  float4 t0, t1, t2, t3, t4;
288  float4 * sap = &stack_array[index + localCount / 4];
289  vertices += localCount; // counter the offset
290  size_t byteIndex = -(localCount) * sizeof(float);
291  //AT&T Code style assembly
292  asm volatile
293  ( ".align 4 \n\
294  0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
295  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
296  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
297  movaps %[t0], %[max] // vertices[0] \n\
298  movlhps %[t1], %[max] // x0y0x1y1 \n\
299  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
300  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
301  mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
302  movhlps %[t0], %[t1] // z0w0z1w1 \n\
303  movaps %[t3], %[t0] // vertices[2] \n\
304  movlhps %[t4], %[t0] // x2y2x3y3 \n\
305  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
306  movhlps %[t3], %[t4] // z2w2z3w3 \n\
307  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
308  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
309  movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
310  shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
311  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
312  addps %[t3], %[max] // x + y \n\
313  addps %[t1], %[max] // x + y + z \n\
314  movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
315  maxps %[t2], %[max] // record max, restore max \n\
316  add $16, %[byteIndex] // advance loop counter\n\
317  jnz 0b \n\
318  "
319  : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
320  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
321  : "memory", "cc"
322  );
323  index += localCount/4;
324 #else
325  {
326  for( unsigned int i=0; i<localCount/4; i++,index++)
327  { // do four dot products at a time. Carefully avoid touching the w element.
328  float4 v0 = vertices[0];
329  float4 v1 = vertices[1];
330  float4 v2 = vertices[2];
331  float4 v3 = vertices[3];
332  vertices += 4;
333 
334  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
335  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
336  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
337  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
338 
339  lo0 = lo0*vLo;
340  lo1 = lo1*vLo;
341  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
342  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
343  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
344  z = z*vHi;
345  x = x+y;
346  x = x+z;
347  stack_array[index] = x;
348  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
349  }
350  }
351 #endif //__APPLE__
352  }
353 
354  // process the last few points
355  if( count & 3 )
356  {
357  float4 v0, v1, v2, x, y, z;
358  switch( count & 3 )
359  {
360  case 3:
361  {
362  v0 = vertices[0];
363  v1 = vertices[1];
364  v2 = vertices[2];
365 
366  // Calculate 3 dot products, transpose, duplicate v2
367  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
368  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
369  lo0 = lo0*vLo;
370  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
371  z = z*vHi;
372  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
373  lo1 = lo1*vLo;
374  x = _mm_shuffle_ps(lo0, lo1, 0x88);
375  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
376  }
377  break;
378  case 2:
379  {
380  v0 = vertices[0];
381  v1 = vertices[1];
382  float4 xy = _mm_movelh_ps(v0, v1);
383  z = _mm_movehl_ps(v1, v0);
384  xy = xy*vLo;
385  z = _mm_shuffle_ps( z, z, 0xa8);
386  x = _mm_shuffle_ps( xy, xy, 0xa8);
387  y = _mm_shuffle_ps( xy, xy, 0xfd);
388  z = z*vHi;
389  }
390  break;
391  case 1:
392  {
393  float4 xy = vertices[0];
394  z = _mm_shuffle_ps( xy, xy, 0xaa);
395  xy = xy*vLo;
396  z = z*vHi;
397  x = _mm_shuffle_ps(xy, xy, 0);
398  y = _mm_shuffle_ps(xy, xy, 0x55);
399  }
400  break;
401  }
402  x = x+y;
403  x = x+z;
404  stack_array[index] = x;
405  max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
406  index++;
407  }
408 
409  // if we found a new max.
410  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
411  { // we found a new max. Search for it
412  // find max across the max vector, place in all elements of max -- big latency hit here
413  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
414  max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
415 
416  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
417  // this where it actually makes a difference is handled in the early out at the top of the function,
418  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
419  // complexity, and removed it.
420 
421  dotMax = max;
422 
423  // scan for the first occurence of max in the array
424  size_t test;
425  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
426  {}
427  maxIndex = 4*index + segment + indexTable[test];
428  }
429 
430  _mm_store_ss( dotResult, dotMax);
431  return maxIndex;
432 }
433 
434 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
435 
436 long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
437 {
438  const float4 *vertices = (const float4*) vv;
439  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
440  float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
441  float4 vvec = _mm_loadu_ps( vec );
442  float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa ));
443  float4 vLo = _mm_movelh_ps( vvec, vvec );
444 
445  long minIndex = -1L;
446 
447  size_t segment = 0;
448  float4 stack_array[ STACK_ARRAY_COUNT ];
449 
450 #if DEBUG
451  //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
452 #endif
453 
454  size_t index;
455  float4 min;
456  // Faster loop without cleanup code for full tiles
457  for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
458  {
459  min = dotmin;
460 
461  for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
462  { // do four dot products at a time. Carefully avoid touching the w element.
463  float4 v0 = vertices[0];
464  float4 v1 = vertices[1];
465  float4 v2 = vertices[2];
466  float4 v3 = vertices[3]; vertices += 4;
467 
468  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
469  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
470  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
471  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
472 
473  lo0 = lo0*vLo;
474  lo1 = lo1*vLo;
475  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
476  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
477  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
478  z = z*vHi;
479  x = x+y;
480  x = x+z;
481  stack_array[index] = x;
482  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
483 
484  v0 = vertices[0];
485  v1 = vertices[1];
486  v2 = vertices[2];
487  v3 = vertices[3]; vertices += 4;
488 
489  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
490  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
491  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
492  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
493 
494  lo0 = lo0*vLo;
495  lo1 = lo1*vLo;
496  z = _mm_shuffle_ps(hi0, hi1, 0x88);
497  x = _mm_shuffle_ps(lo0, lo1, 0x88);
498  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
499  z = z*vHi;
500  x = x+y;
501  x = x+z;
502  stack_array[index+1] = x;
503  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
504 
505  v0 = vertices[0];
506  v1 = vertices[1];
507  v2 = vertices[2];
508  v3 = vertices[3]; vertices += 4;
509 
510  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
511  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
512  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
513  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
514 
515  lo0 = lo0*vLo;
516  lo1 = lo1*vLo;
517  z = _mm_shuffle_ps(hi0, hi1, 0x88);
518  x = _mm_shuffle_ps(lo0, lo1, 0x88);
519  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
520  z = z*vHi;
521  x = x+y;
522  x = x+z;
523  stack_array[index+2] = x;
524  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
525 
526  v0 = vertices[0];
527  v1 = vertices[1];
528  v2 = vertices[2];
529  v3 = vertices[3]; vertices += 4;
530 
531  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
532  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
533  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
534  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
535 
536  lo0 = lo0*vLo;
537  lo1 = lo1*vLo;
538  z = _mm_shuffle_ps(hi0, hi1, 0x88);
539  x = _mm_shuffle_ps(lo0, lo1, 0x88);
540  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
541  z = z*vHi;
542  x = x+y;
543  x = x+z;
544  stack_array[index+3] = x;
545  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
546 
547  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
548  }
549 
550  // If we found a new min
551  if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
552  {
553  // copy the new min across all lanes of our min accumulator
554  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
555  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
556 
557  dotmin = min;
558 
559  // find first occurrence of that min
560  size_t test;
561  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
562  {}
563  // record where it is.
564  minIndex = 4*index + segment + indexTable[test];
565  }
566  }
567 
568  // account for work we've already done
569  count -= segment;
570 
571  // Deal with the last < STACK_ARRAY_COUNT vectors
572  min = dotmin;
573  index = 0;
574 
575 
576  if(btUnlikely( count > 16) )
577  {
578  for( ; index + 4 <= count / 4; index+=4 )
579  { // do four dot products at a time. Carefully avoid touching the w element.
580  float4 v0 = vertices[0];
581  float4 v1 = vertices[1];
582  float4 v2 = vertices[2];
583  float4 v3 = vertices[3]; vertices += 4;
584 
585  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
586  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
587  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
588  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
589 
590  lo0 = lo0*vLo;
591  lo1 = lo1*vLo;
592  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
593  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
594  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
595  z = z*vHi;
596  x = x+y;
597  x = x+z;
598  stack_array[index] = x;
599  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
600 
601  v0 = vertices[0];
602  v1 = vertices[1];
603  v2 = vertices[2];
604  v3 = vertices[3]; vertices += 4;
605 
606  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
607  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
608  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
609  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
610 
611  lo0 = lo0*vLo;
612  lo1 = lo1*vLo;
613  z = _mm_shuffle_ps(hi0, hi1, 0x88);
614  x = _mm_shuffle_ps(lo0, lo1, 0x88);
615  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
616  z = z*vHi;
617  x = x+y;
618  x = x+z;
619  stack_array[index+1] = x;
620  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
621 
622  v0 = vertices[0];
623  v1 = vertices[1];
624  v2 = vertices[2];
625  v3 = vertices[3]; vertices += 4;
626 
627  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
628  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
629  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
630  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
631 
632  lo0 = lo0*vLo;
633  lo1 = lo1*vLo;
634  z = _mm_shuffle_ps(hi0, hi1, 0x88);
635  x = _mm_shuffle_ps(lo0, lo1, 0x88);
636  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
637  z = z*vHi;
638  x = x+y;
639  x = x+z;
640  stack_array[index+2] = x;
641  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
642 
643  v0 = vertices[0];
644  v1 = vertices[1];
645  v2 = vertices[2];
646  v3 = vertices[3]; vertices += 4;
647 
648  lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
649  hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
650  lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
651  hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
652 
653  lo0 = lo0*vLo;
654  lo1 = lo1*vLo;
655  z = _mm_shuffle_ps(hi0, hi1, 0x88);
656  x = _mm_shuffle_ps(lo0, lo1, 0x88);
657  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
658  z = z*vHi;
659  x = x+y;
660  x = x+z;
661  stack_array[index+3] = x;
662  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
663 
664  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
665  }
666  }
667 
668  size_t localCount = (count & -4L) - 4*index;
669  if( localCount )
670  {
671 
672 
673 #ifdef __APPLE__
674  vertices += localCount; // counter the offset
675  float4 t0, t1, t2, t3, t4;
676  size_t byteIndex = -(localCount) * sizeof(float);
677  float4 * sap = &stack_array[index + localCount / 4];
678 
679  asm volatile
680  ( ".align 4 \n\
681  0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
682  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
683  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
684  movaps %[t0], %[min] // vertices[0] \n\
685  movlhps %[t1], %[min] // x0y0x1y1 \n\
686  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
687  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
688  mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
689  movhlps %[t0], %[t1] // z0w0z1w1 \n\
690  movaps %[t3], %[t0] // vertices[2] \n\
691  movlhps %[t4], %[t0] // x2y2x3y3 \n\
692  movhlps %[t3], %[t4] // z2w2z3w3 \n\
693  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
694  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
695  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
696  movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
697  shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
698  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
699  addps %[t3], %[min] // x + y \n\
700  addps %[t1], %[min] // x + y + z \n\
701  movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
702  minps %[t2], %[min] // record min, restore min \n\
703  add $16, %[byteIndex] // advance loop counter\n\
704  jnz 0b \n\
705  "
706  : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
707  : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
708  : "memory", "cc"
709  );
710  index += localCount/4;
711 #else
712  {
713  for( unsigned int i=0; i<localCount/4; i++,index++)
714  { // do four dot products at a time. Carefully avoid touching the w element.
715  float4 v0 = vertices[0];
716  float4 v1 = vertices[1];
717  float4 v2 = vertices[2];
718  float4 v3 = vertices[3];
719  vertices += 4;
720 
721  float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
722  float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
723  float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
724  float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
725 
726  lo0 = lo0*vLo;
727  lo1 = lo1*vLo;
728  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
729  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
730  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
731  z = z*vHi;
732  x = x+y;
733  x = x+z;
734  stack_array[index] = x;
735  min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
736  }
737  }
738 
739 #endif
740  }
741 
742  // process the last few points
743  if( count & 3 )
744  {
745  float4 v0, v1, v2, x, y, z;
746  switch( count & 3 )
747  {
748  case 3:
749  {
750  v0 = vertices[0];
751  v1 = vertices[1];
752  v2 = vertices[2];
753 
754  // Calculate 3 dot products, transpose, duplicate v2
755  float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
756  float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
757  lo0 = lo0*vLo;
758  z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
759  z = z*vHi;
760  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
761  lo1 = lo1*vLo;
762  x = _mm_shuffle_ps(lo0, lo1, 0x88);
763  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
764  }
765  break;
766  case 2:
767  {
768  v0 = vertices[0];
769  v1 = vertices[1];
770  float4 xy = _mm_movelh_ps(v0, v1);
771  z = _mm_movehl_ps(v1, v0);
772  xy = xy*vLo;
773  z = _mm_shuffle_ps( z, z, 0xa8);
774  x = _mm_shuffle_ps( xy, xy, 0xa8);
775  y = _mm_shuffle_ps( xy, xy, 0xfd);
776  z = z*vHi;
777  }
778  break;
779  case 1:
780  {
781  float4 xy = vertices[0];
782  z = _mm_shuffle_ps( xy, xy, 0xaa);
783  xy = xy*vLo;
784  z = z*vHi;
785  x = _mm_shuffle_ps(xy, xy, 0);
786  y = _mm_shuffle_ps(xy, xy, 0x55);
787  }
788  break;
789  }
790  x = x+y;
791  x = x+z;
792  stack_array[index] = x;
793  min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
794  index++;
795  }
796 
797  // if we found a new min.
798  if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
799  { // we found a new min. Search for it
800  // find min across the min vector, place in all elements of min -- big latency hit here
801  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
802  min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
803 
804  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
805  // this where it actually makes a difference is handled in the early out at the top of the function,
806  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
807  // complexity, and removed it.
808 
809  dotmin = min;
810 
811  // scan for the first occurence of min in the array
812  size_t test;
813  for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
814  {}
815  minIndex = 4*index + segment + indexTable[test];
816  }
817 
818  _mm_store_ss( dotResult, dotmin);
819  return minIndex;
820 }
821 
822 
823 #elif defined BT_USE_NEON
824 
825 #define ARM_NEON_GCC_COMPATIBILITY 1
826 #include <arm_neon.h>
827 #include <sys/types.h>
828 #include <sys/sysctl.h> //for sysctlbyname
829 
830 static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
831 static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
832 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
833 static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
834 static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
835 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
836 
837 long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
838 long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
839 
840 
841 static inline uint32_t btGetCpuCapabilities( void )
842 {
843  static uint32_t capabilities = 0;
844  static bool testedCapabilities = false;
845 
846  if( 0 == testedCapabilities)
847  {
848  uint32_t hasFeature = 0;
849  size_t featureSize = sizeof( hasFeature );
850  int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
851 
852  if( 0 == err && hasFeature)
853  capabilities |= 0x2000;
854 
855  testedCapabilities = true;
856  }
857 
858  return capabilities;
859 }
860 
861 
862 
863 
864 static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
865 {
866 
867  if( btGetCpuCapabilities() & 0x2000 )
868  _maxdot_large = _maxdot_large_v1;
869  else
870  _maxdot_large = _maxdot_large_v0;
871 
872  return _maxdot_large(vv, vec, count, dotResult);
873 }
874 
875 static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
876 {
877 
878  if( btGetCpuCapabilities() & 0x2000 )
879  _mindot_large = _mindot_large_v1;
880  else
881  _mindot_large = _mindot_large_v0;
882 
883  return _mindot_large(vv, vec, count, dotResult);
884 }
885 
886 
887 
888 #if defined __arm__
889 # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894 
895 
896 long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
897 {
898  unsigned long i = 0;
899  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
900  float32x2_t vLo = vget_low_f32(vvec);
901  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
902  float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
903  float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
904  uint32x2_t indexLo = (uint32x2_t) {0, 1};
905  uint32x2_t indexHi = (uint32x2_t) {2, 3};
906  uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907  uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
908  const uint32x2_t four = (uint32x2_t) {4,4};
909 
910  for( ; i+8 <= count; i+= 8 )
911  {
912  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
913  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
914  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
915  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
916 
917  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
918  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
919  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
920  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
921 
922  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
923  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
924  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
925  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
926 
927  float32x2_t rLo = vpadd_f32( xy0, xy1);
928  float32x2_t rHi = vpadd_f32( xy2, xy3);
929  rLo = vadd_f32(rLo, zLo);
930  rHi = vadd_f32(rHi, zHi);
931 
932  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
933  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
934  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
935  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
936  iLo = vbsl_u32(maskLo, indexLo, iLo);
937  iHi = vbsl_u32(maskHi, indexHi, iHi);
938  indexLo = vadd_u32(indexLo, four);
939  indexHi = vadd_u32(indexHi, four);
940 
941  v0 = vld1q_f32_aligned_postincrement( vv );
942  v1 = vld1q_f32_aligned_postincrement( vv );
943  v2 = vld1q_f32_aligned_postincrement( vv );
944  v3 = vld1q_f32_aligned_postincrement( vv );
945 
946  xy0 = vmul_f32( vget_low_f32(v0), vLo);
947  xy1 = vmul_f32( vget_low_f32(v1), vLo);
948  xy2 = vmul_f32( vget_low_f32(v2), vLo);
949  xy3 = vmul_f32( vget_low_f32(v3), vLo);
950 
951  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
952  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
953  zLo = vmul_f32( z0.val[0], vHi);
954  zHi = vmul_f32( z1.val[0], vHi);
955 
956  rLo = vpadd_f32( xy0, xy1);
957  rHi = vpadd_f32( xy2, xy3);
958  rLo = vadd_f32(rLo, zLo);
959  rHi = vadd_f32(rHi, zHi);
960 
961  maskLo = vcgt_f32( rLo, dotMaxLo );
962  maskHi = vcgt_f32( rHi, dotMaxHi );
963  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
964  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
965  iLo = vbsl_u32(maskLo, indexLo, iLo);
966  iHi = vbsl_u32(maskHi, indexHi, iHi);
967  indexLo = vadd_u32(indexLo, four);
968  indexHi = vadd_u32(indexHi, four);
969  }
970 
971  for( ; i+4 <= count; i+= 4 )
972  {
973  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
974  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
975  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
976  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
977 
978  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
979  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
980  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
981  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
982 
983  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
984  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
985  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
986  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
987 
988  float32x2_t rLo = vpadd_f32( xy0, xy1);
989  float32x2_t rHi = vpadd_f32( xy2, xy3);
990  rLo = vadd_f32(rLo, zLo);
991  rHi = vadd_f32(rHi, zHi);
992 
993  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
994  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
995  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
996  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
997  iLo = vbsl_u32(maskLo, indexLo, iLo);
998  iHi = vbsl_u32(maskHi, indexHi, iHi);
999  indexLo = vadd_u32(indexLo, four);
1000  indexHi = vadd_u32(indexHi, four);
1001  }
1002 
1003  switch( count & 3 )
1004  {
1005  case 3:
1006  {
1007  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1008  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1009  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1010 
1011  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1012  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1013  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1014 
1015  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1016  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1017  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1018 
1019  float32x2_t rLo = vpadd_f32( xy0, xy1);
1020  float32x2_t rHi = vpadd_f32( xy2, xy2);
1021  rLo = vadd_f32(rLo, zLo);
1022  rHi = vadd_f32(rHi, zHi);
1023 
1024  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1025  uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
1026  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1027  dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
1028  iLo = vbsl_u32(maskLo, indexLo, iLo);
1029  iHi = vbsl_u32(maskHi, indexHi, iHi);
1030  }
1031  break;
1032  case 2:
1033  {
1034  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1035  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1036 
1037  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1038  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1039 
1040  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1041  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1042 
1043  float32x2_t rLo = vpadd_f32( xy0, xy1);
1044  rLo = vadd_f32(rLo, zLo);
1045 
1046  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1047  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1048  iLo = vbsl_u32(maskLo, indexLo, iLo);
1049  }
1050  break;
1051  case 1:
1052  {
1053  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1054  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1055  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1056  float32x2_t zLo = vmul_f32( z0, vHi);
1057  float32x2_t rLo = vpadd_f32( xy0, xy0);
1058  rLo = vadd_f32(rLo, zLo);
1059  uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
1060  dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
1061  iLo = vbsl_u32(maskLo, indexLo, iLo);
1062  }
1063  break;
1064 
1065  default:
1066  break;
1067  }
1068 
1069  // select best answer between hi and lo results
1070  uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
1071  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1072  iLo = vbsl_u32(mask, iHi, iLo);
1073 
1074  // select best answer between even and odd results
1075  dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1076  iHi = vdup_lane_u32(iLo, 1);
1077  mask = vcgt_f32( dotMaxHi, dotMaxLo );
1078  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1079  iLo = vbsl_u32(mask, iHi, iLo);
1080 
1081  *dotResult = vget_lane_f32( dotMaxLo, 0);
1082  return vget_lane_u32(iLo, 0);
1083 }
1084 
1085 
1086 long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1087 {
1088  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1089  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1090  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1091  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1092  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1093  uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1094  float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
1095 
1096  unsigned long i = 0;
1097  for( ; i + 8 <= count; i += 8 )
1098  {
1099  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1100  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1101  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1102  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1103 
1104  // the next two lines should resolve to a single vswp d, d
1105  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1106  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1107  // the next two lines should resolve to a single vswp d, d
1108  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1109  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1110 
1111  xy0 = vmulq_f32(xy0, vLo);
1112  xy1 = vmulq_f32(xy1, vLo);
1113 
1114  float32x4x2_t zb = vuzpq_f32( z0, z1);
1115  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1116  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1117  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1118  x = vaddq_f32(x, z);
1119 
1120  uint32x4_t mask = vcgtq_f32(x, maxDot);
1121  maxDot = vbslq_f32( mask, x, maxDot);
1122  index = vbslq_u32(mask, local_index, index);
1123  local_index = vaddq_u32(local_index, four);
1124 
1125  v0 = vld1q_f32_aligned_postincrement( vv );
1126  v1 = vld1q_f32_aligned_postincrement( vv );
1127  v2 = vld1q_f32_aligned_postincrement( vv );
1128  v3 = vld1q_f32_aligned_postincrement( vv );
1129 
1130  // the next two lines should resolve to a single vswp d, d
1131  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1132  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1133  // the next two lines should resolve to a single vswp d, d
1134  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1135  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1136 
1137  xy0 = vmulq_f32(xy0, vLo);
1138  xy1 = vmulq_f32(xy1, vLo);
1139 
1140  zb = vuzpq_f32( z0, z1);
1141  z = vmulq_f32( zb.val[0], vHi);
1142  xy = vuzpq_f32( xy0, xy1);
1143  x = vaddq_f32(xy.val[0], xy.val[1]);
1144  x = vaddq_f32(x, z);
1145 
1146  mask = vcgtq_f32(x, maxDot);
1147  maxDot = vbslq_f32( mask, x, maxDot);
1148  index = vbslq_u32(mask, local_index, index);
1149  local_index = vaddq_u32(local_index, four);
1150  }
1151 
1152  for( ; i + 4 <= count; i += 4 )
1153  {
1154  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1155  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1156  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1157  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1158 
1159  // the next two lines should resolve to a single vswp d, d
1160  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1161  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1162  // the next two lines should resolve to a single vswp d, d
1163  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1164  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1165 
1166  xy0 = vmulq_f32(xy0, vLo);
1167  xy1 = vmulq_f32(xy1, vLo);
1168 
1169  float32x4x2_t zb = vuzpq_f32( z0, z1);
1170  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1171  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1172  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1173  x = vaddq_f32(x, z);
1174 
1175  uint32x4_t mask = vcgtq_f32(x, maxDot);
1176  maxDot = vbslq_f32( mask, x, maxDot);
1177  index = vbslq_u32(mask, local_index, index);
1178  local_index = vaddq_u32(local_index, four);
1179  }
1180 
1181  switch (count & 3) {
1182  case 3:
1183  {
1184  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1185  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1186  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1187 
1188  // the next two lines should resolve to a single vswp d, d
1189  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1190  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1191  // the next two lines should resolve to a single vswp d, d
1192  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1193  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1194 
1195  xy0 = vmulq_f32(xy0, vLo);
1196  xy1 = vmulq_f32(xy1, vLo);
1197 
1198  float32x4x2_t zb = vuzpq_f32( z0, z1);
1199  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1200  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1201  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1202  x = vaddq_f32(x, z);
1203 
1204  uint32x4_t mask = vcgtq_f32(x, maxDot);
1205  maxDot = vbslq_f32( mask, x, maxDot);
1206  index = vbslq_u32(mask, local_index, index);
1207  local_index = vaddq_u32(local_index, four);
1208  }
1209  break;
1210 
1211  case 2:
1212  {
1213  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1214  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1215 
1216  // the next two lines should resolve to a single vswp d, d
1217  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1218  // the next two lines should resolve to a single vswp d, d
1219  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1220 
1221  xy0 = vmulq_f32(xy0, vLo);
1222 
1223  float32x4x2_t zb = vuzpq_f32( z0, z0);
1224  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1225  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1226  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1227  x = vaddq_f32(x, z);
1228 
1229  uint32x4_t mask = vcgtq_f32(x, maxDot);
1230  maxDot = vbslq_f32( mask, x, maxDot);
1231  index = vbslq_u32(mask, local_index, index);
1232  local_index = vaddq_u32(local_index, four);
1233  }
1234  break;
1235 
1236  case 1:
1237  {
1238  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1239 
1240  // the next two lines should resolve to a single vswp d, d
1241  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1242  // the next two lines should resolve to a single vswp d, d
1243  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244 
1245  xy0 = vmulq_f32(xy0, vLo);
1246 
1247  z = vmulq_f32( z, vHi);
1248  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1249  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1250  x = vaddq_f32(x, z);
1251 
1252  uint32x4_t mask = vcgtq_f32(x, maxDot);
1253  maxDot = vbslq_f32( mask, x, maxDot);
1254  index = vbslq_u32(mask, local_index, index);
1255  local_index = vaddq_u32(local_index, four);
1256  }
1257  break;
1258 
1259  default:
1260  break;
1261  }
1262 
1263 
1264  // select best answer between hi and lo results
1265  uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
1266  float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1267  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1268 
1269  // select best answer between even and odd results
1270  float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1271  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1272  mask = vcgt_f32( maxDotO, maxDot2 );
1273  maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1274  index2 = vbsl_u32(mask, indexHi, index2);
1275 
1276  *dotResult = vget_lane_f32( maxDot2, 0);
1277  return vget_lane_u32(index2, 0);
1278 
1279 }
1280 
1281 long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
1282 {
1283  unsigned long i = 0;
1284  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1285  float32x2_t vLo = vget_low_f32(vvec);
1286  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1287  float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1288  float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
1289  uint32x2_t indexLo = (uint32x2_t) {0, 1};
1290  uint32x2_t indexHi = (uint32x2_t) {2, 3};
1291  uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1292  uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1293  const uint32x2_t four = (uint32x2_t) {4,4};
1294 
1295  for( ; i+8 <= count; i+= 8 )
1296  {
1297  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1298  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1299  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1300  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1301 
1302  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1303  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1304  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1305  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1306 
1307  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1308  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1309  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1310  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1311 
1312  float32x2_t rLo = vpadd_f32( xy0, xy1);
1313  float32x2_t rHi = vpadd_f32( xy2, xy3);
1314  rLo = vadd_f32(rLo, zLo);
1315  rHi = vadd_f32(rHi, zHi);
1316 
1317  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1318  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1319  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1320  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1321  iLo = vbsl_u32(maskLo, indexLo, iLo);
1322  iHi = vbsl_u32(maskHi, indexHi, iHi);
1323  indexLo = vadd_u32(indexLo, four);
1324  indexHi = vadd_u32(indexHi, four);
1325 
1326  v0 = vld1q_f32_aligned_postincrement( vv );
1327  v1 = vld1q_f32_aligned_postincrement( vv );
1328  v2 = vld1q_f32_aligned_postincrement( vv );
1329  v3 = vld1q_f32_aligned_postincrement( vv );
1330 
1331  xy0 = vmul_f32( vget_low_f32(v0), vLo);
1332  xy1 = vmul_f32( vget_low_f32(v1), vLo);
1333  xy2 = vmul_f32( vget_low_f32(v2), vLo);
1334  xy3 = vmul_f32( vget_low_f32(v3), vLo);
1335 
1336  z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1337  z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1338  zLo = vmul_f32( z0.val[0], vHi);
1339  zHi = vmul_f32( z1.val[0], vHi);
1340 
1341  rLo = vpadd_f32( xy0, xy1);
1342  rHi = vpadd_f32( xy2, xy3);
1343  rLo = vadd_f32(rLo, zLo);
1344  rHi = vadd_f32(rHi, zHi);
1345 
1346  maskLo = vclt_f32( rLo, dotMinLo );
1347  maskHi = vclt_f32( rHi, dotMinHi );
1348  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1349  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1350  iLo = vbsl_u32(maskLo, indexLo, iLo);
1351  iHi = vbsl_u32(maskHi, indexHi, iHi);
1352  indexLo = vadd_u32(indexLo, four);
1353  indexHi = vadd_u32(indexHi, four);
1354  }
1355 
1356  for( ; i+4 <= count; i+= 4 )
1357  {
1358  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1359  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1360  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1361  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1362 
1363  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1364  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1365  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1366  float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
1367 
1368  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1369  float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
1370  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1371  float32x2_t zHi = vmul_f32( z1.val[0], vHi);
1372 
1373  float32x2_t rLo = vpadd_f32( xy0, xy1);
1374  float32x2_t rHi = vpadd_f32( xy2, xy3);
1375  rLo = vadd_f32(rLo, zLo);
1376  rHi = vadd_f32(rHi, zHi);
1377 
1378  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1379  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1380  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1381  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1382  iLo = vbsl_u32(maskLo, indexLo, iLo);
1383  iHi = vbsl_u32(maskHi, indexHi, iHi);
1384  indexLo = vadd_u32(indexLo, four);
1385  indexHi = vadd_u32(indexHi, four);
1386  }
1387  switch( count & 3 )
1388  {
1389  case 3:
1390  {
1391  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1392  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1393  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1394 
1395  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1396  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1397  float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
1398 
1399  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1400  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1401  float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1402 
1403  float32x2_t rLo = vpadd_f32( xy0, xy1);
1404  float32x2_t rHi = vpadd_f32( xy2, xy2);
1405  rLo = vadd_f32(rLo, zLo);
1406  rHi = vadd_f32(rHi, zHi);
1407 
1408  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1409  uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
1410  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1411  dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
1412  iLo = vbsl_u32(maskLo, indexLo, iLo);
1413  iHi = vbsl_u32(maskHi, indexHi, iHi);
1414  }
1415  break;
1416  case 2:
1417  {
1418  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1419  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1420 
1421  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1422  float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
1423 
1424  float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
1425  float32x2_t zLo = vmul_f32( z0.val[0], vHi);
1426 
1427  float32x2_t rLo = vpadd_f32( xy0, xy1);
1428  rLo = vadd_f32(rLo, zLo);
1429 
1430  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1431  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1432  iLo = vbsl_u32(maskLo, indexLo, iLo);
1433  }
1434  break;
1435  case 1:
1436  {
1437  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1438  float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
1439  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1440  float32x2_t zLo = vmul_f32( z0, vHi);
1441  float32x2_t rLo = vpadd_f32( xy0, xy0);
1442  rLo = vadd_f32(rLo, zLo);
1443  uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
1444  dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
1445  iLo = vbsl_u32(maskLo, indexLo, iLo);
1446  }
1447  break;
1448 
1449  default:
1450  break;
1451  }
1452 
1453  // select best answer between hi and lo results
1454  uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
1455  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1456  iLo = vbsl_u32(mask, iHi, iLo);
1457 
1458  // select best answer between even and odd results
1459  dotMinHi = vdup_lane_f32(dotMinLo, 1);
1460  iHi = vdup_lane_u32(iLo, 1);
1461  mask = vclt_f32( dotMinHi, dotMinLo );
1462  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1463  iLo = vbsl_u32(mask, iHi, iLo);
1464 
1465  *dotResult = vget_lane_f32( dotMinLo, 0);
1466  return vget_lane_u32(iLo, 0);
1467 }
1468 
1469 long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
1470 {
1471  float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
1472  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1473  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1474  const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
1475  uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
1476  uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
1477  float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
1478 
1479  unsigned long i = 0;
1480  for( ; i + 8 <= count; i += 8 )
1481  {
1482  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1483  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1484  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1485  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1486 
1487  // the next two lines should resolve to a single vswp d, d
1488  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1489  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1490  // the next two lines should resolve to a single vswp d, d
1491  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1492  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1493 
1494  xy0 = vmulq_f32(xy0, vLo);
1495  xy1 = vmulq_f32(xy1, vLo);
1496 
1497  float32x4x2_t zb = vuzpq_f32( z0, z1);
1498  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1499  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1500  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1501  x = vaddq_f32(x, z);
1502 
1503  uint32x4_t mask = vcltq_f32(x, minDot);
1504  minDot = vbslq_f32( mask, x, minDot);
1505  index = vbslq_u32(mask, local_index, index);
1506  local_index = vaddq_u32(local_index, four);
1507 
1508  v0 = vld1q_f32_aligned_postincrement( vv );
1509  v1 = vld1q_f32_aligned_postincrement( vv );
1510  v2 = vld1q_f32_aligned_postincrement( vv );
1511  v3 = vld1q_f32_aligned_postincrement( vv );
1512 
1513  // the next two lines should resolve to a single vswp d, d
1514  xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1515  xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1516  // the next two lines should resolve to a single vswp d, d
1517  z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1518  z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1519 
1520  xy0 = vmulq_f32(xy0, vLo);
1521  xy1 = vmulq_f32(xy1, vLo);
1522 
1523  zb = vuzpq_f32( z0, z1);
1524  z = vmulq_f32( zb.val[0], vHi);
1525  xy = vuzpq_f32( xy0, xy1);
1526  x = vaddq_f32(xy.val[0], xy.val[1]);
1527  x = vaddq_f32(x, z);
1528 
1529  mask = vcltq_f32(x, minDot);
1530  minDot = vbslq_f32( mask, x, minDot);
1531  index = vbslq_u32(mask, local_index, index);
1532  local_index = vaddq_u32(local_index, four);
1533  }
1534 
1535  for( ; i + 4 <= count; i += 4 )
1536  {
1537  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1538  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1539  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1540  float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
1541 
1542  // the next two lines should resolve to a single vswp d, d
1543  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1544  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
1545  // the next two lines should resolve to a single vswp d, d
1546  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1547  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
1548 
1549  xy0 = vmulq_f32(xy0, vLo);
1550  xy1 = vmulq_f32(xy1, vLo);
1551 
1552  float32x4x2_t zb = vuzpq_f32( z0, z1);
1553  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1554  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1555  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1556  x = vaddq_f32(x, z);
1557 
1558  uint32x4_t mask = vcltq_f32(x, minDot);
1559  minDot = vbslq_f32( mask, x, minDot);
1560  index = vbslq_u32(mask, local_index, index);
1561  local_index = vaddq_u32(local_index, four);
1562  }
1563 
1564  switch (count & 3) {
1565  case 3:
1566  {
1567  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1568  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1569  float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
1570 
1571  // the next two lines should resolve to a single vswp d, d
1572  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1573  float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
1574  // the next two lines should resolve to a single vswp d, d
1575  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1576  float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
1577 
1578  xy0 = vmulq_f32(xy0, vLo);
1579  xy1 = vmulq_f32(xy1, vLo);
1580 
1581  float32x4x2_t zb = vuzpq_f32( z0, z1);
1582  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1583  float32x4x2_t xy = vuzpq_f32( xy0, xy1);
1584  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1585  x = vaddq_f32(x, z);
1586 
1587  uint32x4_t mask = vcltq_f32(x, minDot);
1588  minDot = vbslq_f32( mask, x, minDot);
1589  index = vbslq_u32(mask, local_index, index);
1590  local_index = vaddq_u32(local_index, four);
1591  }
1592  break;
1593 
1594  case 2:
1595  {
1596  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1597  float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
1598 
1599  // the next two lines should resolve to a single vswp d, d
1600  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
1601  // the next two lines should resolve to a single vswp d, d
1602  float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
1603 
1604  xy0 = vmulq_f32(xy0, vLo);
1605 
1606  float32x4x2_t zb = vuzpq_f32( z0, z0);
1607  float32x4_t z = vmulq_f32( zb.val[0], vHi);
1608  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1609  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1610  x = vaddq_f32(x, z);
1611 
1612  uint32x4_t mask = vcltq_f32(x, minDot);
1613  minDot = vbslq_f32( mask, x, minDot);
1614  index = vbslq_u32(mask, local_index, index);
1615  local_index = vaddq_u32(local_index, four);
1616  }
1617  break;
1618 
1619  case 1:
1620  {
1621  float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
1622 
1623  // the next two lines should resolve to a single vswp d, d
1624  float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
1625  // the next two lines should resolve to a single vswp d, d
1626  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1627 
1628  xy0 = vmulq_f32(xy0, vLo);
1629 
1630  z = vmulq_f32( z, vHi);
1631  float32x4x2_t xy = vuzpq_f32( xy0, xy0);
1632  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1633  x = vaddq_f32(x, z);
1634 
1635  uint32x4_t mask = vcltq_f32(x, minDot);
1636  minDot = vbslq_f32( mask, x, minDot);
1637  index = vbslq_u32(mask, local_index, index);
1638  local_index = vaddq_u32(local_index, four);
1639  }
1640  break;
1641 
1642  default:
1643  break;
1644  }
1645 
1646 
1647  // select best answer between hi and lo results
1648  uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
1649  float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1650  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1651 
1652  // select best answer between even and odd results
1653  float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1654  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1655  mask = vclt_f32( minDotO, minDot2 );
1656  minDot2 = vbsl_f32(mask, minDotO, minDot2);
1657  index2 = vbsl_u32(mask, indexHi, index2);
1658 
1659  *dotResult = vget_lane_f32( minDot2, 0);
1660  return vget_lane_u32(index2, 0);
1661 
1662 }
1663 
1664 #else
1665  #error Unhandled __APPLE__ arch
1666 #endif
1667 
1668 #endif /* __APPLE__ */
1669 
1670 
unsigned int uint32_t
#define btUnlikely(_c)
Definition: btScalar.h:137
#define BT_INFINITY
Definition: btScalar.h:383