Bullet Collision Detection & Physics Library
btThreadSupportWin32.cpp
Go to the documentation of this file.
1 /*
2 Bullet Continuous Collision Detection and Physics Library
3 Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com
4 
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10 
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15 
16 #if defined( _WIN32 ) && BT_THREADSAFE
17 
18 #include "LinearMath/btScalar.h"
19 #include "LinearMath/btMinMax.h"
21 #include "LinearMath/btThreads.h"
23 #include <windows.h>
24 #include <stdio.h>
25 
26 
27 struct btProcessorInfo
28 {
29  int numLogicalProcessors;
30  int numCores;
31  int numNumaNodes;
32  int numL1Cache;
33  int numL2Cache;
34  int numL3Cache;
35  int numPhysicalPackages;
36  static const int maxNumTeamMasks = 32;
37  int numTeamMasks;
38  UINT64 processorTeamMasks[ maxNumTeamMasks ];
39 };
40 
41 UINT64 getProcessorTeamMask( const btProcessorInfo& procInfo, int procId )
42 {
43  UINT64 procMask = UINT64( 1 ) << procId;
44  for ( int i = 0; i < procInfo.numTeamMasks; ++i )
45  {
46  if ( procMask & procInfo.processorTeamMasks[ i ] )
47  {
48  return procInfo.processorTeamMasks[ i ];
49  }
50  }
51  return 0;
52 }
53 
54 int getProcessorTeamIndex( const btProcessorInfo& procInfo, int procId )
55 {
56  UINT64 procMask = UINT64( 1 ) << procId;
57  for ( int i = 0; i < procInfo.numTeamMasks; ++i )
58  {
59  if ( procMask & procInfo.processorTeamMasks[ i ] )
60  {
61  return i;
62  }
63  }
64  return -1;
65 }
66 
67 int countSetBits( ULONG64 bits )
68 {
69  int count = 0;
70  while ( bits )
71  {
72  if ( bits & 1 )
73  {
74  count++;
75  }
76  bits >>= 1;
77  }
78  return count;
79 }
80 
81 
82 typedef BOOL( WINAPI *Pfn_GetLogicalProcessorInformation )( PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD );
83 
84 
85 void getProcessorInformation( btProcessorInfo* procInfo )
86 {
87  memset( procInfo, 0, sizeof( *procInfo ) );
88  Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
89  (Pfn_GetLogicalProcessorInformation) GetProcAddress( GetModuleHandle( TEXT( "kernel32" ) ), "GetLogicalProcessorInformation" );
90  if ( getLogicalProcInfo == NULL )
91  {
92  // no info
93  return;
94  }
95  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
96  DWORD bufSize = 0;
97  while ( true )
98  {
99  if ( getLogicalProcInfo( buf, &bufSize ) )
100  {
101  break;
102  }
103  else
104  {
105  if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER )
106  {
107  if ( buf )
108  {
109  free( buf );
110  }
111  buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION) malloc( bufSize );
112  }
113  }
114  }
115 
116  int len = bufSize / sizeof( *buf );
117  for ( int i = 0; i < len; ++i )
118  {
119  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
120  switch ( info->Relationship )
121  {
122  case RelationNumaNode:
123  procInfo->numNumaNodes++;
124  break;
125 
126  case RelationProcessorCore:
127  procInfo->numCores++;
128  procInfo->numLogicalProcessors += countSetBits( info->ProcessorMask );
129  break;
130 
131  case RelationCache:
132  if ( info->Cache.Level == 1 )
133  {
134  procInfo->numL1Cache++;
135  }
136  else if ( info->Cache.Level == 2 )
137  {
138  procInfo->numL2Cache++;
139  }
140  else if ( info->Cache.Level == 3 )
141  {
142  procInfo->numL3Cache++;
143  // processors that share L3 cache are considered to be on the same team
144  // because they can more easily work together on the same data.
145  // Large performance penalties will occur if 2 or more threads from different
146  // teams attempt to frequently read and modify the same cache lines.
147  //
148  // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
149  // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
150  // CCXs are operating on the same data, many cycles will be spent keeping the
151  // two caches coherent.
152  if ( procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks )
153  {
154  procInfo->processorTeamMasks[ procInfo->numTeamMasks ] = info->ProcessorMask;
155  procInfo->numTeamMasks++;
156  }
157  }
158  break;
159 
160  case RelationProcessorPackage:
161  procInfo->numPhysicalPackages++;
162  break;
163  }
164  }
165  free( buf );
166 }
167 
168 
169 
171 class btThreadSupportWin32 : public btThreadSupportInterface
172 {
173 public:
174  struct btThreadStatus
175  {
176  int m_taskId;
177  int m_commandId;
178  int m_status;
179 
180  ThreadFunc m_userThreadFunc;
181  void* m_userPtr; //for taskDesc etc
182 
183  void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
184 
185  void* m_eventStartHandle;
186  char m_eventStartHandleName[ 32 ];
187 
188  void* m_eventCompleteHandle;
189  char m_eventCompleteHandleName[ 32 ];
190  };
191 
192 private:
193  btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
194  btAlignedObjectArray<void*> m_completeHandles;
195  int m_numThreads;
196  DWORD_PTR m_startedThreadMask;
197  btProcessorInfo m_processorInfo;
198 
199  void startThreads( const ConstructionInfo& threadInfo );
200  void stopThreads();
201  int waitForResponse();
202 
203 public:
204 
205  btThreadSupportWin32( const ConstructionInfo& threadConstructionInfo );
206  virtual ~btThreadSupportWin32();
207 
208  virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
209  virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
210  virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
211 
212  virtual void runTask( int threadIndex, void* userData ) BT_OVERRIDE;
213  virtual void waitForAllTasks() BT_OVERRIDE;
214 
215  virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
216  virtual void deleteCriticalSection( btCriticalSection* criticalSection ) BT_OVERRIDE;
217 };
218 
219 
220 btThreadSupportWin32::btThreadSupportWin32( const ConstructionInfo & threadConstructionInfo )
221 {
222  startThreads( threadConstructionInfo );
223 }
224 
225 
226 btThreadSupportWin32::~btThreadSupportWin32()
227 {
228  stopThreads();
229 }
230 
231 
232 DWORD WINAPI win32threadStartFunc( LPVOID lpParam )
233 {
234  btThreadSupportWin32::btThreadStatus* status = ( btThreadSupportWin32::btThreadStatus* )lpParam;
235 
236  while ( 1 )
237  {
238  WaitForSingleObject( status->m_eventStartHandle, INFINITE );
239  void* userPtr = status->m_userPtr;
240 
241  if ( userPtr )
242  {
243  btAssert( status->m_status );
244  status->m_userThreadFunc( userPtr );
245  status->m_status = 2;
246  SetEvent( status->m_eventCompleteHandle );
247  }
248  else
249  {
250  //exit Thread
251  status->m_status = 3;
252  printf( "Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle );
253  SetEvent( status->m_eventCompleteHandle );
254  break;
255  }
256  }
257  printf( "Thread TERMINATED\n" );
258  return 0;
259 }
260 
261 
262 void btThreadSupportWin32::runTask( int threadIndex, void* userData )
263 {
264  btThreadStatus& threadStatus = m_activeThreadStatus[ threadIndex ];
265  btAssert( threadIndex >= 0 );
266  btAssert( int( threadIndex ) < m_activeThreadStatus.size() );
267 
268  threadStatus.m_commandId = 1;
269  threadStatus.m_status = 1;
270  threadStatus.m_userPtr = userData;
271  m_startedThreadMask |= DWORD_PTR( 1 ) << threadIndex;
272 
274  SetEvent( threadStatus.m_eventStartHandle );
275 }
276 
277 
278 int btThreadSupportWin32::waitForResponse()
279 {
280  btAssert( m_activeThreadStatus.size() );
281 
282  int last = -1;
283  DWORD res = WaitForMultipleObjects( m_completeHandles.size(), &m_completeHandles[ 0 ], FALSE, INFINITE );
284  btAssert( res != WAIT_FAILED );
285  last = res - WAIT_OBJECT_0;
286 
287  btThreadStatus& threadStatus = m_activeThreadStatus[ last ];
288  btAssert( threadStatus.m_threadHandle );
289  btAssert( threadStatus.m_eventCompleteHandle );
290 
291  //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
292  btAssert( threadStatus.m_status > 1 );
293  threadStatus.m_status = 0;
294 
296  btAssert( last >= 0 );
297  m_startedThreadMask &= ~( DWORD_PTR( 1 ) << last );
298 
299  return last;
300 }
301 
302 
303 void btThreadSupportWin32::waitForAllTasks()
304 {
305  while ( m_startedThreadMask )
306  {
307  waitForResponse();
308  }
309 }
310 
311 
312 void btThreadSupportWin32::startThreads( const ConstructionInfo& threadConstructionInfo )
313 {
314  static int uniqueId = 0;
315  uniqueId++;
316  btProcessorInfo& procInfo = m_processorInfo;
317  getProcessorInformation( &procInfo );
318  DWORD_PTR dwProcessAffinityMask = 0;
319  DWORD_PTR dwSystemAffinityMask = 0;
320  if ( !GetProcessAffinityMask( GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask ) )
321  {
322  dwProcessAffinityMask = 0;
323  }
325  m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
326 
327  m_activeThreadStatus.resize( m_numThreads );
328  m_completeHandles.resize( m_numThreads );
329  m_startedThreadMask = 0;
330 
331  // set main thread affinity
332  if ( DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask( procInfo, 0 ))
333  {
334  SetThreadAffinityMask( GetCurrentThread(), mask );
335  SetThreadIdealProcessor( GetCurrentThread(), 0 );
336  }
337 
338  for ( int i = 0; i < m_numThreads; i++ )
339  {
340  printf( "starting thread %d\n", i );
341 
342  btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
343 
344  LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
345  SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
346  LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
347  LPVOID lpParameter = &threadStatus;
348  DWORD dwCreationFlags = 0;
349  LPDWORD lpThreadId = 0;
350 
351  threadStatus.m_userPtr = 0;
352 
353  sprintf( threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
354  threadStatus.m_eventStartHandle = CreateEventA( 0, false, false, threadStatus.m_eventStartHandleName );
355 
356  sprintf( threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i );
357  threadStatus.m_eventCompleteHandle = CreateEventA( 0, false, false, threadStatus.m_eventCompleteHandleName );
358 
359  m_completeHandles[ i ] = threadStatus.m_eventCompleteHandle;
360 
361  HANDLE handle = CreateThread( lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId );
362  //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
363  // highest priority -- can cause erratic performance when numThreads > numCores
364  // we don't want worker threads to be higher priority than the main thread or the main thread could get
365  // totally shut out and unable to tell the workers to stop
366  //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
367 
368  {
369  int processorId = i + 1; // leave processor 0 for main thread
370  DWORD_PTR teamMask = getProcessorTeamMask( procInfo, processorId );
371  if ( teamMask )
372  {
373  // bind each thread to only execute on processors of it's assigned team
374  // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
375  // - for multi-socket Intel this will keep threads from migrating from one socket to another
376  // - for AMD Ryzen this will keep threads from migrating from one CCX to another
377  DWORD_PTR mask = teamMask & dwProcessAffinityMask;
378  if ( mask )
379  {
380  SetThreadAffinityMask( handle, mask );
381  }
382  }
383  SetThreadIdealProcessor( handle, processorId );
384  }
385 
386  threadStatus.m_taskId = i;
387  threadStatus.m_commandId = 0;
388  threadStatus.m_status = 0;
389  threadStatus.m_threadHandle = handle;
390  threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
391 
392  printf( "started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle );
393  }
394 }
395 
397 void btThreadSupportWin32::stopThreads()
398 {
399  for ( int i = 0; i < m_activeThreadStatus.size(); i++ )
400  {
401  btThreadStatus& threadStatus = m_activeThreadStatus[ i ];
402  if ( threadStatus.m_status > 0 )
403  {
404  WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
405  }
406 
407  threadStatus.m_userPtr = NULL;
408  SetEvent( threadStatus.m_eventStartHandle );
409  WaitForSingleObject( threadStatus.m_eventCompleteHandle, INFINITE );
410 
411  CloseHandle( threadStatus.m_eventCompleteHandle );
412  CloseHandle( threadStatus.m_eventStartHandle );
413  CloseHandle( threadStatus.m_threadHandle );
414 
415  }
416 
417  m_activeThreadStatus.clear();
418  m_completeHandles.clear();
419 }
420 
421 
422 class btWin32CriticalSection : public btCriticalSection
423 {
424 private:
425  CRITICAL_SECTION mCriticalSection;
426 
427 public:
428  btWin32CriticalSection()
429  {
430  InitializeCriticalSection( &mCriticalSection );
431  }
432 
433  ~btWin32CriticalSection()
434  {
435  DeleteCriticalSection( &mCriticalSection );
436  }
437 
438  void lock()
439  {
440  EnterCriticalSection( &mCriticalSection );
441  }
442 
443  void unlock()
444  {
445  LeaveCriticalSection( &mCriticalSection );
446  }
447 };
448 
449 
450 btCriticalSection* btThreadSupportWin32::createCriticalSection()
451 {
452  unsigned char* mem = (unsigned char*) btAlignedAlloc( sizeof( btWin32CriticalSection ), 16 );
453  btWin32CriticalSection* cs = new( mem ) btWin32CriticalSection();
454  return cs;
455 }
456 
457 void btThreadSupportWin32::deleteCriticalSection( btCriticalSection* criticalSection )
458 {
459  criticalSection->~btCriticalSection();
460  btAlignedFree( criticalSection );
461 }
462 
463 
464 btThreadSupportInterface* btThreadSupportInterface::create( const ConstructionInfo& info )
465 {
466  return new btThreadSupportWin32( info );
467 }
468 
469 
470 
471 #endif //defined(_WIN32) && BT_THREADSAFE
472 
The btAlignedObjectArray template class uses a subset of the stl::vector interface for its methods It...
static btThreadSupportInterface * create(const ConstructionInfo &info)
#define btAssert(x)
Definition: btScalar.h:131
const unsigned int BT_MAX_THREAD_COUNT
Definition: btThreads.h:33
void clear()
clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
int size() const
return the number of elements in the array
#define BT_OVERRIDE
Definition: btThreads.h:28
#define btAlignedFree(ptr)
void resize(int newsize, const T &fillData=T())
#define btAlignedAlloc(size, alignment)
const T & btMin(const T &a, const T &b)
Definition: btMinMax.h:23
static int uniqueId
Definition: btRigidBody.cpp:27