LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014  }
1015 
1016  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017  for (i = 0; i < team->t.t_nproc; i++) {
1018  kmp_info_t *thr = team->t.t_threads[i];
1019  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020  thr->th.th_prev_level != team->t.t_level) {
1021  team->t.t_display_affinity = 1;
1022  break;
1023  }
1024  }
1025  }
1026 
1027  KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035  if (__kmp_inherit_fp_control) {
1036  kmp_int16 x87_fpu_control_word;
1037  kmp_uint32 mxcsr;
1038 
1039  // Get primary thread's values of FPU control flags (both X87 and vector)
1040  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041  __kmp_store_mxcsr(&mxcsr);
1042  mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044  // There is no point looking at t_fp_control_saved here.
1045  // If it is TRUE, we still have to update the values if they are different
1046  // from those we now have. If it is FALSE we didn't save anything yet, but
1047  // our objective is the same. We have to ensure that the values in the team
1048  // are the same as those we have.
1049  // So, this code achieves what we need whether or not t_fp_control_saved is
1050  // true. By checking whether the value needs updating we avoid unnecessary
1051  // writes that would put the cache-line into a written state, causing all
1052  // threads in the team to have to read it again.
1053  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055  // Although we don't use this value, other code in the runtime wants to know
1056  // whether it should restore them. So we must ensure it is correct.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058  } else {
1059  // Similarly here. Don't write to this cache-line in the team structure
1060  // unless we have to.
1061  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062  }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069  // Only reset the fp control regs if they have been changed in the team.
1070  // the parallel region that we are exiting.
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074  __kmp_store_mxcsr(&mxcsr);
1075  mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078  __kmp_clear_x87_fpu_status_word();
1079  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080  }
1081 
1082  if (team->t.t_mxcsr != mxcsr) {
1083  __kmp_load_mxcsr(&team->t.t_mxcsr);
1084  }
1085  }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093  int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096  single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098  kmp_info_t *this_thr;
1099  kmp_team_t *serial_team;
1100 
1101  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103  /* Skip all this code for autopar serialized loops since it results in
1104  unacceptable overhead */
1105  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106  return;
1107 
1108  if (!TCR_4(__kmp_init_parallel))
1109  __kmp_parallel_initialize();
1110  __kmp_resume_if_soft_paused();
1111 
1112  this_thr = __kmp_threads[global_tid];
1113  serial_team = this_thr->th.th_serial_team;
1114 
1115  /* utilize the serialized team held by this thread */
1116  KMP_DEBUG_ASSERT(serial_team);
1117  KMP_MB();
1118 
1119  if (__kmp_tasking_mode != tskm_immediate_exec) {
1120  KMP_DEBUG_ASSERT(
1121  this_thr->th.th_task_team ==
1122  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124  NULL);
1125  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126  "team %p, new task_team = NULL\n",
1127  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128  this_thr->th.th_task_team = NULL;
1129  }
1130 
1131  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133  proc_bind = proc_bind_false;
1134  } else if (proc_bind == proc_bind_default) {
1135  // No proc_bind clause was specified, so use the current value
1136  // of proc-bind-var for this parallel region.
1137  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138  }
1139  // Reset for next parallel region
1140  this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143  ompt_data_t ompt_parallel_data = ompt_data_none;
1144  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145  if (ompt_enabled.enabled &&
1146  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148  ompt_task_info_t *parent_task_info;
1149  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152  if (ompt_enabled.ompt_callback_parallel_begin) {
1153  int team_size = 1;
1154 
1155  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156  &(parent_task_info->task_data), &(parent_task_info->frame),
1157  &ompt_parallel_data, team_size,
1158  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159  }
1160  }
1161 #endif // OMPT_SUPPORT
1162 
1163  if (this_thr->th.th_team != serial_team) {
1164  // Nested level will be an index in the nested nthreads array
1165  int level = this_thr->th.th_team->t.t_level;
1166 
1167  if (serial_team->t.t_serialized) {
1168  /* this serial team was already used
1169  TODO increase performance by making this locks more specific */
1170  kmp_team_t *new_team;
1171 
1172  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174  new_team =
1175  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177  ompt_parallel_data,
1178 #endif
1179  proc_bind, &this_thr->th.th_current_task->td_icvs,
1180  0 USE_NESTED_HOT_ARG(NULL));
1181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182  KMP_ASSERT(new_team);
1183 
1184  /* setup new serialized team and install it */
1185  new_team->t.t_threads[0] = this_thr;
1186  new_team->t.t_parent = this_thr->th.th_team;
1187  serial_team = new_team;
1188  this_thr->th.th_serial_team = serial_team;
1189 
1190  KF_TRACE(
1191  10,
1192  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193  global_tid, serial_team));
1194 
1195  /* TODO the above breaks the requirement that if we run out of resources,
1196  then we can still guarantee that serialized teams are ok, since we may
1197  need to allocate a new one */
1198  } else {
1199  KF_TRACE(
1200  10,
1201  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202  global_tid, serial_team));
1203  }
1204 
1205  /* we have to initialize this serial team */
1206  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209  serial_team->t.t_ident = loc;
1210  serial_team->t.t_serialized = 1;
1211  serial_team->t.t_nproc = 1;
1212  serial_team->t.t_parent = this_thr->th.th_team;
1213  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214  this_thr->th.th_team = serial_team;
1215  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218  this_thr->th.th_current_task));
1219  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220  this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225  implicit task for each serialized task represented by
1226  team->t.t_serialized? */
1227  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228  &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230  // Thread value exists in the nested nthreads array for the next nested
1231  // level
1232  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233  this_thr->th.th_current_task->td_icvs.nproc =
1234  __kmp_nested_nth.nth[level + 1];
1235  }
1236 
1237  if (__kmp_nested_proc_bind.used &&
1238  (level + 1 < __kmp_nested_proc_bind.used)) {
1239  this_thr->th.th_current_task->td_icvs.proc_bind =
1240  __kmp_nested_proc_bind.bind_types[level + 1];
1241  }
1242 
1243 #if USE_DEBUGGER
1244  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246  this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248  /* set thread cache values */
1249  this_thr->th.th_team_nproc = 1;
1250  this_thr->th.th_team_master = this_thr;
1251  this_thr->th.th_team_serialized = 1;
1252 
1253  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257  propagateFPControl(serial_team);
1258 
1259  /* check if we need to allocate dispatch buffers stack */
1260  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262  serial_team->t.t_dispatch->th_disp_buffer =
1263  (dispatch_private_info_t *)__kmp_allocate(
1264  sizeof(dispatch_private_info_t));
1265  }
1266  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268  KMP_MB();
1269 
1270  } else {
1271  /* this serialized team is already being used,
1272  * that's fine, just add another nested level */
1273  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276  ++serial_team->t.t_serialized;
1277  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281  // Thread value exists in the nested nthreads array for the next nested
1282  // level
1283  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284  this_thr->th.th_current_task->td_icvs.nproc =
1285  __kmp_nested_nth.nth[level + 1];
1286  }
1287  serial_team->t.t_level++;
1288  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289  "of serial team %p to %d\n",
1290  global_tid, serial_team, serial_team->t.t_level));
1291 
1292  /* allocate/push dispatch buffers stack */
1293  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294  {
1295  dispatch_private_info_t *disp_buffer =
1296  (dispatch_private_info_t *)__kmp_allocate(
1297  sizeof(dispatch_private_info_t));
1298  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300  }
1301  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303  KMP_MB();
1304  }
1305  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307  // Perform the display affinity functionality for
1308  // serialized parallel regions
1309  if (__kmp_display_affinity) {
1310  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311  this_thr->th.th_prev_num_threads != 1) {
1312  // NULL means use the affinity-format-var ICV
1313  __kmp_aux_display_affinity(global_tid, NULL);
1314  this_thr->th.th_prev_level = serial_team->t.t_level;
1315  this_thr->th.th_prev_num_threads = 1;
1316  }
1317  }
1318 
1319  if (__kmp_env_consistency_check)
1320  __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322  serial_team->t.ompt_team_info.master_return_address = codeptr;
1323  if (ompt_enabled.enabled &&
1324  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326  OMPT_GET_FRAME_ADDRESS(0);
1327 
1328  ompt_lw_taskteam_t lw_taskteam;
1329  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330  &ompt_parallel_data, codeptr);
1331 
1332  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333  // don't use lw_taskteam after linking. content was swaped
1334 
1335  /* OMPT implicit task begin */
1336  if (ompt_enabled.ompt_callback_implicit_task) {
1337  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342  __kmp_tid_from_gtid(global_tid);
1343  }
1344 
1345  /* OMPT state */
1346  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348  OMPT_GET_FRAME_ADDRESS(0);
1349  }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356  enum fork_context_e call_context, // Intel, GNU, ...
1357  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358  kmp_va_list ap) {
1359  void **argv;
1360  int i;
1361  int master_tid;
1362  int master_this_cons;
1363  kmp_team_t *team;
1364  kmp_team_t *parent_team;
1365  kmp_info_t *master_th;
1366  kmp_root_t *root;
1367  int nthreads;
1368  int master_active;
1369  int master_set_numthreads;
1370  int level;
1371  int active_level;
1372  int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374  kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376  { // KMP_TIME_BLOCK
1377  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382  /* Some systems prefer the stack for the root thread(s) to start with */
1383  /* some gap from the parent stack to prevent false sharing. */
1384  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385  /* These 2 lines below are so this does not get optimized out */
1386  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387  __kmp_stkpadding += (short)((kmp_int64)dummy);
1388  }
1389 
1390  /* initialize if needed */
1391  KMP_DEBUG_ASSERT(
1392  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393  if (!TCR_4(__kmp_init_parallel))
1394  __kmp_parallel_initialize();
1395  __kmp_resume_if_soft_paused();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data = ompt_data_none;
1409  ompt_data_t *parent_task_data;
1410  ompt_frame_t *ompt_frame;
1411  ompt_data_t *implicit_task_data;
1412  void *return_address = NULL;
1413 
1414  if (ompt_enabled.enabled) {
1415  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416  NULL, NULL);
1417  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418  }
1419 #endif
1420 
1421  // Assign affinity to root thread if it hasn't happened yet
1422  __kmp_assign_root_init_mask();
1423 
1424  // Nested level will be an index in the nested nthreads array
1425  level = parent_team->t.t_level;
1426  // used to launch non-serial teams even if nested is not allowed
1427  active_level = parent_team->t.t_active_level;
1428  // needed to check nesting inside the teams
1429  teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431  p_hot_teams = &master_th->th.th_hot_teams;
1432  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436  // it is either actual or not needed (when active_level > 0)
1437  (*p_hot_teams)[0].hot_team_nth = 1;
1438  }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442  if (ompt_enabled.enabled) {
1443  if (ompt_enabled.ompt_callback_parallel_begin) {
1444  int team_size = master_set_numthreads
1445  ? master_set_numthreads
1446  : get__nproc_2(parent_team, master_tid);
1447  int flags = OMPT_INVOKER(call_context) |
1448  ((microtask == (microtask_t)__kmp_teams_master)
1449  ? ompt_parallel_league
1450  : ompt_parallel_team);
1451  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453  return_address);
1454  }
1455  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456  }
1457 #endif
1458 
1459  master_th->th.th_ident = loc;
1460 
1461  if (master_th->th.th_teams_microtask && ap &&
1462  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463  // AC: This is start of parallel that is nested inside teams construct.
1464  // The team is actual (hot), all workers are ready at the fork barrier.
1465  // No lock needed to initialize the team a bit, then free workers.
1466  parent_team->t.t_ident = loc;
1467  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468  parent_team->t.t_argc = argc;
1469  argv = (void **)parent_team->t.t_argv;
1470  for (i = argc - 1; i >= 0; --i)
1471  *argv++ = va_arg(kmp_va_deref(ap), void *);
1472  // Increment our nested depth levels, but not increase the serialization
1473  if (parent_team == master_th->th.th_serial_team) {
1474  // AC: we are in serialized parallel
1475  __kmpc_serialized_parallel(loc, gtid);
1476  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478  if (call_context == fork_context_gnu) {
1479  // AC: need to decrement t_serialized for enquiry functions to work
1480  // correctly, will restore at join time
1481  parent_team->t.t_serialized--;
1482  return TRUE;
1483  }
1484 
1485 #if OMPD_SUPPORT
1486  parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490  void *dummy;
1491  void **exit_frame_p;
1492 
1493  ompt_lw_taskteam_t lw_taskteam;
1494 
1495  if (ompt_enabled.enabled) {
1496  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497  &ompt_parallel_data, return_address);
1498  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501  // don't use lw_taskteam after linking. content was swaped
1502 
1503  /* OMPT implicit task begin */
1504  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505  if (ompt_enabled.ompt_callback_implicit_task) {
1506  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507  __kmp_tid_from_gtid(gtid);
1508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510  implicit_task_data, 1,
1511  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512  }
1513 
1514  /* OMPT state */
1515  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516  } else {
1517  exit_frame_p = &dummy;
1518  }
1519 #endif
1520  // AC: need to decrement t_serialized for enquiry functions to work
1521  // correctly, will restore at join time
1522  parent_team->t.t_serialized--;
1523 
1524  {
1525  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529  ,
1530  exit_frame_p
1531 #endif
1532  );
1533  }
1534 
1535 #if OMPT_SUPPORT
1536  if (ompt_enabled.enabled) {
1537  *exit_frame_p = NULL;
1538  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539  if (ompt_enabled.ompt_callback_implicit_task) {
1540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541  ompt_scope_end, NULL, implicit_task_data, 1,
1542  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543  }
1544  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545  __ompt_lw_taskteam_unlink(master_th);
1546  if (ompt_enabled.ompt_callback_parallel_end) {
1547  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549  OMPT_INVOKER(call_context) | ompt_parallel_team,
1550  return_address);
1551  }
1552  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553  }
1554 #endif
1555  return TRUE;
1556  }
1557 
1558  parent_team->t.t_pkfn = microtask;
1559  parent_team->t.t_invoke = invoker;
1560  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561  parent_team->t.t_active_level++;
1562  parent_team->t.t_level++;
1563  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566  if (ompt_enabled.enabled) {
1567  ompt_lw_taskteam_t lw_taskteam;
1568  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569  &ompt_parallel_data, return_address);
1570  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571  }
1572 #endif
1573 
1574  /* Change number of threads in the team if requested */
1575  if (master_set_numthreads) { // The parallel has num_threads clause
1576  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577  // AC: only can reduce number of threads dynamically, can't increase
1578  kmp_info_t **other_threads = parent_team->t.t_threads;
1579  // NOTE: if using distributed barrier, we need to run this code block
1580  // even when the team size appears not to have changed from the max.
1581  int old_proc = master_th->th.th_teams_size.nth;
1582  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583  bp_dist_bar) {
1584  __kmp_resize_dist_barrier(parent_team, old_proc,
1585  master_set_numthreads);
1586  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587  }
1588  parent_team->t.t_nproc = master_set_numthreads;
1589  for (i = 0; i < master_set_numthreads; ++i) {
1590  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591  }
1592  }
1593  // Keep extra threads hot in the team for possible next parallels
1594  master_th->th.th_set_nproc = 0;
1595  }
1596 
1597 #if USE_DEBUGGER
1598  if (__kmp_debugging) { // Let debugger override number of threads.
1599  int nth = __kmp_omp_num_threads(loc);
1600  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601  master_set_numthreads = nth;
1602  }
1603  }
1604 #endif
1605 
1606  // Figure out the proc_bind policy for the nested parallel within teams
1607  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608  // proc_bind_default means don't update
1609  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611  proc_bind = proc_bind_false;
1612  } else {
1613  // No proc_bind clause specified; use current proc-bind-var
1614  if (proc_bind == proc_bind_default) {
1615  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616  }
1617  /* else: The proc_bind policy was specified explicitly on parallel
1618  clause.
1619  This overrides proc-bind-var for this parallel region, but does not
1620  change proc-bind-var. */
1621  // Figure the value of proc-bind-var for the child threads.
1622  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624  master_th->th.th_current_task->td_icvs.proc_bind)) {
1625  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626  }
1627  }
1628  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629  // Need to change the bind-var ICV to correct value for each implicit task
1630  if (proc_bind_icv != proc_bind_default &&
1631  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632  kmp_info_t **other_threads = parent_team->t.t_threads;
1633  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634  other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635  proc_bind_icv;
1636  }
1637  }
1638  // Reset for next parallel region
1639  master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643  KMP_ITT_DEBUG) &&
1644  __kmp_forkjoin_frames_mode == 3 &&
1645  parent_team->t.t_active_level == 1 // only report frames at level 1
1646  && master_th->th.th_teams_size.nteams == 1) {
1647  kmp_uint64 tmp_time = __itt_get_timestamp();
1648  master_th->th.th_frame_time = tmp_time;
1649  parent_team->t.t_region_time = tmp_time;
1650  }
1651  if (__itt_stack_caller_create_ptr) {
1652  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653  // create new stack stitching id before entering fork barrier
1654  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658  __kmp_partition_places(parent_team);
1659 #endif
1660 
1661  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662  "master_th=%p, gtid=%d\n",
1663  root, parent_team, master_th, gtid));
1664  __kmp_internal_fork(loc, gtid, parent_team);
1665  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666  "master_th=%p, gtid=%d\n",
1667  root, parent_team, master_th, gtid));
1668 
1669  if (call_context == fork_context_gnu)
1670  return TRUE;
1671 
1672  /* Invoke microtask for PRIMARY thread */
1673  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674  parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676  if (!parent_team->t.t_invoke(gtid)) {
1677  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678  }
1679  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680  parent_team->t.t_id, parent_team->t.t_pkfn));
1681  KMP_MB(); /* Flush all pending memory write invalidates. */
1682 
1683  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685  return TRUE;
1686  } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689  if (__kmp_tasking_mode != tskm_immediate_exec) {
1690  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691  parent_team->t.t_task_team[master_th->th.th_task_state]);
1692  }
1693 #endif
1694 
1695  // Need this to happen before we determine the number of threads, not while
1696  // we are allocating the team
1697  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698  int enter_teams = 0;
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703  enter_teams = ((ap == NULL && active_level == 0) ||
1704  (ap && teams_level > 0 && teams_level == level));
1705  nthreads = master_set_numthreads
1706  ? master_set_numthreads
1707  // TODO: get nproc directly from current task
1708  : get__nproc_2(parent_team, master_tid);
1709  // Check if we need to take forkjoin lock? (no need for serialized
1710  // parallel out of teams construct). This code moved here from
1711  // __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ((get__max_active_levels(master_th) == 1 &&
1714  (root->r.r_in_parallel && !enter_teams)) ||
1715  (__kmp_library == library_serial)) {
1716  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717  " threads\n",
1718  gtid, nthreads));
1719  nthreads = 1;
1720  }
1721  }
1722  if (nthreads > 1) {
1723  /* determine how many new threads we can use */
1724  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725  /* AC: If we execute teams from parallel region (on host), then teams
1726  should be created but each can only have 1 thread if nesting is
1727  disabled. If teams called from serial region, then teams and their
1728  threads should be created regardless of the nesting setting. */
1729  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730  nthreads, enter_teams);
1731  if (nthreads == 1) {
1732  // Free lock for single thread execution here; for multi-thread
1733  // execution it will be freed later after team of threads created
1734  // and initialized
1735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736  }
1737  }
1738  }
1739  KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741  // If we temporarily changed the set number of threads then restore it now
1742  master_th->th.th_set_nproc = 0;
1743 
1744  /* create a serialized parallel region? */
1745  if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX && \
1748  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749  void *args[argc];
1750 #else
1751  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753  KMP_ARCH_AARCH64) */
1754 
1755  KA_TRACE(20,
1756  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758  __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761  master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764  if (call_context == fork_context_intel) {
1765  /* TODO this sucks, use the compiler itself to pass args! :) */
1766  master_th->th.th_serial_team->t.t_ident = loc;
1767  if (!ap) {
1768  // revert change made in __kmpc_serialized_parallel()
1769  master_th->th.th_serial_team->t.t_level--;
1770  // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773  void *dummy;
1774  void **exit_frame_p;
1775  ompt_task_info_t *task_info;
1776 
1777  ompt_lw_taskteam_t lw_taskteam;
1778 
1779  if (ompt_enabled.enabled) {
1780  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781  &ompt_parallel_data, return_address);
1782 
1783  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784  // don't use lw_taskteam after linking. content was swaped
1785 
1786  task_info = OMPT_CUR_TASK_INFO(master_th);
1787  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788  if (ompt_enabled.ompt_callback_implicit_task) {
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790  __kmp_tid_from_gtid(gtid);
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793  &(task_info->task_data), 1,
1794  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795  ompt_task_implicit);
1796  }
1797 
1798  /* OMPT state */
1799  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800  } else {
1801  exit_frame_p = &dummy;
1802  }
1803 #endif
1804 
1805  {
1806  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809  parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811  ,
1812  exit_frame_p
1813 #endif
1814  );
1815  }
1816 
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  *exit_frame_p = NULL;
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 1,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824  ompt_task_implicit);
1825  }
1826  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827  __ompt_lw_taskteam_unlink(master_th);
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  &ompt_parallel_data, parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_team,
1832  return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else if (microtask == (microtask_t)__kmp_teams_master) {
1838  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839  master_th->th.th_serial_team);
1840  team = master_th->th.th_team;
1841  // team->t.t_pkfn = microtask;
1842  team->t.t_invoke = invoker;
1843  __kmp_alloc_argv_entries(argc, team, TRUE);
1844  team->t.t_argc = argc;
1845  argv = (void **)team->t.t_argv;
1846  if (ap) {
1847  for (i = argc - 1; i >= 0; --i)
1848  *argv++ = va_arg(kmp_va_deref(ap), void *);
1849  } else {
1850  for (i = 0; i < argc; ++i)
1851  // Get args from parent team for teams construct
1852  argv[i] = parent_team->t.t_argv[i];
1853  }
1854  // AC: revert change made in __kmpc_serialized_parallel()
1855  // because initial code in teams should have level=0
1856  team->t.t_level--;
1857  // AC: call special invoker for outer "parallel" of teams construct
1858  invoker(gtid);
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 0,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866  }
1867  if (ompt_enabled.ompt_callback_parallel_end) {
1868  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869  &ompt_parallel_data, parent_task_data,
1870  OMPT_INVOKER(call_context) | ompt_parallel_league,
1871  return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  } else {
1877  argv = args;
1878  for (i = argc - 1; i >= 0; --i)
1879  *argv++ = va_arg(kmp_va_deref(ap), void *);
1880  KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883  void *dummy;
1884  void **exit_frame_p;
1885  ompt_task_info_t *task_info;
1886 
1887  ompt_lw_taskteam_t lw_taskteam;
1888 
1889  if (ompt_enabled.enabled) {
1890  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891  &ompt_parallel_data, return_address);
1892  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893  // don't use lw_taskteam after linking. content was swaped
1894  task_info = OMPT_CUR_TASK_INFO(master_th);
1895  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897  /* OMPT implicit task begin */
1898  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903  ompt_task_implicit);
1904  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905  __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_frame_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_frame_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_frame_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933  ompt_task_implicit);
1934  }
1935 
1936  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937  __ompt_lw_taskteam_unlink(master_th);
1938  if (ompt_enabled.ompt_callback_parallel_end) {
1939  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940  &ompt_parallel_data, parent_task_data,
1941  OMPT_INVOKER(call_context) | ompt_parallel_team,
1942  return_address);
1943  }
1944  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945  }
1946 #endif
1947  }
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984  if (!master_th->th.th_teams_microtask || level > teams_level) {
1985  /* Increment our nested depth level */
1986  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987  }
1988 
1989  // See if we need to make a copy of the ICVs.
1990  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991  if ((level + 1 < __kmp_nested_nth.used) &&
1992  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994  } else {
1995  nthreads_icv = 0; // don't update
1996  }
1997 
1998  // Figure out the proc_bind_policy for the new team.
1999  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000  // proc_bind_default means don't update
2001  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003  proc_bind = proc_bind_false;
2004  } else {
2005  // No proc_bind clause specified; use current proc-bind-var for this
2006  // parallel region
2007  if (proc_bind == proc_bind_default) {
2008  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009  }
2010  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011  if (master_th->th.th_teams_microtask &&
2012  microtask == (microtask_t)__kmp_teams_master) {
2013  proc_bind = __kmp_teams_proc_bind;
2014  }
2015  /* else: The proc_bind policy was specified explicitly on parallel clause.
2016  This overrides proc-bind-var for this parallel region, but does not
2017  change proc-bind-var. */
2018  // Figure the value of proc-bind-var for the child threads.
2019  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021  master_th->th.th_current_task->td_icvs.proc_bind)) {
2022  // Do not modify the proc bind icv for the two teams construct forks
2023  // They just let the proc bind icv pass through
2024  if (!master_th->th.th_teams_microtask ||
2025  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027  }
2028  }
2029 
2030  // Reset for next parallel region
2031  master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040  if (proc_bind_icv != proc_bind_default) {
2041  new_icvs.proc_bind = proc_bind_icv;
2042  }
2043 
2044  /* allocate a new parallel team */
2045  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046  team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048  ompt_parallel_data,
2049 #endif
2050  proc_bind, &new_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054  } else {
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061  proc_bind,
2062  &master_th->th.th_current_task->td_icvs,
2063  argc USE_NESTED_HOT_ARG(master_th));
2064  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066  &master_th->th.th_current_task->td_icvs);
2067  }
2068  KF_TRACE(
2069  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071  /* setup the new team */
2072  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079  return_address);
2080 #endif
2081  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082  // TODO: parent_team->t.t_level == INT_MAX ???
2083  if (!master_th->th.th_teams_microtask || level > teams_level) {
2084  int new_level = parent_team->t.t_level + 1;
2085  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086  new_level = parent_team->t.t_active_level + 1;
2087  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088  } else {
2089  // AC: Do not increase parallel level at start of the teams construct
2090  int new_level = parent_team->t.t_level;
2091  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092  new_level = parent_team->t.t_active_level;
2093  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094  }
2095  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096  // set primary thread's schedule as new run-time schedule
2097  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102  // Update the floating point rounding in the team if required.
2103  propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105  if (ompd_state & OMPD_ENABLE_BP)
2106  ompd_bp_parallel_begin();
2107 #endif
2108 
2109  if (__kmp_tasking_mode != tskm_immediate_exec) {
2110  // Set primary thread's task team to team's task team. Unless this is hot
2111  // team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113  parent_team->t.t_task_team[master_th->th.th_task_state]);
2114  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115  "%p, new task_team %p / team %p\n",
2116  __kmp_gtid_from_thread(master_th),
2117  master_th->th.th_task_team, parent_team,
2118  team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120  if (active_level || master_th->th.th_task_team) {
2121  // Take a memo of primary thread's task_state
2122  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123  if (master_th->th.th_task_state_top >=
2124  master_th->th.th_task_state_stack_sz) { // increase size
2125  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126  kmp_uint8 *old_stack, *new_stack;
2127  kmp_uint32 i;
2128  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131  }
2132  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133  ++i) { // zero-init rest of stack
2134  new_stack[i] = 0;
2135  }
2136  old_stack = master_th->th.th_task_state_memo_stack;
2137  master_th->th.th_task_state_memo_stack = new_stack;
2138  master_th->th.th_task_state_stack_sz = new_size;
2139  __kmp_free(old_stack);
2140  }
2141  // Store primary thread's task_state on stack
2142  master_th->th
2143  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144  master_th->th.th_task_state;
2145  master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147  if (master_th->th.th_hot_teams &&
2148  active_level < __kmp_hot_teams_max_level &&
2149  team == master_th->th.th_hot_teams[active_level].hot_team) {
2150  // Restore primary thread's nested state if nested hot team
2151  master_th->th.th_task_state =
2152  master_th->th
2153  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154  } else {
2155 #endif
2156  master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158  }
2159 #endif
2160  }
2161 #if !KMP_NESTED_HOT_TEAMS
2162  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163  (team == root->r.r_hot_team));
2164 #endif
2165  }
2166 
2167  KA_TRACE(
2168  20,
2169  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171  team->t.t_nproc));
2172  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173  (team->t.t_master_tid == 0 &&
2174  (team->t.t_parent == root->r.r_root_team ||
2175  team->t.t_parent->t.t_serialized)));
2176  KMP_MB();
2177 
2178  /* now, setup the arguments */
2179  argv = (void **)team->t.t_argv;
2180  if (ap) {
2181  for (i = argc - 1; i >= 0; --i) {
2182  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183  KMP_CHECK_UPDATE(*argv, new_argv);
2184  argv++;
2185  }
2186  } else {
2187  for (i = 0; i < argc; ++i) {
2188  // Get args from parent team for teams construct
2189  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190  }
2191  }
2192 
2193  /* now actually fork the threads */
2194  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196  root->r.r_active = TRUE;
2197 
2198  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199  __kmp_setup_icv_copy(team, nthreads,
2200  &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209  if (team->t.t_active_level == 1 // only report frames at level 1
2210  && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  // create new stack stitching id before entering fork barrier
2243  if (!enter_teams) {
2244  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246  } else if (parent_team->t.t_serialized) {
2247  // keep stack stitching id in the serialized parent_team;
2248  // current team will be used for parallel inside the teams;
2249  // if parent_team is active, then it already keeps stack stitching id
2250  // for the league of teams
2251  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253  }
2254  }
2255 #endif /* USE_ITT_BUILD */
2256 
2257  // AC: skip __kmp_internal_fork at teams construct, let only primary
2258  // threads execute
2259  if (ap) {
2260  __kmp_internal_fork(loc, gtid, team);
2261  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262  "master_th=%p, gtid=%d\n",
2263  root, team, master_th, gtid));
2264  }
2265 
2266  if (call_context == fork_context_gnu) {
2267  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268  return TRUE;
2269  }
2270 
2271  /* Invoke microtask for PRIMARY thread */
2272  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273  team->t.t_id, team->t.t_pkfn));
2274  } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277  // If beginning a teams construct, then change thread state
2278  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279  if (!ap) {
2280  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281  }
2282 #endif
2283 
2284  if (!team->t.t_invoke(gtid)) {
2285  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286  }
2287 
2288 #if KMP_STATS_ENABLED
2289  // If was beginning of a teams construct, then reset thread state
2290  if (!ap) {
2291  KMP_SET_THREAD_STATE(previous_state);
2292  }
2293 #endif
2294 
2295  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296  team->t.t_id, team->t.t_pkfn));
2297  KMP_MB(); /* Flush all pending memory write invalidates. */
2298 
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301  if (ompt_enabled.enabled) {
2302  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303  }
2304 #endif
2305 
2306  return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311  kmp_team_t *team) {
2312  // restore state outside the region
2313  thread->th.ompt_thread_info.state =
2314  ((team->t.t_serialized) ? ompt_state_work_serial
2315  : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319  kmp_team_t *team, ompt_data_t *parallel_data,
2320  int flags, void *codeptr) {
2321  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322  if (ompt_enabled.ompt_callback_parallel_end) {
2323  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324  parallel_data, &(task_info->task_data), flags, codeptr);
2325  }
2326 
2327  task_info->frame.enter_frame = ompt_data_none;
2328  __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334  ,
2335  enum fork_context_e fork_context
2336 #endif
2337  ,
2338  int exit_teams) {
2339  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340  kmp_team_t *team;
2341  kmp_team_t *parent_team;
2342  kmp_info_t *master_th;
2343  kmp_root_t *root;
2344  int master_active;
2345 
2346  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348  /* setup current data */
2349  master_th = __kmp_threads[gtid];
2350  root = master_th->th.th_root;
2351  team = master_th->th.th_team;
2352  parent_team = team->t.t_parent;
2353 
2354  master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357  void *team_microtask = (void *)team->t.t_pkfn;
2358  // For GOMP interface with serialized parallel, need the
2359  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360  // and end-parallel events.
2361  if (ompt_enabled.enabled &&
2362  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364  }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370  "th_task_team = %p\n",
2371  __kmp_gtid_from_thread(master_th), team,
2372  team->t.t_task_team[master_th->th.th_task_state],
2373  master_th->th.th_task_team));
2374  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375  team->t.t_task_team[master_th->th.th_task_state]);
2376  }
2377 #endif
2378 
2379  if (team->t.t_serialized) {
2380  if (master_th->th.th_teams_microtask) {
2381  // We are in teams construct
2382  int level = team->t.t_level;
2383  int tlevel = master_th->th.th_teams_level;
2384  if (level == tlevel) {
2385  // AC: we haven't incremented it earlier at start of teams construct,
2386  // so do it here - at the end of teams construct
2387  team->t.t_level++;
2388  } else if (level == tlevel + 1) {
2389  // AC: we are exiting parallel inside teams, need to increment
2390  // serialization in order to restore it in the next call to
2391  // __kmpc_end_serialized_parallel
2392  team->t.t_serialized++;
2393  }
2394  }
2395  __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398  if (ompt_enabled.enabled) {
2399  __kmp_join_restore_state(master_th, parent_team);
2400  }
2401 #endif
2402 
2403  return;
2404  }
2405 
2406  master_active = team->t.t_master_active;
2407 
2408  if (!exit_teams) {
2409  // AC: No barrier for internal teams at exit from teams construct.
2410  // But there is barrier for external team (league).
2411  __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413  if (__itt_stack_caller_create_ptr) {
2414  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415  // destroy the stack stitching id after join barrier
2416  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417  team->t.t_stack_id = NULL;
2418  }
2419 #endif
2420  } else {
2421  master_th->th.th_task_state =
2422  0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426  // destroy the stack stitching id on exit from the teams construct
2427  // if parent_team is active, then the id will be destroyed later on
2428  // by master of the league of teams
2429  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430  parent_team->t.t_stack_id = NULL;
2431  }
2432 #endif
2433 
2434  if (team->t.t_nproc > 1 &&
2435  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436  team->t.b->update_num_threads(team->t.t_nproc);
2437  __kmp_add_threads_to_team(team, team->t.t_nproc);
2438  }
2439  }
2440 
2441  KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445  void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450  if (team->t.t_active_level == 1 &&
2451  (!master_th->th.th_teams_microtask || /* not in teams construct */
2452  master_th->th.th_teams_size.nteams == 1)) {
2453  master_th->th.th_ident = loc;
2454  // only one notification scheme (either "submit" or "forking/joined", not
2455  // both)
2456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457  __kmp_forkjoin_frames_mode == 3)
2458  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459  master_th->th.th_frame_time, 0, loc,
2460  master_th->th.th_team_nproc, 1);
2461  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463  __kmp_itt_region_joined(gtid);
2464  } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468  if (!exit_teams) {
2469  // Restore master thread's partition.
2470  master_th->th.th_first_place = team->t.t_first_place;
2471  master_th->th.th_last_place = team->t.t_last_place;
2472  }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475  if (master_th->th.th_teams_microtask && !exit_teams &&
2476  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477  team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482  ompt_data_t ompt_parallel_data = ompt_data_none;
2483  if (ompt_enabled.enabled) {
2484  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485  if (ompt_enabled.ompt_callback_implicit_task) {
2486  int ompt_team_size = team->t.t_nproc;
2487  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490  }
2491  task_info->frame.exit_frame = ompt_data_none;
2492  task_info->task_data = ompt_data_none;
2493  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494  __ompt_lw_taskteam_unlink(master_th);
2495  }
2496 #endif
2497  /* Decrement our nested depth level */
2498  team->t.t_level--;
2499  team->t.t_active_level--;
2500  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502  // Restore number of threads in the team if needed. This code relies on
2503  // the proper adjustment of th_teams_size.nth after the fork in
2504  // __kmp_teams_master on each teams primary thread in the case that
2505  // __kmp_reserve_threads reduced it.
2506  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507  int old_num = master_th->th.th_team_nproc;
2508  int new_num = master_th->th.th_teams_size.nth;
2509  kmp_info_t **other_threads = team->t.t_threads;
2510  team->t.t_nproc = new_num;
2511  for (int i = 0; i < old_num; ++i) {
2512  other_threads[i]->th.th_team_nproc = new_num;
2513  }
2514  // Adjust states of non-used threads of the team
2515  for (int i = old_num; i < new_num; ++i) {
2516  // Re-initialize thread's barrier data.
2517  KMP_DEBUG_ASSERT(other_threads[i]);
2518  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519  for (int b = 0; b < bs_last_barrier; ++b) {
2520  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525  }
2526  if (__kmp_tasking_mode != tskm_immediate_exec) {
2527  // Synchronize thread's task state
2528  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529  }
2530  }
2531  }
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled.enabled) {
2535  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537  }
2538 #endif
2539 
2540  return;
2541  }
2542 
2543  /* do cleanup and restore the parent team */
2544  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549  /* jc: The following lock has instructions with REL and ACQ semantics,
2550  separating the parallel user code called in this parallel region
2551  from the serial user code called after this function returns. */
2552  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554  if (!master_th->th.th_teams_microtask ||
2555  team->t.t_level > master_th->th.th_teams_level) {
2556  /* Decrement our nested depth level */
2557  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558  }
2559  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562  if (ompt_enabled.enabled) {
2563  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564  if (ompt_enabled.ompt_callback_implicit_task) {
2565  int flags = (team_microtask == (void *)__kmp_teams_master)
2566  ? ompt_task_initial
2567  : ompt_task_implicit;
2568  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572  }
2573  task_info->frame.exit_frame = ompt_data_none;
2574  task_info->task_data = ompt_data_none;
2575  }
2576 #endif
2577 
2578  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579  master_th, team));
2580  __kmp_pop_current_task_from_thread(master_th);
2581 
2582  master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585  if (ompd_state & OMPD_ENABLE_BP)
2586  ompd_bp_parallel_end();
2587 #endif
2588  updateHWFPControl(team);
2589 
2590  if (root->r.r_active != master_active)
2591  root->r.r_active = master_active;
2592 
2593  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594  master_th)); // this will free worker threads
2595 
2596  /* this race was fun to find. make sure the following is in the critical
2597  region otherwise assertions may fail occasionally since the old team may be
2598  reallocated and the hierarchy appears inconsistent. it is actually safe to
2599  run and won't cause any bugs, but will cause those assertion failures. it's
2600  only one deref&assign so might as well put this in the critical region */
2601  master_th->th.th_team = parent_team;
2602  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603  master_th->th.th_team_master = parent_team->t.t_threads[0];
2604  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606  /* restore serialized team, if need be */
2607  if (parent_team->t.t_serialized &&
2608  parent_team != master_th->th.th_serial_team &&
2609  parent_team != root->r.r_root_team) {
2610  __kmp_free_team(root,
2611  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612  master_th->th.th_serial_team = parent_team;
2613  }
2614 
2615  if (__kmp_tasking_mode != tskm_immediate_exec) {
2616  if (master_th->th.th_task_state_top >
2617  0) { // Restore task state from memo stack
2618  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619  // Remember primary thread's state if we re-use this nested hot team
2620  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621  master_th->th.th_task_state;
2622  --master_th->th.th_task_state_top; // pop
2623  // Now restore state at this level
2624  master_th->th.th_task_state =
2625  master_th->th
2626  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627  }
2628  // Copy the task team from the parent team to the primary thread
2629  master_th->th.th_task_team =
2630  parent_team->t.t_task_team[master_th->th.th_task_state];
2631  KA_TRACE(20,
2632  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634  parent_team));
2635  }
2636 
2637  // TODO: GEH - cannot do this assertion because root thread not set up as
2638  // executing
2639  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640  master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if KMP_AFFINITY_SUPPORTED
2645  if (master_th->th.th_team->t.t_level == 0 && __kmp_affin_reset) {
2646  __kmp_reset_root_init_mask(gtid);
2647  }
2648 #endif
2649 #if OMPT_SUPPORT
2650  int flags =
2651  OMPT_INVOKER(fork_context) |
2652  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2653  : ompt_parallel_team);
2654  if (ompt_enabled.enabled) {
2655  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2656  codeptr);
2657  }
2658 #endif
2659 
2660  KMP_MB();
2661  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2662 }
2663 
2664 /* Check whether we should push an internal control record onto the
2665  serial team stack. If so, do it. */
2666 void __kmp_save_internal_controls(kmp_info_t *thread) {
2667 
2668  if (thread->th.th_team != thread->th.th_serial_team) {
2669  return;
2670  }
2671  if (thread->th.th_team->t.t_serialized > 1) {
2672  int push = 0;
2673 
2674  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2675  push = 1;
2676  } else {
2677  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2678  thread->th.th_team->t.t_serialized) {
2679  push = 1;
2680  }
2681  }
2682  if (push) { /* push a record on the serial team's stack */
2683  kmp_internal_control_t *control =
2684  (kmp_internal_control_t *)__kmp_allocate(
2685  sizeof(kmp_internal_control_t));
2686 
2687  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2688 
2689  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2690 
2691  control->next = thread->th.th_team->t.t_control_stack_top;
2692  thread->th.th_team->t.t_control_stack_top = control;
2693  }
2694  }
2695 }
2696 
2697 /* Changes set_nproc */
2698 void __kmp_set_num_threads(int new_nth, int gtid) {
2699  kmp_info_t *thread;
2700  kmp_root_t *root;
2701 
2702  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2703  KMP_DEBUG_ASSERT(__kmp_init_serial);
2704 
2705  if (new_nth < 1)
2706  new_nth = 1;
2707  else if (new_nth > __kmp_max_nth)
2708  new_nth = __kmp_max_nth;
2709 
2710  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2711  thread = __kmp_threads[gtid];
2712  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2713  return; // nothing to do
2714 
2715  __kmp_save_internal_controls(thread);
2716 
2717  set__nproc(thread, new_nth);
2718 
2719  // If this omp_set_num_threads() call will cause the hot team size to be
2720  // reduced (in the absence of a num_threads clause), then reduce it now,
2721  // rather than waiting for the next parallel region.
2722  root = thread->th.th_root;
2723  if (__kmp_init_parallel && (!root->r.r_active) &&
2724  (root->r.r_hot_team->t.t_nproc > new_nth)
2725 #if KMP_NESTED_HOT_TEAMS
2726  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2727 #endif
2728  ) {
2729  kmp_team_t *hot_team = root->r.r_hot_team;
2730  int f;
2731 
2732  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2733 
2734  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2735  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2736  }
2737  // Release the extra threads we don't need any more.
2738  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2739  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2740  if (__kmp_tasking_mode != tskm_immediate_exec) {
2741  // When decreasing team size, threads no longer in the team should unref
2742  // task team.
2743  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2744  }
2745  __kmp_free_thread(hot_team->t.t_threads[f]);
2746  hot_team->t.t_threads[f] = NULL;
2747  }
2748  hot_team->t.t_nproc = new_nth;
2749 #if KMP_NESTED_HOT_TEAMS
2750  if (thread->th.th_hot_teams) {
2751  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2752  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2753  }
2754 #endif
2755 
2756  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2757  hot_team->t.b->update_num_threads(new_nth);
2758  __kmp_add_threads_to_team(hot_team, new_nth);
2759  }
2760 
2761  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2762 
2763  // Update the t_nproc field in the threads that are still active.
2764  for (f = 0; f < new_nth; f++) {
2765  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2766  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2767  }
2768  // Special flag in case omp_set_num_threads() call
2769  hot_team->t.t_size_changed = -1;
2770  }
2771 }
2772 
2773 /* Changes max_active_levels */
2774 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2775  kmp_info_t *thread;
2776 
2777  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2778  "%d = (%d)\n",
2779  gtid, max_active_levels));
2780  KMP_DEBUG_ASSERT(__kmp_init_serial);
2781 
2782  // validate max_active_levels
2783  if (max_active_levels < 0) {
2784  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2785  // We ignore this call if the user has specified a negative value.
2786  // The current setting won't be changed. The last valid setting will be
2787  // used. A warning will be issued (if warnings are allowed as controlled by
2788  // the KMP_WARNINGS env var).
2789  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2790  "max_active_levels for thread %d = (%d)\n",
2791  gtid, max_active_levels));
2792  return;
2793  }
2794  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2795  // it's OK, the max_active_levels is within the valid range: [ 0;
2796  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2797  // We allow a zero value. (implementation defined behavior)
2798  } else {
2799  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2800  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2801  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2802  // Current upper limit is MAX_INT. (implementation defined behavior)
2803  // If the input exceeds the upper limit, we correct the input to be the
2804  // upper limit. (implementation defined behavior)
2805  // Actually, the flow should never get here until we use MAX_INT limit.
2806  }
2807  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2808  "max_active_levels for thread %d = (%d)\n",
2809  gtid, max_active_levels));
2810 
2811  thread = __kmp_threads[gtid];
2812 
2813  __kmp_save_internal_controls(thread);
2814 
2815  set__max_active_levels(thread, max_active_levels);
2816 }
2817 
2818 /* Gets max_active_levels */
2819 int __kmp_get_max_active_levels(int gtid) {
2820  kmp_info_t *thread;
2821 
2822  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2823  KMP_DEBUG_ASSERT(__kmp_init_serial);
2824 
2825  thread = __kmp_threads[gtid];
2826  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2827  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2828  "curtask_maxaclevel=%d\n",
2829  gtid, thread->th.th_current_task,
2830  thread->th.th_current_task->td_icvs.max_active_levels));
2831  return thread->th.th_current_task->td_icvs.max_active_levels;
2832 }
2833 
2834 // nteams-var per-device ICV
2835 void __kmp_set_num_teams(int num_teams) {
2836  if (num_teams > 0)
2837  __kmp_nteams = num_teams;
2838 }
2839 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2840 // teams-thread-limit-var per-device ICV
2841 void __kmp_set_teams_thread_limit(int limit) {
2842  if (limit > 0)
2843  __kmp_teams_thread_limit = limit;
2844 }
2845 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2846 
2847 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2848 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2849 
2850 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2851 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2852  kmp_info_t *thread;
2853  kmp_sched_t orig_kind;
2854  // kmp_team_t *team;
2855 
2856  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2857  gtid, (int)kind, chunk));
2858  KMP_DEBUG_ASSERT(__kmp_init_serial);
2859 
2860  // Check if the kind parameter is valid, correct if needed.
2861  // Valid parameters should fit in one of two intervals - standard or extended:
2862  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2863  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2864  orig_kind = kind;
2865  kind = __kmp_sched_without_mods(kind);
2866 
2867  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2868  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2869  // TODO: Hint needs attention in case we change the default schedule.
2870  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2871  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2872  __kmp_msg_null);
2873  kind = kmp_sched_default;
2874  chunk = 0; // ignore chunk value in case of bad kind
2875  }
2876 
2877  thread = __kmp_threads[gtid];
2878 
2879  __kmp_save_internal_controls(thread);
2880 
2881  if (kind < kmp_sched_upper_std) {
2882  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2883  // differ static chunked vs. unchunked: chunk should be invalid to
2884  // indicate unchunked schedule (which is the default)
2885  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2886  } else {
2887  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2888  __kmp_sch_map[kind - kmp_sched_lower - 1];
2889  }
2890  } else {
2891  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2892  // kmp_sched_lower - 2 ];
2893  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2894  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2895  kmp_sched_lower - 2];
2896  }
2897  __kmp_sched_apply_mods_intkind(
2898  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2899  if (kind == kmp_sched_auto || chunk < 1) {
2900  // ignore parameter chunk for schedule auto
2901  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2902  } else {
2903  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2904  }
2905 }
2906 
2907 /* Gets def_sched_var ICV values */
2908 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2909  kmp_info_t *thread;
2910  enum sched_type th_type;
2911 
2912  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2913  KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915  thread = __kmp_threads[gtid];
2916 
2917  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2918  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2919  case kmp_sch_static:
2920  case kmp_sch_static_greedy:
2921  case kmp_sch_static_balanced:
2922  *kind = kmp_sched_static;
2923  __kmp_sched_apply_mods_stdkind(kind, th_type);
2924  *chunk = 0; // chunk was not set, try to show this fact via zero value
2925  return;
2926  case kmp_sch_static_chunked:
2927  *kind = kmp_sched_static;
2928  break;
2929  case kmp_sch_dynamic_chunked:
2930  *kind = kmp_sched_dynamic;
2931  break;
2933  case kmp_sch_guided_iterative_chunked:
2934  case kmp_sch_guided_analytical_chunked:
2935  *kind = kmp_sched_guided;
2936  break;
2937  case kmp_sch_auto:
2938  *kind = kmp_sched_auto;
2939  break;
2940  case kmp_sch_trapezoidal:
2941  *kind = kmp_sched_trapezoidal;
2942  break;
2943 #if KMP_STATIC_STEAL_ENABLED
2944  case kmp_sch_static_steal:
2945  *kind = kmp_sched_static_steal;
2946  break;
2947 #endif
2948  default:
2949  KMP_FATAL(UnknownSchedulingType, th_type);
2950  }
2951 
2952  __kmp_sched_apply_mods_stdkind(kind, th_type);
2953  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2954 }
2955 
2956 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2957 
2958  int ii, dd;
2959  kmp_team_t *team;
2960  kmp_info_t *thr;
2961 
2962  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2963  KMP_DEBUG_ASSERT(__kmp_init_serial);
2964 
2965  // validate level
2966  if (level == 0)
2967  return 0;
2968  if (level < 0)
2969  return -1;
2970  thr = __kmp_threads[gtid];
2971  team = thr->th.th_team;
2972  ii = team->t.t_level;
2973  if (level > ii)
2974  return -1;
2975 
2976  if (thr->th.th_teams_microtask) {
2977  // AC: we are in teams region where multiple nested teams have same level
2978  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2979  if (level <=
2980  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2981  KMP_DEBUG_ASSERT(ii >= tlevel);
2982  // AC: As we need to pass by the teams league, we need to artificially
2983  // increase ii
2984  if (ii == tlevel) {
2985  ii += 2; // three teams have same level
2986  } else {
2987  ii++; // two teams have same level
2988  }
2989  }
2990  }
2991 
2992  if (ii == level)
2993  return __kmp_tid_from_gtid(gtid);
2994 
2995  dd = team->t.t_serialized;
2996  level++;
2997  while (ii > level) {
2998  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2999  }
3000  if ((team->t.t_serialized) && (!dd)) {
3001  team = team->t.t_parent;
3002  continue;
3003  }
3004  if (ii > level) {
3005  team = team->t.t_parent;
3006  dd = team->t.t_serialized;
3007  ii--;
3008  }
3009  }
3010 
3011  return (dd > 1) ? (0) : (team->t.t_master_tid);
3012 }
3013 
3014 int __kmp_get_team_size(int gtid, int level) {
3015 
3016  int ii, dd;
3017  kmp_team_t *team;
3018  kmp_info_t *thr;
3019 
3020  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3021  KMP_DEBUG_ASSERT(__kmp_init_serial);
3022 
3023  // validate level
3024  if (level == 0)
3025  return 1;
3026  if (level < 0)
3027  return -1;
3028  thr = __kmp_threads[gtid];
3029  team = thr->th.th_team;
3030  ii = team->t.t_level;
3031  if (level > ii)
3032  return -1;
3033 
3034  if (thr->th.th_teams_microtask) {
3035  // AC: we are in teams region where multiple nested teams have same level
3036  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3037  if (level <=
3038  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3039  KMP_DEBUG_ASSERT(ii >= tlevel);
3040  // AC: As we need to pass by the teams league, we need to artificially
3041  // increase ii
3042  if (ii == tlevel) {
3043  ii += 2; // three teams have same level
3044  } else {
3045  ii++; // two teams have same level
3046  }
3047  }
3048  }
3049 
3050  while (ii > level) {
3051  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3052  }
3053  if (team->t.t_serialized && (!dd)) {
3054  team = team->t.t_parent;
3055  continue;
3056  }
3057  if (ii > level) {
3058  team = team->t.t_parent;
3059  ii--;
3060  }
3061  }
3062 
3063  return team->t.t_nproc;
3064 }
3065 
3066 kmp_r_sched_t __kmp_get_schedule_global() {
3067  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3068  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3069  // independently. So one can get the updated schedule here.
3070 
3071  kmp_r_sched_t r_sched;
3072 
3073  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3074  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3075  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3076  // different roots (even in OMP 2.5)
3077  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3078  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3079  if (s == kmp_sch_static) {
3080  // replace STATIC with more detailed schedule (balanced or greedy)
3081  r_sched.r_sched_type = __kmp_static;
3082  } else if (s == kmp_sch_guided_chunked) {
3083  // replace GUIDED with more detailed schedule (iterative or analytical)
3084  r_sched.r_sched_type = __kmp_guided;
3085  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3086  r_sched.r_sched_type = __kmp_sched;
3087  }
3088  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3089 
3090  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3091  // __kmp_chunk may be wrong here (if it was not ever set)
3092  r_sched.chunk = KMP_DEFAULT_CHUNK;
3093  } else {
3094  r_sched.chunk = __kmp_chunk;
3095  }
3096 
3097  return r_sched;
3098 }
3099 
3100 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3101  at least argc number of *t_argv entries for the requested team. */
3102 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3103 
3104  KMP_DEBUG_ASSERT(team);
3105  if (!realloc || argc > team->t.t_max_argc) {
3106 
3107  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3108  "current entries=%d\n",
3109  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3110  /* if previously allocated heap space for args, free them */
3111  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3112  __kmp_free((void *)team->t.t_argv);
3113 
3114  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3115  /* use unused space in the cache line for arguments */
3116  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3117  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3118  "argv entries\n",
3119  team->t.t_id, team->t.t_max_argc));
3120  team->t.t_argv = &team->t.t_inline_argv[0];
3121  if (__kmp_storage_map) {
3122  __kmp_print_storage_map_gtid(
3123  -1, &team->t.t_inline_argv[0],
3124  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3125  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3126  team->t.t_id);
3127  }
3128  } else {
3129  /* allocate space for arguments in the heap */
3130  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3131  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3132  : 2 * argc;
3133  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3134  "argv entries\n",
3135  team->t.t_id, team->t.t_max_argc));
3136  team->t.t_argv =
3137  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3138  if (__kmp_storage_map) {
3139  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3140  &team->t.t_argv[team->t.t_max_argc],
3141  sizeof(void *) * team->t.t_max_argc,
3142  "team_%d.t_argv", team->t.t_id);
3143  }
3144  }
3145  }
3146 }
3147 
3148 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3149  int i;
3150  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3151  team->t.t_threads =
3152  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3153  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3154  sizeof(dispatch_shared_info_t) * num_disp_buff);
3155  team->t.t_dispatch =
3156  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3157  team->t.t_implicit_task_taskdata =
3158  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3159  team->t.t_max_nproc = max_nth;
3160 
3161  /* setup dispatch buffers */
3162  for (i = 0; i < num_disp_buff; ++i) {
3163  team->t.t_disp_buffer[i].buffer_index = i;
3164  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3165  }
3166 }
3167 
3168 static void __kmp_free_team_arrays(kmp_team_t *team) {
3169  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3170  int i;
3171  for (i = 0; i < team->t.t_max_nproc; ++i) {
3172  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3173  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3174  team->t.t_dispatch[i].th_disp_buffer = NULL;
3175  }
3176  }
3177 #if KMP_USE_HIER_SCHED
3178  __kmp_dispatch_free_hierarchies(team);
3179 #endif
3180  __kmp_free(team->t.t_threads);
3181  __kmp_free(team->t.t_disp_buffer);
3182  __kmp_free(team->t.t_dispatch);
3183  __kmp_free(team->t.t_implicit_task_taskdata);
3184  team->t.t_threads = NULL;
3185  team->t.t_disp_buffer = NULL;
3186  team->t.t_dispatch = NULL;
3187  team->t.t_implicit_task_taskdata = 0;
3188 }
3189 
3190 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3191  kmp_info_t **oldThreads = team->t.t_threads;
3192 
3193  __kmp_free(team->t.t_disp_buffer);
3194  __kmp_free(team->t.t_dispatch);
3195  __kmp_free(team->t.t_implicit_task_taskdata);
3196  __kmp_allocate_team_arrays(team, max_nth);
3197 
3198  KMP_MEMCPY(team->t.t_threads, oldThreads,
3199  team->t.t_nproc * sizeof(kmp_info_t *));
3200 
3201  __kmp_free(oldThreads);
3202 }
3203 
3204 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3205 
3206  kmp_r_sched_t r_sched =
3207  __kmp_get_schedule_global(); // get current state of scheduling globals
3208 
3209  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3210 
3211  kmp_internal_control_t g_icvs = {
3212  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3213  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3214  // adjustment of threads (per thread)
3215  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3216  // whether blocktime is explicitly set
3217  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3218 #if KMP_USE_MONITOR
3219  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3220 // intervals
3221 #endif
3222  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3223  // next parallel region (per thread)
3224  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3225  __kmp_cg_max_nth, // int thread_limit;
3226  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3227  // for max_active_levels
3228  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3229  // {sched,chunk} pair
3230  __kmp_nested_proc_bind.bind_types[0],
3231  __kmp_default_device,
3232  NULL // struct kmp_internal_control *next;
3233  };
3234 
3235  return g_icvs;
3236 }
3237 
3238 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3239 
3240  kmp_internal_control_t gx_icvs;
3241  gx_icvs.serial_nesting_level =
3242  0; // probably =team->t.t_serial like in save_inter_controls
3243  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3244  gx_icvs.next = NULL;
3245 
3246  return gx_icvs;
3247 }
3248 
3249 static void __kmp_initialize_root(kmp_root_t *root) {
3250  int f;
3251  kmp_team_t *root_team;
3252  kmp_team_t *hot_team;
3253  int hot_team_max_nth;
3254  kmp_r_sched_t r_sched =
3255  __kmp_get_schedule_global(); // get current state of scheduling globals
3256  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3257  KMP_DEBUG_ASSERT(root);
3258  KMP_ASSERT(!root->r.r_begin);
3259 
3260  /* setup the root state structure */
3261  __kmp_init_lock(&root->r.r_begin_lock);
3262  root->r.r_begin = FALSE;
3263  root->r.r_active = FALSE;
3264  root->r.r_in_parallel = 0;
3265  root->r.r_blocktime = __kmp_dflt_blocktime;
3266 #if KMP_AFFINITY_SUPPORTED
3267  root->r.r_affinity_assigned = FALSE;
3268 #endif
3269 
3270  /* setup the root team for this task */
3271  /* allocate the root team structure */
3272  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3273 
3274  root_team =
3275  __kmp_allocate_team(root,
3276  1, // new_nproc
3277  1, // max_nproc
3278 #if OMPT_SUPPORT
3279  ompt_data_none, // root parallel id
3280 #endif
3281  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3282  0 // argc
3283  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3284  );
3285 #if USE_DEBUGGER
3286  // Non-NULL value should be assigned to make the debugger display the root
3287  // team.
3288  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3289 #endif
3290 
3291  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3292 
3293  root->r.r_root_team = root_team;
3294  root_team->t.t_control_stack_top = NULL;
3295 
3296  /* initialize root team */
3297  root_team->t.t_threads[0] = NULL;
3298  root_team->t.t_nproc = 1;
3299  root_team->t.t_serialized = 1;
3300  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3301  root_team->t.t_sched.sched = r_sched.sched;
3302  KA_TRACE(
3303  20,
3304  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3305  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3306 
3307  /* setup the hot team for this task */
3308  /* allocate the hot team structure */
3309  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3310 
3311  hot_team =
3312  __kmp_allocate_team(root,
3313  1, // new_nproc
3314  __kmp_dflt_team_nth_ub * 2, // max_nproc
3315 #if OMPT_SUPPORT
3316  ompt_data_none, // root parallel id
3317 #endif
3318  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3319  0 // argc
3320  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3321  );
3322  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3323 
3324  root->r.r_hot_team = hot_team;
3325  root_team->t.t_control_stack_top = NULL;
3326 
3327  /* first-time initialization */
3328  hot_team->t.t_parent = root_team;
3329 
3330  /* initialize hot team */
3331  hot_team_max_nth = hot_team->t.t_max_nproc;
3332  for (f = 0; f < hot_team_max_nth; ++f) {
3333  hot_team->t.t_threads[f] = NULL;
3334  }
3335  hot_team->t.t_nproc = 1;
3336  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3337  hot_team->t.t_sched.sched = r_sched.sched;
3338  hot_team->t.t_size_changed = 0;
3339 }
3340 
3341 #ifdef KMP_DEBUG
3342 
3343 typedef struct kmp_team_list_item {
3344  kmp_team_p const *entry;
3345  struct kmp_team_list_item *next;
3346 } kmp_team_list_item_t;
3347 typedef kmp_team_list_item_t *kmp_team_list_t;
3348 
3349 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3350  kmp_team_list_t list, // List of teams.
3351  kmp_team_p const *team // Team to add.
3352 ) {
3353 
3354  // List must terminate with item where both entry and next are NULL.
3355  // Team is added to the list only once.
3356  // List is sorted in ascending order by team id.
3357  // Team id is *not* a key.
3358 
3359  kmp_team_list_t l;
3360 
3361  KMP_DEBUG_ASSERT(list != NULL);
3362  if (team == NULL) {
3363  return;
3364  }
3365 
3366  __kmp_print_structure_team_accum(list, team->t.t_parent);
3367  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3368 
3369  // Search list for the team.
3370  l = list;
3371  while (l->next != NULL && l->entry != team) {
3372  l = l->next;
3373  }
3374  if (l->next != NULL) {
3375  return; // Team has been added before, exit.
3376  }
3377 
3378  // Team is not found. Search list again for insertion point.
3379  l = list;
3380  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3381  l = l->next;
3382  }
3383 
3384  // Insert team.
3385  {
3386  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3387  sizeof(kmp_team_list_item_t));
3388  *item = *l;
3389  l->entry = team;
3390  l->next = item;
3391  }
3392 }
3393 
3394 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3395 
3396 ) {
3397  __kmp_printf("%s", title);
3398  if (team != NULL) {
3399  __kmp_printf("%2x %p\n", team->t.t_id, team);
3400  } else {
3401  __kmp_printf(" - (nil)\n");
3402  }
3403 }
3404 
3405 static void __kmp_print_structure_thread(char const *title,
3406  kmp_info_p const *thread) {
3407  __kmp_printf("%s", title);
3408  if (thread != NULL) {
3409  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3410  } else {
3411  __kmp_printf(" - (nil)\n");
3412  }
3413 }
3414 
3415 void __kmp_print_structure(void) {
3416 
3417  kmp_team_list_t list;
3418 
3419  // Initialize list of teams.
3420  list =
3421  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3422  list->entry = NULL;
3423  list->next = NULL;
3424 
3425  __kmp_printf("\n------------------------------\nGlobal Thread "
3426  "Table\n------------------------------\n");
3427  {
3428  int gtid;
3429  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3430  __kmp_printf("%2d", gtid);
3431  if (__kmp_threads != NULL) {
3432  __kmp_printf(" %p", __kmp_threads[gtid]);
3433  }
3434  if (__kmp_root != NULL) {
3435  __kmp_printf(" %p", __kmp_root[gtid]);
3436  }
3437  __kmp_printf("\n");
3438  }
3439  }
3440 
3441  // Print out __kmp_threads array.
3442  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3443  "----------\n");
3444  if (__kmp_threads != NULL) {
3445  int gtid;
3446  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3447  kmp_info_t const *thread = __kmp_threads[gtid];
3448  if (thread != NULL) {
3449  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3450  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3451  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3452  __kmp_print_structure_team(" Serial Team: ",
3453  thread->th.th_serial_team);
3454  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3455  __kmp_print_structure_thread(" Primary: ",
3456  thread->th.th_team_master);
3457  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3458  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3459  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3460  __kmp_print_structure_thread(" Next in pool: ",
3461  thread->th.th_next_pool);
3462  __kmp_printf("\n");
3463  __kmp_print_structure_team_accum(list, thread->th.th_team);
3464  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3465  }
3466  }
3467  } else {
3468  __kmp_printf("Threads array is not allocated.\n");
3469  }
3470 
3471  // Print out __kmp_root array.
3472  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3473  "--------\n");
3474  if (__kmp_root != NULL) {
3475  int gtid;
3476  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3477  kmp_root_t const *root = __kmp_root[gtid];
3478  if (root != NULL) {
3479  __kmp_printf("GTID %2d %p:\n", gtid, root);
3480  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3481  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3482  __kmp_print_structure_thread(" Uber Thread: ",
3483  root->r.r_uber_thread);
3484  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3485  __kmp_printf(" In Parallel: %2d\n",
3486  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3487  __kmp_printf("\n");
3488  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3489  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3490  }
3491  }
3492  } else {
3493  __kmp_printf("Ubers array is not allocated.\n");
3494  }
3495 
3496  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3497  "--------\n");
3498  while (list->next != NULL) {
3499  kmp_team_p const *team = list->entry;
3500  int i;
3501  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3502  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3503  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3504  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3505  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3506  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3507  for (i = 0; i < team->t.t_nproc; ++i) {
3508  __kmp_printf(" Thread %2d: ", i);
3509  __kmp_print_structure_thread("", team->t.t_threads[i]);
3510  }
3511  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3512  __kmp_printf("\n");
3513  list = list->next;
3514  }
3515 
3516  // Print out __kmp_thread_pool and __kmp_team_pool.
3517  __kmp_printf("\n------------------------------\nPools\n----------------------"
3518  "--------\n");
3519  __kmp_print_structure_thread("Thread pool: ",
3520  CCAST(kmp_info_t *, __kmp_thread_pool));
3521  __kmp_print_structure_team("Team pool: ",
3522  CCAST(kmp_team_t *, __kmp_team_pool));
3523  __kmp_printf("\n");
3524 
3525  // Free team list.
3526  while (list != NULL) {
3527  kmp_team_list_item_t *item = list;
3528  list = list->next;
3529  KMP_INTERNAL_FREE(item);
3530  }
3531 }
3532 
3533 #endif
3534 
3535 //---------------------------------------------------------------------------
3536 // Stuff for per-thread fast random number generator
3537 // Table of primes
3538 static const unsigned __kmp_primes[] = {
3539  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3540  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3541  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3542  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3543  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3544  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3545  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3546  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3547  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3548  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3549  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3550 
3551 //---------------------------------------------------------------------------
3552 // __kmp_get_random: Get a random number using a linear congruential method.
3553 unsigned short __kmp_get_random(kmp_info_t *thread) {
3554  unsigned x = thread->th.th_x;
3555  unsigned short r = (unsigned short)(x >> 16);
3556 
3557  thread->th.th_x = x * thread->th.th_a + 1;
3558 
3559  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3560  thread->th.th_info.ds.ds_tid, r));
3561 
3562  return r;
3563 }
3564 //--------------------------------------------------------
3565 // __kmp_init_random: Initialize a random number generator
3566 void __kmp_init_random(kmp_info_t *thread) {
3567  unsigned seed = thread->th.th_info.ds.ds_tid;
3568 
3569  thread->th.th_a =
3570  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3571  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3572  KA_TRACE(30,
3573  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3574 }
3575 
3576 #if KMP_OS_WINDOWS
3577 /* reclaim array entries for root threads that are already dead, returns number
3578  * reclaimed */
3579 static int __kmp_reclaim_dead_roots(void) {
3580  int i, r = 0;
3581 
3582  for (i = 0; i < __kmp_threads_capacity; ++i) {
3583  if (KMP_UBER_GTID(i) &&
3584  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3585  !__kmp_root[i]
3586  ->r.r_active) { // AC: reclaim only roots died in non-active state
3587  r += __kmp_unregister_root_other_thread(i);
3588  }
3589  }
3590  return r;
3591 }
3592 #endif
3593 
3594 /* This function attempts to create free entries in __kmp_threads and
3595  __kmp_root, and returns the number of free entries generated.
3596 
3597  For Windows* OS static library, the first mechanism used is to reclaim array
3598  entries for root threads that are already dead.
3599 
3600  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3601  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3602  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3603  threadprivate cache array has been created. Synchronization with
3604  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3605 
3606  After any dead root reclamation, if the clipping value allows array expansion
3607  to result in the generation of a total of nNeed free slots, the function does
3608  that expansion. If not, nothing is done beyond the possible initial root
3609  thread reclamation.
3610 
3611  If any argument is negative, the behavior is undefined. */
3612 static int __kmp_expand_threads(int nNeed) {
3613  int added = 0;
3614  int minimumRequiredCapacity;
3615  int newCapacity;
3616  kmp_info_t **newThreads;
3617  kmp_root_t **newRoot;
3618 
3619  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3620  // resizing __kmp_threads does not need additional protection if foreign
3621  // threads are present
3622 
3623 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3624  /* only for Windows static library */
3625  /* reclaim array entries for root threads that are already dead */
3626  added = __kmp_reclaim_dead_roots();
3627 
3628  if (nNeed) {
3629  nNeed -= added;
3630  if (nNeed < 0)
3631  nNeed = 0;
3632  }
3633 #endif
3634  if (nNeed <= 0)
3635  return added;
3636 
3637  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3638  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3639  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3640  // > __kmp_max_nth in one of two ways:
3641  //
3642  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3643  // may not be reused by another thread, so we may need to increase
3644  // __kmp_threads_capacity to __kmp_max_nth + 1.
3645  //
3646  // 2) New foreign root(s) are encountered. We always register new foreign
3647  // roots. This may cause a smaller # of threads to be allocated at
3648  // subsequent parallel regions, but the worker threads hang around (and
3649  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3650  //
3651  // Anyway, that is the reason for moving the check to see if
3652  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3653  // instead of having it performed here. -BB
3654 
3655  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3656 
3657  /* compute expansion headroom to check if we can expand */
3658  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3659  /* possible expansion too small -- give up */
3660  return added;
3661  }
3662  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3663 
3664  newCapacity = __kmp_threads_capacity;
3665  do {
3666  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3667  : __kmp_sys_max_nth;
3668  } while (newCapacity < minimumRequiredCapacity);
3669  newThreads = (kmp_info_t **)__kmp_allocate(
3670  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3671  newRoot =
3672  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3673  KMP_MEMCPY(newThreads, __kmp_threads,
3674  __kmp_threads_capacity * sizeof(kmp_info_t *));
3675  KMP_MEMCPY(newRoot, __kmp_root,
3676  __kmp_threads_capacity * sizeof(kmp_root_t *));
3677  // Put old __kmp_threads array on a list. Any ongoing references to the old
3678  // list will be valid. This list is cleaned up at library shutdown.
3679  kmp_old_threads_list_t *node =
3680  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3681  node->threads = __kmp_threads;
3682  node->next = __kmp_old_threads_list;
3683  __kmp_old_threads_list = node;
3684 
3685  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3686  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3687  added += newCapacity - __kmp_threads_capacity;
3688  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3689 
3690  if (newCapacity > __kmp_tp_capacity) {
3691  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3692  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3693  __kmp_threadprivate_resize_cache(newCapacity);
3694  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3695  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3696  }
3697  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3698  }
3699 
3700  return added;
3701 }
3702 
3703 /* Register the current thread as a root thread and obtain our gtid. We must
3704  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3705  thread that calls from __kmp_do_serial_initialize() */
3706 int __kmp_register_root(int initial_thread) {
3707  kmp_info_t *root_thread;
3708  kmp_root_t *root;
3709  int gtid;
3710  int capacity;
3711  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3712  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3713  KMP_MB();
3714 
3715  /* 2007-03-02:
3716  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3717  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3718  work as expected -- it may return false (that means there is at least one
3719  empty slot in __kmp_threads array), but it is possible the only free slot
3720  is #0, which is reserved for initial thread and so cannot be used for this
3721  one. Following code workarounds this bug.
3722 
3723  However, right solution seems to be not reserving slot #0 for initial
3724  thread because:
3725  (1) there is no magic in slot #0,
3726  (2) we cannot detect initial thread reliably (the first thread which does
3727  serial initialization may be not a real initial thread).
3728  */
3729  capacity = __kmp_threads_capacity;
3730  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3731  --capacity;
3732  }
3733 
3734  // If it is not for initializing the hidden helper team, we need to take
3735  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3736  // in __kmp_threads_capacity.
3737  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3738  capacity -= __kmp_hidden_helper_threads_num;
3739  }
3740 
3741  /* see if there are too many threads */
3742  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3743  if (__kmp_tp_cached) {
3744  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3745  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3746  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3747  } else {
3748  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3749  __kmp_msg_null);
3750  }
3751  }
3752 
3753  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3754  // 0: initial thread, also a regular OpenMP thread.
3755  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3756  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3757  // regular OpenMP threads.
3758  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3759  // Find an available thread slot for hidden helper thread. Slots for hidden
3760  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3761  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3762  gtid <= __kmp_hidden_helper_threads_num;
3763  gtid++)
3764  ;
3765  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3766  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3767  "hidden helper thread: T#%d\n",
3768  gtid));
3769  } else {
3770  /* find an available thread slot */
3771  // Don't reassign the zero slot since we need that to only be used by
3772  // initial thread. Slots for hidden helper threads should also be skipped.
3773  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3774  gtid = 0;
3775  } else {
3776  for (gtid = __kmp_hidden_helper_threads_num + 1;
3777  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3778  ;
3779  }
3780  KA_TRACE(
3781  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3782  KMP_ASSERT(gtid < __kmp_threads_capacity);
3783  }
3784 
3785  /* update global accounting */
3786  __kmp_all_nth++;
3787  TCW_4(__kmp_nth, __kmp_nth + 1);
3788 
3789  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3790  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3791  if (__kmp_adjust_gtid_mode) {
3792  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3793  if (TCR_4(__kmp_gtid_mode) != 2) {
3794  TCW_4(__kmp_gtid_mode, 2);
3795  }
3796  } else {
3797  if (TCR_4(__kmp_gtid_mode) != 1) {
3798  TCW_4(__kmp_gtid_mode, 1);
3799  }
3800  }
3801  }
3802 
3803 #ifdef KMP_ADJUST_BLOCKTIME
3804  /* Adjust blocktime to zero if necessary */
3805  /* Middle initialization might not have occurred yet */
3806  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3807  if (__kmp_nth > __kmp_avail_proc) {
3808  __kmp_zero_bt = TRUE;
3809  }
3810  }
3811 #endif /* KMP_ADJUST_BLOCKTIME */
3812 
3813  /* setup this new hierarchy */
3814  if (!(root = __kmp_root[gtid])) {
3815  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3816  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3817  }
3818 
3819 #if KMP_STATS_ENABLED
3820  // Initialize stats as soon as possible (right after gtid assignment).
3821  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3822  __kmp_stats_thread_ptr->startLife();
3823  KMP_SET_THREAD_STATE(SERIAL_REGION);
3824  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3825 #endif
3826  __kmp_initialize_root(root);
3827 
3828  /* setup new root thread structure */
3829  if (root->r.r_uber_thread) {
3830  root_thread = root->r.r_uber_thread;
3831  } else {
3832  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3833  if (__kmp_storage_map) {
3834  __kmp_print_thread_storage_map(root_thread, gtid);
3835  }
3836  root_thread->th.th_info.ds.ds_gtid = gtid;
3837 #if OMPT_SUPPORT
3838  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3839 #endif
3840  root_thread->th.th_root = root;
3841  if (__kmp_env_consistency_check) {
3842  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3843  }
3844 #if USE_FAST_MEMORY
3845  __kmp_initialize_fast_memory(root_thread);
3846 #endif /* USE_FAST_MEMORY */
3847 
3848 #if KMP_USE_BGET
3849  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3850  __kmp_initialize_bget(root_thread);
3851 #endif
3852  __kmp_init_random(root_thread); // Initialize random number generator
3853  }
3854 
3855  /* setup the serial team held in reserve by the root thread */
3856  if (!root_thread->th.th_serial_team) {
3857  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3858  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3859  root_thread->th.th_serial_team = __kmp_allocate_team(
3860  root, 1, 1,
3861 #if OMPT_SUPPORT
3862  ompt_data_none, // root parallel id
3863 #endif
3864  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3865  }
3866  KMP_ASSERT(root_thread->th.th_serial_team);
3867  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3868  root_thread->th.th_serial_team));
3869 
3870  /* drop root_thread into place */
3871  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3872 
3873  root->r.r_root_team->t.t_threads[0] = root_thread;
3874  root->r.r_hot_team->t.t_threads[0] = root_thread;
3875  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3876  // AC: the team created in reserve, not for execution (it is unused for now).
3877  root_thread->th.th_serial_team->t.t_serialized = 0;
3878  root->r.r_uber_thread = root_thread;
3879 
3880  /* initialize the thread, get it ready to go */
3881  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3882  TCW_4(__kmp_init_gtid, TRUE);
3883 
3884  /* prepare the primary thread for get_gtid() */
3885  __kmp_gtid_set_specific(gtid);
3886 
3887 #if USE_ITT_BUILD
3888  __kmp_itt_thread_name(gtid);
3889 #endif /* USE_ITT_BUILD */
3890 
3891 #ifdef KMP_TDATA_GTID
3892  __kmp_gtid = gtid;
3893 #endif
3894  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3895  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3896 
3897  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3898  "plain=%u\n",
3899  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3900  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3901  KMP_INIT_BARRIER_STATE));
3902  { // Initialize barrier data.
3903  int b;
3904  for (b = 0; b < bs_last_barrier; ++b) {
3905  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3906 #if USE_DEBUGGER
3907  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3908 #endif
3909  }
3910  }
3911  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3912  KMP_INIT_BARRIER_STATE);
3913 
3914 #if KMP_AFFINITY_SUPPORTED
3915  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3916  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3917  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3918  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3919 #endif /* KMP_AFFINITY_SUPPORTED */
3920  root_thread->th.th_def_allocator = __kmp_def_allocator;
3921  root_thread->th.th_prev_level = 0;
3922  root_thread->th.th_prev_num_threads = 1;
3923 
3924  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3925  tmp->cg_root = root_thread;
3926  tmp->cg_thread_limit = __kmp_cg_max_nth;
3927  tmp->cg_nthreads = 1;
3928  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3929  " cg_nthreads init to 1\n",
3930  root_thread, tmp));
3931  tmp->up = NULL;
3932  root_thread->th.th_cg_roots = tmp;
3933 
3934  __kmp_root_counter++;
3935 
3936 #if OMPT_SUPPORT
3937  if (!initial_thread && ompt_enabled.enabled) {
3938 
3939  kmp_info_t *root_thread = ompt_get_thread();
3940 
3941  ompt_set_thread_state(root_thread, ompt_state_overhead);
3942 
3943  if (ompt_enabled.ompt_callback_thread_begin) {
3944  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3945  ompt_thread_initial, __ompt_get_thread_data_internal());
3946  }
3947  ompt_data_t *task_data;
3948  ompt_data_t *parallel_data;
3949  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3950  NULL);
3951  if (ompt_enabled.ompt_callback_implicit_task) {
3952  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3953  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3954  }
3955 
3956  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3957  }
3958 #endif
3959 #if OMPD_SUPPORT
3960  if (ompd_state & OMPD_ENABLE_BP)
3961  ompd_bp_thread_begin();
3962 #endif
3963 
3964  KMP_MB();
3965  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3966 
3967  return gtid;
3968 }
3969 
3970 #if KMP_NESTED_HOT_TEAMS
3971 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3972  const int max_level) {
3973  int i, n, nth;
3974  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3975  if (!hot_teams || !hot_teams[level].hot_team) {
3976  return 0;
3977  }
3978  KMP_DEBUG_ASSERT(level < max_level);
3979  kmp_team_t *team = hot_teams[level].hot_team;
3980  nth = hot_teams[level].hot_team_nth;
3981  n = nth - 1; // primary thread is not freed
3982  if (level < max_level - 1) {
3983  for (i = 0; i < nth; ++i) {
3984  kmp_info_t *th = team->t.t_threads[i];
3985  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3986  if (i > 0 && th->th.th_hot_teams) {
3987  __kmp_free(th->th.th_hot_teams);
3988  th->th.th_hot_teams = NULL;
3989  }
3990  }
3991  }
3992  __kmp_free_team(root, team, NULL);
3993  return n;
3994 }
3995 #endif
3996 
3997 // Resets a root thread and clear its root and hot teams.
3998 // Returns the number of __kmp_threads entries directly and indirectly freed.
3999 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4000  kmp_team_t *root_team = root->r.r_root_team;
4001  kmp_team_t *hot_team = root->r.r_hot_team;
4002  int n = hot_team->t.t_nproc;
4003  int i;
4004 
4005  KMP_DEBUG_ASSERT(!root->r.r_active);
4006 
4007  root->r.r_root_team = NULL;
4008  root->r.r_hot_team = NULL;
4009  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4010  // before call to __kmp_free_team().
4011  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4012 #if KMP_NESTED_HOT_TEAMS
4013  if (__kmp_hot_teams_max_level >
4014  0) { // need to free nested hot teams and their threads if any
4015  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4016  kmp_info_t *th = hot_team->t.t_threads[i];
4017  if (__kmp_hot_teams_max_level > 1) {
4018  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4019  }
4020  if (th->th.th_hot_teams) {
4021  __kmp_free(th->th.th_hot_teams);
4022  th->th.th_hot_teams = NULL;
4023  }
4024  }
4025  }
4026 #endif
4027  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4028 
4029  // Before we can reap the thread, we need to make certain that all other
4030  // threads in the teams that had this root as ancestor have stopped trying to
4031  // steal tasks.
4032  if (__kmp_tasking_mode != tskm_immediate_exec) {
4033  __kmp_wait_to_unref_task_teams();
4034  }
4035 
4036 #if KMP_OS_WINDOWS
4037  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4038  KA_TRACE(
4039  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4040  "\n",
4041  (LPVOID) & (root->r.r_uber_thread->th),
4042  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4043  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4044 #endif /* KMP_OS_WINDOWS */
4045 
4046 #if OMPD_SUPPORT
4047  if (ompd_state & OMPD_ENABLE_BP)
4048  ompd_bp_thread_end();
4049 #endif
4050 
4051 #if OMPT_SUPPORT
4052  ompt_data_t *task_data;
4053  ompt_data_t *parallel_data;
4054  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4055  NULL);
4056  if (ompt_enabled.ompt_callback_implicit_task) {
4057  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4058  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4059  }
4060  if (ompt_enabled.ompt_callback_thread_end) {
4061  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4062  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4063  }
4064 #endif
4065 
4066  TCW_4(__kmp_nth,
4067  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4068  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4069  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4070  " to %d\n",
4071  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4072  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4073  if (i == 1) {
4074  // need to free contention group structure
4075  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4076  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4077  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4078  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4079  root->r.r_uber_thread->th.th_cg_roots = NULL;
4080  }
4081  __kmp_reap_thread(root->r.r_uber_thread, 1);
4082 
4083  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4084  // instead of freeing.
4085  root->r.r_uber_thread = NULL;
4086  /* mark root as no longer in use */
4087  root->r.r_begin = FALSE;
4088 
4089  return n;
4090 }
4091 
4092 void __kmp_unregister_root_current_thread(int gtid) {
4093  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4094  /* this lock should be ok, since unregister_root_current_thread is never
4095  called during an abort, only during a normal close. furthermore, if you
4096  have the forkjoin lock, you should never try to get the initz lock */
4097  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4098  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4099  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4100  "exiting T#%d\n",
4101  gtid));
4102  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4103  return;
4104  }
4105  kmp_root_t *root = __kmp_root[gtid];
4106 
4107  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4108  KMP_ASSERT(KMP_UBER_GTID(gtid));
4109  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4110  KMP_ASSERT(root->r.r_active == FALSE);
4111 
4112  KMP_MB();
4113 
4114  kmp_info_t *thread = __kmp_threads[gtid];
4115  kmp_team_t *team = thread->th.th_team;
4116  kmp_task_team_t *task_team = thread->th.th_task_team;
4117 
4118  // we need to wait for the proxy tasks before finishing the thread
4119  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4120  task_team->tt.tt_hidden_helper_task_encountered)) {
4121 #if OMPT_SUPPORT
4122  // the runtime is shutting down so we won't report any events
4123  thread->th.ompt_thread_info.state = ompt_state_undefined;
4124 #endif
4125  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4126  }
4127 
4128  __kmp_reset_root(gtid, root);
4129 
4130  KMP_MB();
4131  KC_TRACE(10,
4132  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4133 
4134  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4135 }
4136 
4137 #if KMP_OS_WINDOWS
4138 /* __kmp_forkjoin_lock must be already held
4139  Unregisters a root thread that is not the current thread. Returns the number
4140  of __kmp_threads entries freed as a result. */
4141 static int __kmp_unregister_root_other_thread(int gtid) {
4142  kmp_root_t *root = __kmp_root[gtid];
4143  int r;
4144 
4145  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4146  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4147  KMP_ASSERT(KMP_UBER_GTID(gtid));
4148  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4149  KMP_ASSERT(root->r.r_active == FALSE);
4150 
4151  r = __kmp_reset_root(gtid, root);
4152  KC_TRACE(10,
4153  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4154  return r;
4155 }
4156 #endif
4157 
4158 #if KMP_DEBUG
4159 void __kmp_task_info() {
4160 
4161  kmp_int32 gtid = __kmp_entry_gtid();
4162  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4163  kmp_info_t *this_thr = __kmp_threads[gtid];
4164  kmp_team_t *steam = this_thr->th.th_serial_team;
4165  kmp_team_t *team = this_thr->th.th_team;
4166 
4167  __kmp_printf(
4168  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4169  "ptask=%p\n",
4170  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4171  team->t.t_implicit_task_taskdata[tid].td_parent);
4172 }
4173 #endif // KMP_DEBUG
4174 
4175 /* TODO optimize with one big memclr, take out what isn't needed, split
4176  responsibility to workers as much as possible, and delay initialization of
4177  features as much as possible */
4178 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4179  int tid, int gtid) {
4180  /* this_thr->th.th_info.ds.ds_gtid is setup in
4181  kmp_allocate_thread/create_worker.
4182  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4183  KMP_DEBUG_ASSERT(this_thr != NULL);
4184  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4185  KMP_DEBUG_ASSERT(team);
4186  KMP_DEBUG_ASSERT(team->t.t_threads);
4187  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4188  kmp_info_t *master = team->t.t_threads[0];
4189  KMP_DEBUG_ASSERT(master);
4190  KMP_DEBUG_ASSERT(master->th.th_root);
4191 
4192  KMP_MB();
4193 
4194  TCW_SYNC_PTR(this_thr->th.th_team, team);
4195 
4196  this_thr->th.th_info.ds.ds_tid = tid;
4197  this_thr->th.th_set_nproc = 0;
4198  if (__kmp_tasking_mode != tskm_immediate_exec)
4199  // When tasking is possible, threads are not safe to reap until they are
4200  // done tasking; this will be set when tasking code is exited in wait
4201  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4202  else // no tasking --> always safe to reap
4203  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4204  this_thr->th.th_set_proc_bind = proc_bind_default;
4205 #if KMP_AFFINITY_SUPPORTED
4206  this_thr->th.th_new_place = this_thr->th.th_current_place;
4207 #endif
4208  this_thr->th.th_root = master->th.th_root;
4209 
4210  /* setup the thread's cache of the team structure */
4211  this_thr->th.th_team_nproc = team->t.t_nproc;
4212  this_thr->th.th_team_master = master;
4213  this_thr->th.th_team_serialized = team->t.t_serialized;
4214 
4215  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4216 
4217  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4218  tid, gtid, this_thr, this_thr->th.th_current_task));
4219 
4220  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4221  team, tid, TRUE);
4222 
4223  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4224  tid, gtid, this_thr, this_thr->th.th_current_task));
4225  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4226  // __kmp_initialize_team()?
4227 
4228  /* TODO no worksharing in speculative threads */
4229  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4230 
4231  this_thr->th.th_local.this_construct = 0;
4232 
4233  if (!this_thr->th.th_pri_common) {
4234  this_thr->th.th_pri_common =
4235  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4236  if (__kmp_storage_map) {
4237  __kmp_print_storage_map_gtid(
4238  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4239  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4240  }
4241  this_thr->th.th_pri_head = NULL;
4242  }
4243 
4244  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4245  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4246  // Make new thread's CG root same as primary thread's
4247  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4248  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4249  if (tmp) {
4250  // worker changes CG, need to check if old CG should be freed
4251  int i = tmp->cg_nthreads--;
4252  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4253  " on node %p of thread %p to %d\n",
4254  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4255  if (i == 1) {
4256  __kmp_free(tmp); // last thread left CG --> free it
4257  }
4258  }
4259  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4260  // Increment new thread's CG root's counter to add the new thread
4261  this_thr->th.th_cg_roots->cg_nthreads++;
4262  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4263  " node %p of thread %p to %d\n",
4264  this_thr, this_thr->th.th_cg_roots,
4265  this_thr->th.th_cg_roots->cg_root,
4266  this_thr->th.th_cg_roots->cg_nthreads));
4267  this_thr->th.th_current_task->td_icvs.thread_limit =
4268  this_thr->th.th_cg_roots->cg_thread_limit;
4269  }
4270 
4271  /* Initialize dynamic dispatch */
4272  {
4273  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4274  // Use team max_nproc since this will never change for the team.
4275  size_t disp_size =
4276  sizeof(dispatch_private_info_t) *
4277  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4278  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4279  team->t.t_max_nproc));
4280  KMP_ASSERT(dispatch);
4281  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4282  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4283 
4284  dispatch->th_disp_index = 0;
4285  dispatch->th_doacross_buf_idx = 0;
4286  if (!dispatch->th_disp_buffer) {
4287  dispatch->th_disp_buffer =
4288  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4289 
4290  if (__kmp_storage_map) {
4291  __kmp_print_storage_map_gtid(
4292  gtid, &dispatch->th_disp_buffer[0],
4293  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4294  ? 1
4295  : __kmp_dispatch_num_buffers],
4296  disp_size,
4297  "th_%d.th_dispatch.th_disp_buffer "
4298  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4299  gtid, team->t.t_id, gtid);
4300  }
4301  } else {
4302  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4303  }
4304 
4305  dispatch->th_dispatch_pr_current = 0;
4306  dispatch->th_dispatch_sh_current = 0;
4307 
4308  dispatch->th_deo_fcn = 0; /* ORDERED */
4309  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4310  }
4311 
4312  this_thr->th.th_next_pool = NULL;
4313 
4314  if (!this_thr->th.th_task_state_memo_stack) {
4315  size_t i;
4316  this_thr->th.th_task_state_memo_stack =
4317  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4318  this_thr->th.th_task_state_top = 0;
4319  this_thr->th.th_task_state_stack_sz = 4;
4320  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4321  ++i) // zero init the stack
4322  this_thr->th.th_task_state_memo_stack[i] = 0;
4323  }
4324 
4325  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4326  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4327 
4328  KMP_MB();
4329 }
4330 
4331 /* allocate a new thread for the requesting team. this is only called from
4332  within a forkjoin critical section. we will first try to get an available
4333  thread from the thread pool. if none is available, we will fork a new one
4334  assuming we are able to create a new one. this should be assured, as the
4335  caller should check on this first. */
4336 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4337  int new_tid) {
4338  kmp_team_t *serial_team;
4339  kmp_info_t *new_thr;
4340  int new_gtid;
4341 
4342  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4343  KMP_DEBUG_ASSERT(root && team);
4344 #if !KMP_NESTED_HOT_TEAMS
4345  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4346 #endif
4347  KMP_MB();
4348 
4349  /* first, try to get one from the thread pool */
4350  if (__kmp_thread_pool) {
4351  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4352  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4353  if (new_thr == __kmp_thread_pool_insert_pt) {
4354  __kmp_thread_pool_insert_pt = NULL;
4355  }
4356  TCW_4(new_thr->th.th_in_pool, FALSE);
4357  __kmp_suspend_initialize_thread(new_thr);
4358  __kmp_lock_suspend_mx(new_thr);
4359  if (new_thr->th.th_active_in_pool == TRUE) {
4360  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4361  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4362  new_thr->th.th_active_in_pool = FALSE;
4363  }
4364  __kmp_unlock_suspend_mx(new_thr);
4365 
4366  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4367  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4368  KMP_ASSERT(!new_thr->th.th_team);
4369  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4370 
4371  /* setup the thread structure */
4372  __kmp_initialize_info(new_thr, team, new_tid,
4373  new_thr->th.th_info.ds.ds_gtid);
4374  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4375 
4376  TCW_4(__kmp_nth, __kmp_nth + 1);
4377 
4378  new_thr->th.th_task_state = 0;
4379  new_thr->th.th_task_state_top = 0;
4380  new_thr->th.th_task_state_stack_sz = 4;
4381 
4382  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4383  // Make sure pool thread has transitioned to waiting on own thread struct
4384  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4385  // Thread activated in __kmp_allocate_team when increasing team size
4386  }
4387 
4388 #ifdef KMP_ADJUST_BLOCKTIME
4389  /* Adjust blocktime back to zero if necessary */
4390  /* Middle initialization might not have occurred yet */
4391  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4392  if (__kmp_nth > __kmp_avail_proc) {
4393  __kmp_zero_bt = TRUE;
4394  }
4395  }
4396 #endif /* KMP_ADJUST_BLOCKTIME */
4397 
4398 #if KMP_DEBUG
4399  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4400  // KMP_BARRIER_PARENT_FLAG.
4401  int b;
4402  kmp_balign_t *balign = new_thr->th.th_bar;
4403  for (b = 0; b < bs_last_barrier; ++b)
4404  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4405 #endif
4406 
4407  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4408  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4409 
4410  KMP_MB();
4411  return new_thr;
4412  }
4413 
4414  /* no, well fork a new one */
4415  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4416  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4417 
4418 #if KMP_USE_MONITOR
4419  // If this is the first worker thread the RTL is creating, then also
4420  // launch the monitor thread. We try to do this as early as possible.
4421  if (!TCR_4(__kmp_init_monitor)) {
4422  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4423  if (!TCR_4(__kmp_init_monitor)) {
4424  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4425  TCW_4(__kmp_init_monitor, 1);
4426  __kmp_create_monitor(&__kmp_monitor);
4427  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4428 #if KMP_OS_WINDOWS
4429  // AC: wait until monitor has started. This is a fix for CQ232808.
4430  // The reason is that if the library is loaded/unloaded in a loop with
4431  // small (parallel) work in between, then there is high probability that
4432  // monitor thread started after the library shutdown. At shutdown it is
4433  // too late to cope with the problem, because when the primary thread is
4434  // in DllMain (process detach) the monitor has no chances to start (it is
4435  // blocked), and primary thread has no means to inform the monitor that
4436  // the library has gone, because all the memory which the monitor can
4437  // access is going to be released/reset.
4438  while (TCR_4(__kmp_init_monitor) < 2) {
4439  KMP_YIELD(TRUE);
4440  }
4441  KF_TRACE(10, ("after monitor thread has started\n"));
4442 #endif
4443  }
4444  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4445  }
4446 #endif
4447 
4448  KMP_MB();
4449 
4450  {
4451  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4452  ? 1
4453  : __kmp_hidden_helper_threads_num + 1;
4454 
4455  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4456  ++new_gtid) {
4457  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4458  }
4459 
4460  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4461  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4462  }
4463  }
4464 
4465  /* allocate space for it. */
4466  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4467 
4468  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4469 
4470 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4471  // suppress race conditions detection on synchronization flags in debug mode
4472  // this helps to analyze library internals eliminating false positives
4473  __itt_suppress_mark_range(
4474  __itt_suppress_range, __itt_suppress_threading_errors,
4475  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4476  __itt_suppress_mark_range(
4477  __itt_suppress_range, __itt_suppress_threading_errors,
4478  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4479 #if KMP_OS_WINDOWS
4480  __itt_suppress_mark_range(
4481  __itt_suppress_range, __itt_suppress_threading_errors,
4482  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4483 #else
4484  __itt_suppress_mark_range(__itt_suppress_range,
4485  __itt_suppress_threading_errors,
4486  &new_thr->th.th_suspend_init_count,
4487  sizeof(new_thr->th.th_suspend_init_count));
4488 #endif
4489  // TODO: check if we need to also suppress b_arrived flags
4490  __itt_suppress_mark_range(__itt_suppress_range,
4491  __itt_suppress_threading_errors,
4492  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4493  sizeof(new_thr->th.th_bar[0].bb.b_go));
4494  __itt_suppress_mark_range(__itt_suppress_range,
4495  __itt_suppress_threading_errors,
4496  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4497  sizeof(new_thr->th.th_bar[1].bb.b_go));
4498  __itt_suppress_mark_range(__itt_suppress_range,
4499  __itt_suppress_threading_errors,
4500  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4501  sizeof(new_thr->th.th_bar[2].bb.b_go));
4502 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4503  if (__kmp_storage_map) {
4504  __kmp_print_thread_storage_map(new_thr, new_gtid);
4505  }
4506 
4507  // add the reserve serialized team, initialized from the team's primary thread
4508  {
4509  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4510  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4511  new_thr->th.th_serial_team = serial_team =
4512  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4513 #if OMPT_SUPPORT
4514  ompt_data_none, // root parallel id
4515 #endif
4516  proc_bind_default, &r_icvs,
4517  0 USE_NESTED_HOT_ARG(NULL));
4518  }
4519  KMP_ASSERT(serial_team);
4520  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4521  // execution (it is unused for now).
4522  serial_team->t.t_threads[0] = new_thr;
4523  KF_TRACE(10,
4524  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4525  new_thr));
4526 
4527  /* setup the thread structures */
4528  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4529 
4530 #if USE_FAST_MEMORY
4531  __kmp_initialize_fast_memory(new_thr);
4532 #endif /* USE_FAST_MEMORY */
4533 
4534 #if KMP_USE_BGET
4535  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4536  __kmp_initialize_bget(new_thr);
4537 #endif
4538 
4539  __kmp_init_random(new_thr); // Initialize random number generator
4540 
4541  /* Initialize these only once when thread is grabbed for a team allocation */
4542  KA_TRACE(20,
4543  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4544  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4545 
4546  int b;
4547  kmp_balign_t *balign = new_thr->th.th_bar;
4548  for (b = 0; b < bs_last_barrier; ++b) {
4549  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4550  balign[b].bb.team = NULL;
4551  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4552  balign[b].bb.use_oncore_barrier = 0;
4553  }
4554 
4555  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4556  new_thr->th.th_sleep_loc_type = flag_unset;
4557 
4558  new_thr->th.th_spin_here = FALSE;
4559  new_thr->th.th_next_waiting = 0;
4560 #if KMP_OS_UNIX
4561  new_thr->th.th_blocking = false;
4562 #endif
4563 
4564 #if KMP_AFFINITY_SUPPORTED
4565  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4566  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4567  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4568  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4569 #endif
4570  new_thr->th.th_def_allocator = __kmp_def_allocator;
4571  new_thr->th.th_prev_level = 0;
4572  new_thr->th.th_prev_num_threads = 1;
4573 
4574  TCW_4(new_thr->th.th_in_pool, FALSE);
4575  new_thr->th.th_active_in_pool = FALSE;
4576  TCW_4(new_thr->th.th_active, TRUE);
4577 
4578  /* adjust the global counters */
4579  __kmp_all_nth++;
4580  __kmp_nth++;
4581 
4582  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4583  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4584  if (__kmp_adjust_gtid_mode) {
4585  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4586  if (TCR_4(__kmp_gtid_mode) != 2) {
4587  TCW_4(__kmp_gtid_mode, 2);
4588  }
4589  } else {
4590  if (TCR_4(__kmp_gtid_mode) != 1) {
4591  TCW_4(__kmp_gtid_mode, 1);
4592  }
4593  }
4594  }
4595 
4596 #ifdef KMP_ADJUST_BLOCKTIME
4597  /* Adjust blocktime back to zero if necessary */
4598  /* Middle initialization might not have occurred yet */
4599  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4600  if (__kmp_nth > __kmp_avail_proc) {
4601  __kmp_zero_bt = TRUE;
4602  }
4603  }
4604 #endif /* KMP_ADJUST_BLOCKTIME */
4605 
4606  /* actually fork it and create the new worker thread */
4607  KF_TRACE(
4608  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4609  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4610  KF_TRACE(10,
4611  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4612 
4613  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4614  new_gtid));
4615  KMP_MB();
4616  return new_thr;
4617 }
4618 
4619 /* Reinitialize team for reuse.
4620  The hot team code calls this case at every fork barrier, so EPCC barrier
4621  test are extremely sensitive to changes in it, esp. writes to the team
4622  struct, which cause a cache invalidation in all threads.
4623  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4624 static void __kmp_reinitialize_team(kmp_team_t *team,
4625  kmp_internal_control_t *new_icvs,
4626  ident_t *loc) {
4627  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4628  team->t.t_threads[0], team));
4629  KMP_DEBUG_ASSERT(team && new_icvs);
4630  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4631  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4632 
4633  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4634  // Copy ICVs to the primary thread's implicit taskdata
4635  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4636  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4637 
4638  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4639  team->t.t_threads[0], team));
4640 }
4641 
4642 /* Initialize the team data structure.
4643  This assumes the t_threads and t_max_nproc are already set.
4644  Also, we don't touch the arguments */
4645 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4646  kmp_internal_control_t *new_icvs,
4647  ident_t *loc) {
4648  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4649 
4650  /* verify */
4651  KMP_DEBUG_ASSERT(team);
4652  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4653  KMP_DEBUG_ASSERT(team->t.t_threads);
4654  KMP_MB();
4655 
4656  team->t.t_master_tid = 0; /* not needed */
4657  /* team->t.t_master_bar; not needed */
4658  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4659  team->t.t_nproc = new_nproc;
4660 
4661  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4662  team->t.t_next_pool = NULL;
4663  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4664  * up hot team */
4665 
4666  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4667  team->t.t_invoke = NULL; /* not needed */
4668 
4669  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4670  team->t.t_sched.sched = new_icvs->sched.sched;
4671 
4672 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4673  team->t.t_fp_control_saved = FALSE; /* not needed */
4674  team->t.t_x87_fpu_control_word = 0; /* not needed */
4675  team->t.t_mxcsr = 0; /* not needed */
4676 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4677 
4678  team->t.t_construct = 0;
4679 
4680  team->t.t_ordered.dt.t_value = 0;
4681  team->t.t_master_active = FALSE;
4682 
4683 #ifdef KMP_DEBUG
4684  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4685 #endif
4686 #if KMP_OS_WINDOWS
4687  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4688 #endif
4689 
4690  team->t.t_control_stack_top = NULL;
4691 
4692  __kmp_reinitialize_team(team, new_icvs, loc);
4693 
4694  KMP_MB();
4695  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4696 }
4697 
4698 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4699 /* Sets full mask for thread and returns old mask, no changes to structures. */
4700 static void
4701 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4702  if (KMP_AFFINITY_CAPABLE()) {
4703  int status;
4704  if (old_mask != NULL) {
4705  status = __kmp_get_system_affinity(old_mask, TRUE);
4706  int error = errno;
4707  if (status != 0) {
4708  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4709  __kmp_msg_null);
4710  }
4711  }
4712  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4713  }
4714 }
4715 #endif
4716 
4717 #if KMP_AFFINITY_SUPPORTED
4718 
4719 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4720 // It calculates the worker + primary thread's partition based upon the parent
4721 // thread's partition, and binds each worker to a thread in their partition.
4722 // The primary thread's partition should already include its current binding.
4723 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4724  // Do not partition places for the hidden helper team
4725  if (KMP_HIDDEN_HELPER_TEAM(team))
4726  return;
4727  // Copy the primary thread's place partition to the team struct
4728  kmp_info_t *master_th = team->t.t_threads[0];
4729  KMP_DEBUG_ASSERT(master_th != NULL);
4730  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4731  int first_place = master_th->th.th_first_place;
4732  int last_place = master_th->th.th_last_place;
4733  int masters_place = master_th->th.th_current_place;
4734  team->t.t_first_place = first_place;
4735  team->t.t_last_place = last_place;
4736 
4737  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4738  "bound to place %d partition = [%d,%d]\n",
4739  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4740  team->t.t_id, masters_place, first_place, last_place));
4741 
4742  switch (proc_bind) {
4743 
4744  case proc_bind_default:
4745  // Serial teams might have the proc_bind policy set to proc_bind_default.
4746  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4747  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4748  break;
4749 
4750  case proc_bind_primary: {
4751  int f;
4752  int n_th = team->t.t_nproc;
4753  for (f = 1; f < n_th; f++) {
4754  kmp_info_t *th = team->t.t_threads[f];
4755  KMP_DEBUG_ASSERT(th != NULL);
4756  th->th.th_first_place = first_place;
4757  th->th.th_last_place = last_place;
4758  th->th.th_new_place = masters_place;
4759  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4760  team->t.t_display_affinity != 1) {
4761  team->t.t_display_affinity = 1;
4762  }
4763 
4764  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4765  "partition = [%d,%d]\n",
4766  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4767  f, masters_place, first_place, last_place));
4768  }
4769  } break;
4770 
4771  case proc_bind_close: {
4772  int f;
4773  int n_th = team->t.t_nproc;
4774  int n_places;
4775  if (first_place <= last_place) {
4776  n_places = last_place - first_place + 1;
4777  } else {
4778  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4779  }
4780  if (n_th <= n_places) {
4781  int place = masters_place;
4782  for (f = 1; f < n_th; f++) {
4783  kmp_info_t *th = team->t.t_threads[f];
4784  KMP_DEBUG_ASSERT(th != NULL);
4785 
4786  if (place == last_place) {
4787  place = first_place;
4788  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4789  place = 0;
4790  } else {
4791  place++;
4792  }
4793  th->th.th_first_place = first_place;
4794  th->th.th_last_place = last_place;
4795  th->th.th_new_place = place;
4796  if (__kmp_display_affinity && place != th->th.th_current_place &&
4797  team->t.t_display_affinity != 1) {
4798  team->t.t_display_affinity = 1;
4799  }
4800 
4801  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4802  "partition = [%d,%d]\n",
4803  __kmp_gtid_from_thread(team->t.t_threads[f]),
4804  team->t.t_id, f, place, first_place, last_place));
4805  }
4806  } else {
4807  int S, rem, gap, s_count;
4808  S = n_th / n_places;
4809  s_count = 0;
4810  rem = n_th - (S * n_places);
4811  gap = rem > 0 ? n_places / rem : n_places;
4812  int place = masters_place;
4813  int gap_ct = gap;
4814  for (f = 0; f < n_th; f++) {
4815  kmp_info_t *th = team->t.t_threads[f];
4816  KMP_DEBUG_ASSERT(th != NULL);
4817 
4818  th->th.th_first_place = first_place;
4819  th->th.th_last_place = last_place;
4820  th->th.th_new_place = place;
4821  if (__kmp_display_affinity && place != th->th.th_current_place &&
4822  team->t.t_display_affinity != 1) {
4823  team->t.t_display_affinity = 1;
4824  }
4825  s_count++;
4826 
4827  if ((s_count == S) && rem && (gap_ct == gap)) {
4828  // do nothing, add an extra thread to place on next iteration
4829  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4830  // we added an extra thread to this place; move to next place
4831  if (place == last_place) {
4832  place = first_place;
4833  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4834  place = 0;
4835  } else {
4836  place++;
4837  }
4838  s_count = 0;
4839  gap_ct = 1;
4840  rem--;
4841  } else if (s_count == S) { // place full; don't add extra
4842  if (place == last_place) {
4843  place = first_place;
4844  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4845  place = 0;
4846  } else {
4847  place++;
4848  }
4849  gap_ct++;
4850  s_count = 0;
4851  }
4852 
4853  KA_TRACE(100,
4854  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4855  "partition = [%d,%d]\n",
4856  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4857  th->th.th_new_place, first_place, last_place));
4858  }
4859  KMP_DEBUG_ASSERT(place == masters_place);
4860  }
4861  } break;
4862 
4863  case proc_bind_spread: {
4864  int f;
4865  int n_th = team->t.t_nproc;
4866  int n_places;
4867  int thidx;
4868  if (first_place <= last_place) {
4869  n_places = last_place - first_place + 1;
4870  } else {
4871  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4872  }
4873  if (n_th <= n_places) {
4874  int place = -1;
4875 
4876  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4877  int S = n_places / n_th;
4878  int s_count, rem, gap, gap_ct;
4879 
4880  place = masters_place;
4881  rem = n_places - n_th * S;
4882  gap = rem ? n_th / rem : 1;
4883  gap_ct = gap;
4884  thidx = n_th;
4885  if (update_master_only == 1)
4886  thidx = 1;
4887  for (f = 0; f < thidx; f++) {
4888  kmp_info_t *th = team->t.t_threads[f];
4889  KMP_DEBUG_ASSERT(th != NULL);
4890 
4891  th->th.th_first_place = place;
4892  th->th.th_new_place = place;
4893  if (__kmp_display_affinity && place != th->th.th_current_place &&
4894  team->t.t_display_affinity != 1) {
4895  team->t.t_display_affinity = 1;
4896  }
4897  s_count = 1;
4898  while (s_count < S) {
4899  if (place == last_place) {
4900  place = first_place;
4901  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4902  place = 0;
4903  } else {
4904  place++;
4905  }
4906  s_count++;
4907  }
4908  if (rem && (gap_ct == gap)) {
4909  if (place == last_place) {
4910  place = first_place;
4911  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4912  place = 0;
4913  } else {
4914  place++;
4915  }
4916  rem--;
4917  gap_ct = 0;
4918  }
4919  th->th.th_last_place = place;
4920  gap_ct++;
4921 
4922  if (place == last_place) {
4923  place = first_place;
4924  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4925  place = 0;
4926  } else {
4927  place++;
4928  }
4929 
4930  KA_TRACE(100,
4931  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4932  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4933  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4934  f, th->th.th_new_place, th->th.th_first_place,
4935  th->th.th_last_place, __kmp_affinity_num_masks));
4936  }
4937  } else {
4938  /* Having uniform space of available computation places I can create
4939  T partitions of round(P/T) size and put threads into the first
4940  place of each partition. */
4941  double current = static_cast<double>(masters_place);
4942  double spacing =
4943  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4944  int first, last;
4945  kmp_info_t *th;
4946 
4947  thidx = n_th + 1;
4948  if (update_master_only == 1)
4949  thidx = 1;
4950  for (f = 0; f < thidx; f++) {
4951  first = static_cast<int>(current);
4952  last = static_cast<int>(current + spacing) - 1;
4953  KMP_DEBUG_ASSERT(last >= first);
4954  if (first >= n_places) {
4955  if (masters_place) {
4956  first -= n_places;
4957  last -= n_places;
4958  if (first == (masters_place + 1)) {
4959  KMP_DEBUG_ASSERT(f == n_th);
4960  first--;
4961  }
4962  if (last == masters_place) {
4963  KMP_DEBUG_ASSERT(f == (n_th - 1));
4964  last--;
4965  }
4966  } else {
4967  KMP_DEBUG_ASSERT(f == n_th);
4968  first = 0;
4969  last = 0;
4970  }
4971  }
4972  if (last >= n_places) {
4973  last = (n_places - 1);
4974  }
4975  place = first;
4976  current += spacing;
4977  if (f < n_th) {
4978  KMP_DEBUG_ASSERT(0 <= first);
4979  KMP_DEBUG_ASSERT(n_places > first);
4980  KMP_DEBUG_ASSERT(0 <= last);
4981  KMP_DEBUG_ASSERT(n_places > last);
4982  KMP_DEBUG_ASSERT(last_place >= first_place);
4983  th = team->t.t_threads[f];
4984  KMP_DEBUG_ASSERT(th);
4985  th->th.th_first_place = first;
4986  th->th.th_new_place = place;
4987  th->th.th_last_place = last;
4988  if (__kmp_display_affinity && place != th->th.th_current_place &&
4989  team->t.t_display_affinity != 1) {
4990  team->t.t_display_affinity = 1;
4991  }
4992  KA_TRACE(100,
4993  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4994  "partition = [%d,%d], spacing = %.4f\n",
4995  __kmp_gtid_from_thread(team->t.t_threads[f]),
4996  team->t.t_id, f, th->th.th_new_place,
4997  th->th.th_first_place, th->th.th_last_place, spacing));
4998  }
4999  }
5000  }
5001  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5002  } else {
5003  int S, rem, gap, s_count;
5004  S = n_th / n_places;
5005  s_count = 0;
5006  rem = n_th - (S * n_places);
5007  gap = rem > 0 ? n_places / rem : n_places;
5008  int place = masters_place;
5009  int gap_ct = gap;
5010  thidx = n_th;
5011  if (update_master_only == 1)
5012  thidx = 1;
5013  for (f = 0; f < thidx; f++) {
5014  kmp_info_t *th = team->t.t_threads[f];
5015  KMP_DEBUG_ASSERT(th != NULL);
5016 
5017  th->th.th_first_place = place;
5018  th->th.th_last_place = place;
5019  th->th.th_new_place = place;
5020  if (__kmp_display_affinity && place != th->th.th_current_place &&
5021  team->t.t_display_affinity != 1) {
5022  team->t.t_display_affinity = 1;
5023  }
5024  s_count++;
5025 
5026  if ((s_count == S) && rem && (gap_ct == gap)) {
5027  // do nothing, add an extra thread to place on next iteration
5028  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5029  // we added an extra thread to this place; move on to next place
5030  if (place == last_place) {
5031  place = first_place;
5032  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5033  place = 0;
5034  } else {
5035  place++;
5036  }
5037  s_count = 0;
5038  gap_ct = 1;
5039  rem--;
5040  } else if (s_count == S) { // place is full; don't add extra thread
5041  if (place == last_place) {
5042  place = first_place;
5043  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5044  place = 0;
5045  } else {
5046  place++;
5047  }
5048  gap_ct++;
5049  s_count = 0;
5050  }
5051 
5052  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5053  "partition = [%d,%d]\n",
5054  __kmp_gtid_from_thread(team->t.t_threads[f]),
5055  team->t.t_id, f, th->th.th_new_place,
5056  th->th.th_first_place, th->th.th_last_place));
5057  }
5058  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5059  }
5060  } break;
5061 
5062  default:
5063  break;
5064  }
5065 
5066  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5067 }
5068 
5069 #endif // KMP_AFFINITY_SUPPORTED
5070 
5071 /* allocate a new team data structure to use. take one off of the free pool if
5072  available */
5073 kmp_team_t *
5074 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5075 #if OMPT_SUPPORT
5076  ompt_data_t ompt_parallel_data,
5077 #endif
5078  kmp_proc_bind_t new_proc_bind,
5079  kmp_internal_control_t *new_icvs,
5080  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5081  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5082  int f;
5083  kmp_team_t *team;
5084  int use_hot_team = !root->r.r_active;
5085  int level = 0;
5086  int do_place_partition = 1;
5087 
5088  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5089  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5090  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5091  KMP_MB();
5092 
5093 #if KMP_NESTED_HOT_TEAMS
5094  kmp_hot_team_ptr_t *hot_teams;
5095  if (master) {
5096  team = master->th.th_team;
5097  level = team->t.t_active_level;
5098  if (master->th.th_teams_microtask) { // in teams construct?
5099  if (master->th.th_teams_size.nteams > 1 &&
5100  ( // #teams > 1
5101  team->t.t_pkfn ==
5102  (microtask_t)__kmp_teams_master || // inner fork of the teams
5103  master->th.th_teams_level <
5104  team->t.t_level)) { // or nested parallel inside the teams
5105  ++level; // not increment if #teams==1, or for outer fork of the teams;
5106  // increment otherwise
5107  }
5108  // Do not perform the place partition if inner fork of the teams
5109  // Wait until nested parallel region encountered inside teams construct
5110  if ((master->th.th_teams_size.nteams == 1 &&
5111  master->th.th_teams_level >= team->t.t_level) ||
5112  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5113  do_place_partition = 0;
5114  }
5115  hot_teams = master->th.th_hot_teams;
5116  if (level < __kmp_hot_teams_max_level && hot_teams &&
5117  hot_teams[level].hot_team) {
5118  // hot team has already been allocated for given level
5119  use_hot_team = 1;
5120  } else {
5121  use_hot_team = 0;
5122  }
5123  } else {
5124  // check we won't access uninitialized hot_teams, just in case
5125  KMP_DEBUG_ASSERT(new_nproc == 1);
5126  }
5127 #endif
5128  // Optimization to use a "hot" team
5129  if (use_hot_team && new_nproc > 1) {
5130  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5131 #if KMP_NESTED_HOT_TEAMS
5132  team = hot_teams[level].hot_team;
5133 #else
5134  team = root->r.r_hot_team;
5135 #endif
5136 #if KMP_DEBUG
5137  if (__kmp_tasking_mode != tskm_immediate_exec) {
5138  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5139  "task_team[1] = %p before reinit\n",
5140  team->t.t_task_team[0], team->t.t_task_team[1]));
5141  }
5142 #endif
5143 
5144  if (team->t.t_nproc != new_nproc &&
5145  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5146  // Distributed barrier may need a resize
5147  int old_nthr = team->t.t_nproc;
5148  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5149  }
5150 
5151  // If not doing the place partition, then reset the team's proc bind
5152  // to indicate that partitioning of all threads still needs to take place
5153  if (do_place_partition == 0)
5154  team->t.t_proc_bind = proc_bind_default;
5155  // Has the number of threads changed?
5156  /* Let's assume the most common case is that the number of threads is
5157  unchanged, and put that case first. */
5158  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5159  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5160  // This case can mean that omp_set_num_threads() was called and the hot
5161  // team size was already reduced, so we check the special flag
5162  if (team->t.t_size_changed == -1) {
5163  team->t.t_size_changed = 1;
5164  } else {
5165  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5166  }
5167 
5168  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5169  kmp_r_sched_t new_sched = new_icvs->sched;
5170  // set primary thread's schedule as new run-time schedule
5171  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5172 
5173  __kmp_reinitialize_team(team, new_icvs,
5174  root->r.r_uber_thread->th.th_ident);
5175 
5176  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5177  team->t.t_threads[0], team));
5178  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5179 
5180 #if KMP_AFFINITY_SUPPORTED
5181  if ((team->t.t_size_changed == 0) &&
5182  (team->t.t_proc_bind == new_proc_bind)) {
5183  if (new_proc_bind == proc_bind_spread) {
5184  if (do_place_partition) {
5185  // add flag to update only master for spread
5186  __kmp_partition_places(team, 1);
5187  }
5188  }
5189  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5190  "proc_bind = %d, partition = [%d,%d]\n",
5191  team->t.t_id, new_proc_bind, team->t.t_first_place,
5192  team->t.t_last_place));
5193  } else {
5194  if (do_place_partition) {
5195  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5196  __kmp_partition_places(team);
5197  }
5198  }
5199 #else
5200  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5201 #endif /* KMP_AFFINITY_SUPPORTED */
5202  } else if (team->t.t_nproc > new_nproc) {
5203  KA_TRACE(20,
5204  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5205  new_nproc));
5206 
5207  team->t.t_size_changed = 1;
5208  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5209  // Barrier size already reduced earlier in this function
5210  // Activate team threads via th_used_in_team
5211  __kmp_add_threads_to_team(team, new_nproc);
5212  }
5213 #if KMP_NESTED_HOT_TEAMS
5214  if (__kmp_hot_teams_mode == 0) {
5215  // AC: saved number of threads should correspond to team's value in this
5216  // mode, can be bigger in mode 1, when hot team has threads in reserve
5217  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5218  hot_teams[level].hot_team_nth = new_nproc;
5219 #endif // KMP_NESTED_HOT_TEAMS
5220  /* release the extra threads we don't need any more */
5221  for (f = new_nproc; f < team->t.t_nproc; f++) {
5222  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5223  if (__kmp_tasking_mode != tskm_immediate_exec) {
5224  // When decreasing team size, threads no longer in the team should
5225  // unref task team.
5226  team->t.t_threads[f]->th.th_task_team = NULL;
5227  }
5228  __kmp_free_thread(team->t.t_threads[f]);
5229  team->t.t_threads[f] = NULL;
5230  }
5231 #if KMP_NESTED_HOT_TEAMS
5232  } // (__kmp_hot_teams_mode == 0)
5233  else {
5234  // When keeping extra threads in team, switch threads to wait on own
5235  // b_go flag
5236  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5237  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5238  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5239  for (int b = 0; b < bs_last_barrier; ++b) {
5240  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5241  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5242  }
5243  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5244  }
5245  }
5246  }
5247 #endif // KMP_NESTED_HOT_TEAMS
5248  team->t.t_nproc = new_nproc;
5249  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5250  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5251  __kmp_reinitialize_team(team, new_icvs,
5252  root->r.r_uber_thread->th.th_ident);
5253 
5254  // Update remaining threads
5255  for (f = 0; f < new_nproc; ++f) {
5256  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5257  }
5258 
5259  // restore the current task state of the primary thread: should be the
5260  // implicit task
5261  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5262  team->t.t_threads[0], team));
5263 
5264  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5265 
5266 #ifdef KMP_DEBUG
5267  for (f = 0; f < team->t.t_nproc; f++) {
5268  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5269  team->t.t_threads[f]->th.th_team_nproc ==
5270  team->t.t_nproc);
5271  }
5272 #endif
5273 
5274  if (do_place_partition) {
5275  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5276 #if KMP_AFFINITY_SUPPORTED
5277  __kmp_partition_places(team);
5278 #endif
5279  }
5280  } else { // team->t.t_nproc < new_nproc
5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5282  kmp_affin_mask_t *old_mask;
5283  if (KMP_AFFINITY_CAPABLE()) {
5284  KMP_CPU_ALLOC(old_mask);
5285  }
5286 #endif
5287 
5288  KA_TRACE(20,
5289  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5290  new_nproc));
5291  int old_nproc = team->t.t_nproc; // save old value and use to update only
5292  team->t.t_size_changed = 1;
5293 
5294 #if KMP_NESTED_HOT_TEAMS
5295  int avail_threads = hot_teams[level].hot_team_nth;
5296  if (new_nproc < avail_threads)
5297  avail_threads = new_nproc;
5298  kmp_info_t **other_threads = team->t.t_threads;
5299  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5300  // Adjust barrier data of reserved threads (if any) of the team
5301  // Other data will be set in __kmp_initialize_info() below.
5302  int b;
5303  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5304  for (b = 0; b < bs_last_barrier; ++b) {
5305  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5306  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5307 #if USE_DEBUGGER
5308  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5309 #endif
5310  }
5311  }
5312  if (hot_teams[level].hot_team_nth >= new_nproc) {
5313  // we have all needed threads in reserve, no need to allocate any
5314  // this only possible in mode 1, cannot have reserved threads in mode 0
5315  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5316  team->t.t_nproc = new_nproc; // just get reserved threads involved
5317  } else {
5318  // We may have some threads in reserve, but not enough;
5319  // get reserved threads involved if any.
5320  team->t.t_nproc = hot_teams[level].hot_team_nth;
5321  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5322 #endif // KMP_NESTED_HOT_TEAMS
5323  if (team->t.t_max_nproc < new_nproc) {
5324  /* reallocate larger arrays */
5325  __kmp_reallocate_team_arrays(team, new_nproc);
5326  __kmp_reinitialize_team(team, new_icvs, NULL);
5327  }
5328 
5329 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5330  /* Temporarily set full mask for primary thread before creation of
5331  workers. The reason is that workers inherit the affinity from the
5332  primary thread, so if a lot of workers are created on the single
5333  core quickly, they don't get a chance to set their own affinity for
5334  a long time. */
5335  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5336 #endif
5337 
5338  /* allocate new threads for the hot team */
5339  for (f = team->t.t_nproc; f < new_nproc; f++) {
5340  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5341  KMP_DEBUG_ASSERT(new_worker);
5342  team->t.t_threads[f] = new_worker;
5343 
5344  KA_TRACE(20,
5345  ("__kmp_allocate_team: team %d init T#%d arrived: "
5346  "join=%llu, plain=%llu\n",
5347  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5348  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5349  team->t.t_bar[bs_plain_barrier].b_arrived));
5350 
5351  { // Initialize barrier data for new threads.
5352  int b;
5353  kmp_balign_t *balign = new_worker->th.th_bar;
5354  for (b = 0; b < bs_last_barrier; ++b) {
5355  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5357  KMP_BARRIER_PARENT_FLAG);
5358 #if USE_DEBUGGER
5359  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5360 #endif
5361  }
5362  }
5363  }
5364 
5365 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5366  if (KMP_AFFINITY_CAPABLE()) {
5367  /* Restore initial primary thread's affinity mask */
5368  __kmp_set_system_affinity(old_mask, TRUE);
5369  KMP_CPU_FREE(old_mask);
5370  }
5371 #endif
5372 #if KMP_NESTED_HOT_TEAMS
5373  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5374 #endif // KMP_NESTED_HOT_TEAMS
5375  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5376  // Barrier size already increased earlier in this function
5377  // Activate team threads via th_used_in_team
5378  __kmp_add_threads_to_team(team, new_nproc);
5379  }
5380  /* make sure everyone is syncronized */
5381  // new threads below
5382  __kmp_initialize_team(team, new_nproc, new_icvs,
5383  root->r.r_uber_thread->th.th_ident);
5384 
5385  /* reinitialize the threads */
5386  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5387  for (f = 0; f < team->t.t_nproc; ++f)
5388  __kmp_initialize_info(team->t.t_threads[f], team, f,
5389  __kmp_gtid_from_tid(f, team));
5390 
5391  if (level) { // set th_task_state for new threads in nested hot team
5392  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5393  // only need to set the th_task_state for the new threads. th_task_state
5394  // for primary thread will not be accurate until after this in
5395  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5396  // get the correct value.
5397  for (f = old_nproc; f < team->t.t_nproc; ++f)
5398  team->t.t_threads[f]->th.th_task_state =
5399  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5400  } else { // set th_task_state for new threads in non-nested hot team
5401  // copy primary thread's state
5402  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5403  for (f = old_nproc; f < team->t.t_nproc; ++f)
5404  team->t.t_threads[f]->th.th_task_state = old_state;
5405  }
5406 
5407 #ifdef KMP_DEBUG
5408  for (f = 0; f < team->t.t_nproc; ++f) {
5409  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5410  team->t.t_threads[f]->th.th_team_nproc ==
5411  team->t.t_nproc);
5412  }
5413 #endif
5414 
5415  if (do_place_partition) {
5416  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5417 #if KMP_AFFINITY_SUPPORTED
5418  __kmp_partition_places(team);
5419 #endif
5420  }
5421  } // Check changes in number of threads
5422 
5423  kmp_info_t *master = team->t.t_threads[0];
5424  if (master->th.th_teams_microtask) {
5425  for (f = 1; f < new_nproc; ++f) {
5426  // propagate teams construct specific info to workers
5427  kmp_info_t *thr = team->t.t_threads[f];
5428  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5429  thr->th.th_teams_level = master->th.th_teams_level;
5430  thr->th.th_teams_size = master->th.th_teams_size;
5431  }
5432  }
5433 #if KMP_NESTED_HOT_TEAMS
5434  if (level) {
5435  // Sync barrier state for nested hot teams, not needed for outermost hot
5436  // team.
5437  for (f = 1; f < new_nproc; ++f) {
5438  kmp_info_t *thr = team->t.t_threads[f];
5439  int b;
5440  kmp_balign_t *balign = thr->th.th_bar;
5441  for (b = 0; b < bs_last_barrier; ++b) {
5442  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5443  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5444 #if USE_DEBUGGER
5445  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5446 #endif
5447  }
5448  }
5449  }
5450 #endif // KMP_NESTED_HOT_TEAMS
5451 
5452  /* reallocate space for arguments if necessary */
5453  __kmp_alloc_argv_entries(argc, team, TRUE);
5454  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5455  // The hot team re-uses the previous task team,
5456  // if untouched during the previous release->gather phase.
5457 
5458  KF_TRACE(10, (" hot_team = %p\n", team));
5459 
5460 #if KMP_DEBUG
5461  if (__kmp_tasking_mode != tskm_immediate_exec) {
5462  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5463  "task_team[1] = %p after reinit\n",
5464  team->t.t_task_team[0], team->t.t_task_team[1]));
5465  }
5466 #endif
5467 
5468 #if OMPT_SUPPORT
5469  __ompt_team_assign_id(team, ompt_parallel_data);
5470 #endif
5471 
5472  KMP_MB();
5473 
5474  return team;
5475  }
5476 
5477  /* next, let's try to take one from the team pool */
5478  KMP_MB();
5479  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5480  /* TODO: consider resizing undersized teams instead of reaping them, now
5481  that we have a resizing mechanism */
5482  if (team->t.t_max_nproc >= max_nproc) {
5483  /* take this team from the team pool */
5484  __kmp_team_pool = team->t.t_next_pool;
5485 
5486  if (max_nproc > 1 &&
5487  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5488  if (!team->t.b) { // Allocate barrier structure
5489  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5490  }
5491  }
5492 
5493  /* setup the team for fresh use */
5494  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5495 
5496  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5497  "task_team[1] %p to NULL\n",
5498  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5499  team->t.t_task_team[0] = NULL;
5500  team->t.t_task_team[1] = NULL;
5501 
5502  /* reallocate space for arguments if necessary */
5503  __kmp_alloc_argv_entries(argc, team, TRUE);
5504  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5505 
5506  KA_TRACE(
5507  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5508  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5509  { // Initialize barrier data.
5510  int b;
5511  for (b = 0; b < bs_last_barrier; ++b) {
5512  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5513 #if USE_DEBUGGER
5514  team->t.t_bar[b].b_master_arrived = 0;
5515  team->t.t_bar[b].b_team_arrived = 0;
5516 #endif
5517  }
5518  }
5519 
5520  team->t.t_proc_bind = new_proc_bind;
5521 
5522  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5523  team->t.t_id));
5524 
5525 #if OMPT_SUPPORT
5526  __ompt_team_assign_id(team, ompt_parallel_data);
5527 #endif
5528 
5529  KMP_MB();
5530 
5531  return team;
5532  }
5533 
5534  /* reap team if it is too small, then loop back and check the next one */
5535  // not sure if this is wise, but, will be redone during the hot-teams
5536  // rewrite.
5537  /* TODO: Use technique to find the right size hot-team, don't reap them */
5538  team = __kmp_reap_team(team);
5539  __kmp_team_pool = team;
5540  }
5541 
5542  /* nothing available in the pool, no matter, make a new team! */
5543  KMP_MB();
5544  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5545 
5546  /* and set it up */
5547  team->t.t_max_nproc = max_nproc;
5548  if (max_nproc > 1 &&
5549  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5550  // Allocate barrier structure
5551  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5552  }
5553 
5554  /* NOTE well, for some reason allocating one big buffer and dividing it up
5555  seems to really hurt performance a lot on the P4, so, let's not use this */
5556  __kmp_allocate_team_arrays(team, max_nproc);
5557 
5558  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5559  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5560 
5561  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5562  "%p to NULL\n",
5563  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5564  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5565  // memory, no need to duplicate
5566  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5567  // memory, no need to duplicate
5568 
5569  if (__kmp_storage_map) {
5570  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5571  }
5572 
5573  /* allocate space for arguments */
5574  __kmp_alloc_argv_entries(argc, team, FALSE);
5575  team->t.t_argc = argc;
5576 
5577  KA_TRACE(20,
5578  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5579  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5580  { // Initialize barrier data.
5581  int b;
5582  for (b = 0; b < bs_last_barrier; ++b) {
5583  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5584 #if USE_DEBUGGER
5585  team->t.t_bar[b].b_master_arrived = 0;
5586  team->t.t_bar[b].b_team_arrived = 0;
5587 #endif
5588  }
5589  }
5590 
5591  team->t.t_proc_bind = new_proc_bind;
5592 
5593 #if OMPT_SUPPORT
5594  __ompt_team_assign_id(team, ompt_parallel_data);
5595  team->t.ompt_serialized_team_info = NULL;
5596 #endif
5597 
5598  KMP_MB();
5599 
5600  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5601  team->t.t_id));
5602 
5603  return team;
5604 }
5605 
5606 /* TODO implement hot-teams at all levels */
5607 /* TODO implement lazy thread release on demand (disband request) */
5608 
5609 /* free the team. return it to the team pool. release all the threads
5610  * associated with it */
5611 void __kmp_free_team(kmp_root_t *root,
5612  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5613  int f;
5614  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5615  team->t.t_id));
5616 
5617  /* verify state */
5618  KMP_DEBUG_ASSERT(root);
5619  KMP_DEBUG_ASSERT(team);
5620  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5621  KMP_DEBUG_ASSERT(team->t.t_threads);
5622 
5623  int use_hot_team = team == root->r.r_hot_team;
5624 #if KMP_NESTED_HOT_TEAMS
5625  int level;
5626  if (master) {
5627  level = team->t.t_active_level - 1;
5628  if (master->th.th_teams_microtask) { // in teams construct?
5629  if (master->th.th_teams_size.nteams > 1) {
5630  ++level; // level was not increased in teams construct for
5631  // team_of_masters
5632  }
5633  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5634  master->th.th_teams_level == team->t.t_level) {
5635  ++level; // level was not increased in teams construct for
5636  // team_of_workers before the parallel
5637  } // team->t.t_level will be increased inside parallel
5638  }
5639 #if KMP_DEBUG
5640  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5641 #endif
5642  if (level < __kmp_hot_teams_max_level) {
5643  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5644  use_hot_team = 1;
5645  }
5646  }
5647 #endif // KMP_NESTED_HOT_TEAMS
5648 
5649  /* team is done working */
5650  TCW_SYNC_PTR(team->t.t_pkfn,
5651  NULL); // Important for Debugging Support Library.
5652 #if KMP_OS_WINDOWS
5653  team->t.t_copyin_counter = 0; // init counter for possible reuse
5654 #endif
5655  // Do not reset pointer to parent team to NULL for hot teams.
5656 
5657  /* if we are non-hot team, release our threads */
5658  if (!use_hot_team) {
5659  if (__kmp_tasking_mode != tskm_immediate_exec) {
5660  // Wait for threads to reach reapable state
5661  for (f = 1; f < team->t.t_nproc; ++f) {
5662  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5663  kmp_info_t *th = team->t.t_threads[f];
5664  volatile kmp_uint32 *state = &th->th.th_reap_state;
5665  while (*state != KMP_SAFE_TO_REAP) {
5666 #if KMP_OS_WINDOWS
5667  // On Windows a thread can be killed at any time, check this
5668  DWORD ecode;
5669  if (!__kmp_is_thread_alive(th, &ecode)) {
5670  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5671  break;
5672  }
5673 #endif
5674  // first check if thread is sleeping
5675  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5676  if (fl.is_sleeping())
5677  fl.resume(__kmp_gtid_from_thread(th));
5678  KMP_CPU_PAUSE();
5679  }
5680  }
5681 
5682  // Delete task teams
5683  int tt_idx;
5684  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5685  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5686  if (task_team != NULL) {
5687  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5688  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5689  team->t.t_threads[f]->th.th_task_team = NULL;
5690  }
5691  KA_TRACE(
5692  20,
5693  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5694  __kmp_get_gtid(), task_team, team->t.t_id));
5695 #if KMP_NESTED_HOT_TEAMS
5696  __kmp_free_task_team(master, task_team);
5697 #endif
5698  team->t.t_task_team[tt_idx] = NULL;
5699  }
5700  }
5701  }
5702 
5703  // Reset pointer to parent team only for non-hot teams.
5704  team->t.t_parent = NULL;
5705  team->t.t_level = 0;
5706  team->t.t_active_level = 0;
5707 
5708  /* free the worker threads */
5709  for (f = 1; f < team->t.t_nproc; ++f) {
5710  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5711  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5712  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5713  1, 2);
5714  }
5715  __kmp_free_thread(team->t.t_threads[f]);
5716  }
5717 
5718  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5719  if (team->t.b) {
5720  // wake up thread at old location
5721  team->t.b->go_release();
5722  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5723  for (f = 1; f < team->t.t_nproc; ++f) {
5724  if (team->t.b->sleep[f].sleep) {
5725  __kmp_atomic_resume_64(
5726  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5727  (kmp_atomic_flag_64<> *)NULL);
5728  }
5729  }
5730  }
5731  // Wait for threads to be removed from team
5732  for (int f = 1; f < team->t.t_nproc; ++f) {
5733  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5734  KMP_CPU_PAUSE();
5735  }
5736  }
5737  }
5738 
5739  for (f = 1; f < team->t.t_nproc; ++f) {
5740  team->t.t_threads[f] = NULL;
5741  }
5742 
5743  if (team->t.t_max_nproc > 1 &&
5744  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5745  distributedBarrier::deallocate(team->t.b);
5746  team->t.b = NULL;
5747  }
5748  /* put the team back in the team pool */
5749  /* TODO limit size of team pool, call reap_team if pool too large */
5750  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5751  __kmp_team_pool = (volatile kmp_team_t *)team;
5752  } else { // Check if team was created for primary threads in teams construct
5753  // See if first worker is a CG root
5754  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5755  team->t.t_threads[1]->th.th_cg_roots);
5756  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5757  // Clean up the CG root nodes on workers so that this team can be re-used
5758  for (f = 1; f < team->t.t_nproc; ++f) {
5759  kmp_info_t *thr = team->t.t_threads[f];
5760  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5761  thr->th.th_cg_roots->cg_root == thr);
5762  // Pop current CG root off list
5763  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5764  thr->th.th_cg_roots = tmp->up;
5765  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5766  " up to node %p. cg_nthreads was %d\n",
5767  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5768  int i = tmp->cg_nthreads--;
5769  if (i == 1) {
5770  __kmp_free(tmp); // free CG if we are the last thread in it
5771  }
5772  // Restore current task's thread_limit from CG root
5773  if (thr->th.th_cg_roots)
5774  thr->th.th_current_task->td_icvs.thread_limit =
5775  thr->th.th_cg_roots->cg_thread_limit;
5776  }
5777  }
5778  }
5779 
5780  KMP_MB();
5781 }
5782 
5783 /* reap the team. destroy it, reclaim all its resources and free its memory */
5784 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5785  kmp_team_t *next_pool = team->t.t_next_pool;
5786 
5787  KMP_DEBUG_ASSERT(team);
5788  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5789  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5790  KMP_DEBUG_ASSERT(team->t.t_threads);
5791  KMP_DEBUG_ASSERT(team->t.t_argv);
5792 
5793  /* TODO clean the threads that are a part of this? */
5794 
5795  /* free stuff */
5796  __kmp_free_team_arrays(team);
5797  if (team->t.t_argv != &team->t.t_inline_argv[0])
5798  __kmp_free((void *)team->t.t_argv);
5799  __kmp_free(team);
5800 
5801  KMP_MB();
5802  return next_pool;
5803 }
5804 
5805 // Free the thread. Don't reap it, just place it on the pool of available
5806 // threads.
5807 //
5808 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5809 // binding for the affinity mechanism to be useful.
5810 //
5811 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5812 // However, we want to avoid a potential performance problem by always
5813 // scanning through the list to find the correct point at which to insert
5814 // the thread (potential N**2 behavior). To do this we keep track of the
5815 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5816 // With single-level parallelism, threads will always be added to the tail
5817 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5818 // parallelism, all bets are off and we may need to scan through the entire
5819 // free list.
5820 //
5821 // This change also has a potentially large performance benefit, for some
5822 // applications. Previously, as threads were freed from the hot team, they
5823 // would be placed back on the free list in inverse order. If the hot team
5824 // grew back to it's original size, then the freed thread would be placed
5825 // back on the hot team in reverse order. This could cause bad cache
5826 // locality problems on programs where the size of the hot team regularly
5827 // grew and shrunk.
5828 //
5829 // Now, for single-level parallelism, the OMP tid is always == gtid.
5830 void __kmp_free_thread(kmp_info_t *this_th) {
5831  int gtid;
5832  kmp_info_t **scan;
5833 
5834  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5835  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5836 
5837  KMP_DEBUG_ASSERT(this_th);
5838 
5839  // When moving thread to pool, switch thread to wait on own b_go flag, and
5840  // uninitialized (NULL team).
5841  int b;
5842  kmp_balign_t *balign = this_th->th.th_bar;
5843  for (b = 0; b < bs_last_barrier; ++b) {
5844  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5845  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5846  balign[b].bb.team = NULL;
5847  balign[b].bb.leaf_kids = 0;
5848  }
5849  this_th->th.th_task_state = 0;
5850  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5851 
5852  /* put thread back on the free pool */
5853  TCW_PTR(this_th->th.th_team, NULL);
5854  TCW_PTR(this_th->th.th_root, NULL);
5855  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5856 
5857  while (this_th->th.th_cg_roots) {
5858  this_th->th.th_cg_roots->cg_nthreads--;
5859  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5860  " %p of thread %p to %d\n",
5861  this_th, this_th->th.th_cg_roots,
5862  this_th->th.th_cg_roots->cg_root,
5863  this_th->th.th_cg_roots->cg_nthreads));
5864  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5865  if (tmp->cg_root == this_th) { // Thread is a cg_root
5866  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5867  KA_TRACE(
5868  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5869  this_th->th.th_cg_roots = tmp->up;
5870  __kmp_free(tmp);
5871  } else { // Worker thread
5872  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5873  __kmp_free(tmp);
5874  }
5875  this_th->th.th_cg_roots = NULL;
5876  break;
5877  }
5878  }
5879 
5880  /* If the implicit task assigned to this thread can be used by other threads
5881  * -> multiple threads can share the data and try to free the task at
5882  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5883  * with higher probability when hot team is disabled but can occurs even when
5884  * the hot team is enabled */
5885  __kmp_free_implicit_task(this_th);
5886  this_th->th.th_current_task = NULL;
5887 
5888  // If the __kmp_thread_pool_insert_pt is already past the new insert
5889  // point, then we need to re-scan the entire list.
5890  gtid = this_th->th.th_info.ds.ds_gtid;
5891  if (__kmp_thread_pool_insert_pt != NULL) {
5892  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5893  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5894  __kmp_thread_pool_insert_pt = NULL;
5895  }
5896  }
5897 
5898  // Scan down the list to find the place to insert the thread.
5899  // scan is the address of a link in the list, possibly the address of
5900  // __kmp_thread_pool itself.
5901  //
5902  // In the absence of nested parallelism, the for loop will have 0 iterations.
5903  if (__kmp_thread_pool_insert_pt != NULL) {
5904  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5905  } else {
5906  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5907  }
5908  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5909  scan = &((*scan)->th.th_next_pool))
5910  ;
5911 
5912  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5913  // to its address.
5914  TCW_PTR(this_th->th.th_next_pool, *scan);
5915  __kmp_thread_pool_insert_pt = *scan = this_th;
5916  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5917  (this_th->th.th_info.ds.ds_gtid <
5918  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5919  TCW_4(this_th->th.th_in_pool, TRUE);
5920  __kmp_suspend_initialize_thread(this_th);
5921  __kmp_lock_suspend_mx(this_th);
5922  if (this_th->th.th_active == TRUE) {
5923  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5924  this_th->th.th_active_in_pool = TRUE;
5925  }
5926 #if KMP_DEBUG
5927  else {
5928  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5929  }
5930 #endif
5931  __kmp_unlock_suspend_mx(this_th);
5932 
5933  TCW_4(__kmp_nth, __kmp_nth - 1);
5934 
5935 #ifdef KMP_ADJUST_BLOCKTIME
5936  /* Adjust blocktime back to user setting or default if necessary */
5937  /* Middle initialization might never have occurred */
5938  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5939  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5940  if (__kmp_nth <= __kmp_avail_proc) {
5941  __kmp_zero_bt = FALSE;
5942  }
5943  }
5944 #endif /* KMP_ADJUST_BLOCKTIME */
5945 
5946  KMP_MB();
5947 }
5948 
5949 /* ------------------------------------------------------------------------ */
5950 
5951 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5952 #if OMP_PROFILING_SUPPORT
5953  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5954  // TODO: add a configuration option for time granularity
5955  if (ProfileTraceFile)
5956  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5957 #endif
5958 
5959  int gtid = this_thr->th.th_info.ds.ds_gtid;
5960  /* void *stack_data;*/
5961  kmp_team_t **volatile pteam;
5962 
5963  KMP_MB();
5964  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5965 
5966  if (__kmp_env_consistency_check) {
5967  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5968  }
5969 
5970 #if OMPD_SUPPORT
5971  if (ompd_state & OMPD_ENABLE_BP)
5972  ompd_bp_thread_begin();
5973 #endif
5974 
5975 #if OMPT_SUPPORT
5976  ompt_data_t *thread_data = nullptr;
5977  if (ompt_enabled.enabled) {
5978  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5979  *thread_data = ompt_data_none;
5980 
5981  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5982  this_thr->th.ompt_thread_info.wait_id = 0;
5983  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5984  this_thr->th.ompt_thread_info.parallel_flags = 0;
5985  if (ompt_enabled.ompt_callback_thread_begin) {
5986  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5987  ompt_thread_worker, thread_data);
5988  }
5989  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5990  }
5991 #endif
5992 
5993  /* This is the place where threads wait for work */
5994  while (!TCR_4(__kmp_global.g.g_done)) {
5995  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5996  KMP_MB();
5997 
5998  /* wait for work to do */
5999  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6000 
6001  /* No tid yet since not part of a team */
6002  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6003 
6004 #if OMPT_SUPPORT
6005  if (ompt_enabled.enabled) {
6006  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6007  }
6008 #endif
6009 
6010  pteam = &this_thr->th.th_team;
6011 
6012  /* have we been allocated? */
6013  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6014  /* we were just woken up, so run our new task */
6015  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6016  int rc;
6017  KA_TRACE(20,
6018  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6019  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6020  (*pteam)->t.t_pkfn));
6021 
6022  updateHWFPControl(*pteam);
6023 
6024 #if OMPT_SUPPORT
6025  if (ompt_enabled.enabled) {
6026  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6027  }
6028 #endif
6029 
6030  rc = (*pteam)->t.t_invoke(gtid);
6031  KMP_ASSERT(rc);
6032 
6033  KMP_MB();
6034  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6035  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6036  (*pteam)->t.t_pkfn));
6037  }
6038 #if OMPT_SUPPORT
6039  if (ompt_enabled.enabled) {
6040  /* no frame set while outside task */
6041  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6042 
6043  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6044  }
6045 #endif
6046  /* join barrier after parallel region */
6047  __kmp_join_barrier(gtid);
6048  }
6049  }
6050  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6051 
6052 #if OMPD_SUPPORT
6053  if (ompd_state & OMPD_ENABLE_BP)
6054  ompd_bp_thread_end();
6055 #endif
6056 
6057 #if OMPT_SUPPORT
6058  if (ompt_enabled.ompt_callback_thread_end) {
6059  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6060  }
6061 #endif
6062 
6063  this_thr->th.th_task_team = NULL;
6064  /* run the destructors for the threadprivate data for this thread */
6065  __kmp_common_destroy_gtid(gtid);
6066 
6067  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6068  KMP_MB();
6069 
6070 #if OMP_PROFILING_SUPPORT
6071  llvm::timeTraceProfilerFinishThread();
6072 #endif
6073  return this_thr;
6074 }
6075 
6076 /* ------------------------------------------------------------------------ */
6077 
6078 void __kmp_internal_end_dest(void *specific_gtid) {
6079  // Make sure no significant bits are lost
6080  int gtid;
6081  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6082 
6083  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6084  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6085  * this is because 0 is reserved for the nothing-stored case */
6086 
6087  __kmp_internal_end_thread(gtid);
6088 }
6089 
6090 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6091 
6092 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6093  __kmp_internal_end_atexit();
6094 }
6095 
6096 #endif
6097 
6098 /* [Windows] josh: when the atexit handler is called, there may still be more
6099  than one thread alive */
6100 void __kmp_internal_end_atexit(void) {
6101  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6102  /* [Windows]
6103  josh: ideally, we want to completely shutdown the library in this atexit
6104  handler, but stat code that depends on thread specific data for gtid fails
6105  because that data becomes unavailable at some point during the shutdown, so
6106  we call __kmp_internal_end_thread instead. We should eventually remove the
6107  dependency on __kmp_get_specific_gtid in the stat code and use
6108  __kmp_internal_end_library to cleanly shutdown the library.
6109 
6110  // TODO: Can some of this comment about GVS be removed?
6111  I suspect that the offending stat code is executed when the calling thread
6112  tries to clean up a dead root thread's data structures, resulting in GVS
6113  code trying to close the GVS structures for that thread, but since the stat
6114  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6115  the calling thread is cleaning up itself instead of another thread, it get
6116  confused. This happens because allowing a thread to unregister and cleanup
6117  another thread is a recent modification for addressing an issue.
6118  Based on the current design (20050722), a thread may end up
6119  trying to unregister another thread only if thread death does not trigger
6120  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6121  thread specific data destructor function to detect thread death. For
6122  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6123  is nothing. Thus, the workaround is applicable only for Windows static
6124  stat library. */
6125  __kmp_internal_end_library(-1);
6126 #if KMP_OS_WINDOWS
6127  __kmp_close_console();
6128 #endif
6129 }
6130 
6131 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6132  // It is assumed __kmp_forkjoin_lock is acquired.
6133 
6134  int gtid;
6135 
6136  KMP_DEBUG_ASSERT(thread != NULL);
6137 
6138  gtid = thread->th.th_info.ds.ds_gtid;
6139 
6140  if (!is_root) {
6141  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6142  /* Assume the threads are at the fork barrier here */
6143  KA_TRACE(
6144  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6145  gtid));
6146  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6147  while (
6148  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6149  KMP_CPU_PAUSE();
6150  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6151  } else {
6152  /* Need release fence here to prevent seg faults for tree forkjoin
6153  barrier (GEH) */
6154  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6155  thread);
6156  __kmp_release_64(&flag);
6157  }
6158  }
6159 
6160  // Terminate OS thread.
6161  __kmp_reap_worker(thread);
6162 
6163  // The thread was killed asynchronously. If it was actively
6164  // spinning in the thread pool, decrement the global count.
6165  //
6166  // There is a small timing hole here - if the worker thread was just waking
6167  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6168  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6169  // the global counter might not get updated.
6170  //
6171  // Currently, this can only happen as the library is unloaded,
6172  // so there are no harmful side effects.
6173  if (thread->th.th_active_in_pool) {
6174  thread->th.th_active_in_pool = FALSE;
6175  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6176  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6177  }
6178  }
6179 
6180  __kmp_free_implicit_task(thread);
6181 
6182 // Free the fast memory for tasking
6183 #if USE_FAST_MEMORY
6184  __kmp_free_fast_memory(thread);
6185 #endif /* USE_FAST_MEMORY */
6186 
6187  __kmp_suspend_uninitialize_thread(thread);
6188 
6189  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6190  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6191 
6192  --__kmp_all_nth;
6193  // __kmp_nth was decremented when thread is added to the pool.
6194 
6195 #ifdef KMP_ADJUST_BLOCKTIME
6196  /* Adjust blocktime back to user setting or default if necessary */
6197  /* Middle initialization might never have occurred */
6198  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6199  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6200  if (__kmp_nth <= __kmp_avail_proc) {
6201  __kmp_zero_bt = FALSE;
6202  }
6203  }
6204 #endif /* KMP_ADJUST_BLOCKTIME */
6205 
6206  /* free the memory being used */
6207  if (__kmp_env_consistency_check) {
6208  if (thread->th.th_cons) {
6209  __kmp_free_cons_stack(thread->th.th_cons);
6210  thread->th.th_cons = NULL;
6211  }
6212  }
6213 
6214  if (thread->th.th_pri_common != NULL) {
6215  __kmp_free(thread->th.th_pri_common);
6216  thread->th.th_pri_common = NULL;
6217  }
6218 
6219  if (thread->th.th_task_state_memo_stack != NULL) {
6220  __kmp_free(thread->th.th_task_state_memo_stack);
6221  thread->th.th_task_state_memo_stack = NULL;
6222  }
6223 
6224 #if KMP_USE_BGET
6225  if (thread->th.th_local.bget_data != NULL) {
6226  __kmp_finalize_bget(thread);
6227  }
6228 #endif
6229 
6230 #if KMP_AFFINITY_SUPPORTED
6231  if (thread->th.th_affin_mask != NULL) {
6232  KMP_CPU_FREE(thread->th.th_affin_mask);
6233  thread->th.th_affin_mask = NULL;
6234  }
6235 #endif /* KMP_AFFINITY_SUPPORTED */
6236 
6237 #if KMP_USE_HIER_SCHED
6238  if (thread->th.th_hier_bar_data != NULL) {
6239  __kmp_free(thread->th.th_hier_bar_data);
6240  thread->th.th_hier_bar_data = NULL;
6241  }
6242 #endif
6243 
6244  __kmp_reap_team(thread->th.th_serial_team);
6245  thread->th.th_serial_team = NULL;
6246  __kmp_free(thread);
6247 
6248  KMP_MB();
6249 
6250 } // __kmp_reap_thread
6251 
6252 static void __kmp_itthash_clean(kmp_info_t *th) {
6253 #if USE_ITT_NOTIFY
6254  if (__kmp_itt_region_domains.count > 0) {
6255  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6256  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6257  while (bucket) {
6258  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6259  __kmp_thread_free(th, bucket);
6260  bucket = next;
6261  }
6262  }
6263  }
6264  if (__kmp_itt_barrier_domains.count > 0) {
6265  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6266  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6267  while (bucket) {
6268  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6269  __kmp_thread_free(th, bucket);
6270  bucket = next;
6271  }
6272  }
6273  }
6274 #endif
6275 }
6276 
6277 static void __kmp_internal_end(void) {
6278  int i;
6279 
6280  /* First, unregister the library */
6281  __kmp_unregister_library();
6282 
6283 #if KMP_OS_WINDOWS
6284  /* In Win static library, we can't tell when a root actually dies, so we
6285  reclaim the data structures for any root threads that have died but not
6286  unregistered themselves, in order to shut down cleanly.
6287  In Win dynamic library we also can't tell when a thread dies. */
6288  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6289 // dead roots
6290 #endif
6291 
6292  for (i = 0; i < __kmp_threads_capacity; i++)
6293  if (__kmp_root[i])
6294  if (__kmp_root[i]->r.r_active)
6295  break;
6296  KMP_MB(); /* Flush all pending memory write invalidates. */
6297  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6298 
6299  if (i < __kmp_threads_capacity) {
6300 #if KMP_USE_MONITOR
6301  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6302  KMP_MB(); /* Flush all pending memory write invalidates. */
6303 
6304  // Need to check that monitor was initialized before reaping it. If we are
6305  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6306  // __kmp_monitor will appear to contain valid data, but it is only valid in
6307  // the parent process, not the child.
6308  // New behavior (201008): instead of keying off of the flag
6309  // __kmp_init_parallel, the monitor thread creation is keyed off
6310  // of the new flag __kmp_init_monitor.
6311  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6312  if (TCR_4(__kmp_init_monitor)) {
6313  __kmp_reap_monitor(&__kmp_monitor);
6314  TCW_4(__kmp_init_monitor, 0);
6315  }
6316  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6317  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6318 #endif // KMP_USE_MONITOR
6319  } else {
6320 /* TODO move this to cleanup code */
6321 #ifdef KMP_DEBUG
6322  /* make sure that everything has properly ended */
6323  for (i = 0; i < __kmp_threads_capacity; i++) {
6324  if (__kmp_root[i]) {
6325  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6326  // there can be uber threads alive here
6327  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6328  }
6329  }
6330 #endif
6331 
6332  KMP_MB();
6333 
6334  // Reap the worker threads.
6335  // This is valid for now, but be careful if threads are reaped sooner.
6336  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6337  // Get the next thread from the pool.
6338  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6339  __kmp_thread_pool = thread->th.th_next_pool;
6340  // Reap it.
6341  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6342  thread->th.th_next_pool = NULL;
6343  thread->th.th_in_pool = FALSE;
6344  __kmp_reap_thread(thread, 0);
6345  }
6346  __kmp_thread_pool_insert_pt = NULL;
6347 
6348  // Reap teams.
6349  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6350  // Get the next team from the pool.
6351  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6352  __kmp_team_pool = team->t.t_next_pool;
6353  // Reap it.
6354  team->t.t_next_pool = NULL;
6355  __kmp_reap_team(team);
6356  }
6357 
6358  __kmp_reap_task_teams();
6359 
6360 #if KMP_OS_UNIX
6361  // Threads that are not reaped should not access any resources since they
6362  // are going to be deallocated soon, so the shutdown sequence should wait
6363  // until all threads either exit the final spin-waiting loop or begin
6364  // sleeping after the given blocktime.
6365  for (i = 0; i < __kmp_threads_capacity; i++) {
6366  kmp_info_t *thr = __kmp_threads[i];
6367  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6368  KMP_CPU_PAUSE();
6369  }
6370 #endif
6371 
6372  for (i = 0; i < __kmp_threads_capacity; ++i) {
6373  // TBD: Add some checking...
6374  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6375  }
6376 
6377  /* Make sure all threadprivate destructors get run by joining with all
6378  worker threads before resetting this flag */
6379  TCW_SYNC_4(__kmp_init_common, FALSE);
6380 
6381  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6382  KMP_MB();
6383 
6384 #if KMP_USE_MONITOR
6385  // See note above: One of the possible fixes for CQ138434 / CQ140126
6386  //
6387  // FIXME: push both code fragments down and CSE them?
6388  // push them into __kmp_cleanup() ?
6389  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6390  if (TCR_4(__kmp_init_monitor)) {
6391  __kmp_reap_monitor(&__kmp_monitor);
6392  TCW_4(__kmp_init_monitor, 0);
6393  }
6394  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6395  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6396 #endif
6397  } /* else !__kmp_global.t_active */
6398  TCW_4(__kmp_init_gtid, FALSE);
6399  KMP_MB(); /* Flush all pending memory write invalidates. */
6400 
6401  __kmp_cleanup();
6402 #if OMPT_SUPPORT
6403  ompt_fini();
6404 #endif
6405 }
6406 
6407 void __kmp_internal_end_library(int gtid_req) {
6408  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6409  /* this shouldn't be a race condition because __kmp_internal_end() is the
6410  only place to clear __kmp_serial_init */
6411  /* we'll check this later too, after we get the lock */
6412  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6413  // redundant, because the next check will work in any case.
6414  if (__kmp_global.g.g_abort) {
6415  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6416  /* TODO abort? */
6417  return;
6418  }
6419  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6420  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6421  return;
6422  }
6423 
6424  // If hidden helper team has been initialized, we need to deinit it
6425  if (TCR_4(__kmp_init_hidden_helper) &&
6426  !TCR_4(__kmp_hidden_helper_team_done)) {
6427  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6428  // First release the main thread to let it continue its work
6429  __kmp_hidden_helper_main_thread_release();
6430  // Wait until the hidden helper team has been destroyed
6431  __kmp_hidden_helper_threads_deinitz_wait();
6432  }
6433 
6434  KMP_MB(); /* Flush all pending memory write invalidates. */
6435  /* find out who we are and what we should do */
6436  {
6437  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6438  KA_TRACE(
6439  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6440  if (gtid == KMP_GTID_SHUTDOWN) {
6441  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6442  "already shutdown\n"));
6443  return;
6444  } else if (gtid == KMP_GTID_MONITOR) {
6445  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6446  "registered, or system shutdown\n"));
6447  return;
6448  } else if (gtid == KMP_GTID_DNE) {
6449  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6450  "shutdown\n"));
6451  /* we don't know who we are, but we may still shutdown the library */
6452  } else if (KMP_UBER_GTID(gtid)) {
6453  /* unregister ourselves as an uber thread. gtid is no longer valid */
6454  if (__kmp_root[gtid]->r.r_active) {
6455  __kmp_global.g.g_abort = -1;
6456  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6457  __kmp_unregister_library();
6458  KA_TRACE(10,
6459  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6460  gtid));
6461  return;
6462  } else {
6463  __kmp_itthash_clean(__kmp_threads[gtid]);
6464  KA_TRACE(
6465  10,
6466  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6467  __kmp_unregister_root_current_thread(gtid);
6468  }
6469  } else {
6470 /* worker threads may call this function through the atexit handler, if they
6471  * call exit() */
6472 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6473  TODO: do a thorough shutdown instead */
6474 #ifdef DUMP_DEBUG_ON_EXIT
6475  if (__kmp_debug_buf)
6476  __kmp_dump_debug_buffer();
6477 #endif
6478  // added unregister library call here when we switch to shm linux
6479  // if we don't, it will leave lots of files in /dev/shm
6480  // cleanup shared memory file before exiting.
6481  __kmp_unregister_library();
6482  return;
6483  }
6484  }
6485  /* synchronize the termination process */
6486  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6487 
6488  /* have we already finished */
6489  if (__kmp_global.g.g_abort) {
6490  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6491  /* TODO abort? */
6492  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6493  return;
6494  }
6495  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6496  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6497  return;
6498  }
6499 
6500  /* We need this lock to enforce mutex between this reading of
6501  __kmp_threads_capacity and the writing by __kmp_register_root.
6502  Alternatively, we can use a counter of roots that is atomically updated by
6503  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6504  __kmp_internal_end_*. */
6505  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6506 
6507  /* now we can safely conduct the actual termination */
6508  __kmp_internal_end();
6509 
6510  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6511  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6512 
6513  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6514 
6515 #ifdef DUMP_DEBUG_ON_EXIT
6516  if (__kmp_debug_buf)
6517  __kmp_dump_debug_buffer();
6518 #endif
6519 
6520 #if KMP_OS_WINDOWS
6521  __kmp_close_console();
6522 #endif
6523 
6524  __kmp_fini_allocator();
6525 
6526 } // __kmp_internal_end_library
6527 
6528 void __kmp_internal_end_thread(int gtid_req) {
6529  int i;
6530 
6531  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6532  /* this shouldn't be a race condition because __kmp_internal_end() is the
6533  * only place to clear __kmp_serial_init */
6534  /* we'll check this later too, after we get the lock */
6535  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6536  // redundant, because the next check will work in any case.
6537  if (__kmp_global.g.g_abort) {
6538  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6539  /* TODO abort? */
6540  return;
6541  }
6542  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6543  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6544  return;
6545  }
6546 
6547  // If hidden helper team has been initialized, we need to deinit it
6548  if (TCR_4(__kmp_init_hidden_helper) &&
6549  !TCR_4(__kmp_hidden_helper_team_done)) {
6550  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6551  // First release the main thread to let it continue its work
6552  __kmp_hidden_helper_main_thread_release();
6553  // Wait until the hidden helper team has been destroyed
6554  __kmp_hidden_helper_threads_deinitz_wait();
6555  }
6556 
6557  KMP_MB(); /* Flush all pending memory write invalidates. */
6558 
6559  /* find out who we are and what we should do */
6560  {
6561  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6562  KA_TRACE(10,
6563  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6564  if (gtid == KMP_GTID_SHUTDOWN) {
6565  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6566  "already shutdown\n"));
6567  return;
6568  } else if (gtid == KMP_GTID_MONITOR) {
6569  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6570  "registered, or system shutdown\n"));
6571  return;
6572  } else if (gtid == KMP_GTID_DNE) {
6573  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6574  "shutdown\n"));
6575  return;
6576  /* we don't know who we are */
6577  } else if (KMP_UBER_GTID(gtid)) {
6578  /* unregister ourselves as an uber thread. gtid is no longer valid */
6579  if (__kmp_root[gtid]->r.r_active) {
6580  __kmp_global.g.g_abort = -1;
6581  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6582  KA_TRACE(10,
6583  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6584  gtid));
6585  return;
6586  } else {
6587  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6588  gtid));
6589  __kmp_unregister_root_current_thread(gtid);
6590  }
6591  } else {
6592  /* just a worker thread, let's leave */
6593  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6594 
6595  if (gtid >= 0) {
6596  __kmp_threads[gtid]->th.th_task_team = NULL;
6597  }
6598 
6599  KA_TRACE(10,
6600  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6601  gtid));
6602  return;
6603  }
6604  }
6605 #if KMP_DYNAMIC_LIB
6606  if (__kmp_pause_status != kmp_hard_paused)
6607  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6608  // because we will better shutdown later in the library destructor.
6609  {
6610  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6611  return;
6612  }
6613 #endif
6614  /* synchronize the termination process */
6615  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6616 
6617  /* have we already finished */
6618  if (__kmp_global.g.g_abort) {
6619  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6620  /* TODO abort? */
6621  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6622  return;
6623  }
6624  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6625  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6626  return;
6627  }
6628 
6629  /* We need this lock to enforce mutex between this reading of
6630  __kmp_threads_capacity and the writing by __kmp_register_root.
6631  Alternatively, we can use a counter of roots that is atomically updated by
6632  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6633  __kmp_internal_end_*. */
6634 
6635  /* should we finish the run-time? are all siblings done? */
6636  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6637 
6638  for (i = 0; i < __kmp_threads_capacity; ++i) {
6639  if (KMP_UBER_GTID(i)) {
6640  KA_TRACE(
6641  10,
6642  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6643  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6644  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6645  return;
6646  }
6647  }
6648 
6649  /* now we can safely conduct the actual termination */
6650 
6651  __kmp_internal_end();
6652 
6653  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6654  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655 
6656  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6657 
6658 #ifdef DUMP_DEBUG_ON_EXIT
6659  if (__kmp_debug_buf)
6660  __kmp_dump_debug_buffer();
6661 #endif
6662 } // __kmp_internal_end_thread
6663 
6664 // -----------------------------------------------------------------------------
6665 // Library registration stuff.
6666 
6667 static long __kmp_registration_flag = 0;
6668 // Random value used to indicate library initialization.
6669 static char *__kmp_registration_str = NULL;
6670 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6671 
6672 static inline char *__kmp_reg_status_name() {
6673 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6674  each thread. If registration and unregistration go in different threads
6675  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6676  env var can not be found, because the name will contain different pid. */
6677 // macOS* complains about name being too long with additional getuid()
6678 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6679  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6680  (int)getuid());
6681 #else
6682  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6683 #endif
6684 } // __kmp_reg_status_get
6685 
6686 void __kmp_register_library_startup(void) {
6687 
6688  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6689  int done = 0;
6690  union {
6691  double dtime;
6692  long ltime;
6693  } time;
6694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6695  __kmp_initialize_system_tick();
6696 #endif
6697  __kmp_read_system_time(&time.dtime);
6698  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6699  __kmp_registration_str =
6700  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6701  __kmp_registration_flag, KMP_LIBRARY_FILE);
6702 
6703  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6704  __kmp_registration_str));
6705 
6706  while (!done) {
6707 
6708  char *value = NULL; // Actual value of the environment variable.
6709 
6710 #if defined(KMP_USE_SHM)
6711  char *shm_name = __kmp_str_format("/%s", name);
6712  int shm_preexist = 0;
6713  char *data1;
6714  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6715  if ((fd1 == -1) && (errno == EEXIST)) {
6716  // file didn't open because it already exists.
6717  // try opening existing file
6718  fd1 = shm_open(shm_name, O_RDWR, 0666);
6719  if (fd1 == -1) { // file didn't open
6720  // error out here
6721  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6722  __kmp_msg_null);
6723  } else {
6724  // able to open existing file
6725  shm_preexist = 1;
6726  }
6727  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6728  // already exists.
6729  // error out here.
6730  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6731  __kmp_msg_null);
6732  }
6733  if (shm_preexist == 0) {
6734  // we created SHM now set size
6735  if (ftruncate(fd1, SHM_SIZE) == -1) {
6736  // error occured setting size;
6737  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6738  KMP_ERR(errno), __kmp_msg_null);
6739  }
6740  }
6741  data1 =
6742  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6743  if (data1 == MAP_FAILED) {
6744  // failed to map shared memory
6745  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6746  __kmp_msg_null);
6747  }
6748  if (shm_preexist == 0) { // set data to SHM, set value
6749  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6750  }
6751  // Read value from either what we just wrote or existing file.
6752  value = __kmp_str_format("%s", data1); // read value from SHM
6753  munmap(data1, SHM_SIZE);
6754  close(fd1);
6755 #else // Windows and unix with static library
6756  // Set environment variable, but do not overwrite if it is exist.
6757  __kmp_env_set(name, __kmp_registration_str, 0);
6758  // read value to see if it got set
6759  value = __kmp_env_get(name);
6760 #endif
6761 
6762  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6763  done = 1; // Ok, environment variable set successfully, exit the loop.
6764  } else {
6765  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6766  // Check whether it alive or dead.
6767  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6768  char *tail = value;
6769  char *flag_addr_str = NULL;
6770  char *flag_val_str = NULL;
6771  char const *file_name = NULL;
6772  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6773  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6774  file_name = tail;
6775  if (tail != NULL) {
6776  unsigned long *flag_addr = 0;
6777  unsigned long flag_val = 0;
6778  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6779  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6780  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6781  // First, check whether environment-encoded address is mapped into
6782  // addr space.
6783  // If so, dereference it to see if it still has the right value.
6784  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6785  neighbor = 1;
6786  } else {
6787  // If not, then we know the other copy of the library is no longer
6788  // running.
6789  neighbor = 2;
6790  }
6791  }
6792  }
6793  switch (neighbor) {
6794  case 0: // Cannot parse environment variable -- neighbor status unknown.
6795  // Assume it is the incompatible format of future version of the
6796  // library. Assume the other library is alive.
6797  // WARN( ... ); // TODO: Issue a warning.
6798  file_name = "unknown library";
6799  KMP_FALLTHROUGH();
6800  // Attention! Falling to the next case. That's intentional.
6801  case 1: { // Neighbor is alive.
6802  // Check it is allowed.
6803  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6804  if (!__kmp_str_match_true(duplicate_ok)) {
6805  // That's not allowed. Issue fatal error.
6806  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6807  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6808  }
6809  KMP_INTERNAL_FREE(duplicate_ok);
6810  __kmp_duplicate_library_ok = 1;
6811  done = 1; // Exit the loop.
6812  } break;
6813  case 2: { // Neighbor is dead.
6814 
6815 #if defined(KMP_USE_SHM)
6816  // close shared memory.
6817  shm_unlink(shm_name); // this removes file in /dev/shm
6818 #else
6819  // Clear the variable and try to register library again.
6820  __kmp_env_unset(name);
6821 #endif
6822  } break;
6823  default: {
6824  KMP_DEBUG_ASSERT(0);
6825  } break;
6826  }
6827  }
6828  KMP_INTERNAL_FREE((void *)value);
6829 #if defined(KMP_USE_SHM)
6830  KMP_INTERNAL_FREE((void *)shm_name);
6831 #endif
6832  } // while
6833  KMP_INTERNAL_FREE((void *)name);
6834 
6835 } // func __kmp_register_library_startup
6836 
6837 void __kmp_unregister_library(void) {
6838 
6839  char *name = __kmp_reg_status_name();
6840  char *value = NULL;
6841 
6842 #if defined(KMP_USE_SHM)
6843  char *shm_name = __kmp_str_format("/%s", name);
6844  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6845  if (fd1 == -1) {
6846  // file did not open. return.
6847  return;
6848  }
6849  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6850  if (data1 != MAP_FAILED) {
6851  value = __kmp_str_format("%s", data1); // read value from SHM
6852  munmap(data1, SHM_SIZE);
6853  }
6854  close(fd1);
6855 #else
6856  value = __kmp_env_get(name);
6857 #endif
6858 
6859  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6860  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6861  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6862 // Ok, this is our variable. Delete it.
6863 #if defined(KMP_USE_SHM)
6864  shm_unlink(shm_name); // this removes file in /dev/shm
6865 #else
6866  __kmp_env_unset(name);
6867 #endif
6868  }
6869 
6870 #if defined(KMP_USE_SHM)
6871  KMP_INTERNAL_FREE(shm_name);
6872 #endif
6873 
6874  KMP_INTERNAL_FREE(__kmp_registration_str);
6875  KMP_INTERNAL_FREE(value);
6876  KMP_INTERNAL_FREE(name);
6877 
6878  __kmp_registration_flag = 0;
6879  __kmp_registration_str = NULL;
6880 
6881 } // __kmp_unregister_library
6882 
6883 // End of Library registration stuff.
6884 // -----------------------------------------------------------------------------
6885 
6886 #if KMP_MIC_SUPPORTED
6887 
6888 static void __kmp_check_mic_type() {
6889  kmp_cpuid_t cpuid_state = {0};
6890  kmp_cpuid_t *cs_p = &cpuid_state;
6891  __kmp_x86_cpuid(1, 0, cs_p);
6892  // We don't support mic1 at the moment
6893  if ((cs_p->eax & 0xff0) == 0xB10) {
6894  __kmp_mic_type = mic2;
6895  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6896  __kmp_mic_type = mic3;
6897  } else {
6898  __kmp_mic_type = non_mic;
6899  }
6900 }
6901 
6902 #endif /* KMP_MIC_SUPPORTED */
6903 
6904 #if KMP_HAVE_UMWAIT
6905 static void __kmp_user_level_mwait_init() {
6906  struct kmp_cpuid buf;
6907  __kmp_x86_cpuid(7, 0, &buf);
6908  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6909  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6910  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6911  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6912  __kmp_umwait_enabled));
6913 }
6914 #elif KMP_HAVE_MWAIT
6915 #ifndef AT_INTELPHIUSERMWAIT
6916 // Spurious, non-existent value that should always fail to return anything.
6917 // Will be replaced with the correct value when we know that.
6918 #define AT_INTELPHIUSERMWAIT 10000
6919 #endif
6920 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6921 // earlier OS is used to build the RTL, we'll use the following internal
6922 // function when the entry is not found.
6923 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6924 unsigned long getauxval(unsigned long) { return 0; }
6925 
6926 static void __kmp_user_level_mwait_init() {
6927  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6928  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6929  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6930  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6931  if (__kmp_mic_type == mic3) {
6932  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6933  if ((res & 0x1) || __kmp_user_level_mwait) {
6934  __kmp_mwait_enabled = TRUE;
6935  if (__kmp_user_level_mwait) {
6936  KMP_INFORM(EnvMwaitWarn);
6937  }
6938  } else {
6939  __kmp_mwait_enabled = FALSE;
6940  }
6941  }
6942  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6943  "__kmp_mwait_enabled = %d\n",
6944  __kmp_mic_type, __kmp_mwait_enabled));
6945 }
6946 #endif /* KMP_HAVE_UMWAIT */
6947 
6948 static void __kmp_do_serial_initialize(void) {
6949  int i, gtid;
6950  size_t size;
6951 
6952  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6953 
6954  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6955  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6956  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6957  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6958  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6959 
6960 #if OMPT_SUPPORT
6961  ompt_pre_init();
6962 #endif
6963 #if OMPD_SUPPORT
6964  __kmp_env_dump();
6965  ompd_init();
6966 #endif
6967 
6968  __kmp_validate_locks();
6969 
6970  /* Initialize internal memory allocator */
6971  __kmp_init_allocator();
6972 
6973  /* Register the library startup via an environment variable or via mapped
6974  shared memory file and check to see whether another copy of the library is
6975  already registered. Since forked child process is often terminated, we
6976  postpone the registration till middle initialization in the child */
6977  if (__kmp_need_register_serial)
6978  __kmp_register_library_startup();
6979 
6980  /* TODO reinitialization of library */
6981  if (TCR_4(__kmp_global.g.g_done)) {
6982  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6983  }
6984 
6985  __kmp_global.g.g_abort = 0;
6986  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6987 
6988 /* initialize the locks */
6989 #if KMP_USE_ADAPTIVE_LOCKS
6990 #if KMP_DEBUG_ADAPTIVE_LOCKS
6991  __kmp_init_speculative_stats();
6992 #endif
6993 #endif
6994 #if KMP_STATS_ENABLED
6995  __kmp_stats_init();
6996 #endif
6997  __kmp_init_lock(&__kmp_global_lock);
6998  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6999  __kmp_init_lock(&__kmp_debug_lock);
7000  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7001  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7002  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7003  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7004  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7005  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7006  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7007  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7008  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7009  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7010  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7011  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7012  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7013  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7014  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7015 #if KMP_USE_MONITOR
7016  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7017 #endif
7018  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7019 
7020  /* conduct initialization and initial setup of configuration */
7021 
7022  __kmp_runtime_initialize();
7023 
7024 #if KMP_MIC_SUPPORTED
7025  __kmp_check_mic_type();
7026 #endif
7027 
7028 // Some global variable initialization moved here from kmp_env_initialize()
7029 #ifdef KMP_DEBUG
7030  kmp_diag = 0;
7031 #endif
7032  __kmp_abort_delay = 0;
7033 
7034  // From __kmp_init_dflt_team_nth()
7035  /* assume the entire machine will be used */
7036  __kmp_dflt_team_nth_ub = __kmp_xproc;
7037  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7038  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7039  }
7040  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7041  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7042  }
7043  __kmp_max_nth = __kmp_sys_max_nth;
7044  __kmp_cg_max_nth = __kmp_sys_max_nth;
7045  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7046  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7047  __kmp_teams_max_nth = __kmp_sys_max_nth;
7048  }
7049 
7050  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7051  // part
7052  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7053 #if KMP_USE_MONITOR
7054  __kmp_monitor_wakeups =
7055  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7056  __kmp_bt_intervals =
7057  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7058 #endif
7059  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7060  __kmp_library = library_throughput;
7061  // From KMP_SCHEDULE initialization
7062  __kmp_static = kmp_sch_static_balanced;
7063 // AC: do not use analytical here, because it is non-monotonous
7064 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7065 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7066 // need to repeat assignment
7067 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7068 // bit control and barrier method control parts
7069 #if KMP_FAST_REDUCTION_BARRIER
7070 #define kmp_reduction_barrier_gather_bb ((int)1)
7071 #define kmp_reduction_barrier_release_bb ((int)1)
7072 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7073 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7074 #endif // KMP_FAST_REDUCTION_BARRIER
7075  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7076  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7077  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7078  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7079  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7080 #if KMP_FAST_REDUCTION_BARRIER
7081  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7082  // lin_64 ): hyper,1
7083  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7084  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7085  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7086  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7087  }
7088 #endif // KMP_FAST_REDUCTION_BARRIER
7089  }
7090 #if KMP_FAST_REDUCTION_BARRIER
7091 #undef kmp_reduction_barrier_release_pat
7092 #undef kmp_reduction_barrier_gather_pat
7093 #undef kmp_reduction_barrier_release_bb
7094 #undef kmp_reduction_barrier_gather_bb
7095 #endif // KMP_FAST_REDUCTION_BARRIER
7096 #if KMP_MIC_SUPPORTED
7097  if (__kmp_mic_type == mic2) { // KNC
7098  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7099  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7100  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7101  1; // forkjoin release
7102  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7103  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7104  }
7105 #if KMP_FAST_REDUCTION_BARRIER
7106  if (__kmp_mic_type == mic2) { // KNC
7107  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7108  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7109  }
7110 #endif // KMP_FAST_REDUCTION_BARRIER
7111 #endif // KMP_MIC_SUPPORTED
7112 
7113 // From KMP_CHECKS initialization
7114 #ifdef KMP_DEBUG
7115  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7116 #else
7117  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7118 #endif
7119 
7120  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7121  __kmp_foreign_tp = TRUE;
7122 
7123  __kmp_global.g.g_dynamic = FALSE;
7124  __kmp_global.g.g_dynamic_mode = dynamic_default;
7125 
7126  __kmp_init_nesting_mode();
7127 
7128  __kmp_env_initialize(NULL);
7129 
7130 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7131  __kmp_user_level_mwait_init();
7132 #endif
7133 // Print all messages in message catalog for testing purposes.
7134 #ifdef KMP_DEBUG
7135  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7136  if (__kmp_str_match_true(val)) {
7137  kmp_str_buf_t buffer;
7138  __kmp_str_buf_init(&buffer);
7139  __kmp_i18n_dump_catalog(&buffer);
7140  __kmp_printf("%s", buffer.str);
7141  __kmp_str_buf_free(&buffer);
7142  }
7143  __kmp_env_free(&val);
7144 #endif
7145 
7146  __kmp_threads_capacity =
7147  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7148  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7149  __kmp_tp_capacity = __kmp_default_tp_capacity(
7150  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7151 
7152  // If the library is shut down properly, both pools must be NULL. Just in
7153  // case, set them to NULL -- some memory may leak, but subsequent code will
7154  // work even if pools are not freed.
7155  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7156  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7157  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7158  __kmp_thread_pool = NULL;
7159  __kmp_thread_pool_insert_pt = NULL;
7160  __kmp_team_pool = NULL;
7161 
7162  /* Allocate all of the variable sized records */
7163  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7164  * expandable */
7165  /* Since allocation is cache-aligned, just add extra padding at the end */
7166  size =
7167  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7168  CACHE_LINE;
7169  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7170  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7171  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7172 
7173  /* init thread counts */
7174  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7175  0); // Asserts fail if the library is reinitializing and
7176  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7177  __kmp_all_nth = 0;
7178  __kmp_nth = 0;
7179 
7180  /* setup the uber master thread and hierarchy */
7181  gtid = __kmp_register_root(TRUE);
7182  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7183  KMP_ASSERT(KMP_UBER_GTID(gtid));
7184  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7185 
7186  KMP_MB(); /* Flush all pending memory write invalidates. */
7187 
7188  __kmp_common_initialize();
7189 
7190 #if KMP_OS_UNIX
7191  /* invoke the child fork handler */
7192  __kmp_register_atfork();
7193 #endif
7194 
7195 #if !KMP_DYNAMIC_LIB
7196  {
7197  /* Invoke the exit handler when the program finishes, only for static
7198  library. For dynamic library, we already have _fini and DllMain. */
7199  int rc = atexit(__kmp_internal_end_atexit);
7200  if (rc != 0) {
7201  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7202  __kmp_msg_null);
7203  }
7204  }
7205 #endif
7206 
7207 #if KMP_HANDLE_SIGNALS
7208 #if KMP_OS_UNIX
7209  /* NOTE: make sure that this is called before the user installs their own
7210  signal handlers so that the user handlers are called first. this way they
7211  can return false, not call our handler, avoid terminating the library, and
7212  continue execution where they left off. */
7213  __kmp_install_signals(FALSE);
7214 #endif /* KMP_OS_UNIX */
7215 #if KMP_OS_WINDOWS
7216  __kmp_install_signals(TRUE);
7217 #endif /* KMP_OS_WINDOWS */
7218 #endif
7219 
7220  /* we have finished the serial initialization */
7221  __kmp_init_counter++;
7222 
7223  __kmp_init_serial = TRUE;
7224 
7225  if (__kmp_settings) {
7226  __kmp_env_print();
7227  }
7228 
7229  if (__kmp_display_env || __kmp_display_env_verbose) {
7230  __kmp_env_print_2();
7231  }
7232 
7233 #if OMPT_SUPPORT
7234  ompt_post_init();
7235 #endif
7236 
7237  KMP_MB();
7238 
7239  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7240 }
7241 
7242 void __kmp_serial_initialize(void) {
7243  if (__kmp_init_serial) {
7244  return;
7245  }
7246  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7247  if (__kmp_init_serial) {
7248  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7249  return;
7250  }
7251  __kmp_do_serial_initialize();
7252  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7253 }
7254 
7255 static void __kmp_do_middle_initialize(void) {
7256  int i, j;
7257  int prev_dflt_team_nth;
7258 
7259  if (!__kmp_init_serial) {
7260  __kmp_do_serial_initialize();
7261  }
7262 
7263  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7264 
7265  if (UNLIKELY(!__kmp_need_register_serial)) {
7266  // We are in a forked child process. The registration was skipped during
7267  // serial initialization in __kmp_atfork_child handler. Do it here.
7268  __kmp_register_library_startup();
7269  }
7270 
7271  // Save the previous value for the __kmp_dflt_team_nth so that
7272  // we can avoid some reinitialization if it hasn't changed.
7273  prev_dflt_team_nth = __kmp_dflt_team_nth;
7274 
7275 #if KMP_AFFINITY_SUPPORTED
7276  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7277  // number of cores on the machine.
7278  __kmp_affinity_initialize();
7279 
7280 #endif /* KMP_AFFINITY_SUPPORTED */
7281 
7282  KMP_ASSERT(__kmp_xproc > 0);
7283  if (__kmp_avail_proc == 0) {
7284  __kmp_avail_proc = __kmp_xproc;
7285  }
7286 
7287  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7288  // correct them now
7289  j = 0;
7290  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7291  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7292  __kmp_avail_proc;
7293  j++;
7294  }
7295 
7296  if (__kmp_dflt_team_nth == 0) {
7297 #ifdef KMP_DFLT_NTH_CORES
7298  // Default #threads = #cores
7299  __kmp_dflt_team_nth = __kmp_ncores;
7300  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7301  "__kmp_ncores (%d)\n",
7302  __kmp_dflt_team_nth));
7303 #else
7304  // Default #threads = #available OS procs
7305  __kmp_dflt_team_nth = __kmp_avail_proc;
7306  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7307  "__kmp_avail_proc(%d)\n",
7308  __kmp_dflt_team_nth));
7309 #endif /* KMP_DFLT_NTH_CORES */
7310  }
7311 
7312  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7313  __kmp_dflt_team_nth = KMP_MIN_NTH;
7314  }
7315  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7316  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7317  }
7318 
7319  if (__kmp_nesting_mode > 0)
7320  __kmp_set_nesting_mode_threads();
7321 
7322  // There's no harm in continuing if the following check fails,
7323  // but it indicates an error in the previous logic.
7324  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7325 
7326  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7327  // Run through the __kmp_threads array and set the num threads icv for each
7328  // root thread that is currently registered with the RTL (which has not
7329  // already explicitly set its nthreads-var with a call to
7330  // omp_set_num_threads()).
7331  for (i = 0; i < __kmp_threads_capacity; i++) {
7332  kmp_info_t *thread = __kmp_threads[i];
7333  if (thread == NULL)
7334  continue;
7335  if (thread->th.th_current_task->td_icvs.nproc != 0)
7336  continue;
7337 
7338  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7339  }
7340  }
7341  KA_TRACE(
7342  20,
7343  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7344  __kmp_dflt_team_nth));
7345 
7346 #ifdef KMP_ADJUST_BLOCKTIME
7347  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7348  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7349  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7350  if (__kmp_nth > __kmp_avail_proc) {
7351  __kmp_zero_bt = TRUE;
7352  }
7353  }
7354 #endif /* KMP_ADJUST_BLOCKTIME */
7355 
7356  /* we have finished middle initialization */
7357  TCW_SYNC_4(__kmp_init_middle, TRUE);
7358 
7359  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7360 }
7361 
7362 void __kmp_middle_initialize(void) {
7363  if (__kmp_init_middle) {
7364  return;
7365  }
7366  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7367  if (__kmp_init_middle) {
7368  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7369  return;
7370  }
7371  __kmp_do_middle_initialize();
7372  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7373 }
7374 
7375 void __kmp_parallel_initialize(void) {
7376  int gtid = __kmp_entry_gtid(); // this might be a new root
7377 
7378  /* synchronize parallel initialization (for sibling) */
7379  if (TCR_4(__kmp_init_parallel))
7380  return;
7381  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7382  if (TCR_4(__kmp_init_parallel)) {
7383  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7384  return;
7385  }
7386 
7387  /* TODO reinitialization after we have already shut down */
7388  if (TCR_4(__kmp_global.g.g_done)) {
7389  KA_TRACE(
7390  10,
7391  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7392  __kmp_infinite_loop();
7393  }
7394 
7395  /* jc: The lock __kmp_initz_lock is already held, so calling
7396  __kmp_serial_initialize would cause a deadlock. So we call
7397  __kmp_do_serial_initialize directly. */
7398  if (!__kmp_init_middle) {
7399  __kmp_do_middle_initialize();
7400  }
7401  __kmp_assign_root_init_mask();
7402  __kmp_resume_if_hard_paused();
7403 
7404  /* begin initialization */
7405  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7406  KMP_ASSERT(KMP_UBER_GTID(gtid));
7407 
7408 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7409  // Save the FP control regs.
7410  // Worker threads will set theirs to these values at thread startup.
7411  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7412  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7413  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7414 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7415 
7416 #if KMP_OS_UNIX
7417 #if KMP_HANDLE_SIGNALS
7418  /* must be after __kmp_serial_initialize */
7419  __kmp_install_signals(TRUE);
7420 #endif
7421 #endif
7422 
7423  __kmp_suspend_initialize();
7424 
7425 #if defined(USE_LOAD_BALANCE)
7426  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7427  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7428  }
7429 #else
7430  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7431  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7432  }
7433 #endif
7434 
7435  if (__kmp_version) {
7436  __kmp_print_version_2();
7437  }
7438 
7439  /* we have finished parallel initialization */
7440  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7441 
7442  KMP_MB();
7443  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7444 
7445  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7446 }
7447 
7448 void __kmp_hidden_helper_initialize() {
7449  if (TCR_4(__kmp_init_hidden_helper))
7450  return;
7451 
7452  // __kmp_parallel_initialize is required before we initialize hidden helper
7453  if (!TCR_4(__kmp_init_parallel))
7454  __kmp_parallel_initialize();
7455 
7456  // Double check. Note that this double check should not be placed before
7457  // __kmp_parallel_initialize as it will cause dead lock.
7458  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7459  if (TCR_4(__kmp_init_hidden_helper)) {
7460  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7461  return;
7462  }
7463 
7464  // Set the count of hidden helper tasks to be executed to zero
7465  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7466 
7467  // Set the global variable indicating that we're initializing hidden helper
7468  // team/threads
7469  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7470 
7471  // Platform independent initialization
7472  __kmp_do_initialize_hidden_helper_threads();
7473 
7474  // Wait here for the finish of initialization of hidden helper teams
7475  __kmp_hidden_helper_threads_initz_wait();
7476 
7477  // We have finished hidden helper initialization
7478  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7479 
7480  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7481 }
7482 
7483 /* ------------------------------------------------------------------------ */
7484 
7485 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7486  kmp_team_t *team) {
7487  kmp_disp_t *dispatch;
7488 
7489  KMP_MB();
7490 
7491  /* none of the threads have encountered any constructs, yet. */
7492  this_thr->th.th_local.this_construct = 0;
7493 #if KMP_CACHE_MANAGE
7494  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7495 #endif /* KMP_CACHE_MANAGE */
7496  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7497  KMP_DEBUG_ASSERT(dispatch);
7498  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7499  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7500  // this_thr->th.th_info.ds.ds_tid ] );
7501 
7502  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7503  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7504  if (__kmp_env_consistency_check)
7505  __kmp_push_parallel(gtid, team->t.t_ident);
7506 
7507  KMP_MB(); /* Flush all pending memory write invalidates. */
7508 }
7509 
7510 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7511  kmp_team_t *team) {
7512  if (__kmp_env_consistency_check)
7513  __kmp_pop_parallel(gtid, team->t.t_ident);
7514 
7515  __kmp_finish_implicit_task(this_thr);
7516 }
7517 
7518 int __kmp_invoke_task_func(int gtid) {
7519  int rc;
7520  int tid = __kmp_tid_from_gtid(gtid);
7521  kmp_info_t *this_thr = __kmp_threads[gtid];
7522  kmp_team_t *team = this_thr->th.th_team;
7523 
7524  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7525 #if USE_ITT_BUILD
7526  if (__itt_stack_caller_create_ptr) {
7527  // inform ittnotify about entering user's code
7528  if (team->t.t_stack_id != NULL) {
7529  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7530  } else {
7531  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7532  __kmp_itt_stack_callee_enter(
7533  (__itt_caller)team->t.t_parent->t.t_stack_id);
7534  }
7535  }
7536 #endif /* USE_ITT_BUILD */
7537 #if INCLUDE_SSC_MARKS
7538  SSC_MARK_INVOKING();
7539 #endif
7540 
7541 #if OMPT_SUPPORT
7542  void *dummy;
7543  void **exit_frame_p;
7544  ompt_data_t *my_task_data;
7545  ompt_data_t *my_parallel_data;
7546  int ompt_team_size;
7547 
7548  if (ompt_enabled.enabled) {
7549  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7550  .ompt_task_info.frame.exit_frame.ptr);
7551  } else {
7552  exit_frame_p = &dummy;
7553  }
7554 
7555  my_task_data =
7556  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7557  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7558  if (ompt_enabled.ompt_callback_implicit_task) {
7559  ompt_team_size = team->t.t_nproc;
7560  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7561  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7562  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7563  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7564  }
7565 #endif
7566 
7567 #if KMP_STATS_ENABLED
7568  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7569  if (previous_state == stats_state_e::TEAMS_REGION) {
7570  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7571  } else {
7572  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7573  }
7574  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7575 #endif
7576 
7577  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7578  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7579 #if OMPT_SUPPORT
7580  ,
7581  exit_frame_p
7582 #endif
7583  );
7584 #if OMPT_SUPPORT
7585  *exit_frame_p = NULL;
7586  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7587 #endif
7588 
7589 #if KMP_STATS_ENABLED
7590  if (previous_state == stats_state_e::TEAMS_REGION) {
7591  KMP_SET_THREAD_STATE(previous_state);
7592  }
7593  KMP_POP_PARTITIONED_TIMER();
7594 #endif
7595 
7596 #if USE_ITT_BUILD
7597  if (__itt_stack_caller_create_ptr) {
7598  // inform ittnotify about leaving user's code
7599  if (team->t.t_stack_id != NULL) {
7600  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7601  } else {
7602  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7603  __kmp_itt_stack_callee_leave(
7604  (__itt_caller)team->t.t_parent->t.t_stack_id);
7605  }
7606  }
7607 #endif /* USE_ITT_BUILD */
7608  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7609 
7610  return rc;
7611 }
7612 
7613 void __kmp_teams_master(int gtid) {
7614  // This routine is called by all primary threads in teams construct
7615  kmp_info_t *thr = __kmp_threads[gtid];
7616  kmp_team_t *team = thr->th.th_team;
7617  ident_t *loc = team->t.t_ident;
7618  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7619  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7620  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7621  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7622  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7623 
7624  // This thread is a new CG root. Set up the proper variables.
7625  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7626  tmp->cg_root = thr; // Make thr the CG root
7627  // Init to thread limit stored when league primary threads were forked
7628  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7629  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7630  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7631  " cg_nthreads to 1\n",
7632  thr, tmp));
7633  tmp->up = thr->th.th_cg_roots;
7634  thr->th.th_cg_roots = tmp;
7635 
7636 // Launch league of teams now, but not let workers execute
7637 // (they hang on fork barrier until next parallel)
7638 #if INCLUDE_SSC_MARKS
7639  SSC_MARK_FORKING();
7640 #endif
7641  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7642  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7643  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7644 #if INCLUDE_SSC_MARKS
7645  SSC_MARK_JOINING();
7646 #endif
7647  // If the team size was reduced from the limit, set it to the new size
7648  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7649  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7650  // AC: last parameter "1" eliminates join barrier which won't work because
7651  // worker threads are in a fork barrier waiting for more parallel regions
7652  __kmp_join_call(loc, gtid
7653 #if OMPT_SUPPORT
7654  ,
7655  fork_context_intel
7656 #endif
7657  ,
7658  1);
7659 }
7660 
7661 int __kmp_invoke_teams_master(int gtid) {
7662  kmp_info_t *this_thr = __kmp_threads[gtid];
7663  kmp_team_t *team = this_thr->th.th_team;
7664 #if KMP_DEBUG
7665  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7666  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7667  (void *)__kmp_teams_master);
7668 #endif
7669  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7670 #if OMPT_SUPPORT
7671  int tid = __kmp_tid_from_gtid(gtid);
7672  ompt_data_t *task_data =
7673  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7674  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7675  if (ompt_enabled.ompt_callback_implicit_task) {
7676  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7677  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7678  ompt_task_initial);
7679  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7680  }
7681 #endif
7682  __kmp_teams_master(gtid);
7683 #if OMPT_SUPPORT
7684  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7685 #endif
7686  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7687  return 1;
7688 }
7689 
7690 /* this sets the requested number of threads for the next parallel region
7691  encountered by this team. since this should be enclosed in the forkjoin
7692  critical section it should avoid race conditions with asymmetrical nested
7693  parallelism */
7694 
7695 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7696  kmp_info_t *thr = __kmp_threads[gtid];
7697 
7698  if (num_threads > 0)
7699  thr->th.th_set_nproc = num_threads;
7700 }
7701 
7702 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7703  int num_threads) {
7704  KMP_DEBUG_ASSERT(thr);
7705  // Remember the number of threads for inner parallel regions
7706  if (!TCR_4(__kmp_init_middle))
7707  __kmp_middle_initialize(); // get internal globals calculated
7708  __kmp_assign_root_init_mask();
7709  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7710  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7711 
7712  if (num_threads == 0) {
7713  if (__kmp_teams_thread_limit > 0) {
7714  num_threads = __kmp_teams_thread_limit;
7715  } else {
7716  num_threads = __kmp_avail_proc / num_teams;
7717  }
7718  // adjust num_threads w/o warning as it is not user setting
7719  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7720  // no thread_limit clause specified - do not change thread-limit-var ICV
7721  if (num_threads > __kmp_dflt_team_nth) {
7722  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7723  }
7724  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7725  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7726  } // prevent team size to exceed thread-limit-var
7727  if (num_teams * num_threads > __kmp_teams_max_nth) {
7728  num_threads = __kmp_teams_max_nth / num_teams;
7729  }
7730  if (num_threads == 0) {
7731  num_threads = 1;
7732  }
7733  } else {
7734  if (num_threads < 0) {
7735  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7736  __kmp_msg_null);
7737  num_threads = 1;
7738  }
7739  // This thread will be the primary thread of the league primary threads
7740  // Store new thread limit; old limit is saved in th_cg_roots list
7741  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7742  // num_threads = min(num_threads, nthreads-var)
7743  if (num_threads > __kmp_dflt_team_nth) {
7744  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7745  }
7746  if (num_teams * num_threads > __kmp_teams_max_nth) {
7747  int new_threads = __kmp_teams_max_nth / num_teams;
7748  if (new_threads == 0) {
7749  new_threads = 1;
7750  }
7751  if (new_threads != num_threads) {
7752  if (!__kmp_reserve_warn) { // user asked for too many threads
7753  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7754  __kmp_msg(kmp_ms_warning,
7755  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7756  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7757  }
7758  }
7759  num_threads = new_threads;
7760  }
7761  }
7762  thr->th.th_teams_size.nth = num_threads;
7763 }
7764 
7765 /* this sets the requested number of teams for the teams region and/or
7766  the number of threads for the next parallel region encountered */
7767 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7768  int num_threads) {
7769  kmp_info_t *thr = __kmp_threads[gtid];
7770  if (num_teams < 0) {
7771  // OpenMP specification requires requested values to be positive,
7772  // but people can send us any value, so we'd better check
7773  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7774  __kmp_msg_null);
7775  num_teams = 1;
7776  }
7777  if (num_teams == 0) {
7778  if (__kmp_nteams > 0) {
7779  num_teams = __kmp_nteams;
7780  } else {
7781  num_teams = 1; // default number of teams is 1.
7782  }
7783  }
7784  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7785  if (!__kmp_reserve_warn) {
7786  __kmp_reserve_warn = 1;
7787  __kmp_msg(kmp_ms_warning,
7788  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7789  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7790  }
7791  num_teams = __kmp_teams_max_nth;
7792  }
7793  // Set number of teams (number of threads in the outer "parallel" of the
7794  // teams)
7795  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7796 
7797  __kmp_push_thread_limit(thr, num_teams, num_threads);
7798 }
7799 
7800 /* This sets the requested number of teams for the teams region and/or
7801  the number of threads for the next parallel region encountered */
7802 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7803  int num_teams_ub, int num_threads) {
7804  kmp_info_t *thr = __kmp_threads[gtid];
7805  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7806  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7807  KMP_DEBUG_ASSERT(num_threads >= 0);
7808 
7809  if (num_teams_lb > num_teams_ub) {
7810  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7811  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7812  }
7813 
7814  int num_teams = 1; // defalt number of teams is 1.
7815 
7816  if (num_teams_lb == 0 && num_teams_ub > 0)
7817  num_teams_lb = num_teams_ub;
7818 
7819  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7820  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7821  if (num_teams > __kmp_teams_max_nth) {
7822  if (!__kmp_reserve_warn) {
7823  __kmp_reserve_warn = 1;
7824  __kmp_msg(kmp_ms_warning,
7825  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7826  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7827  }
7828  num_teams = __kmp_teams_max_nth;
7829  }
7830  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7831  num_teams = num_teams_ub;
7832  } else { // num_teams_lb <= num_teams <= num_teams_ub
7833  if (num_threads <= 0) {
7834  if (num_teams_ub > __kmp_teams_max_nth) {
7835  num_teams = num_teams_lb;
7836  } else {
7837  num_teams = num_teams_ub;
7838  }
7839  } else {
7840  num_teams = (num_threads > __kmp_teams_max_nth)
7841  ? num_teams
7842  : __kmp_teams_max_nth / num_threads;
7843  if (num_teams < num_teams_lb) {
7844  num_teams = num_teams_lb;
7845  } else if (num_teams > num_teams_ub) {
7846  num_teams = num_teams_ub;
7847  }
7848  }
7849  }
7850  // Set number of teams (number of threads in the outer "parallel" of the
7851  // teams)
7852  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7853 
7854  __kmp_push_thread_limit(thr, num_teams, num_threads);
7855 }
7856 
7857 // Set the proc_bind var to use in the following parallel region.
7858 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7859  kmp_info_t *thr = __kmp_threads[gtid];
7860  thr->th.th_set_proc_bind = proc_bind;
7861 }
7862 
7863 /* Launch the worker threads into the microtask. */
7864 
7865 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7866  kmp_info_t *this_thr = __kmp_threads[gtid];
7867 
7868 #ifdef KMP_DEBUG
7869  int f;
7870 #endif /* KMP_DEBUG */
7871 
7872  KMP_DEBUG_ASSERT(team);
7873  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7874  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7875  KMP_MB(); /* Flush all pending memory write invalidates. */
7876 
7877  team->t.t_construct = 0; /* no single directives seen yet */
7878  team->t.t_ordered.dt.t_value =
7879  0; /* thread 0 enters the ordered section first */
7880 
7881  /* Reset the identifiers on the dispatch buffer */
7882  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7883  if (team->t.t_max_nproc > 1) {
7884  int i;
7885  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7886  team->t.t_disp_buffer[i].buffer_index = i;
7887  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7888  }
7889  } else {
7890  team->t.t_disp_buffer[0].buffer_index = 0;
7891  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7892  }
7893 
7894  KMP_MB(); /* Flush all pending memory write invalidates. */
7895  KMP_ASSERT(this_thr->th.th_team == team);
7896 
7897 #ifdef KMP_DEBUG
7898  for (f = 0; f < team->t.t_nproc; f++) {
7899  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7900  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7901  }
7902 #endif /* KMP_DEBUG */
7903 
7904  /* release the worker threads so they may begin working */
7905  __kmp_fork_barrier(gtid, 0);
7906 }
7907 
7908 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7909  kmp_info_t *this_thr = __kmp_threads[gtid];
7910 
7911  KMP_DEBUG_ASSERT(team);
7912  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7913  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7914  KMP_MB(); /* Flush all pending memory write invalidates. */
7915 
7916  /* Join barrier after fork */
7917 
7918 #ifdef KMP_DEBUG
7919  if (__kmp_threads[gtid] &&
7920  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7921  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7922  __kmp_threads[gtid]);
7923  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7924  "team->t.t_nproc=%d\n",
7925  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7926  team->t.t_nproc);
7927  __kmp_print_structure();
7928  }
7929  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7930  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7931 #endif /* KMP_DEBUG */
7932 
7933  __kmp_join_barrier(gtid); /* wait for everyone */
7934 #if OMPT_SUPPORT
7935  if (ompt_enabled.enabled &&
7936  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7937  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7938  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7939  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7940 #if OMPT_OPTIONAL
7941  void *codeptr = NULL;
7942  if (KMP_MASTER_TID(ds_tid) &&
7943  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7944  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7945  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7946 
7947  if (ompt_enabled.ompt_callback_sync_region_wait) {
7948  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7949  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7950  codeptr);
7951  }
7952  if (ompt_enabled.ompt_callback_sync_region) {
7953  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7954  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7955  codeptr);
7956  }
7957 #endif
7958  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7959  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7960  ompt_scope_end, NULL, task_data, 0, ds_tid,
7961  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7962  }
7963  }
7964 #endif
7965 
7966  KMP_MB(); /* Flush all pending memory write invalidates. */
7967  KMP_ASSERT(this_thr->th.th_team == team);
7968 }
7969 
7970 /* ------------------------------------------------------------------------ */
7971 
7972 #ifdef USE_LOAD_BALANCE
7973 
7974 // Return the worker threads actively spinning in the hot team, if we
7975 // are at the outermost level of parallelism. Otherwise, return 0.
7976 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7977  int i;
7978  int retval;
7979  kmp_team_t *hot_team;
7980 
7981  if (root->r.r_active) {
7982  return 0;
7983  }
7984  hot_team = root->r.r_hot_team;
7985  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7986  return hot_team->t.t_nproc - 1; // Don't count primary thread
7987  }
7988 
7989  // Skip the primary thread - it is accounted for elsewhere.
7990  retval = 0;
7991  for (i = 1; i < hot_team->t.t_nproc; i++) {
7992  if (hot_team->t.t_threads[i]->th.th_active) {
7993  retval++;
7994  }
7995  }
7996  return retval;
7997 }
7998 
7999 // Perform an automatic adjustment to the number of
8000 // threads used by the next parallel region.
8001 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8002  int retval;
8003  int pool_active;
8004  int hot_team_active;
8005  int team_curr_active;
8006  int system_active;
8007 
8008  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8009  set_nproc));
8010  KMP_DEBUG_ASSERT(root);
8011  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8012  ->th.th_current_task->td_icvs.dynamic == TRUE);
8013  KMP_DEBUG_ASSERT(set_nproc > 1);
8014 
8015  if (set_nproc == 1) {
8016  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8017  return 1;
8018  }
8019 
8020  // Threads that are active in the thread pool, active in the hot team for this
8021  // particular root (if we are at the outer par level), and the currently
8022  // executing thread (to become the primary thread) are available to add to the
8023  // new team, but are currently contributing to the system load, and must be
8024  // accounted for.
8025  pool_active = __kmp_thread_pool_active_nth;
8026  hot_team_active = __kmp_active_hot_team_nproc(root);
8027  team_curr_active = pool_active + hot_team_active + 1;
8028 
8029  // Check the system load.
8030  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8031  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8032  "hot team active = %d\n",
8033  system_active, pool_active, hot_team_active));
8034 
8035  if (system_active < 0) {
8036  // There was an error reading the necessary info from /proc, so use the
8037  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8038  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8039  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8040  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8041 
8042  // Make this call behave like the thread limit algorithm.
8043  retval = __kmp_avail_proc - __kmp_nth +
8044  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8045  if (retval > set_nproc) {
8046  retval = set_nproc;
8047  }
8048  if (retval < KMP_MIN_NTH) {
8049  retval = KMP_MIN_NTH;
8050  }
8051 
8052  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8053  retval));
8054  return retval;
8055  }
8056 
8057  // There is a slight delay in the load balance algorithm in detecting new
8058  // running procs. The real system load at this instant should be at least as
8059  // large as the #active omp thread that are available to add to the team.
8060  if (system_active < team_curr_active) {
8061  system_active = team_curr_active;
8062  }
8063  retval = __kmp_avail_proc - system_active + team_curr_active;
8064  if (retval > set_nproc) {
8065  retval = set_nproc;
8066  }
8067  if (retval < KMP_MIN_NTH) {
8068  retval = KMP_MIN_NTH;
8069  }
8070 
8071  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8072  return retval;
8073 } // __kmp_load_balance_nproc()
8074 
8075 #endif /* USE_LOAD_BALANCE */
8076 
8077 /* ------------------------------------------------------------------------ */
8078 
8079 /* NOTE: this is called with the __kmp_init_lock held */
8080 void __kmp_cleanup(void) {
8081  int f;
8082 
8083  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8084 
8085  if (TCR_4(__kmp_init_parallel)) {
8086 #if KMP_HANDLE_SIGNALS
8087  __kmp_remove_signals();
8088 #endif
8089  TCW_4(__kmp_init_parallel, FALSE);
8090  }
8091 
8092  if (TCR_4(__kmp_init_middle)) {
8093 #if KMP_AFFINITY_SUPPORTED
8094  __kmp_affinity_uninitialize();
8095 #endif /* KMP_AFFINITY_SUPPORTED */
8096  __kmp_cleanup_hierarchy();
8097  TCW_4(__kmp_init_middle, FALSE);
8098  }
8099 
8100  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8101 
8102  if (__kmp_init_serial) {
8103  __kmp_runtime_destroy();
8104  __kmp_init_serial = FALSE;
8105  }
8106 
8107  __kmp_cleanup_threadprivate_caches();
8108 
8109  for (f = 0; f < __kmp_threads_capacity; f++) {
8110  if (__kmp_root[f] != NULL) {
8111  __kmp_free(__kmp_root[f]);
8112  __kmp_root[f] = NULL;
8113  }
8114  }
8115  __kmp_free(__kmp_threads);
8116  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8117  // there is no need in freeing __kmp_root.
8118  __kmp_threads = NULL;
8119  __kmp_root = NULL;
8120  __kmp_threads_capacity = 0;
8121 
8122  // Free old __kmp_threads arrays if they exist.
8123  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8124  while (ptr) {
8125  kmp_old_threads_list_t *next = ptr->next;
8126  __kmp_free(ptr->threads);
8127  __kmp_free(ptr);
8128  ptr = next;
8129  }
8130 
8131 #if KMP_USE_DYNAMIC_LOCK
8132  __kmp_cleanup_indirect_user_locks();
8133 #else
8134  __kmp_cleanup_user_locks();
8135 #endif
8136 #if OMPD_SUPPORT
8137  if (ompd_state) {
8138  __kmp_free(ompd_env_block);
8139  ompd_env_block = NULL;
8140  ompd_env_block_size = 0;
8141  }
8142 #endif
8143 
8144 #if KMP_AFFINITY_SUPPORTED
8145  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8146  __kmp_cpuinfo_file = NULL;
8147 #endif /* KMP_AFFINITY_SUPPORTED */
8148 
8149 #if KMP_USE_ADAPTIVE_LOCKS
8150 #if KMP_DEBUG_ADAPTIVE_LOCKS
8151  __kmp_print_speculative_stats();
8152 #endif
8153 #endif
8154  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8155  __kmp_nested_nth.nth = NULL;
8156  __kmp_nested_nth.size = 0;
8157  __kmp_nested_nth.used = 0;
8158  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8159  __kmp_nested_proc_bind.bind_types = NULL;
8160  __kmp_nested_proc_bind.size = 0;
8161  __kmp_nested_proc_bind.used = 0;
8162  if (__kmp_affinity_format) {
8163  KMP_INTERNAL_FREE(__kmp_affinity_format);
8164  __kmp_affinity_format = NULL;
8165  }
8166 
8167  __kmp_i18n_catclose();
8168 
8169 #if KMP_USE_HIER_SCHED
8170  __kmp_hier_scheds.deallocate();
8171 #endif
8172 
8173 #if KMP_STATS_ENABLED
8174  __kmp_stats_fini();
8175 #endif
8176 
8177  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8178 }
8179 
8180 /* ------------------------------------------------------------------------ */
8181 
8182 int __kmp_ignore_mppbeg(void) {
8183  char *env;
8184 
8185  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8186  if (__kmp_str_match_false(env))
8187  return FALSE;
8188  }
8189  // By default __kmpc_begin() is no-op.
8190  return TRUE;
8191 }
8192 
8193 int __kmp_ignore_mppend(void) {
8194  char *env;
8195 
8196  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8197  if (__kmp_str_match_false(env))
8198  return FALSE;
8199  }
8200  // By default __kmpc_end() is no-op.
8201  return TRUE;
8202 }
8203 
8204 void __kmp_internal_begin(void) {
8205  int gtid;
8206  kmp_root_t *root;
8207 
8208  /* this is a very important step as it will register new sibling threads
8209  and assign these new uber threads a new gtid */
8210  gtid = __kmp_entry_gtid();
8211  root = __kmp_threads[gtid]->th.th_root;
8212  KMP_ASSERT(KMP_UBER_GTID(gtid));
8213 
8214  if (root->r.r_begin)
8215  return;
8216  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8217  if (root->r.r_begin) {
8218  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8219  return;
8220  }
8221 
8222  root->r.r_begin = TRUE;
8223 
8224  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8225 }
8226 
8227 /* ------------------------------------------------------------------------ */
8228 
8229 void __kmp_user_set_library(enum library_type arg) {
8230  int gtid;
8231  kmp_root_t *root;
8232  kmp_info_t *thread;
8233 
8234  /* first, make sure we are initialized so we can get our gtid */
8235 
8236  gtid = __kmp_entry_gtid();
8237  thread = __kmp_threads[gtid];
8238 
8239  root = thread->th.th_root;
8240 
8241  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8242  library_serial));
8243  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8244  thread */
8245  KMP_WARNING(SetLibraryIncorrectCall);
8246  return;
8247  }
8248 
8249  switch (arg) {
8250  case library_serial:
8251  thread->th.th_set_nproc = 0;
8252  set__nproc(thread, 1);
8253  break;
8254  case library_turnaround:
8255  thread->th.th_set_nproc = 0;
8256  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8257  : __kmp_dflt_team_nth_ub);
8258  break;
8259  case library_throughput:
8260  thread->th.th_set_nproc = 0;
8261  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8262  : __kmp_dflt_team_nth_ub);
8263  break;
8264  default:
8265  KMP_FATAL(UnknownLibraryType, arg);
8266  }
8267 
8268  __kmp_aux_set_library(arg);
8269 }
8270 
8271 void __kmp_aux_set_stacksize(size_t arg) {
8272  if (!__kmp_init_serial)
8273  __kmp_serial_initialize();
8274 
8275 #if KMP_OS_DARWIN
8276  if (arg & (0x1000 - 1)) {
8277  arg &= ~(0x1000 - 1);
8278  if (arg + 0x1000) /* check for overflow if we round up */
8279  arg += 0x1000;
8280  }
8281 #endif
8282  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8283 
8284  /* only change the default stacksize before the first parallel region */
8285  if (!TCR_4(__kmp_init_parallel)) {
8286  size_t value = arg; /* argument is in bytes */
8287 
8288  if (value < __kmp_sys_min_stksize)
8289  value = __kmp_sys_min_stksize;
8290  else if (value > KMP_MAX_STKSIZE)
8291  value = KMP_MAX_STKSIZE;
8292 
8293  __kmp_stksize = value;
8294 
8295  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8296  }
8297 
8298  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8299 }
8300 
8301 /* set the behaviour of the runtime library */
8302 /* TODO this can cause some odd behaviour with sibling parallelism... */
8303 void __kmp_aux_set_library(enum library_type arg) {
8304  __kmp_library = arg;
8305 
8306  switch (__kmp_library) {
8307  case library_serial: {
8308  KMP_INFORM(LibraryIsSerial);
8309  } break;
8310  case library_turnaround:
8311  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8312  __kmp_use_yield = 2; // only yield when oversubscribed
8313  break;
8314  case library_throughput:
8315  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8316  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8317  break;
8318  default:
8319  KMP_FATAL(UnknownLibraryType, arg);
8320  }
8321 }
8322 
8323 /* Getting team information common for all team API */
8324 // Returns NULL if not in teams construct
8325 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8326  kmp_info_t *thr = __kmp_entry_thread();
8327  teams_serialized = 0;
8328  if (thr->th.th_teams_microtask) {
8329  kmp_team_t *team = thr->th.th_team;
8330  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8331  int ii = team->t.t_level;
8332  teams_serialized = team->t.t_serialized;
8333  int level = tlevel + 1;
8334  KMP_DEBUG_ASSERT(ii >= tlevel);
8335  while (ii > level) {
8336  for (teams_serialized = team->t.t_serialized;
8337  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8338  }
8339  if (team->t.t_serialized && (!teams_serialized)) {
8340  team = team->t.t_parent;
8341  continue;
8342  }
8343  if (ii > level) {
8344  team = team->t.t_parent;
8345  ii--;
8346  }
8347  }
8348  return team;
8349  }
8350  return NULL;
8351 }
8352 
8353 int __kmp_aux_get_team_num() {
8354  int serialized;
8355  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8356  if (team) {
8357  if (serialized > 1) {
8358  return 0; // teams region is serialized ( 1 team of 1 thread ).
8359  } else {
8360  return team->t.t_master_tid;
8361  }
8362  }
8363  return 0;
8364 }
8365 
8366 int __kmp_aux_get_num_teams() {
8367  int serialized;
8368  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8369  if (team) {
8370  if (serialized > 1) {
8371  return 1;
8372  } else {
8373  return team->t.t_parent->t.t_nproc;
8374  }
8375  }
8376  return 1;
8377 }
8378 
8379 /* ------------------------------------------------------------------------ */
8380 
8381 /*
8382  * Affinity Format Parser
8383  *
8384  * Field is in form of: %[[[0].]size]type
8385  * % and type are required (%% means print a literal '%')
8386  * type is either single char or long name surrounded by {},
8387  * e.g., N or {num_threads}
8388  * 0 => leading zeros
8389  * . => right justified when size is specified
8390  * by default output is left justified
8391  * size is the *minimum* field length
8392  * All other characters are printed as is
8393  *
8394  * Available field types:
8395  * L {thread_level} - omp_get_level()
8396  * n {thread_num} - omp_get_thread_num()
8397  * h {host} - name of host machine
8398  * P {process_id} - process id (integer)
8399  * T {thread_identifier} - native thread identifier (integer)
8400  * N {num_threads} - omp_get_num_threads()
8401  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8402  * a {thread_affinity} - comma separated list of integers or integer ranges
8403  * (values of affinity mask)
8404  *
8405  * Implementation-specific field types can be added
8406  * If a type is unknown, print "undefined"
8407  */
8408 
8409 // Structure holding the short name, long name, and corresponding data type
8410 // for snprintf. A table of these will represent the entire valid keyword
8411 // field types.
8412 typedef struct kmp_affinity_format_field_t {
8413  char short_name; // from spec e.g., L -> thread level
8414  const char *long_name; // from spec thread_level -> thread level
8415  char field_format; // data type for snprintf (typically 'd' or 's'
8416  // for integer or string)
8417 } kmp_affinity_format_field_t;
8418 
8419 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8420 #if KMP_AFFINITY_SUPPORTED
8421  {'A', "thread_affinity", 's'},
8422 #endif
8423  {'t', "team_num", 'd'},
8424  {'T', "num_teams", 'd'},
8425  {'L', "nesting_level", 'd'},
8426  {'n', "thread_num", 'd'},
8427  {'N', "num_threads", 'd'},
8428  {'a', "ancestor_tnum", 'd'},
8429  {'H', "host", 's'},
8430  {'P', "process_id", 'd'},
8431  {'i', "native_thread_id", 'd'}};
8432 
8433 // Return the number of characters it takes to hold field
8434 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8435  const char **ptr,
8436  kmp_str_buf_t *field_buffer) {
8437  int rc, format_index, field_value;
8438  const char *width_left, *width_right;
8439  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8440  static const int FORMAT_SIZE = 20;
8441  char format[FORMAT_SIZE] = {0};
8442  char absolute_short_name = 0;
8443 
8444  KMP_DEBUG_ASSERT(gtid >= 0);
8445  KMP_DEBUG_ASSERT(th);
8446  KMP_DEBUG_ASSERT(**ptr == '%');
8447  KMP_DEBUG_ASSERT(field_buffer);
8448 
8449  __kmp_str_buf_clear(field_buffer);
8450 
8451  // Skip the initial %
8452  (*ptr)++;
8453 
8454  // Check for %% first
8455  if (**ptr == '%') {
8456  __kmp_str_buf_cat(field_buffer, "%", 1);
8457  (*ptr)++; // skip over the second %
8458  return 1;
8459  }
8460 
8461  // Parse field modifiers if they are present
8462  pad_zeros = false;
8463  if (**ptr == '0') {
8464  pad_zeros = true;
8465  (*ptr)++; // skip over 0
8466  }
8467  right_justify = false;
8468  if (**ptr == '.') {
8469  right_justify = true;
8470  (*ptr)++; // skip over .
8471  }
8472  // Parse width of field: [width_left, width_right)
8473  width_left = width_right = NULL;
8474  if (**ptr >= '0' && **ptr <= '9') {
8475  width_left = *ptr;
8476  SKIP_DIGITS(*ptr);
8477  width_right = *ptr;
8478  }
8479 
8480  // Create the format for KMP_SNPRINTF based on flags parsed above
8481  format_index = 0;
8482  format[format_index++] = '%';
8483  if (!right_justify)
8484  format[format_index++] = '-';
8485  if (pad_zeros)
8486  format[format_index++] = '0';
8487  if (width_left && width_right) {
8488  int i = 0;
8489  // Only allow 8 digit number widths.
8490  // This also prevents overflowing format variable
8491  while (i < 8 && width_left < width_right) {
8492  format[format_index++] = *width_left;
8493  width_left++;
8494  i++;
8495  }
8496  }
8497 
8498  // Parse a name (long or short)
8499  // Canonicalize the name into absolute_short_name
8500  found_valid_name = false;
8501  parse_long_name = (**ptr == '{');
8502  if (parse_long_name)
8503  (*ptr)++; // skip initial left brace
8504  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8505  sizeof(__kmp_affinity_format_table[0]);
8506  ++i) {
8507  char short_name = __kmp_affinity_format_table[i].short_name;
8508  const char *long_name = __kmp_affinity_format_table[i].long_name;
8509  char field_format = __kmp_affinity_format_table[i].field_format;
8510  if (parse_long_name) {
8511  size_t length = KMP_STRLEN(long_name);
8512  if (strncmp(*ptr, long_name, length) == 0) {
8513  found_valid_name = true;
8514  (*ptr) += length; // skip the long name
8515  }
8516  } else if (**ptr == short_name) {
8517  found_valid_name = true;
8518  (*ptr)++; // skip the short name
8519  }
8520  if (found_valid_name) {
8521  format[format_index++] = field_format;
8522  format[format_index++] = '\0';
8523  absolute_short_name = short_name;
8524  break;
8525  }
8526  }
8527  if (parse_long_name) {
8528  if (**ptr != '}') {
8529  absolute_short_name = 0;
8530  } else {
8531  (*ptr)++; // skip over the right brace
8532  }
8533  }
8534 
8535  // Attempt to fill the buffer with the requested
8536  // value using snprintf within __kmp_str_buf_print()
8537  switch (absolute_short_name) {
8538  case 't':
8539  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8540  break;
8541  case 'T':
8542  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8543  break;
8544  case 'L':
8545  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8546  break;
8547  case 'n':
8548  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8549  break;
8550  case 'H': {
8551  static const int BUFFER_SIZE = 256;
8552  char buf[BUFFER_SIZE];
8553  __kmp_expand_host_name(buf, BUFFER_SIZE);
8554  rc = __kmp_str_buf_print(field_buffer, format, buf);
8555  } break;
8556  case 'P':
8557  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8558  break;
8559  case 'i':
8560  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8561  break;
8562  case 'N':
8563  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8564  break;
8565  case 'a':
8566  field_value =
8567  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8568  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8569  break;
8570 #if KMP_AFFINITY_SUPPORTED
8571  case 'A': {
8572  kmp_str_buf_t buf;
8573  __kmp_str_buf_init(&buf);
8574  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8575  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8576  __kmp_str_buf_free(&buf);
8577  } break;
8578 #endif
8579  default:
8580  // According to spec, If an implementation does not have info for field
8581  // type, then "undefined" is printed
8582  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8583  // Skip the field
8584  if (parse_long_name) {
8585  SKIP_TOKEN(*ptr);
8586  if (**ptr == '}')
8587  (*ptr)++;
8588  } else {
8589  (*ptr)++;
8590  }
8591  }
8592 
8593  KMP_ASSERT(format_index <= FORMAT_SIZE);
8594  return rc;
8595 }
8596 
8597 /*
8598  * Return number of characters needed to hold the affinity string
8599  * (not including null byte character)
8600  * The resultant string is printed to buffer, which the caller can then
8601  * handle afterwards
8602  */
8603 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8604  kmp_str_buf_t *buffer) {
8605  const char *parse_ptr;
8606  size_t retval;
8607  const kmp_info_t *th;
8608  kmp_str_buf_t field;
8609 
8610  KMP_DEBUG_ASSERT(buffer);
8611  KMP_DEBUG_ASSERT(gtid >= 0);
8612 
8613  __kmp_str_buf_init(&field);
8614  __kmp_str_buf_clear(buffer);
8615 
8616  th = __kmp_threads[gtid];
8617  retval = 0;
8618 
8619  // If format is NULL or zero-length string, then we use
8620  // affinity-format-var ICV
8621  parse_ptr = format;
8622  if (parse_ptr == NULL || *parse_ptr == '\0') {
8623  parse_ptr = __kmp_affinity_format;
8624  }
8625  KMP_DEBUG_ASSERT(parse_ptr);
8626 
8627  while (*parse_ptr != '\0') {
8628  // Parse a field
8629  if (*parse_ptr == '%') {
8630  // Put field in the buffer
8631  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8632  __kmp_str_buf_catbuf(buffer, &field);
8633  retval += rc;
8634  } else {
8635  // Put literal character in buffer
8636  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8637  retval++;
8638  parse_ptr++;
8639  }
8640  }
8641  __kmp_str_buf_free(&field);
8642  return retval;
8643 }
8644 
8645 // Displays the affinity string to stdout
8646 void __kmp_aux_display_affinity(int gtid, const char *format) {
8647  kmp_str_buf_t buf;
8648  __kmp_str_buf_init(&buf);
8649  __kmp_aux_capture_affinity(gtid, format, &buf);
8650  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8651  __kmp_str_buf_free(&buf);
8652 }
8653 
8654 /* ------------------------------------------------------------------------ */
8655 
8656 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8657  int blocktime = arg; /* argument is in milliseconds */
8658 #if KMP_USE_MONITOR
8659  int bt_intervals;
8660 #endif
8661  kmp_int8 bt_set;
8662 
8663  __kmp_save_internal_controls(thread);
8664 
8665  /* Normalize and set blocktime for the teams */
8666  if (blocktime < KMP_MIN_BLOCKTIME)
8667  blocktime = KMP_MIN_BLOCKTIME;
8668  else if (blocktime > KMP_MAX_BLOCKTIME)
8669  blocktime = KMP_MAX_BLOCKTIME;
8670 
8671  set__blocktime_team(thread->th.th_team, tid, blocktime);
8672  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8673 
8674 #if KMP_USE_MONITOR
8675  /* Calculate and set blocktime intervals for the teams */
8676  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8677 
8678  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8679  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8680 #endif
8681 
8682  /* Set whether blocktime has been set to "TRUE" */
8683  bt_set = TRUE;
8684 
8685  set__bt_set_team(thread->th.th_team, tid, bt_set);
8686  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8687 #if KMP_USE_MONITOR
8688  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8689  "bt_intervals=%d, monitor_updates=%d\n",
8690  __kmp_gtid_from_tid(tid, thread->th.th_team),
8691  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8692  __kmp_monitor_wakeups));
8693 #else
8694  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8695  __kmp_gtid_from_tid(tid, thread->th.th_team),
8696  thread->th.th_team->t.t_id, tid, blocktime));
8697 #endif
8698 }
8699 
8700 void __kmp_aux_set_defaults(char const *str, size_t len) {
8701  if (!__kmp_init_serial) {
8702  __kmp_serial_initialize();
8703  }
8704  __kmp_env_initialize(str);
8705 
8706  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8707  __kmp_env_print();
8708  }
8709 } // __kmp_aux_set_defaults
8710 
8711 /* ------------------------------------------------------------------------ */
8712 /* internal fast reduction routines */
8713 
8714 PACKED_REDUCTION_METHOD_T
8715 __kmp_determine_reduction_method(
8716  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8717  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8718  kmp_critical_name *lck) {
8719 
8720  // Default reduction method: critical construct ( lck != NULL, like in current
8721  // PAROPT )
8722  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8723  // can be selected by RTL
8724  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8725  // can be selected by RTL
8726  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8727  // among generated by PAROPT.
8728 
8729  PACKED_REDUCTION_METHOD_T retval;
8730 
8731  int team_size;
8732 
8733  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8734  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8735 
8736 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8737  (loc && \
8738  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8739 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8740 
8741  retval = critical_reduce_block;
8742 
8743  // another choice of getting a team size (with 1 dynamic deference) is slower
8744  team_size = __kmp_get_team_num_threads(global_tid);
8745  if (team_size == 1) {
8746 
8747  retval = empty_reduce_block;
8748 
8749  } else {
8750 
8751  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8752 
8753 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8754  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8755 
8756 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8757  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8758 
8759  int teamsize_cutoff = 4;
8760 
8761 #if KMP_MIC_SUPPORTED
8762  if (__kmp_mic_type != non_mic) {
8763  teamsize_cutoff = 8;
8764  }
8765 #endif
8766  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8767  if (tree_available) {
8768  if (team_size <= teamsize_cutoff) {
8769  if (atomic_available) {
8770  retval = atomic_reduce_block;
8771  }
8772  } else {
8773  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8774  }
8775  } else if (atomic_available) {
8776  retval = atomic_reduce_block;
8777  }
8778 #else
8779 #error "Unknown or unsupported OS"
8780 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8781  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8782 
8783 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8784 
8785 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8786 
8787  // basic tuning
8788 
8789  if (atomic_available) {
8790  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8791  retval = atomic_reduce_block;
8792  }
8793  } // otherwise: use critical section
8794 
8795 #elif KMP_OS_DARWIN
8796 
8797  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8798  if (atomic_available && (num_vars <= 3)) {
8799  retval = atomic_reduce_block;
8800  } else if (tree_available) {
8801  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8802  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8803  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8804  }
8805  } // otherwise: use critical section
8806 
8807 #else
8808 #error "Unknown or unsupported OS"
8809 #endif
8810 
8811 #else
8812 #error "Unknown or unsupported architecture"
8813 #endif
8814  }
8815 
8816  // KMP_FORCE_REDUCTION
8817 
8818  // If the team is serialized (team_size == 1), ignore the forced reduction
8819  // method and stay with the unsynchronized method (empty_reduce_block)
8820  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8821  team_size != 1) {
8822 
8823  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8824 
8825  int atomic_available, tree_available;
8826 
8827  switch ((forced_retval = __kmp_force_reduction_method)) {
8828  case critical_reduce_block:
8829  KMP_ASSERT(lck); // lck should be != 0
8830  break;
8831 
8832  case atomic_reduce_block:
8833  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8834  if (!atomic_available) {
8835  KMP_WARNING(RedMethodNotSupported, "atomic");
8836  forced_retval = critical_reduce_block;
8837  }
8838  break;
8839 
8840  case tree_reduce_block:
8841  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8842  if (!tree_available) {
8843  KMP_WARNING(RedMethodNotSupported, "tree");
8844  forced_retval = critical_reduce_block;
8845  } else {
8846 #if KMP_FAST_REDUCTION_BARRIER
8847  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8848 #endif
8849  }
8850  break;
8851 
8852  default:
8853  KMP_ASSERT(0); // "unsupported method specified"
8854  }
8855 
8856  retval = forced_retval;
8857  }
8858 
8859  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8860 
8861 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8862 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8863 
8864  return (retval);
8865 }
8866 // this function is for testing set/get/determine reduce method
8867 kmp_int32 __kmp_get_reduce_method(void) {
8868  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8869 }
8870 
8871 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8872 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8873 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8874 
8875 // Hard pause shuts down the runtime completely. Resume happens naturally when
8876 // OpenMP is used subsequently.
8877 void __kmp_hard_pause() {
8878  __kmp_pause_status = kmp_hard_paused;
8879  __kmp_internal_end_thread(-1);
8880 }
8881 
8882 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8883 void __kmp_resume_if_soft_paused() {
8884  if (__kmp_pause_status == kmp_soft_paused) {
8885  __kmp_pause_status = kmp_not_paused;
8886 
8887  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8888  kmp_info_t *thread = __kmp_threads[gtid];
8889  if (thread) { // Wake it if sleeping
8890  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8891  thread);
8892  if (fl.is_sleeping())
8893  fl.resume(gtid);
8894  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8895  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8896  } else { // thread holds the lock and may sleep soon
8897  do { // until either the thread sleeps, or we can get the lock
8898  if (fl.is_sleeping()) {
8899  fl.resume(gtid);
8900  break;
8901  } else if (__kmp_try_suspend_mx(thread)) {
8902  __kmp_unlock_suspend_mx(thread);
8903  break;
8904  }
8905  } while (1);
8906  }
8907  }
8908  }
8909  }
8910 }
8911 
8912 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8913 // TODO: add warning messages
8914 int __kmp_pause_resource(kmp_pause_status_t level) {
8915  if (level == kmp_not_paused) { // requesting resume
8916  if (__kmp_pause_status == kmp_not_paused) {
8917  // error message about runtime not being paused, so can't resume
8918  return 1;
8919  } else {
8920  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8921  __kmp_pause_status == kmp_hard_paused);
8922  __kmp_pause_status = kmp_not_paused;
8923  return 0;
8924  }
8925  } else if (level == kmp_soft_paused) { // requesting soft pause
8926  if (__kmp_pause_status != kmp_not_paused) {
8927  // error message about already being paused
8928  return 1;
8929  } else {
8930  __kmp_soft_pause();
8931  return 0;
8932  }
8933  } else if (level == kmp_hard_paused) { // requesting hard pause
8934  if (__kmp_pause_status != kmp_not_paused) {
8935  // error message about already being paused
8936  return 1;
8937  } else {
8938  __kmp_hard_pause();
8939  return 0;
8940  }
8941  } else {
8942  // error message about invalid level
8943  return 1;
8944  }
8945 }
8946 
8947 void __kmp_omp_display_env(int verbose) {
8948  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8949  if (__kmp_init_serial == 0)
8950  __kmp_do_serial_initialize();
8951  __kmp_display_env_impl(!verbose, verbose);
8952  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8953 }
8954 
8955 // The team size is changing, so distributed barrier must be modified
8956 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8957  int new_nthreads) {
8958  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8959  bp_dist_bar);
8960  kmp_info_t **other_threads = team->t.t_threads;
8961 
8962  // We want all the workers to stop waiting on the barrier while we adjust the
8963  // size of the team.
8964  for (int f = 1; f < old_nthreads; ++f) {
8965  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8966  // Ignore threads that are already inactive or not present in the team
8967  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8968  // teams construct causes thread_limit to get passed in, and some of
8969  // those could be inactive; just ignore them
8970  continue;
8971  }
8972  // If thread is transitioning still to in_use state, wait for it
8973  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8974  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8975  KMP_CPU_PAUSE();
8976  }
8977  // The thread should be in_use now
8978  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8979  // Transition to unused state
8980  team->t.t_threads[f]->th.th_used_in_team.store(2);
8981  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8982  }
8983  // Release all the workers
8984  team->t.b->go_release();
8985 
8986  KMP_MFENCE();
8987 
8988  // Workers should see transition status 2 and move to 0; but may need to be
8989  // woken up first
8990  int count = old_nthreads - 1;
8991  while (count > 0) {
8992  count = old_nthreads - 1;
8993  for (int f = 1; f < old_nthreads; ++f) {
8994  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8995  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8996  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8997  void *, other_threads[f]->th.th_sleep_loc);
8998  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8999  }
9000  } else {
9001  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9002  count--;
9003  }
9004  }
9005  }
9006  // Now update the barrier size
9007  team->t.b->update_num_threads(new_nthreads);
9008  team->t.b->go_reset();
9009 }
9010 
9011 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9012  // Add the threads back to the team
9013  KMP_DEBUG_ASSERT(team);
9014  // Threads were paused and pointed at th_used_in_team temporarily during a
9015  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9016  // the thread that it should transition itself back into the team. Then, if
9017  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9018  // to wake it up.
9019  for (int f = 1; f < new_nthreads; ++f) {
9020  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9021  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9022  3);
9023  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9024  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9025  (kmp_flag_32<false, false> *)NULL);
9026  }
9027  }
9028  // The threads should be transitioning to the team; when they are done, they
9029  // should have set th_used_in_team to 1. This loop forces master to wait until
9030  // all threads have moved into the team and are waiting in the barrier.
9031  int count = new_nthreads - 1;
9032  while (count > 0) {
9033  count = new_nthreads - 1;
9034  for (int f = 1; f < new_nthreads; ++f) {
9035  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9036  count--;
9037  }
9038  }
9039  }
9040 }
9041 
9042 // Globals and functions for hidden helper task
9043 kmp_info_t **__kmp_hidden_helper_threads;
9044 kmp_info_t *__kmp_hidden_helper_main_thread;
9045 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9046 #if KMP_OS_LINUX
9047 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9048 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9049 #else
9050 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9051 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9052 #endif
9053 
9054 namespace {
9055 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9056 
9057 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9058  // This is an explicit synchronization on all hidden helper threads in case
9059  // that when a regular thread pushes a hidden helper task to one hidden
9060  // helper thread, the thread has not been awaken once since they're released
9061  // by the main thread after creating the team.
9062  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9063  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9064  __kmp_hidden_helper_threads_num)
9065  ;
9066 
9067  // If main thread, then wait for signal
9068  if (__kmpc_master(nullptr, *gtid)) {
9069  // First, unset the initial state and release the initial thread
9070  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9071  __kmp_hidden_helper_initz_release();
9072  __kmp_hidden_helper_main_thread_wait();
9073  // Now wake up all worker threads
9074  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9075  __kmp_hidden_helper_worker_thread_signal();
9076  }
9077  }
9078 }
9079 } // namespace
9080 
9081 void __kmp_hidden_helper_threads_initz_routine() {
9082  // Create a new root for hidden helper team/threads
9083  const int gtid = __kmp_register_root(TRUE);
9084  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9085  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9086  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9087  __kmp_hidden_helper_threads_num;
9088 
9089  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9090 
9091  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9092 
9093  // Set the initialization flag to FALSE
9094  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9095 
9096  __kmp_hidden_helper_threads_deinitz_release();
9097 }
9098 
9099 /* Nesting Mode:
9100  Set via KMP_NESTING_MODE, which takes an integer.
9101  Note: we skip duplicate topology levels, and skip levels with only
9102  one entity.
9103  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9104  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9105  in the topology, and initializes the number of threads at each of those
9106  levels to the number of entities at each level, respectively, below the
9107  entity at the parent level.
9108  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9109  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9110  the user to turn nesting on explicitly. This is an even more experimental
9111  option to this experimental feature, and may change or go away in the
9112  future.
9113 */
9114 
9115 // Allocate space to store nesting levels
9116 void __kmp_init_nesting_mode() {
9117  int levels = KMP_HW_LAST;
9118  __kmp_nesting_mode_nlevels = levels;
9119  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9120  for (int i = 0; i < levels; ++i)
9121  __kmp_nesting_nth_level[i] = 0;
9122  if (__kmp_nested_nth.size < levels) {
9123  __kmp_nested_nth.nth =
9124  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9125  __kmp_nested_nth.size = levels;
9126  }
9127 }
9128 
9129 // Set # threads for top levels of nesting; must be called after topology set
9130 void __kmp_set_nesting_mode_threads() {
9131  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9132 
9133  if (__kmp_nesting_mode == 1)
9134  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9135  else if (__kmp_nesting_mode > 1)
9136  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9137 
9138  if (__kmp_topology) { // use topology info
9139  int loc, hw_level;
9140  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9141  loc < __kmp_nesting_mode_nlevels;
9142  loc++, hw_level++) {
9143  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9144  if (__kmp_nesting_nth_level[loc] == 1)
9145  loc--;
9146  }
9147  // Make sure all cores are used
9148  if (__kmp_nesting_mode > 1 && loc > 1) {
9149  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9150  int num_cores = __kmp_topology->get_count(core_level);
9151  int upper_levels = 1;
9152  for (int level = 0; level < loc - 1; ++level)
9153  upper_levels *= __kmp_nesting_nth_level[level];
9154  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9155  __kmp_nesting_nth_level[loc - 1] =
9156  num_cores / __kmp_nesting_nth_level[loc - 2];
9157  }
9158  __kmp_nesting_mode_nlevels = loc;
9159  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9160  } else { // no topology info available; provide a reasonable guesstimation
9161  if (__kmp_avail_proc >= 4) {
9162  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9163  __kmp_nesting_nth_level[1] = 2;
9164  __kmp_nesting_mode_nlevels = 2;
9165  } else {
9166  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9167  __kmp_nesting_mode_nlevels = 1;
9168  }
9169  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9170  }
9171  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9172  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9173  }
9174  set__nproc(thread, __kmp_nesting_nth_level[0]);
9175  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9176  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9177  if (get__max_active_levels(thread) > 1) {
9178  // if max levels was set, set nesting mode levels to same
9179  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9180  }
9181  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9182  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9183 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236