1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
 herr_t
 ss_debug_env(MPI_Comm UNUSED_SERIAL comm,       /* The library communicator. Pass any integer value when using a version of
                                                  * SSlib compiled without MPI support. */
          const char *s_const                    /* Optional string to use instead of looking at the SSLIB_DEBUG environment
                                                  * variable. Pass null to use SSLIB_DEBUG instead. Passing an empty string
                                                  * (or all white space) accomplishes nothing. Task zero broadcasts this string
                                                  * to all the other tasks. */
          )
 {
     SS_ENTER(ss_debug_env, herr_t);
     const char  *s_hdf5=NULL;                   /* value of the HDF5_DEBUG environment variable */
     int         s_len[2];                       /* length of SSLIB_DEBUG and HDF5_DEBUG environment variable values */
     char        *s=NULL, *s_here=NULL;          /* malloc'd version of SSLIB_DEBUG's value; `s' is for strtok() 1st arg */
     char        *rest=NULL;                     /* ptr into `s' for first char after parsing a number with strtol() */
     int         fd=2;                           /* file number for output, defaults to stderr */
     int         self, ntasks;                   /* MPI rank and size of COMM */
     char        buf1[1024], buf2[1024], buf3[1024];/* temporary buffers */
     char        *filename=NULL;                 /* Name of file to be opened */
     hbool_t     *task_enabled=NULL;             /* array specifying which tasks are affected by the following terms */
     hbool_t     debugger_started=FALSE;         /* has the `debugger' keyword been processed already? */
     long        nl;                             /* return value from strtol() */
     size_t      nchars;
     int         i, n, sign, nterms;
     const char  *source=NULL;

     /* Initializations */
     if ((self=ss_mpi_comm_rank(comm))<0) SS_ERROR(FAILED);
     if ((ntasks=ss_mpi_comm_size(comm))<0) SS_ERROR(FAILED);
     if (NULL==(task_enabled=malloc(ntasks*sizeof(*task_enabled)))) SS_ERROR(RESOURCE);
     for (i=0; i<ntasks; i++) task_enabled[i] = TRUE;

     /* Task zero gets the values of SSLIB_DEBUG and HDF5_DEBUG and broadcasts their lengths to the other tasks. We use
      * `int' for the string lengths because the MPI_SIZE_T type isn't initialized by ss_mpi_init() until after we've
      * parsed the SSLIB_DEBUG environment variable. */
     if (0==self) {
         if (s_const) {
             source = "supplied string";
         } else {
             source = "$SSLIB_DEBUG";
             s_const = getenv("SSLIB_DEBUG");
         }
         s_len[0] = s_const ? (int)strlen(s_const)+1 : 0;
         s_hdf5 = getenv("HDF5_DEBUG");
         s_len[1] = s_hdf5 ? (int)strlen(s_hdf5)+1 : 0;
     }
     if (ss_mpi_bcast(&s_len, 2, MPI_INT, 0, comm)<0) SS_ERROR(FAILED);

     if (s_len[0]>0) {
         /* Broadcast the SSLIB_ERROR value */
         if (NULL==(s=s_here=malloc((size_t)MAX(s_len[0], s_len[1])))) SS_ERROR(RESOURCE);
         if (0==self) strcpy(s, s_const);
         if (ss_mpi_bcast(s, (size_t)(s_len[0]), MPI_CHAR, 0, comm)<0) SS_ERROR(FAILED);

         /* Parse the SSLIB_ERROR value */
         while (s && *s) {
             if (s[0]==';') {
                 s++;
             } else if (!strncmp(s, "task=", 5)) {
                 /* Select certain tasks */
                 s += 5;
                 sign = 0;
                 nterms = 0;

                 while (*s && ';'!=*s) {

                     /* Get the leading `+' or `-' sign. It applies to all subsequent task numbers in the list */
                     while (*s && isspace(*s)) s++;
                     if ('-'==*s) {
                         sign = -1;
                         s++;
                     } else if ('+'==*s) {
                         sign = 1;
                         s++;
                     }

                     while (*s && isspace(*s)) s++;
                     if (!strncmp(s, "all", 3)) {
                         for (i=0; i<ntasks; i++) task_enabled[i] = TRUE;
                         s += 3;
                     } else if (!strncmp(s, "none", 4)) {
                         for (i=0; i<ntasks; i++) task_enabled[i] = FALSE;
                         s += 4;
                     } else if ((nl=strtol(s, &rest, 0)) || rest!=s) {
                         if (nl<0 || nl>=ntasks) SS_ERROR_FMT(DOMAIN, ("task rank %ld is not valid in %s", nl, source));
                         if (sign>0) {
                             task_enabled[nl] = TRUE;
                         } else if (sign<0) {
                             task_enabled[nl] = FALSE;
                         } else {
                             if (0==nterms) {
                                 for (i=0; i<ntasks; i++) task_enabled[i] = FALSE;
                             }
                             task_enabled[nl] = TRUE;
                         }
                         s = rest;
                     } else {
                         SS_ERROR_FMT(USAGE, ("malformed task specification in %s at: %s", source, s));
                     }
                     if (','==*s) s++;
                     nterms++;
                 }

             } else if (!strncmp(s, "error=", 6)) {
                 if ((0==(nl=strtol(s+6, &rest, 0)) && rest==s) || (*rest!=';' && *rest!='\0'))
                     SS_ERROR_FMT(USAGE, ("malformed error number in %s at: %s", source, s));
                 if (task_enabled[self]) {
                     sslib_g.show_error_ids = TRUE;
                     sslib_g.debug_error = nl;
                 }
                 s = rest;

             } else if (!strncmp(s, "error", 5) && (';'==s[5] || !s[5])) {
                 if (task_enabled[self]) sslib_g.show_error_ids = TRUE;
                 s += 5;

             } else if (!strncmp(s, "file=", 5)) {
                 /* Select file descriptor and/or open a file */
                 s += 5;

                 /* Look for a leading integer optionally followed by a comma */
                 if ((0==(nl=strtol(s, &rest, 0)) && rest==s) ||                 /* if no leading integer, or ... */
                     (*rest!=',' && *rest!=';' && *rest!='\0')) {                /* something other than ',' or ';' follows... */
                     n = -1;                                                     /* then there is no leading file descriptor. */
                 } else {
                     SS_ASSERT(nl<=INT_MAX);
                     n = (int)nl;
                     s = ','==*rest ? rest+1 : rest;                             /* otherwise skip the descriptor and ',' */
                 }

                 /* Look for a file name. It is an error if there's no name and the destination fd is negative */
                 if ((!s || !*s || ';'==*s) && n<0) {
                     SS_ERROR_FMT(USAGE, ("malformed file term in %s", source));
                 } else if (!strncmp(s, "none", 4) && (';'==s[4] || !s[4])) {
                     if (task_enabled[self]) {
                         s += 4;
                         if (n>=0) SS_ERROR_FMT(USAGE, ("file descriptor and `none' both specified in %s", source));
                         fd = -1;
                     }
                 } else if (*s && ';'!=*s) {
                     if ((rest=strchr(s, ';'))) {
                         SS_ASSERT((size_t)(rest-s) < sizeof buf1);
                         strncpy(buf1, s, (size_t)(rest-s));
                         buf1[rest-s] = '\0';
                         sprintf(buf2, buf1, self);
                         sprintf(buf3, buf1, -1); /*for comparison later*/
                         s = rest;
                     } else {
                         sprintf(buf2, s, self);
                         sprintf(buf3, s, -1); /*for comparison later*/
                         s = NULL;
                     }
                     if ('<'==buf2[0]) {
                         /* The file is being opened for reading. */
                         if (task_enabled[self]) {
                             filename = buf2+1;
                             fd = open(filename, O_RDONLY);
                             if (fd<0) SS_ERROR_FMT(FAILED, ("cannot open file `%s': %s", filename, strerror(errno)));
                         }
                     } else if (!strcmp(buf2, buf3)) {
                         /* All tasks are opening the same file. The lowest numbered task will create or truncate the file and
                          * then everyone else will open it for appending. This prevents tasks from clobbering each other's
                          * output. */
                         for (i=0; i<ntasks; i++) if (task_enabled[i]) break;
                         if (i==self) {
                             /* The first task should create/truncate the file */
                             fd = open(buf2, O_RDWR|O_CREAT|O_TRUNC|O_APPEND, 0666);
                             if (fd<0) SS_ERROR_FMT(FAILED, ("cannot create file `%s': %s", buf2, strerror(errno)));
                         }
                         ss_mpi_barrier(comm);
                         if (task_enabled[self] && i!=self) {
                             /* All other tasks just open the file */
                             fd = open(buf2, O_RDWR|O_APPEND, 0666);
                             if (fd<0) SS_ERROR_FMT(FAILED, ("cannot open file `%s': %s", buf2, strerror(errno)));
                         }
                     } else if (task_enabled[self]) {
                         /* All tasks are opening different files */
                         fd = open(buf2, O_RDWR|O_CREAT|O_TRUNC|O_APPEND, 0666);
                         if (fd<0) SS_ERROR_FMT(FAILED, ("cannot create file `%s': %s", buf2, strerror(errno)));
                     }
                     if (task_enabled[self] && n>=0) {
                         if (dup2(fd, n)<0) SS_ERROR_FMT(FAILED, ("dup2 failed: %s", strerror(errno)));
                         close(fd);
                         fd = n;
                     }
                 } else if (n>0) {
                     if (task_enabled[self]) fd = n;
                 }

             } else if (!strncmp(s, "stop", 4) && (';'==s[4] || !s[4])) {
                 /* Stop all affected tasks immediately */
                 s += 4;
                 if (task_enabled[self]) {
 #ifdef HAVE_KILL
                     sprintf(buf1, "SSLIB: MPI task %d, PID %d is stopping with SIGSTOP\n", self, getpid());
                     if (strlen(buf1)!=(size_t)write(fd, buf1, strlen(buf1))) write(2, buf1, strlen(buf1));
                     kill(getpid(), SIGSTOP);
 #else
                     SS_ERROR_FMT(NOTIMP, ("keyword `stop' is not supported in %s on this platform", source));
 #endif
                 }

             } else if (!strncmp(s, "pause=", 6)) {
                 if ((0==(nl=strtol(s+6, &rest, 0)) && rest==s) || nl<0 || (*rest!=';' && *rest!='\0'))
                     SS_ERROR_FMT(USAGE, ("malformed pause value in %s at: %s", source, s));
                 if (task_enabled[self]) {
                     fprintf(stderr, "SSLIB: MPI task %d, PID %d is pausing for %ld second%s\n", self, getpid(), nl, 1==nl?"":"s");
 #ifdef WIN32
                     if (nl) Sleep((unsigned)nl*1000);
 #else
                     if (nl) sleep((unsigned)nl);
 #endif
                 }
                 s = rest;

             } else if (!strncmp(s, "banner=", 7)) {
                 s += 7;
                 for (nchars=0; s[nchars] && ';'!=s[nchars]; nchars++) /*void*/;
                 sslib_g.banner = SS_FREE(sslib_g.banner);
                 if (nchars) {
                     if (NULL==(sslib_g.banner=malloc(nchars+1))) SS_ERROR_FMT(RESOURCE, ("banner string"));
                     strncpy(sslib_g.banner, s, nchars);
                     sslib_g.banner[nchars] = '\0';
                 }
                 s += nchars;

             } else if (!strncmp(s, "debugger=", 9)) {
                 s += 9;
                 if ((rest = strchr(s, ';'))) {
                     nchars = MIN(sizeof(sslib_g.debugger)-1, (size_t)(rest-s));
                     strncpy(sslib_g.debugger, s, nchars);
                     sslib_g.debugger[nchars] = '\0';
                     s = rest;
                 } else {
                     nchars = MIN(sizeof(sslib_g.debugger)-1, strlen(s));
                     strncpy(sslib_g.debugger, s, nchars);
                     sslib_g.debugger[nchars] = '\0';
                     s = NULL;
                 }

             } else if (!strncmp(s, "debug", 5) && (';'==s[5] || !s[5])) {

                 /* Spawn a debugger and attach it to the first affected task */
                 s += 5;
                 if (task_enabled[self]) {
                     if (ss_debug_start(sslib_g.debugger)<0) SS_ERROR(FAILED);
                     debugger_started = TRUE;
                     sslib_g.debug_signal = FALSE;
                 }

             } else if (!strncmp(s, "signal", 6) && (';'==s[6] || !s[6])) {
                 /* Cause certain program error signals to start a debugger */
 #ifdef HAVE_SIGACTION
                 struct sigaction newaction;
                 s += 6;
                 if (task_enabled[self] && !debugger_started) {
                     sslib_g.debug_signal = TRUE;
                     newaction.sa_handler = ss_debug_signal;
                     sigemptyset(&(newaction.sa_mask));
                     newaction.sa_flags = 0;
                     sigaction(SIGABRT, &newaction, NULL);
                     sigaction(SIGSEGV, &newaction, NULL);
                     sigaction(SIGILL,  &newaction, NULL);
                     sigaction(SIGBUS,  &newaction, NULL);
                     sigaction(SIGFPE,  &newaction, NULL);
                 }
 #else
                 SS_ERROR_FMT(NOTIMP, ("keyword `signal' not supported in %s on this platform", source));
 #endif

             } else if (!strncmp(s, "stack", 5) && (';'==s[5] || !s[5])) {
                 /* Set file descriptor for stack traces */
                 s += 5;
                 if (task_enabled[self]) ss_err_cntl_g.fd = fd;

             } else if (!strncmp(s, "pid", 3) && (';'==s[3] || !s[3])) {
 #ifdef HAVE_GETPID
                 s += 3;
                 if (task_enabled[self]) {
                     sprintf(buf1, "SSLIB: MPI task %d has PID %d\n", self, getpid());
                     if (strlen(buf1)!=(size_t)write(fd, buf1, strlen(buf1))) write(2, buf1, strlen(buf1));
                 }
 #else
                 SS_ERROR_FMT(NOTIMP, ("keyword `pid' not supported in %s on this platform", source));
 #endif

             } else if (!strncmp(s, "commands", 8) && (';'==s[8] || !s[8])) {
                 s += 8;
                 if (task_enabled[self]) sslib_g.command_fd = fd;

             } else if (!strncmp(s, "mpi", 3) && (';'==s[3] || !s[3])) {
                 s += 3;
                 if (task_enabled[self]) sslib_g.ignore_mpierror = TRUE;

             } else if (!strncmp(s, "warnings", 8) && (';'==s[8] || !s[8])) {
                 s += 8;
                 if (task_enabled[self]) sslib_g.warnings = fdopen(fd, "w"); /*do not close old one*/

             } else if (!strncmp(s, "check=", 6)) {
                 /* check=sync        -- turn on default synchronization checking/debugging options
                  * check=!sync       -- turn off all synchronization checking/debugging options
                  * check=sync=bcast  -- turn on sync bcast debugging
                  * check=!sync=bcast -- turn off sync bcast debugging
                  * etc. */
                 s += 6;
                 while (*s && ';'!=*s) {
                     hbool_t turn_off=FALSE;
                     if ('-'==*s) {
                         turn_off = TRUE;
                         s++;
                     }
                     if (!strncmp(s, "sync", 4) && (';'==s[4] || ','==s[4] || '='==s[4] || !s[4])) {
                         s += 4;
                         if ('='==*s) {
                             s++;
                             while (*s && ';'!=*s) {
                                 if (!strncmp(s, "error", 5) && (';'==s[5] || ','==s[5] || !s[5])) {
                                     s += 5;
                                     if (task_enabled[self]) sslib_g.sync_check = turn_off ? FALSE : SS_STRICT;
                                 } else if (!strncmp(s, "bcast", 5) && (';'==s[5] || ','==s[5] || !s[5])) {
                                     s += 5;
                                     if (task_enabled[self]) sslib_g.sync_bcast = turn_off ? FALSE : TRUE;
                                 } else {
                                     SS_ERROR_FMT(USAGE, ("unknown attribute for check=%ssync in %s at: %s",
                                                          turn_off?"-":"", source, s));
                                 }
                                 while (*s && ','==*s) s++;
                             }
                         } else if (task_enabled[self]) {
                             if (turn_off) {
                                 sslib_g.sync_check = FALSE;
                                 sslib_g.sync_bcast = FALSE;
                             } else {
                                 /* Default values */
                                 sslib_g.sync_check = TRUE;
                                 sslib_g.sync_bcast = FALSE;
                             }
                         }
                     } else if (!strncmp(s, "2pio", 4) && (';'==s[4] || ','==s[4] || !s[4])) {
                         s += 4;
                         sslib_g.tpio_alloc = turn_off ? FALSE : TRUE; /*regardless of task_enabled[]*/
                     } else {
                         SS_ERROR_FMT(USAGE, ("unknown category for check in %s at: %s%s",
                                              source, turn_off?"-":"", s));
                     }
                     while (*s && ','==*s) s++;
                 }

             } else {
                 SS_ERROR_FMT(USAGE, ("unknown term in %s at: %s", source, s));
             }

             while (s && ';'==*s) s++;
         }

 #if 0 /* disable because there is no H5debug_mask() yet. --rpm 2003-09-12 */
         /* Broadcast the HDF5_DEBUG value and reinitialize HDF5's debug settings */
         if (H5debug_mask("-all")<0) SS_ERROR(HDF5);
         if (s_hdf5) {
             if (0==self) strcpy(s_here, s_hdf5);
             if (MPI_Bcast(s_here, s_len[1], MPI_CHAR, 0, comm)) SS_ERROR(MPI);
             if (H5debug_mask(s_here)<0) SS_ERROR(HDF5);
         }
 #endif

         /* Cleanup */
         s_here = SS_FREE(s_here);
     }
  task_enabled = SS_FREE(task_enabled);

  SS_CLEANUP:
     SS_FREE(s_here);
     SS_FREE(task_enabled);

     SS_LEAVE(0);
 }