File size: 9,699 Bytes
6be3106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
 * Copyright 2020 NVIDIA Corporation. All rights reserved.
 *
 * Sample CUPTI based injection to attach and detach CUPTI
 * For detaching, it uses CUPTI API cuptiFinalize().
 *
 * It is recommended to invoke API cuptiFinalize() in the
 * exit callsite of any CUDA Driver/Runtime API.
 *
 * API cuptiFinalize() destroys and cleans up all the
 * resources associated with CUPTI in the current process.
 * After CUPTI detaches from the process, the process will
 * keep on running with no CUPTI attached to it.
 *
 * CUPTI can be attached by calling any CUPTI API as CUPTI
 * supports lazy initialization. Any subsequent CUPTI API
 * call will reinitialize the CUPTI.
 *
 * You can attach and detach CUPTI any number of times.
 *
 * After building the sample, set the following environment variable
 * export CUDA_INJECTION64_PATH=<full_path>/libCuptiFinalize.so
 * Add CUPTI library in LD_LIBRARY_PATH and run any CUDA sample
 * with runtime more than 10 sec for demonstration of the
 * CUPTI sample
 */

#include <stdio.h>
#include <unistd.h>
#include <pthread.h>
#include <stdlib.h>

#include <cuda.h>
#include <cupti.h>

#define STDCALL

#if defined(__cplusplus)
extern "C" {
#endif

// MAcros

#define CUPTI_CALL(call)                                                       \
do {                                                                           \
    CUptiResult _status = call;                                                \
    if (_status != CUPTI_SUCCESS) {                                            \
        const char *errstr;                                                    \
        cuptiGetResultString(_status, &errstr);                                \
        fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
                __FILE__, __LINE__, #call, errstr);                            \
        exit(EXIT_FAILURE);                                                    \
    }                                                                          \
} while (0)

#define PTHREAD_CALL(call)                                                         \
do {                                                                               \
    int _status = call;                                                            \
    if (_status != 0) {                                                            \
        fprintf(stderr, "%s:%d: error: function %s failed with error code %d.\n",  \
                __FILE__, __LINE__, #call, _status);                               \
        exit(EXIT_FAILURE);                                                        \
    }                                                                              \
} while (0)

#define BUF_SIZE (8 * 1024 * 1024)  // 8MB
#define ALIGN_SIZE (8)
#define ALIGN_BUFFER(buffer, align)                                            \
    (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))

// Global Structure

typedef struct {
    volatile uint32_t initialized;
    CUpti_SubscriberHandle  subscriber;
    volatile uint32_t detachCupti;
    int frequency;
    int tracingEnabled;
    int terminateThread;
    uint64_t kernelsTraced;
    pthread_t dynamicThread;
    pthread_mutex_t mutexFinalize;
    pthread_cond_t mutexCondition;
} injGlobalControl;
injGlobalControl globalControl;

// Function Declarations

static CUptiResult cuptiInitialize(void);

static void atExitHandler(void);

void CUPTIAPI callbackHandler(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, void *cbInfo);

extern int STDCALL InitializeInjection(void);

#if defined(__cplusplus)
}
#endif

// Function Definitions

static void
globalControlInit(void) {
    globalControl.initialized = 0;
    globalControl.subscriber = 0;
    globalControl.detachCupti = 0;
    globalControl.frequency = 2; // in seconds
    globalControl.tracingEnabled = 0;
    globalControl.terminateThread = 0;
    globalControl.kernelsTraced = 0;
    globalControl.mutexFinalize = PTHREAD_MUTEX_INITIALIZER;
    globalControl.mutexCondition = PTHREAD_COND_INITIALIZER;
}

void registerAtExitHandler(void) {
    // Register atExitHandler
    atexit(&atExitHandler);
}

static void
printSummary(void) {
    printf("\n-------------------------------------------------------------------\n");
    printf("\tKernels traced : %llu", (unsigned long long)globalControl.kernelsTraced);
    printf("\n-------------------------------------------------------------------\n");
}

static void
atExitHandler(void) {
    globalControl.terminateThread = 1;

    // Force flush
    if(globalControl.tracingEnabled) {
        CUPTI_CALL(cuptiActivityFlushAll(1));
    }

    PTHREAD_CALL(pthread_join(globalControl.dynamicThread, NULL));
    printSummary();
}

static void CUPTIAPI
bufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
    uint8_t *rawBuffer;

    *size = BUF_SIZE;
    rawBuffer = (uint8_t *)malloc(*size + ALIGN_SIZE);

    *buffer = ALIGN_BUFFER(rawBuffer, ALIGN_SIZE);
    *maxNumRecords = 0;

    if (*buffer == NULL) {
        printf("Error: Out of memory.\n");
        exit(EXIT_FAILURE);
    }
}

static void CUPTIAPI
bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
    CUptiResult status;
    CUpti_Activity *record = NULL;
    size_t dropped;

    do {
        status = cuptiActivityGetNextRecord(buffer, validSize, &record);
        if (status == CUPTI_SUCCESS) {
            CUpti_ActivityKind kind = record->kind;
            switch (kind) {
            case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
                globalControl.kernelsTraced++;
                break;
            default:
                break;
            }
        }
        else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
            break;
        }
        else {
            CUPTI_CALL(status);
        }
    } while (1);

    // Report any records dropped from the queue
    CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
    if (dropped != 0) {
        printf("Dropped %u activity records.\n", (unsigned int)dropped);
    }
    free(buffer);
}

static CUptiResult
cuptiInitialize(void) {
    CUptiResult status = CUPTI_SUCCESS;

    CUPTI_CALL(cuptiSubscribe(&globalControl.subscriber, (CUpti_CallbackFunc)callbackHandler, NULL));

    // Subscribe Driver and Runtime callbacks to call cuptiFinalize in the entry/exit callback of these APIs
    CUPTI_CALL(cuptiEnableDomain(1, globalControl.subscriber, CUPTI_CB_DOMAIN_RUNTIME_API));
    CUPTI_CALL(cuptiEnableDomain(1, globalControl.subscriber, CUPTI_CB_DOMAIN_DRIVER_API));

    // Enable CUPTI activities
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
    CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));

    // Register buffer callbacks
    CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));

    return status;
}

void CUPTIAPI
callbackHandler(void *userdata, CUpti_CallbackDomain domain,
    CUpti_CallbackId cbid, void *cbdata) {
    const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;

    // Check last error
    CUPTI_CALL(cuptiGetLastError());

    // This code path is taken only when we wish to perform the CUPTI teardown
    if (globalControl.detachCupti) {
        switch(domain) {
        case CUPTI_CB_DOMAIN_RUNTIME_API:
        case CUPTI_CB_DOMAIN_DRIVER_API:
            if (cbInfo->callbackSite == CUPTI_API_EXIT) {
                // Detach CUPTI calling cuptiFinalize() API
                CUPTI_CALL(cuptiFinalize());
                PTHREAD_CALL(pthread_cond_broadcast(&globalControl.mutexCondition));
            }
            break;
        default:
            break;
        }
    }
}

void *dynamicAttachDetach(void *arg) {
    while (!globalControl.terminateThread) {
        sleep(globalControl.frequency);

        // Check the condition again after sleep
        if (globalControl.terminateThread) {
            break;
        }

        // Turn on/off CUPTI at a regular interval
        if (globalControl.tracingEnabled) {
            printf("\nCUPTI detach starting ...\n");

            // Force flush
            CUPTI_CALL(cuptiActivityFlushAll(1));

            globalControl.detachCupti = 1;

            // Lock and wait for callbackHandler() to perform CUPTI teardown
            PTHREAD_CALL(pthread_mutex_lock(&globalControl.mutexFinalize));
            PTHREAD_CALL(pthread_cond_wait(&globalControl.mutexCondition, &globalControl.mutexFinalize));
            PTHREAD_CALL(pthread_mutex_unlock(&globalControl.mutexFinalize));

            printf("CUPTI detach completed.\n");

            globalControl.detachCupti = 0;
            globalControl.tracingEnabled = 0;
            globalControl.subscriber = 0;
        }
        else {
            printf("\nCUPTI attach starting ...\n");

            CUPTI_CALL(cuptiInitialize());
            globalControl.tracingEnabled = 1;

            printf("CUPTI attach completed.\n");
        }
    }
    return NULL;
}

int STDCALL
InitializeInjection(void) {

    if (globalControl.initialized) {
        return 1;
    }
    // Init globalControl
    globalControlInit();

    //Initialize Mutex
    PTHREAD_CALL(pthread_mutex_init(&globalControl.mutexFinalize, 0));

    registerAtExitHandler();

    // Initialize CUPTI
    CUPTI_CALL(cuptiInitialize());
    globalControl.tracingEnabled = 1;

    // Launch the thread
    PTHREAD_CALL(pthread_create(&globalControl.dynamicThread, NULL, dynamicAttachDetach, NULL));

    globalControl.initialized = 1;

    return 1;
}