File size: 18,449 Bytes
61ba51e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
"""
Publish performance traces to GitHub repository
"""

import argparse
import base64
import json
import os
import sys
import time
import warnings
from urllib.error import HTTPError
from urllib.request import Request, urlopen


def is_rate_limit_error(e):
    """Check if an exception is a GitHub rate limit error (not permission error)"""
    if not isinstance(e, HTTPError):
        return False
    if e.code == 429:
        return True
    if e.code == 403:
        # 403 can be rate limit OR permission error - check the message
        error_body = getattr(e, "error_body", "")
        if isinstance(error_body, str):
            # Rate limit errors contain specific phrases
            rate_limit_phrases = [
                "rate limit",
                "abuse detection",
                "secondary rate limit",
            ]
            return any(phrase in error_body.lower() for phrase in rate_limit_phrases)
    return False


def is_permission_error(e):
    """Check if an exception is a GitHub permission error"""
    if not isinstance(e, HTTPError) or e.code != 403:
        return False
    error_body = getattr(e, "error_body", "")
    if isinstance(error_body, str):
        permission_phrases = [
            "resource not accessible",
            "must have push access",
            "permission",
            "denied",
        ]
        return any(phrase in error_body.lower() for phrase in permission_phrases)
    return False


def make_github_request(url, token, method="GET", data=None):
    """Make authenticated request to GitHub API"""
    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"Bearer {token}",
        # "User-Agent": "sglang-ci",
        "X-GitHub-Api-Version": "2022-11-28",
    }

    if data:
        headers["Content-Type"] = "application/json"
        data = json.dumps(data).encode("utf-8")

    req = Request(url, data=data, headers=headers, method=method)

    try:
        with urlopen(req) as response:
            return response.read().decode("utf-8")
    except HTTPError as e:
        print(f"GitHub API request failed: {e}")
        try:
            error_body = e.read().decode("utf-8")
            print(f"Error response body: {error_body}")
            e.error_body = error_body  # Attach for later inspection
        except Exception:
            e.error_body = ""
        raise
    except Exception as e:
        print(f"GitHub API request failed with a non-HTTP error: {e}")
        raise


def verify_token_permissions(repo_owner, repo_name, token):
    """Verify that the token has necessary permissions for the repository"""
    print("Verifying token permissions...")

    checks = [
        (
            f"https://api.github.com/repos/{repo_owner}/{repo_name}",  # Check if we can access the repository
            "Repository access verified",
        ),
        (
            f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents",  # Check if we can read the repository contents
            "Repository contents access verified",
        ),
    ]

    for url, success_message in checks:
        try:
            response = make_github_request(url, token)
            if success_message == "Repository access verified":
                repo_data = json.loads(response)
                print(f"{success_message}: {repo_data['full_name']}")
            else:
                print(success_message)
        except Exception as e:
            if is_rate_limit_error(e):
                warnings.warn(
                    "GitHub API rate limit exceeded during token verification."
                )
                return "rate_limited"
            print(f"Failed to verify permissions for {url}: {e}")
            return False

    return True


def get_branch_sha(repo_owner, repo_name, branch, token):
    """Get SHA of the branch head"""
    url = (
        f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}"
    )
    response = make_github_request(url, token)
    data = json.loads(response)
    return data["object"]["sha"]


def get_tree_sha(repo_owner, repo_name, commit_sha, token):
    """Get tree SHA from commit"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits/{commit_sha}"
    response = make_github_request(url, token)
    data = json.loads(response)
    return data["tree"]["sha"]


def create_blob(repo_owner, repo_name, content, token, max_retries=3):
    """Create a blob with file content"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/blobs"

    # Encode content as base64 for GitHub API
    content_b64 = base64.b64encode(content).decode("utf-8")

    data = {"content": content_b64, "encoding": "base64"}

    for attempt in range(max_retries):
        try:
            response = make_github_request(url, token, method="POST", data=data)
            return json.loads(response)["sha"]
        except Exception as e:
            # Don't retry on rate limit errors - fail fast
            if is_rate_limit_error(e):
                raise

            if attempt < max_retries - 1:
                wait_time = 2**attempt  # Exponential backoff: 1s, 2s, 4s
                print(
                    f"Blob creation failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s..."
                )
                time.sleep(wait_time)
            else:
                raise


def create_blobs(repo_owner, repo_name, files, token):
    """Create blobs for all files and return tree items with blob SHAs"""
    tree_items = []
    for i, (file_path, content) in enumerate(files):
        # Create blob first to get SHA
        blob_sha = create_blob(repo_owner, repo_name, content, token)
        tree_items.append(
            {
                "path": file_path,
                "mode": "100644",
                "type": "blob",
                "sha": blob_sha,
            }
        )
        # Progress indicator for large uploads
        if (i + 1) % 10 == 0 or (i + 1) == len(files):
            print(f"Created {i + 1}/{len(files)} blobs...")
    return tree_items


def create_tree(repo_owner, repo_name, base_tree_sha, tree_items, token, max_retries=3):
    """Create a new tree from pre-created blob SHAs"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees"

    data = {"base_tree": base_tree_sha, "tree": tree_items}

    for attempt in range(max_retries):
        try:
            response = make_github_request(url, token, method="POST", data=data)
            return json.loads(response)["sha"]
        except Exception as e:
            # Don't retry on rate limit errors - fail fast
            if is_rate_limit_error(e):
                raise

            if attempt < max_retries - 1:
                wait_time = 2**attempt
                print(
                    f"Tree creation failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s..."
                )
                time.sleep(wait_time)
            else:
                raise


def create_commit(
    repo_owner, repo_name, tree_sha, parent_sha, message, token, max_retries=3
):
    """Create a new commit"""
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits"

    data = {"tree": tree_sha, "parents": [parent_sha], "message": message}

    for attempt in range(max_retries):
        try:
            response = make_github_request(url, token, method="POST", data=data)
            commit_sha = json.loads(response)["sha"]

            # Verify the commit was actually created
            verify_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/commits/{commit_sha}"
            verify_response = make_github_request(verify_url, token)
            verify_data = json.loads(verify_response)
            if verify_data["sha"] != commit_sha:
                raise Exception(
                    f"Commit verification failed: expected {commit_sha}, got {verify_data['sha']}"
                )

            return commit_sha
        except Exception as e:
            # Don't retry on rate limit errors - fail fast
            if is_rate_limit_error(e):
                raise

            if attempt < max_retries - 1:
                wait_time = 2**attempt
                print(
                    f"Commit creation failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s..."
                )
                time.sleep(wait_time)
            else:
                raise


def update_branch_ref(repo_owner, repo_name, branch, commit_sha, token, max_retries=3):
    """Update branch reference to point to new commit"""
    url = (
        f"https://api.github.com/repos/{repo_owner}/{repo_name}/git/refs/heads/{branch}"
    )

    data = {"sha": commit_sha}

    for attempt in range(max_retries):
        try:
            make_github_request(url, token, method="PATCH", data=data)
            return
        except HTTPError as e:
            # Don't retry on rate limit errors - fail fast
            if is_rate_limit_error(e):
                raise

            # Check if this is an "Object does not exist" error
            is_object_not_exist = False
            if hasattr(e, "error_body"):
                try:
                    error_data = json.loads(e.error_body)
                    if "Object does not exist" in error_data.get("message", ""):
                        is_object_not_exist = True
                except Exception:
                    pass

            if is_object_not_exist and attempt < max_retries - 1:
                # This might be a transient consistency issue - wait and retry
                wait_time = 2**attempt
                print(
                    f"Branch update failed with 'Object does not exist' (attempt {attempt + 1}/{max_retries}), waiting {wait_time}s for consistency..."
                )
                time.sleep(wait_time)
            else:
                raise
        except Exception as e:
            # Don't retry on rate limit errors - fail fast
            if is_rate_limit_error(e):
                raise

            if attempt < max_retries - 1:
                wait_time = 2**attempt
                print(
                    f"Branch update failed (attempt {attempt + 1}/{max_retries}), retrying in {wait_time}s..."
                )
                time.sleep(wait_time)
            else:
                raise


def copy_trace_files(source_dir, target_base_path):
    """Copy trace files and return list of files to upload.

    Only uploads traces from TP rank 0 to avoid duplicated data across tensor parallel ranks.
    """
    files_to_upload = []

    if not os.path.exists(source_dir):
        print(f"Warning: Traces directory {source_dir} does not exist")
        return files_to_upload

    # Walk through source directory and find .json.gz files
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".json.gz"):

                # Only upload TP rank 0 traces to avoid duplicates across tensor parallel ranks
                if "TP-" in file and "TP-0" not in file:
                    continue

                source_file = os.path.join(root, file)
                # Calculate relative path from source_dir
                rel_path = os.path.relpath(source_file, source_dir)
                target_path = f"{target_base_path}/{rel_path}"

                # Read file content
                with open(source_file, "rb") as f:
                    content = f.read()

                files_to_upload.append((target_path, content))

    return files_to_upload


def publish_traces(traces_dir, run_id, run_number):
    """Publish traces from a single directory to GitHub repository in a single commit"""
    target_base_path = f"traces/{run_id}"
    files_to_upload = copy_trace_files(traces_dir, target_base_path)

    if not files_to_upload:
        print("No trace files found to upload")
        return

    print(f"Found {len(files_to_upload)} files to upload")
    publish_traces_from_files(files_to_upload, run_id, run_number)


def publish_traces_from_files(files_to_upload, run_id, run_number):
    """Publish pre-collected trace files to GitHub repository in a single commit"""
    # Get environment variables
    token = os.getenv("GITHUB_TOKEN")
    if not token:
        print("Error: GITHUB_TOKEN environment variable not set")
        sys.exit(1)

    # Repository configuration
    repo_owner = "sglang-bot"
    repo_name = "sglang-ci-data"
    branch = "main"

    # Verify token permissions before proceeding
    permission_check = verify_token_permissions(repo_owner, repo_name, token)
    if permission_check == "rate_limited":
        warnings.warn(
            "Skipping trace upload due to GitHub API rate limit. "
            "This is expected during high CI activity and does not indicate a test failure."
        )
        return
    elif not permission_check:
        print(
            "Token permission verification failed. Please check the token permissions."
        )
        sys.exit(1)

    max_retries = 5
    retry_delay = 5  # seconds

    # Create blobs once before retry loop to avoid re-uploading on failures
    try:
        tree_items = create_blobs(repo_owner, repo_name, files_to_upload, token)
    except Exception as e:
        # Check for rate limit errors during blob creation
        if is_rate_limit_error(e):
            warnings.warn(
                "GitHub API rate limit exceeded during blob creation. Skipping trace upload."
            )
            return
        # Check for permission errors - these should fail loudly
        if is_permission_error(e):
            print(
                f"ERROR: Token does not have write permission to {repo_owner}/{repo_name}. "
                "Please update the GH_PAT_FOR_NIGHTLY_CI_DATA secret with a token that has "
                "'contents: write' permission for the repository."
            )
            sys.exit(1)
        print(f"Failed to create blobs: {e}")
        raise

    for attempt in range(max_retries):
        try:
            # Get current branch head
            branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
            print(f"Current branch head: {branch_sha}")

            # Get current tree
            tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
            print(f"Current tree SHA: {tree_sha}")

            # Create new tree with pre-created blobs
            new_tree_sha = create_tree(
                repo_owner, repo_name, tree_sha, tree_items, token
            )
            print(f"Created new tree: {new_tree_sha}")

            # Create commit
            commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
            commit_sha = create_commit(
                repo_owner,
                repo_name,
                new_tree_sha,
                branch_sha,
                commit_message,
                token,
            )
            print(f"Created commit: {commit_sha}")

            # Update branch reference
            update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
            print("Updated branch reference")

            print("Successfully published all traces in a single commit")
            return

        except Exception as e:
            # Check for retryable errors
            is_retryable = False
            error_type = "unknown"

            if hasattr(e, "error_body"):
                if "Update is not a fast forward" in e.error_body:
                    is_retryable = True
                    error_type = "fast-forward conflict"
                elif "Object does not exist" in e.error_body:
                    is_retryable = True
                    error_type = "object consistency"

            # Also retry on HTTP errors that might be transient
            if isinstance(e, HTTPError) and e.code in [422, 500, 502, 503, 504]:
                is_retryable = True
                error_type = f"HTTP {e.code}"

            # Check for rate limit errors (non-fatal - just warn and skip)
            if is_rate_limit_error(e):
                warnings.warn("GitHub API rate limit exceeded. Skipping trace upload.")
                return

            # Check for permission errors - these should fail loudly
            if is_permission_error(e):
                print(
                    f"ERROR: Token does not have write permission to {repo_owner}/{repo_name}. "
                    "Please update the GH_PAT_FOR_NIGHTLY_CI_DATA secret with a token that has "
                    "'contents: write' permission for the repository."
                )
                sys.exit(1)

            if is_retryable and attempt < max_retries - 1:
                print(
                    f"Attempt {attempt + 1}/{max_retries} failed ({error_type}). Retrying in {retry_delay} seconds..."
                )
                time.sleep(retry_delay)
            else:
                print(f"Failed to publish traces after {attempt + 1} attempts: {e}")
                raise


def main():
    parser = argparse.ArgumentParser(
        description="Publish performance traces to GitHub repository"
    )
    parser.add_argument(
        "--traces-dir",
        type=str,
        action="append",
        dest="traces_dirs",
        required=True,
        help="Traces directory to publish (can be specified multiple times)",
    )
    args = parser.parse_args()

    # Get environment variables
    run_id = os.getenv("GITHUB_RUN_ID", "test")
    run_number = os.getenv("GITHUB_RUN_NUMBER", "12345")

    if not run_id or not run_number:
        print(
            "Error: GITHUB_RUN_ID and GITHUB_RUN_NUMBER environment variables must be set"
        )
        sys.exit(1)

    # Collect trace files from all directories
    target_base_path = f"traces/{run_id}"
    all_files = []
    for traces_dir in args.traces_dirs:
        print(f"Processing traces from directory: {traces_dir}")
        files = copy_trace_files(traces_dir, target_base_path)
        all_files.extend(files)

    if not all_files:
        print("No trace files found to upload across all directories")
        return

    print(f"Found {len(all_files)} total files to upload")

    # Publish all collected traces in a single commit
    publish_traces_from_files(all_files, run_id, run_number)


if __name__ == "__main__":
    main()