import glob import json import os import re import sys import time from datetime import datetime, timezone import requests from github import Auth, Github # Configuration PERMISSIONS_FILE_PATH = ".github/CI_PERMISSIONS.json" def find_workflow_run_url( gh_repo, workflow_id, ref, target_stage, token, dispatch_time, pr_head_sha=None, max_wait=30, ): """ Poll for the workflow run URL after dispatch. Uses the dynamic run-name feature to identify runs: - Fork PRs: display_title = "[stage-name] sha" - Non-fork PRs: display_title = "[stage-name]" Args: gh_repo: PyGithub repository object workflow_id: ID of the workflow that was dispatched ref: Branch/ref the workflow was dispatched on target_stage: The stage name we're looking for token: GitHub API token dispatch_time: Unix timestamp when dispatch was triggered pr_head_sha: PR head SHA (for fork PRs, used to match display_title) max_wait: Maximum seconds to wait for the run to appear Returns: The workflow run URL if found, None otherwise. """ # Build expected display_title pattern based on workflow's run-name # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork if pr_head_sha: expected_title = f"[{target_stage}] {pr_head_sha}" else: expected_title = f"[{target_stage}]" print(f"Looking for workflow run with display_title: {expected_title}") for attempt in range(max_wait // 5): time.sleep(5) # Get recent workflow_dispatch runs for this workflow runs_url = f"https://api.github.com/repos/{gh_repo.full_name}/actions/workflows/{workflow_id}/runs" runs_resp = requests.get( runs_url, params={"event": "workflow_dispatch", "branch": ref, "per_page": 10}, headers={ "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", }, ) if runs_resp.status_code != 200: print(f"Failed to fetch workflow runs: {runs_resp.status_code}") continue for run in runs_resp.json().get("workflow_runs", []): # Skip runs created before our dispatch (with 10s tolerance) run_created = datetime.fromisoformat( run["created_at"].replace("Z", "+00:00") ).timestamp() if run_created < dispatch_time - 10: continue # Match by display_title (set by workflow's run-name directive) # This is immediately available, unlike job names which require waiting display_title = run.get("display_title", "") if display_title == expected_title: print( f"Found matching workflow run: {run['id']} with title '{display_title}'" ) return run["html_url"] print(f"Could not find workflow run after {max_wait} seconds") return None def get_env_var(name): val = os.getenv(name) if not val: print(f"Error: Environment variable {name} not set.") sys.exit(1) return val def load_permissions(user_login): """ Reads the permissions JSON from the local file system and returns the permissions dict for the specific user. """ try: print(f"Loading permissions from {PERMISSIONS_FILE_PATH}...") if not os.path.exists(PERMISSIONS_FILE_PATH): print(f"Error: Permissions file not found at {PERMISSIONS_FILE_PATH}") return None with open(PERMISSIONS_FILE_PATH, "r") as f: data = json.load(f) user_perms = data.get(user_login) if not user_perms: print(f"User '{user_login}' not found in permissions file.") return None return user_perms except Exception as e: print(f"Failed to load or parse permissions file: {e}") sys.exit(1) def has_sgl_kernel_changes(pr): """ Check if the PR has changes to the sgl-kernel directory. This is used to determine if we need a full workflow rerun (to rebuild the kernel) vs just rerunning failed jobs. """ try: files = pr.get_files() for f in files: if f.filename.startswith("sgl-kernel/"): return True return False except Exception as e: print(f"Warning: Could not check PR files for sgl-kernel changes: {e}") # Default to False to avoid unnecessary full reruns return False def handle_tag_run_ci(gh_repo, pr, comment, user_perms, react_on_success=True): """ Handles the /tag-run-ci-label command. Returns True if action was taken, False otherwise. """ if not user_perms.get("can_tag_run_ci_label", False): print("Permission denied: can_tag_run_ci_label is false.") return False print("Permission granted. Adding 'run-ci' label.") pr.add_to_labels("run-ci") if react_on_success: comment.create_reaction("+1") print("Label added and comment reacted.") else: print("Label added (reaction suppressed).") return True def handle_rerun_failed_ci(gh_repo, pr, comment, user_perms, react_on_success=True): """ Handles the /rerun-failed-ci command. Reruns workflows with 'failure' or 'skipped' conclusions. Returns True if action was taken, False otherwise. """ if not user_perms.get("can_rerun_failed_ci", False): print("Permission denied: can_rerun_failed_ci is false.") return False print("Permission granted. Triggering rerun of failed or skipped workflows.") # Check if PR has sgl-kernel changes - if so, we need full reruns # to ensure sgl-kernel-build-wheels runs and produces fresh artifacts sgl_kernel_changes = has_sgl_kernel_changes(pr) if sgl_kernel_changes: print("PR has sgl-kernel changes - will use full rerun to rebuild kernel") # Get the SHA of the latest commit in the PR head_sha = pr.head.sha print(f"Checking workflows for commit: {head_sha}") # List all workflow runs for this commit runs = gh_repo.get_workflow_runs(head_sha=head_sha) rerun_count = 0 for run in runs: if run.status != "completed": continue if run.conclusion == "failure": print(f"Rerunning failed workflow: {run.name} (ID: {run.id})") try: if sgl_kernel_changes: # Full rerun to ensure sgl-kernel-build-wheels runs # and produces fresh artifacts for dependent jobs run.rerun() else: # Use rerun_failed_jobs for efficiency on failures run.rerun_failed_jobs() rerun_count += 1 except Exception as e: print(f"Failed to rerun workflow {run.id}: {e}") elif run.conclusion == "skipped": print(f"Rerunning skipped workflow: {run.name} (ID: {run.id})") try: # Skipped workflows don't have 'failed jobs', so we use full rerun() run.rerun() rerun_count += 1 except Exception as e: print(f"Failed to rerun workflow {run.id}: {e}") if rerun_count > 0: print(f"Triggered rerun for {rerun_count} workflows.") if react_on_success: comment.create_reaction("+1") return True else: print("No failed or skipped workflows found to rerun.") return False def handle_rerun_stage( gh_repo, pr, comment, user_perms, stage_name, token, react_on_success=True ): """ Handles the /rerun-stage command. Triggers a workflow_dispatch to run only the specified stage, skipping dependencies. Returns True if action was taken, False otherwise. """ if not user_perms.get("can_rerun_stage", False): print("Permission denied: can_rerun_stage is false.") return False if not stage_name: print("Error: No stage name provided") comment.create_reaction("confused") pr.create_issue_comment( f"❌ Please specify a stage name: `/rerun-stage `\n\n" f"Examples: `/rerun-stage unit-test-backend-4-gpu`, `/rerun-stage accuracy-test-1-gpu`" ) return False print(f"Permission granted. Triggering workflow_dispatch for stage '{stage_name}'.") # Valid NVIDIA stage names that support target_stage nvidia_stages = [ "stage-a-test-1", "stage-a-cpu-only", "stage-b-test-small-1-gpu", "stage-b-test-large-1-gpu", "stage-b-test-large-2-gpu", "stage-b-test-4-gpu-b200", "stage-c-test-4-gpu-h100", "stage-c-test-8-gpu-h200", "stage-c-test-8-gpu-h20", "stage-c-test-4-gpu-b200", "stage-c-test-4-gpu-gb200", "stage-c-test-deepep-4-gpu", "stage-c-test-deepep-8-gpu-h200", "multimodal-gen-test-1-gpu", "multimodal-gen-test-2-gpu", ] # Valid AMD stage names that support target_stage amd_stages = [ "sgl-kernel-unit-test-amd", "sgl-kernel-unit-test-2-gpu-amd", "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", "stage-b-test-small-1-gpu-amd-nondeterministic", "stage-b-test-small-1-gpu-amd-mi35x", "stage-b-test-large-1-gpu-amd", "stage-b-test-large-2-gpu-amd", "multimodal-gen-test-1-gpu-amd", "multimodal-gen-test-2-gpu-amd", "stage-c-test-large-8-gpu-amd", "stage-c-test-large-8-gpu-amd-mi35x", ] valid_stages = nvidia_stages + amd_stages is_amd_stage = stage_name in amd_stages if stage_name not in valid_stages: comment.create_reaction("confused") pr.create_issue_comment( f"❌ Stage `{stage_name}` doesn't support isolated runs yet.\n\n" f"**NVIDIA stages:**\n" + "\n".join(f"- `{s}`" for s in nvidia_stages) + "\n\n**AMD stages:**\n" + "\n".join(f"- `{s}`" for s in amd_stages) + "\n\nOther stages will be added soon. For now, use `/rerun-failed-ci` for those stages." ) return False try: # Get the appropriate workflow based on stage type workflow_name = "PR Test (AMD)" if is_amd_stage else "PR Test" workflows = gh_repo.get_workflows() target_workflow = None for wf in workflows: if wf.name == workflow_name: target_workflow = wf break if not target_workflow: print(f"Error: {workflow_name} workflow not found") return False # Check if PR is from a fork by comparing repo owners # Handle case where fork repo may have been deleted (pr.head.repo is None) is_fork = ( pr.head.repo is None or pr.head.repo.owner.login != gh_repo.owner.login ) print(f"PR is from fork: {is_fork}") # pr_head_sha is used for fork PRs (passed to workflow and used for URL lookup) pr_head_sha = None if is_fork: # For fork PRs: dispatch on main and pass SHA as input # This is needed because fork branch names don't exist in the main repo ref = "main" pr_head_sha = pr.head.sha print( f"Triggering {workflow_name} workflow on ref: {ref}, PR head SHA: {pr_head_sha}" ) if is_amd_stage: inputs = {"target_stage": stage_name, "pr_head_sha": pr_head_sha} else: inputs = { "version": "release", "target_stage": stage_name, "pr_head_sha": pr_head_sha, } else: # For non-fork PRs: dispatch on the PR branch directly # This allows testing workflow changes before merge ref = pr.head.ref print(f"Triggering {workflow_name} workflow on branch: {ref}") if is_amd_stage: inputs = {"target_stage": stage_name} else: inputs = {"version": "release", "target_stage": stage_name} # Record dispatch time before triggering dispatch_time = time.time() # Use requests directly as PyGithub's create_dispatch only accepts HTTP 204 dispatch_url = f"https://api.github.com/repos/{gh_repo.full_name}/actions/workflows/{target_workflow.id}/dispatches" dispatch_resp = requests.post( dispatch_url, json={"ref": ref, "inputs": inputs}, headers={ "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", }, ) success = dispatch_resp.status_code in (200, 204) if not success: print(f"Dispatch failed: {dispatch_resp.status_code} {dispatch_resp.text}") if success: print(f"Successfully triggered workflow for stage '{stage_name}'") if react_on_success: comment.create_reaction("+1") pr.create_issue_comment( f"✅ Triggered `{stage_name}` to run independently (skipping dependencies)." ) # Poll for the workflow run URL and post follow-up comment run_url = find_workflow_run_url( gh_repo, target_workflow.id, ref, stage_name, token, dispatch_time, pr_head_sha=pr_head_sha, max_wait=30, ) if run_url: pr.create_issue_comment(f"🔗 [View workflow run]({run_url})") else: pr.create_issue_comment( f"⚠️ Could not retrieve workflow run URL. " f"Check the [Actions tab](https://github.com/{gh_repo.full_name}/actions) for progress." ) return True else: print("Failed to trigger workflow_dispatch") return False except Exception as e: print(f"Error triggering workflow_dispatch: {e}") comment.create_reaction("confused") pr.create_issue_comment( f"❌ Failed to trigger workflow: {str(e)}\n\n" f"Please check the logs or contact maintainers." ) return False CUDA_SUITE_TO_RUNNER = { "stage-a-test-1": "1-gpu-runner", "stage-a-cpu-only": "ubuntu-latest", "stage-b-test-small-1-gpu": "1-gpu-5090", "stage-b-test-large-1-gpu": "1-gpu-runner", "stage-b-test-large-2-gpu": "2-gpu-runner", "stage-b-test-4-gpu-b200": "4-gpu-b200", "stage-c-test-4-gpu-h100": "4-gpu-h100", "stage-c-test-8-gpu-h200": "8-gpu-h200", "stage-c-test-8-gpu-h20": "8-gpu-h20", "stage-c-test-4-gpu-b200": "4-gpu-b200", "stage-c-test-deepep-4-gpu": "4-gpu-h100", "stage-c-test-deepep-8-gpu-h200": "8-gpu-h200", } DEEPEP_SUITES = { "stage-c-test-8-gpu-h20", "stage-c-test-deepep-4-gpu", "stage-c-test-deepep-8-gpu-h200", } def resolve_test_file(file_part): """ Resolve a user-provided file path to a path relative to test/. Supports: - Full path: test/registered/core/test_srt_endpoint.py - Relative to test/: registered/core/test_srt_endpoint.py - Bare filename: test_srt_endpoint.py (glob-matched, must be unique) Returns (resolved_path, error_message). On success error_message is None. """ if file_part.startswith("test/"): file_part = file_part[len("test/") :] if "/" not in file_part: matches = glob.glob(f"test/registered/**/{file_part}", recursive=True) if len(matches) == 0: return ( None, f"No test file found matching `{file_part}` under `test/registered/`.", ) if len(matches) > 1: match_list = "\n".join(f"- `{m}`" for m in sorted(matches)) return None, ( f"Ambiguous filename `{file_part}` — matched {len(matches)} files:\n\n" f"{match_list}\n\n" f"Please provide the full path, e.g. `/rerun-ut {matches[0]}`" ) return matches[0][len("test/") :], None full_path = f"test/{file_part}" if not os.path.isfile(full_path): return None, f"File not found: `{full_path}`" return file_part, None def detect_cuda_suite(file_path_from_test): """ Read a test file and extract the suite from register_cuda_ci(suite="..."). Returns (suite_name, runner_label, use_deepep, error_message). """ full_path = f"test/{file_path_from_test}" with open(full_path, "r") as f: content = f.read() match = re.search( r'register_cuda_ci\([^)]*suite\s*=\s*["\']([^"\']+)["\']', content ) if not match: return ( None, None, False, ( f"No `register_cuda_ci()` found in `{full_path}`.\n\n" f"This file may not be a registered CUDA CI test." ), ) suite = match.group(1) runner = CUDA_SUITE_TO_RUNNER.get(suite) if not runner: known = ", ".join(f"`{s}`" for s in sorted(CUDA_SUITE_TO_RUNNER)) return ( suite, None, False, ( f"Unknown CUDA suite `{suite}` in `{full_path}`.\n\n" f"Known suites: {known}" ), ) use_deepep = suite in DEEPEP_SUITES return suite, runner, use_deepep, None def handle_rerun_ut(gh_repo, pr, comment, user_perms, test_spec, token): """ Handles the /rerun-ut :: command. Dispatches a lightweight workflow to run a single test on the correct CUDA runner. """ if not ( user_perms.get("can_rerun_ut", False) or user_perms.get("can_rerun_stage", False) ): print("Permission denied: neither can_rerun_ut nor can_rerun_stage is true.") return False if not test_spec: comment.create_reaction("confused") pr.create_issue_comment( "❌ Please specify a test: `/rerun-ut ::`\n\n" "Examples:\n" "- `/rerun-ut test/registered/core/test_srt_endpoint.py::TestSRTEndpoint.test_simple_decode`\n" "- `/rerun-ut registered/core/test_srt_endpoint.py::TestSRTEndpoint`\n" "- `/rerun-ut test_srt_endpoint.py`" ) return False # Parse spec: split on :: to get file path and optional test selector if "::" in test_spec: file_part, test_selector = test_spec.split("::", 1) else: file_part = test_spec test_selector = None file_part = file_part.strip() if test_selector: test_selector = test_selector.strip() # Resolve file path resolved_path, err = resolve_test_file(file_part) if err: comment.create_reaction("confused") pr.create_issue_comment(f"❌ {err}") return False # Detect suite and runner suite, runner_label, use_deepep, err = detect_cuda_suite(resolved_path) if err: comment.create_reaction("confused") pr.create_issue_comment(f"❌ {err}") return False # Build test_command: file path (+ optional test selector as unittest arg) test_command = resolved_path if test_selector: test_command = f"{resolved_path} {test_selector}" print( f"Resolved: file={resolved_path}, selector={test_selector}, " f"suite={suite}, runner={runner_label}, deepep={use_deepep}, command='{test_command}'" ) try: workflow_name = "Rerun UT" workflows = gh_repo.get_workflows() target_workflow = None for wf in workflows: if wf.name == workflow_name: target_workflow = wf break if not target_workflow: print(f"Error: {workflow_name} workflow not found") return False is_fork = ( pr.head.repo is None or pr.head.repo.owner.login != gh_repo.owner.login ) print(f"PR is from fork: {is_fork}") pr_head_sha = None if is_fork: ref = "main" pr_head_sha = pr.head.sha inputs = { "test_command": test_command, "runner_label": runner_label, "pr_head_sha": pr_head_sha, "use_deepep": str(use_deepep).lower(), } else: ref = pr.head.ref inputs = { "test_command": test_command, "runner_label": runner_label, "use_deepep": str(use_deepep).lower(), } dispatch_time = time.time() dispatch_url = f"https://api.github.com/repos/{gh_repo.full_name}/actions/workflows/{target_workflow.id}/dispatches" dispatch_resp = requests.post( dispatch_url, json={"ref": ref, "inputs": inputs}, headers={ "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", }, ) success = dispatch_resp.status_code in (200, 204) if not success: print(f"Dispatch failed: {dispatch_resp.status_code} {dispatch_resp.text}") if success: print(f"Successfully triggered rerun-ut: {test_command}") comment.create_reaction("+1") pr.create_issue_comment( f"✅ Triggered `/rerun-ut` on `{runner_label}` runner:\n" f"```\ncd test/ && python3 {test_command}\n```" ) run_url = find_workflow_run_url( gh_repo, target_workflow.id, ref, "rerun-ut", token, dispatch_time, pr_head_sha=pr_head_sha, max_wait=30, ) if run_url: pr.create_issue_comment(f"🔗 [View workflow run]({run_url})") else: pr.create_issue_comment( f"⚠️ Could not retrieve workflow run URL. " f"Check the [Actions tab](https://github.com/{gh_repo.full_name}/actions) for progress." ) return True else: print("Failed to trigger workflow_dispatch") return False except Exception as e: print(f"Error triggering rerun-ut: {e}") comment.create_reaction("confused") pr.create_issue_comment( f"❌ Failed to trigger rerun-ut: {str(e)}\n\n" f"Please check the logs or contact maintainers." ) return False def main(): # 1. Load Environment Variables token = get_env_var("GITHUB_TOKEN") repo_name = get_env_var("REPO_FULL_NAME") pr_number = int(get_env_var("PR_NUMBER")) comment_id = int(get_env_var("COMMENT_ID")) comment_body = get_env_var("COMMENT_BODY").strip() user_login = get_env_var("USER_LOGIN") # 2. Load Permissions (local file check first to avoid unnecessary API calls) user_perms = load_permissions(user_login) # 3. Initialize GitHub API with Auth auth = Auth.Token(token) g = Github(auth=auth) repo = g.get_repo(repo_name) pr = repo.get_pull(pr_number) comment = repo.get_issue(pr_number).get_comment(comment_id) # PR authors can always rerun failed CI and rerun individual UTs on their own PRs, # even if they are not listed in CI_PERMISSIONS.json. # Note: /tag-run-ci-label and /rerun-stage still require CI_PERMISSIONS.json. if pr.user.login == user_login: if user_perms is None: print( f"User {user_login} is the PR author (not in CI_PERMISSIONS.json). " "Granting CI rerun permissions." ) user_perms = {} else: print( f"User {user_login} is the PR author and has existing CI permissions." ) user_perms["can_rerun_failed_ci"] = True user_perms["can_rerun_ut"] = True if not user_perms: print(f"User {user_login} does not have any configured permissions. Exiting.") return # 4. Parse Command and Execute first_line = comment_body.split("\n")[0].strip() if first_line.startswith("/tag-run-ci-label"): handle_tag_run_ci(repo, pr, comment, user_perms) elif first_line.startswith("/rerun-failed-ci"): handle_rerun_failed_ci(repo, pr, comment, user_perms) elif first_line.startswith("/tag-and-rerun-ci"): # Perform both actions, but suppress individual reactions print("Processing combined command: /tag-and-rerun-ci") tagged = handle_tag_run_ci( repo, pr, comment, user_perms, react_on_success=False ) # Wait for the label to propagate before triggering rerun if tagged: print("Waiting 5 seconds for label to propagate...") time.sleep(5) rerun = handle_rerun_failed_ci( repo, pr, comment, user_perms, react_on_success=False ) # If at least one action was successful, add the reaction here if tagged or rerun: comment.create_reaction("+1") print("Combined command processed successfully; reaction added.") else: print("Combined command finished, but no actions were taken.") elif first_line.startswith("/rerun-stage"): # Extract stage name from command parts = first_line.split(maxsplit=1) stage_name = parts[1].strip() if len(parts) > 1 else None handle_rerun_stage(repo, pr, comment, user_perms, stage_name, token) elif first_line.startswith("/rerun-ut"): parts = first_line.split(maxsplit=1) test_spec = parts[1].strip() if len(parts) > 1 else None handle_rerun_ut(repo, pr, comment, user_perms, test_spec, token) else: print(f"Unknown or ignored command: {first_line}") if __name__ == "__main__": main()