| import os |
| import json |
| import argparse |
| from codelens_env.scenarios import ALL_SCENARIOS |
|
|
| SYSTEM_PROMPT = """You are an expert code reviewer specializing in bugs, security vulnerabilities, and architectural issues. |
| |
| You will be given a code diff (PR) to review. Your job is to identify issues and output a single JSON action. |
| |
| Available action types: |
| - "flag_issue": Flag a specific issue in the code |
| - "approve": Approve the PR (no issues found / all issues flagged) |
| - "request_changes": Request changes (issues found that must be fixed) |
| - "ask_question": Ask a clarifying question |
| - "comment": Leave a general comment |
| |
| For "flag_issue", you MUST provide: |
| - action_type: "flag_issue" |
| - body: description of the issue (be specific, mention the root cause) |
| - filename: the file containing the issue |
| - line_number: approximate line number |
| - severity: one of "low", "medium", "high", "critical" |
| - category: one of "bug", "security", "architecture", "performance", "style", "design" |
| |
| For "approve" or "request_changes", you MUST provide: |
| - action_type: "approve" or "request_changes" |
| - body: your overall assessment |
| - verdict: "LGTM" (for approve) or "REQUEST_CHANGES" (for request_changes) |
| |
| IMPORTANT: Output ONLY a valid JSON object — no markdown, no explanation. |
| """ |
|
|
| def generate_diff(files_changed): |
| diff = [] |
| for f in files_changed: |
| diff.append(f"--- a/{f.filename}\n+++ b/{f.filename}\n{f.patch}") |
| return "\n".join(diff) |
|
|
| def build_user_message(scenario): |
| task_hints = { |
| "bug_detection": "Focus on: off-by-one errors, None dereferences, type mismatches, mutable defaults, race conditions, exception handling.", |
| "security_audit": "Focus on: SQL injection, XSS, hardcoded secrets, JWT issues, insecure deserialization, CORS, timing attacks, path traversal.", |
| "architectural_review": "Focus on: SRP violations, direct DB access from wrong layers, N+1 queries, missing retry/circuit-breaker, god objects, blocking I/O." |
| } |
|
|
| diff = generate_diff(scenario.files_changed) |
| |
| return f"""PR Title: {scenario.pr_title} |
| PR Description: {scenario.pr_description} |
| Task: {scenario.task_id.value} (step 1/10) |
| Noise budget remaining: 5 (false positives consume this) |
| Review focus: {task_hints.get(scenario.task_id.value, 'General code review')} |
| |
| Code diff: |
| ``` |
| {diff} |
| ``` |
| |
| Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes.""" |
|
|
| def build_assistant_message(scenario): |
| |
| |
| if not scenario.ground_truth_issues: |
| return json.dumps({ |
| "action_type": "approve", |
| "body": "Code looks good, no issues found.", |
| "verdict": "LGTM" |
| }, indent=2) |
| |
| issue = scenario.ground_truth_issues[0] |
| return json.dumps({ |
| "action_type": "flag_issue", |
| "body": issue.description, |
| "filename": issue.filename, |
| "line_number": issue.line_number, |
| "severity": issue.severity.value, |
| "category": issue.category.value |
| }, indent=2) |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Export CodeLens scenarios to a JSONL dataset for fine-tuning") |
| parser.add_argument("--output", default="dataset.jsonl", help="Output file path") |
| args = parser.parse_args() |
|
|
| dataset = [] |
| for scenario in ALL_SCENARIOS: |
| messages = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": build_user_message(scenario)}, |
| {"role": "assistant", "content": build_assistant_message(scenario)} |
| ] |
| |
| dataset.append({"messages": messages}) |
|
|
| with open(args.output, "w") as f: |
| for item in dataset: |
| f.write(json.dumps(item) + "\n") |
| |
| print(f"✅ Successfully exported {len(dataset)} examples to {args.output}") |
| print("This dataset is ready to be uploaded to Kaggle for Supervised Fine-Tuning!") |
|
|
| if __name__ == "__main__": |
| main() |
|
|