File size: 3,283 Bytes
5482b12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import os
import json
import argparse
from openai import OpenAI
from pydantic import BaseModel, Field
# Ensure you have OPENAI_API_KEY set in your environment
client = OpenAI()
SYSTEM_PROMPT = """You are an expert software engineer and data generator.
Your task is to generate synthetic Pull Request scenarios containing subtle bugs, security flaws, or architectural issues.
Follow the exact JSON schema provided."""
class GeneratedScenario(BaseModel):
pr_title: str
pr_description: str
diff: str
action_type: str = Field(description="Must be 'flag_issue'")
body: str = Field(description="Detailed explanation of the issue")
filename: str
line_number: int
severity: str = Field(description="One of: low, medium, high, critical")
category: str = Field(description="One of: bug, security, architecture, performance, style")
def generate_synthetic_data(count: int, output_file: str):
dataset = []
print(f"Generating {count} synthetic scenarios...")
for i in range(count):
print(f"Generating scenario {i+1}/{count}...")
try:
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": "Generate a unique, realistic Python code review scenario. Include a unified diff."}
],
response_format=GeneratedScenario,
)
scenario = response.choices[0].message.parsed
# Convert to CodeLens instruction format
user_msg = f"PR Title: {scenario.pr_title}\nPR Description: {scenario.pr_description}\nCode diff:\n```\n{scenario.diff}\n```\nOutput a single JSON action object."
assistant_msg = json.dumps({
"action_type": scenario.action_type,
"body": scenario.body,
"filename": scenario.filename,
"line_number": scenario.line_number,
"severity": scenario.severity,
"category": scenario.category
}, indent=2)
dataset.append({
"messages": [
{"role": "system", "content": "You are an expert code reviewer..."},
{"role": "user", "content": user_msg},
{"role": "assistant", "content": assistant_msg}
]
})
except Exception as e:
print(f"Failed on generation {i}: {e}")
# Append to existing dataset
with open(output_file, "a") as f:
for item in dataset:
f.write(json.dumps(item) + "\n")
print(f"✅ Generated {len(dataset)} scenarios and appended to {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--count", type=int, default=10, help="Number of scenarios to generate")
parser.add_argument("--output", type=str, default="dataset.jsonl", help="Output file")
args = parser.parse_args()
if not os.getenv("OPENAI_API_KEY"):
print("ERROR: OPENAI_API_KEY environment variable not set.")
exit(1)
generate_synthetic_data(args.count, args.output)
|