Spaces:

ArshVerma
/

CodeLens

Sleeping

File size: 3,283 Bytes

5482b12

import os
import json
import argparse
from openai import OpenAI
from pydantic import BaseModel, Field

# Ensure you have OPENAI_API_KEY set in your environment
client = OpenAI()

SYSTEM_PROMPT = """You are an expert software engineer and data generator. 
Your task is to generate synthetic Pull Request scenarios containing subtle bugs, security flaws, or architectural issues.
Follow the exact JSON schema provided."""

class GeneratedScenario(BaseModel):
    pr_title: str
    pr_description: str
    diff: str
    action_type: str = Field(description="Must be 'flag_issue'")
    body: str = Field(description="Detailed explanation of the issue")
    filename: str
    line_number: int
    severity: str = Field(description="One of: low, medium, high, critical")
    category: str = Field(description="One of: bug, security, architecture, performance, style")

def generate_synthetic_data(count: int, output_file: str):
    dataset = []
    print(f"Generating {count} synthetic scenarios...")
    
    for i in range(count):
        print(f"Generating scenario {i+1}/{count}...")
        try:
            response = client.beta.chat.completions.parse(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": "Generate a unique, realistic Python code review scenario. Include a unified diff."}
                ],
                response_format=GeneratedScenario,
            )
            scenario = response.choices[0].message.parsed
            
            # Convert to CodeLens instruction format
            user_msg = f"PR Title: {scenario.pr_title}\nPR Description: {scenario.pr_description}\nCode diff:\n```\n{scenario.diff}\n```\nOutput a single JSON action object."
            assistant_msg = json.dumps({
                "action_type": scenario.action_type,
                "body": scenario.body,
                "filename": scenario.filename,
                "line_number": scenario.line_number,
                "severity": scenario.severity,
                "category": scenario.category
            }, indent=2)
            
            dataset.append({
                "messages": [
                    {"role": "system", "content": "You are an expert code reviewer..."},
                    {"role": "user", "content": user_msg},
                    {"role": "assistant", "content": assistant_msg}
                ]
            })
            
        except Exception as e:
            print(f"Failed on generation {i}: {e}")

    # Append to existing dataset
    with open(output_file, "a") as f:
        for item in dataset:
            f.write(json.dumps(item) + "\n")
            
    print(f"✅ Generated {len(dataset)} scenarios and appended to {output_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--count", type=int, default=10, help="Number of scenarios to generate")
    parser.add_argument("--output", type=str, default="dataset.jsonl", help="Output file")
    args = parser.parse_args()
    
    if not os.getenv("OPENAI_API_KEY"):
        print("ERROR: OPENAI_API_KEY environment variable not set.")
        exit(1)
        
    generate_synthetic_data(args.count, args.output)