""" env/scenario_generator.py Dynamically generates new DB engineering scenarios using an LLM. Used by CurriculumGenerator when agent reaches ultra tier. Also useful for generating additional training data. """ import os import json import random import requests from typing import Optional ENV_URL = os.getenv("ENV_URL", "https://junaid0600-sql-db-engineer-agent.hf.space") API_BASE = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") HF_TOKEN = os.getenv("HF_TOKEN", "") MODEL = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") # Domain templates for variety DOMAINS = [ "e-commerce platform", "healthcare records system", "financial trading platform", "social media platform", "logistics and shipping", "gaming leaderboard system", "SaaS subscription platform", "analytics and reporting DB", "inventory management system", "HR and payroll system", ] TABLE_TEMPLATES = { "e-commerce platform": ["orders", "users", "products", "payments", "reviews"], "healthcare records system": ["patients", "appointments", "prescriptions", "doctors", "billing"], "financial trading platform": ["trades", "accounts", "portfolios", "transactions", "market_data"], "social media platform": ["posts", "users", "comments", "likes", "followers"], "logistics and shipping": ["shipments", "drivers", "routes", "tracking", "warehouses"], "gaming leaderboard system": ["players", "matches", "scores", "achievements", "seasons"], "SaaS subscription platform": ["subscriptions", "users", "invoices", "features", "usage_logs"], "analytics and reporting DB": ["events", "sessions", "users", "funnels", "reports"], "inventory management system": ["products", "stock", "suppliers", "purchase_orders", "warehouses"], "HR and payroll system": ["employees", "departments", "payroll", "attendance", "performance"], } class ScenarioGenerator: """ Generates novel DB engineering scenarios procedurally. Can use LLM for richer descriptions or pure procedural generation. """ def __init__(self): self.generated_count = 0 def generate_procedural( self, difficulty: str = "hard", domain: Optional[str] = None, ) -> dict: """ Generate a scenario procedurally (no LLM needed). Fast, deterministic, infinitely scalable. """ if domain is None: domain = random.choice(DOMAINS) table_names = TABLE_TEMPLATES.get(domain, ["orders", "users", "products"]) # Scale based on difficulty scales = { "easy": {"rows": (5000, 50000), "queries": 1, "max_steps": 15, "target": (75, 85)}, "medium": {"rows": (50000, 500000), "queries": 2, "max_steps": 30, "target": (70, 80)}, "hard": {"rows": (500000, 5000000), "queries": 3, "max_steps": 50, "target": (65, 75)}, "ultra": {"rows": (1000000, 10000000),"queries": 4, "max_steps": 40, "target": (60, 70)}, } scale = scales.get(difficulty, scales["hard"]) # Pick tables n_tables = {"easy": 1, "medium": 2, "hard": 3, "ultra": random.randint(4, 6)}.get(difficulty, 3) chosen = random.sample(table_names, min(n_tables, len(table_names))) tables = [] for name in chosen: rows = random.randint(*scale["rows"]) tables.append({ "name": name, "rows": rows, "indexes": ["PRIMARY"], "size_mb": rows // 200, }) # Generate slow queries filter_cols = ["user_id", "status", "created_at", "category", "type", "date"] slow_queries = [] for i in range(scale["queries"]): table = random.choice(chosen) col1 = random.choice(filter_cols) col2 = random.choice([c for c in filter_cols if c != col1]) avg_ms = random.randint(3000, 25000) slow_queries.append({ "id": f"q{i+1}", "sql": f"SELECT * FROM {table} WHERE {col1}=? AND {col2}=?", "avg_ms": avg_ms, "main_table": table, "rows_examined": tables[chosen.index(table)]["rows"] if table in chosen else 100000, }) # Missing index hints (one per query) missing_hints = [] for i, q in enumerate(slow_queries): table = q["main_table"] cols = [c.strip() for c in q["sql"].split("WHERE")[1].split("AND")] cols = [c.split("=")[0].strip() for c in cols if "=" in c][:2] missing_hints.append({ "table": table, "columns": cols, "reason": f"Composite WHERE clause on {table}", }) baseline = round(random.uniform(3.0, 12.0), 1) target = round(random.uniform(*scale["target"]), 1) self.generated_count += 1 scenario_id = f"gen_{difficulty}_{self.generated_count:04d}" return { "id": scenario_id, "description": f"{domain.title()}: {len(tables)} tables, {len(slow_queries)} slow queries. Auto-generated.", "tables": tables, "slow_queries": slow_queries, "missing_index_hints": missing_hints, "performance_score_baseline": baseline, "target_score": target, "max_steps": scale["max_steps"], "category": domain.replace(" ", "_"), "generated": True, } def generate_batch(self, n: int = 10, difficulty: str = "hard") -> list[dict]: """Generate a batch of scenarios for training data augmentation.""" scenarios = [] domains = DOMAINS * (n // len(DOMAINS) + 1) random.shuffle(domains) for i in range(n): domain = domains[i % len(domains)] scenario = self.generate_procedural(difficulty=difficulty, domain=domain) scenarios.append(scenario) print(f"āœ… Generated {n} {difficulty} scenarios") return scenarios def save_batch(self, scenarios: list[dict], filepath: str): """Save generated scenarios to JSON file.""" with open(filepath, "w") as f: json.dump(scenarios, f, indent=2) print(f"šŸ’¾ Saved {len(scenarios)} scenarios to {filepath}") def augment_dataset(self, n_per_difficulty: int = 5): """ Augment the existing dataset with generated scenarios. Saves to dataset/generated_*.json files. """ for diff in ["easy", "medium", "hard"]: batch = self.generate_batch(n_per_difficulty, difficulty=diff) filepath = f"dataset/generated_{diff}_scenarios.json" self.save_batch(batch, filepath) print(f"āœ… Dataset augmented with {n_per_difficulty * 3} new scenarios") # Singleton scenario_generator = ScenarioGenerator() if __name__ == "__main__": # Quick test — generate one scenario per difficulty print("šŸ”§ Scenario Generator Test") print("=" * 50) for diff in ["easy", "medium", "hard"]: s = scenario_generator.generate_procedural(difficulty=diff) print(f"\n[{diff.upper()}] {s['id']}") print(f" Domain: {s['category']}") print(f" Tables: {[t['name'] for t in s['tables']]}") print(f" Queries: {len(s['slow_queries'])}") print(f" Baseline: {s['performance_score_baseline']} → Target: {s['target_score']}") print(f" Max steps: {s['max_steps']}") print("\nāœ… Generator working correctly!")