"""Main data generation class for creating synthetic datasets using AI models.""" import os from datetime import datetime from .prompts import build_user_prompt, system_message from .models import get_gpt_completion, get_claude_completion from .utils import execute_code_in_virtualenv from .constants import OUTPUT_DIR, logger class DataGen: """Handles synthetic data generation using AI models.""" def __init__(self, output_dir=None): """Initialize the data generator with output directory.""" # Use provided output_dir, or fall back to OUTPUT_DIR constant self.output_dir = output_dir or OUTPUT_DIR os.makedirs(self.output_dir, exist_ok=True) def get_timestamp(self): """Return current timestamp for file naming.""" return datetime.now().strftime("%Y%m%d_%H%M%S") def generate_dataset(self, **input_data): """Generate synthetic dataset based on input parameters and model choice.""" try: # Ensure output directory exists before generating os.makedirs(self.output_dir, exist_ok=True) # Add output directory path to input data for file generation input_data["file_path"] = self.output_dir # Build the prompt to send to the selected LLM prompt = build_user_prompt(**input_data) # Call the selected LLM based on the model parameter if input_data["model"] == "GPT": code = get_gpt_completion(prompt, system_message) elif input_data["model"] == "Claude": code = get_claude_completion(prompt, system_message) else: raise ValueError("Invalid model selected.") # Execute the generated code and return the output file path file_path = execute_code_in_virtualenv(code) return file_path except Exception as e: # Log and re-raise any errors that occur during generation logger.error(f"Error in generate_dataset: {e}") raise