Spaces:
Sleeping
Sleeping
| """Main data generation class for creating synthetic datasets using AI models.""" | |
| import os | |
| from datetime import datetime | |
| from .prompts import build_user_prompt, system_message | |
| from .models import get_gpt_completion, get_claude_completion | |
| from .utils import execute_code_in_virtualenv | |
| from .constants import OUTPUT_DIR, logger | |
| class DataGen: | |
| """Handles synthetic data generation using AI models.""" | |
| def __init__(self, output_dir=None): | |
| """Initialize the data generator with output directory.""" | |
| # Use provided output_dir, or fall back to OUTPUT_DIR constant | |
| self.output_dir = output_dir or OUTPUT_DIR | |
| os.makedirs(self.output_dir, exist_ok=True) | |
| def get_timestamp(self): | |
| """Return current timestamp for file naming.""" | |
| return datetime.now().strftime("%Y%m%d_%H%M%S") | |
| def generate_dataset(self, **input_data): | |
| """Generate synthetic dataset based on input parameters and model choice.""" | |
| try: | |
| # Ensure output directory exists before generating | |
| os.makedirs(self.output_dir, exist_ok=True) | |
| # Add output directory path to input data for file generation | |
| input_data["file_path"] = self.output_dir | |
| # Build the prompt to send to the selected LLM | |
| prompt = build_user_prompt(**input_data) | |
| # Call the selected LLM based on the model parameter | |
| if input_data["model"] == "GPT": | |
| code = get_gpt_completion(prompt, system_message) | |
| elif input_data["model"] == "Claude": | |
| code = get_claude_completion(prompt, system_message) | |
| else: | |
| raise ValueError("Invalid model selected.") | |
| # Execute the generated code and return the output file path | |
| file_path = execute_code_in_virtualenv(code) | |
| return file_path | |
| except Exception as e: | |
| # Log and re-raise any errors that occur during generation | |
| logger.error(f"Error in generate_dataset: {e}") | |
| raise | |