datagen / src /datagen.py
lisekarimi's picture
Deploy version 0.1.0
db17eb5
"""Main data generation class for creating synthetic datasets using AI models."""
import os
from datetime import datetime
from .prompts import build_user_prompt, system_message
from .models import get_gpt_completion, get_claude_completion
from .utils import execute_code_in_virtualenv
from .constants import OUTPUT_DIR, logger
class DataGen:
"""Handles synthetic data generation using AI models."""
def __init__(self, output_dir=None):
"""Initialize the data generator with output directory."""
# Use provided output_dir, or fall back to OUTPUT_DIR constant
self.output_dir = output_dir or OUTPUT_DIR
os.makedirs(self.output_dir, exist_ok=True)
def get_timestamp(self):
"""Return current timestamp for file naming."""
return datetime.now().strftime("%Y%m%d_%H%M%S")
def generate_dataset(self, **input_data):
"""Generate synthetic dataset based on input parameters and model choice."""
try:
# Ensure output directory exists before generating
os.makedirs(self.output_dir, exist_ok=True)
# Add output directory path to input data for file generation
input_data["file_path"] = self.output_dir
# Build the prompt to send to the selected LLM
prompt = build_user_prompt(**input_data)
# Call the selected LLM based on the model parameter
if input_data["model"] == "GPT":
code = get_gpt_completion(prompt, system_message)
elif input_data["model"] == "Claude":
code = get_claude_completion(prompt, system_message)
else:
raise ValueError("Invalid model selected.")
# Execute the generated code and return the output file path
file_path = execute_code_in_virtualenv(code)
return file_path
except Exception as e:
# Log and re-raise any errors that occur during generation
logger.error(f"Error in generate_dataset: {e}")
raise