| | """ |
| | Synthetic Data Generator for MangoMAS Local |
| | |
| | This module provides a framework for generating synthetic training data |
| | for different specialized capabilities, adaptable across all modules. |
| | """ |
| |
|
| | import json |
| | import logging |
| | import os |
| | import random |
| | from abc import ABC, abstractmethod |
| | from pathlib import Path |
| | from typing import Any, Callable, Dict, List, Optional, Tuple |
| |
|
| | import yaml |
| | from tqdm import tqdm |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class SyntheticDataGenerator(ABC): |
| | """ |
| | Abstract base class for synthetic data generators. |
| | Each specialized module can implement this interface for data generation. |
| | """ |
| |
|
| | def __init__(self, config: Dict[str, Any], output_dir: str = "data/processed"): |
| | """ |
| | Initialize the synthetic data generator. |
| | |
| | Args: |
| | config: Configuration for the data generator |
| | output_dir: Directory to save generated data |
| | """ |
| | self.config = config |
| | self.output_dir = Path(output_dir) |
| | self.output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | self.num_examples = config.get("synthetic_examples", 1000) |
| |
|
| | |
| | self.templates = self._load_templates() |
| |
|
| | logger.info(f"Initialized {self.__class__.__name__} with {self.num_examples} examples") |
| |
|
| | @abstractmethod |
| | def _load_templates(self) -> List[Dict[str, Any]]: |
| | """ |
| | Load templates for data generation. |
| | Each implementation should define its own templates. |
| | |
| | Returns: |
| | List of template dictionaries |
| | """ |
| | pass |
| |
|
| | @abstractmethod |
| | def generate_example(self) -> Dict[str, Any]: |
| | """ |
| | Generate a single synthetic training example. |
| | |
| | Returns: |
| | Dictionary with the generated example |
| | """ |
| | pass |
| |
|
| | def generate_dataset(self, filename: str, num_examples: Optional[int] = None) -> str: |
| | """ |
| | Generate a synthetic dataset and save to a JSONL file. |
| | |
| | Args: |
| | filename: Name of the output file |
| | num_examples: Number of examples to generate (overrides config) |
| | |
| | Returns: |
| | Path to the generated dataset file |
| | """ |
| | n = num_examples if num_examples is not None else self.num_examples |
| | output_file = self.output_dir / filename |
| |
|
| | logger.info(f"Generating {n} synthetic examples for {self.__class__.__name__}") |
| |
|
| | with open(output_file, 'w', encoding='utf-8') as f: |
| | for _ in tqdm(range(n), desc=f"Generating {filename}"): |
| | example = self.generate_example() |
| | f.write(json.dumps(example) + '\n') |
| |
|
| | logger.info(f"Generated dataset saved to {output_file}") |
| | return str(output_file) |
| |
|
| | def augment_existing_dataset(self, input_file: str, output_file: Optional[str] = None, |
| | ratio: float = 0.5) -> str: |
| | """ |
| | Augment an existing dataset with synthetic examples. |
| | |
| | Args: |
| | input_file: Path to the existing dataset |
| | output_file: Path to save the augmented dataset (or None to overwrite) |
| | ratio: Ratio of synthetic to original examples |
| | |
| | Returns: |
| | Path to the augmented dataset |
| | """ |
| | if output_file is None: |
| | output_file = input_file |
| |
|
| | |
| | existing_data = [] |
| | try: |
| | with open(input_file, 'r', encoding='utf-8') as f: |
| | for line in f: |
| | existing_data.append(json.loads(line.strip())) |
| | except (FileNotFoundError, json.JSONDecodeError) as e: |
| | logger.warning(f"Error loading existing data: {e}") |
| | existing_data = [] |
| |
|
| | |
| | n_existing = len(existing_data) |
| | n_synthetic = int(n_existing * ratio) |
| |
|
| | |
| | synthetic_data = [self.generate_example() for _ in tqdm(range(n_synthetic), |
| | desc=f"Generating augmentation data")] |
| |
|
| | |
| | combined_data = existing_data + synthetic_data |
| | random.shuffle(combined_data) |
| |
|
| | |
| | with open(output_file, 'w', encoding='utf-8') as f: |
| | for item in combined_data: |
| | f.write(json.dumps(item) + '\n') |
| |
|
| | logger.info(f"Augmented dataset with {n_synthetic} synthetic examples, saved to {output_file}") |
| | return output_file |
| |
|
| |
|
| | class SyntheticDataGeneratorRegistry: |
| | """Registry for all synthetic data generators in the system.""" |
| |
|
| | _generators = {} |
| |
|
| | @classmethod |
| | def register(cls, module_type: str, generator_class): |
| | """Register a generator class for a module type.""" |
| | cls._generators[module_type] = generator_class |
| |
|
| | @classmethod |
| | def get_generator(cls, module_type: str, config: Dict[str, Any], output_dir: str) -> SyntheticDataGenerator: |
| | """Get a generator instance for a module type.""" |
| | if module_type not in cls._generators: |
| | raise ValueError(f"No generator registered for module type: {module_type}") |
| |
|
| | return cls._generators[module_type](config, output_dir) |
| |
|
| | @classmethod |
| | def list_generators(cls) -> List[str]: |
| | """List all registered generator types.""" |
| | return list(cls._generators.keys()) |
| | """ |
| | |