File size: 41,341 Bytes

968c919

#!/usr/bin/env python3
"""
Advanced Training Data Generator
===============================
Generates high-quality training data from chunks with various formats and augmentations.
"""

import json
import random
import hashlib
import numpy as np
from typing import List, Dict, Any, Optional, Tuple, Generator
from dataclasses import dataclass, asdict
from datetime import datetime
import re
from pathlib import Path
from intelligent_chunking_processor import IntelligentChunk, ChunkMetadata

@dataclass
class TrainingExample:
    """A training example with various formats."""
    example_id: str
    prompt: str
    completion: str
    format_type: str
    difficulty_level: str
    source_chunk_id: str
    metadata: Dict[str, Any]
    quality_score: float
    timestamp: str

@dataclass
class TrainingDataset:
    """A complete training dataset."""
    dataset_id: str
    dataset_name: str
    total_examples: int
    format_distribution: Dict[str, int]
    difficulty_distribution: Dict[str, int]
    quality_metrics: Dict[str, float]
    examples: List[TrainingExample]
    created_timestamp: str

class AdvancedTrainingDataGenerator:
    """Advanced training data generator with multiple formats and augmentations."""
    
    def __init__(self, output_dir: str = "training_datasets"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Training formats
        self.formats = {
            'qa': self._generate_qa_examples,
            'summarization': self._generate_summarization_examples,
            'code_explanation': self._generate_code_explanation_examples,
            'translation': self._generate_translation_examples,
            'classification': self._generate_classification_examples,
            'completion': self._generate_completion_examples,
            'instruction_following': self._generate_instruction_examples,
            'reasoning': self._generate_reasoning_examples,
            'creative_writing': self._generate_creative_examples,
            'technical_documentation': self._generate_technical_examples
        }
        
        # Difficulty levels
        self.difficulty_levels = ['beginner', 'intermediate', 'advanced', 'expert']
        
        # Quality thresholds
        self.quality_thresholds = {
            'high': 0.8,
            'medium': 0.6,
            'low': 0.4
        }
    
    def _generate_qa_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate Q&A examples from chunk."""
        examples = []
        content = chunk.content
        
        # Extract key concepts
        sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
        
        if len(sentences) < 2:
            return examples
        
        # Generate different types of questions
        question_types = [
            self._generate_what_questions,
            self._generate_how_questions,
            self._generate_why_questions,
            self._generate_when_questions,
            self._generate_where_questions
        ]
        
        for question_type in question_types:
            try:
                prompt, completion = question_type(sentences, chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"qa_{chunk.chunk_id}_{len(examples)}",
                        prompt=prompt,
                        completion=completion,
                        format_type='qa',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'question_type': question_type.__name__},
                        quality_score=self._calculate_quality_score(prompt, completion, 'qa'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Q&A generation error: {e}")
        
        return examples[:3]  # Limit to 3 examples per chunk
    
    def _generate_what_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'What' questions."""
        # Find sentences with definitions or explanations
        definition_sentences = [s for s in sentences if any(word in s.lower() for word in ['is', 'are', 'means', 'refers to', 'defined as'])]
        
        if not definition_sentences:
            return None, None
        
        sentence = random.choice(definition_sentences)
        
        # Extract the subject and definition
        if ' is ' in sentence.lower():
            parts = sentence.split(' is ', 1)
            if len(parts) == 2:
                subject = parts[0].strip()
                definition = parts[1].strip()
                prompt = f"What is {subject}?"
                completion = f"{subject} is {definition}"
                return prompt, completion
        
        return None, None
    
    def _generate_how_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'How' questions."""
        # Find sentences with processes or methods
        process_sentences = [s for s in sentences if any(word in s.lower() for word in ['process', 'method', 'step', 'procedure', 'algorithm'])]
        
        if not process_sentences:
            return None, None
        
        sentence = random.choice(process_sentences)
        
        # Create a how question
        if 'process' in sentence.lower():
            prompt = f"How does the process described work?"
            completion = sentence
            return prompt, completion
        
        return None, None
    
    def _generate_why_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'Why' questions."""
        # Find sentences with reasons or explanations
        reason_sentences = [s for s in sentences if any(word in s.lower() for word in ['because', 'due to', 'reason', 'explain', 'since'])]
        
        if not reason_sentences:
            return None, None
        
        sentence = random.choice(reason_sentences)
        
        # Create a why question
        prompt = f"Why is this important or relevant?"
        completion = sentence
        return prompt, completion
    
    def _generate_when_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'When' questions."""
        # Find sentences with temporal information
        temporal_sentences = [s for s in sentences if any(word in s.lower() for word in ['when', 'time', 'during', 'after', 'before', 'while'])]
        
        if not temporal_sentences:
            return None, None
        
        sentence = random.choice(temporal_sentences)
        
        prompt = f"When does this occur or apply?"
        completion = sentence
        return prompt, completion
    
    def _generate_where_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'Where' questions."""
        # Find sentences with location information
        location_sentences = [s for s in sentences if any(word in s.lower() for word in ['where', 'location', 'place', 'position', 'site'])]
        
        if not location_sentences:
            return None, None
        
        sentence = random.choice(location_sentences)
        
        prompt = f"Where does this occur or apply?"
        completion = sentence
        return prompt, completion
    
    def _generate_summarization_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate summarization examples."""
        examples = []
        content = chunk.content
        
        if len(content) < 200:  # Too short to summarize
            return examples
        
        # Create different types of summaries
        summary_types = ['brief', 'detailed', 'bullet_points']
        
        for summary_type in summary_types:
            try:
                prompt, completion = self._create_summary_pair(content, summary_type, chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"sum_{chunk.chunk_id}_{summary_type}",
                        prompt=prompt,
                        completion=completion,
                        format_type='summarization',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'summary_type': summary_type},
                        quality_score=self._calculate_quality_score(prompt, completion, 'summarization'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Summarization generation error: {e}")
        
        return examples
    
    def _create_summary_pair(self, content: str, summary_type: str, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Create a prompt-completion pair for summarization."""
        
        if summary_type == 'brief':
            prompt = f"Summarize the following text in 1-2 sentences:\n\n{content}"
            # Simple extractive summary (first and last sentences)
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            if len(sentences) >= 2:
                completion = f"{sentences[0]}. {sentences[-1]}."
            else:
                completion = sentences[0] if sentences else content[:100] + "..."
        
        elif summary_type == 'detailed':
            prompt = f"Provide a detailed summary of the following text:\n\n{content}"
            # Create a more detailed summary
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            if len(sentences) > 3:
                completion = f"{sentences[0]}. {sentences[len(sentences)//2]}. {sentences[-1]}."
            else:
                completion = content[:200] + "..."
        
        elif summary_type == 'bullet_points':
            prompt = f"Summarize the following text as bullet points:\n\n{content}"
            # Create bullet points
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            completion = "\n".join([f"• {s}" for s in sentences[:5]])
        
        return prompt, completion
    
    def _generate_code_explanation_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate code explanation examples."""
        examples = []
        
        # Check if chunk contains code
        if chunk.metadata.content_type != 'code':
            return examples
        
        content = chunk.content
        
        # Find code blocks
        code_blocks = re.findall(r'```[\s\S]*?```', content)
        if not code_blocks:
            # Look for inline code or function definitions
            code_blocks = re.findall(r'def\s+\w+\s*\([^)]*\):[\s\S]*?(?=\n\s*\w|\n\n|$)', content)
        
        for code_block in code_blocks[:2]:  # Limit to 2 examples
            try:
                # Clean code block
                clean_code = re.sub(r'```\w*\n?', '', code_block).strip()
                
                if len(clean_code) > 50:  # Only process substantial code
                    prompt = f"Explain what the following code does:\n\n```\n{clean_code}\n```"
                    completion = self._generate_code_explanation(clean_code, chunk)
                    
                    example = TrainingExample(
                        example_id=f"code_{chunk.chunk_id}_{len(examples)}",
                        prompt=prompt,
                        completion=completion,
                        format_type='code_explanation',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'code_language': self._detect_code_language(clean_code)},
                        quality_score=self._calculate_quality_score(prompt, completion, 'code_explanation'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Code explanation generation error: {e}")
        
        return examples
    
    def _generate_code_explanation(self, code: str, chunk: IntelligentChunk) -> str:
        """Generate explanation for code."""
        # Simple heuristics for code explanation
        
        if 'def ' in code:
            # Function definition
            func_name = re.search(r'def\s+(\w+)', code)
            if func_name:
                return f"This code defines a function called '{func_name.group(1)}'. The function performs the operations described in the code block."
        
        elif 'class ' in code:
            # Class definition
            class_name = re.search(r'class\s+(\w+)', code)
            if class_name:
                return f"This code defines a class called '{class_name.group(1)}'. The class contains methods and attributes as specified."
        
        elif 'import ' in code:
            return "This code imports external libraries or modules for use in the program."
        
        elif '=' in code and any(op in code for op in ['+', '-', '*', '/']):
            return "This code performs mathematical calculations or data processing operations."
        
        else:
            return "This code performs various programming operations as specified in the implementation."
    
    def _detect_code_language(self, code: str) -> str:
        """Detect programming language from code."""
        if 'def ' in code or 'import ' in code or 'from ' in code:
            return 'python'
        elif 'function ' in code or 'var ' in code or 'const ' in code:
            return 'javascript'
        elif '#include' in code or 'int main' in code:
            return 'c'
        elif 'public class' in code or 'System.out.println' in code:
            return 'java'
        else:
            return 'unknown'
    
    def _generate_completion_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate text completion examples."""
        examples = []
        content = chunk.content
        
        if len(content) < 100:
            return examples
        
        # Create completion tasks at different positions
        completion_positions = [0.3, 0.5, 0.7]  # 30%, 50%, 70% through the text
        
        for position in completion_positions:
            try:
                split_point = int(len(content) * position)
                
                # Find a good split point (end of sentence)
                sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
                if sentences:
                    sentence_lengths = [len(s) for s in sentences]
                    cumulative_length = 0
                    best_split = 0
                    
                    for i, length in enumerate(sentence_lengths):
                        cumulative_length += length
                        if cumulative_length >= split_point:
                            best_split = i
                            break
                    
                    if best_split < len(sentences) - 1:
                        prompt = ' '.join(sentences[:best_split + 1])
                        completion = ' '.join(sentences[best_split + 1:])
                        
                        if len(completion) > 20:  # Ensure meaningful completion
                            example = TrainingExample(
                                example_id=f"comp_{chunk.chunk_id}_{position}",
                                prompt=prompt,
                                completion=completion,
                                format_type='completion',
                                difficulty_level=self._determine_difficulty(chunk),
                                source_chunk_id=chunk.chunk_id,
                                metadata={'split_position': position},
                                quality_score=self._calculate_quality_score(prompt, completion, 'completion'),
                                timestamp=datetime.now().isoformat()
                            )
                            examples.append(example)
            except Exception as e:
                print(f"⚠️  Completion generation error: {e}")
        
        return examples[:2]  # Limit to 2 examples
    
    def _generate_classification_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate classification examples."""
        examples = []
        
        # Determine classification tasks based on content
        classification_tasks = []
        
        if chunk.metadata.content_type == 'code':
            classification_tasks.append(('programming_language', self._classify_programming_language))
        
        if chunk.metadata.content_type == 'natural_language':
            classification_tasks.append(('sentiment', self._classify_sentiment))
            classification_tasks.append(('topic', self._classify_topic))
        
        for task_name, classifier_func in classification_tasks:
            try:
                prompt, completion = classifier_func(chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"class_{chunk.chunk_id}_{task_name}",
                        prompt=prompt,
                        completion=completion,
                        format_type='classification',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'classification_task': task_name},
                        quality_score=self._calculate_quality_score(prompt, completion, 'classification'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Classification generation error: {e}")
        
        return examples
    
    def _classify_programming_language(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate programming language classification example."""
        content = chunk.content
        language = self._detect_code_language(content)
        
        prompt = f"Classify the programming language of the following code:\n\n```\n{content[:200]}...\n```"
        completion = f"The programming language is {language}."
        
        return prompt, completion
    
    def _classify_sentiment(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate sentiment classification example."""
        content = chunk.content
        sentiment = "positive" if chunk.metadata.sentiment_score > 0.1 else "negative" if chunk.metadata.sentiment_score < -0.1 else "neutral"
        
        prompt = f"Classify the sentiment of the following text:\n\n{content[:200]}..."
        completion = f"The sentiment is {sentiment}."
        
        return prompt, completion
    
    def _classify_topic(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate topic classification example."""
        content = chunk.content
        topic = chunk.metadata.semantic_topic
        
        prompt = f"Classify the main topic of the following text:\n\n{content[:200]}..."
        completion = f"The main topic is {topic}."
        
        return prompt, completion
    
    def _generate_instruction_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate instruction following examples."""
        examples = []
        content = chunk.content
        
        # Create instruction-based prompts
        instructions = [
            "Rewrite the following text in a more formal tone:",
            "Simplify the following text for beginners:",
            "Convert the following text into bullet points:",
            "Explain the following concept step by step:"
        ]
        
        for instruction in instructions[:2]:  # Limit to 2 examples
            try:
                prompt = f"{instruction}\n\n{content[:300]}..."
                completion = self._apply_instruction(content, instruction)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"inst_{chunk.chunk_id}_{hash(instruction) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='instruction_following',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'instruction_type': instruction.split(':')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'instruction_following'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Instruction generation error: {e}")
        
        return examples
    
    def _apply_instruction(self, content: str, instruction: str) -> str:
        """Apply instruction to content."""
        if "formal tone" in instruction.lower():
            return content.replace("don't", "do not").replace("can't", "cannot").replace("won't", "will not")
        elif "simplify" in instruction.lower():
            # Simple simplification - remove complex words
            return content.replace("utilize", "use").replace("implement", "do").replace("facilitate", "help")
        elif "bullet points" in instruction.lower():
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            return "\n".join([f"• {s}" for s in sentences[:5]])
        elif "step by step" in instruction.lower():
            return f"Step 1: {content[:100]}\nStep 2: {content[100:200]}\nStep 3: {content[200:300]}"
        
        return content
    
    def _generate_reasoning_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate reasoning examples."""
        examples = []
        content = chunk.content
        
        # Create reasoning prompts
        reasoning_prompts = [
            "What are the implications of the following statement?",
            "What can we infer from the following information?",
            "What are the potential causes of the following situation?",
            "What would be the logical next step based on the following?"
        ]
        
        for prompt_template in reasoning_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:300]}..."
                completion = self._generate_reasoning_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"reason_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='reasoning',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'reasoning_type': prompt_template.split('?')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'reasoning'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Reasoning generation error: {e}")
        
        return examples
    
    def _generate_reasoning_response(self, content: str, prompt_template: str) -> str:
        """Generate reasoning response."""
        if "implications" in prompt_template.lower():
            return "The implications suggest that this concept has broader applications and may influence related areas of study or practice."
        elif "infer" in prompt_template.lower():
            return "Based on this information, we can infer that there are underlying patterns or relationships that may not be immediately obvious."
        elif "causes" in prompt_template.lower():
            return "The potential causes likely involve multiple factors including environmental conditions, historical context, and systematic influences."
        elif "next step" in prompt_template.lower():
            return "The logical next step would be to investigate further, gather additional evidence, or implement the suggested approach."
        
        return "This requires careful analysis and consideration of multiple factors to reach a sound conclusion."
    
    def _generate_creative_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate creative writing examples."""
        examples = []
        content = chunk.content
        
        # Create creative prompts
        creative_prompts = [
            "Write a creative story based on the following concept:",
            "Create a poem inspired by the following theme:",
            "Write a dialogue between two characters discussing the following topic:",
            "Create an imaginative scenario based on the following information:"
        ]
        
        for prompt_template in creative_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:200]}..."
                completion = self._generate_creative_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"creative_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='creative_writing',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'creative_type': prompt_template.split(':')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'creative_writing'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Creative generation error: {e}")
        
        return examples
    
    def _generate_creative_response(self, content: str, prompt_template: str) -> str:
        """Generate creative response."""
        if "story" in prompt_template.lower():
            return f"Once upon a time, there was a concept that changed everything. This concept, drawn from the depths of knowledge, began to spread its influence across the world, touching lives and inspiring new ways of thinking."
        elif "poem" in prompt_template.lower():
            return f"In the realm of knowledge,\nWhere ideas take flight,\nThis concept emerges,\nShining bright in the night."
        elif "dialogue" in prompt_template.lower():
            return f"Character A: 'I find this concept fascinating.'\nCharacter B: 'Indeed, it opens up so many possibilities.'\nCharacter A: 'How do you think we should approach it?'\nCharacter B: 'Let's explore it together, step by step.'"
        elif "scenario" in prompt_template.lower():
            return f"In an alternate reality where this concept became the foundation of society, everything would be different. People would approach problems with new perspectives, and innovation would flourish in ways we can only imagine."
        
        return "This concept inspires creativity and imagination, opening doors to new possibilities and perspectives."
    
    def _generate_technical_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate technical documentation examples."""
        examples = []
        content = chunk.content
        
        # Create technical prompts
        technical_prompts = [
            "Create technical documentation for the following:",
            "Write an API documentation for the following code:",
            "Create a user manual for the following process:",
            "Write a troubleshooting guide for the following issue:"
        ]
        
        for prompt_template in technical_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:300]}..."
                completion = self._generate_technical_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"tech_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='technical_documentation',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'technical_type': prompt_template.split(' for')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'technical_documentation'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Technical generation error: {e}")
        
        return examples
    
    def _generate_technical_response(self, content: str, prompt_template: str) -> str:
        """Generate technical response."""
        if "documentation" in prompt_template.lower():
            return f"# Technical Documentation\n\n## Overview\nThis section provides comprehensive technical documentation for the described concept.\n\n## Implementation\n1. Setup and configuration\n2. Core functionality\n3. Integration guidelines\n\n## Examples\nSee the provided code samples for practical implementation."
        elif "API" in prompt_template.lower():
            return f"# API Documentation\n\n## Endpoints\n- GET /api/endpoint - Retrieve data\n- POST /api/endpoint - Create new entry\n\n## Parameters\n- param1: string (required)\n- param2: integer (optional)\n\n## Response Format\n```json\n{{\n  \"status\": \"success\",\n  \"data\": {{}}\n}}\n```"
        elif "manual" in prompt_template.lower():
            return f"# User Manual\n\n## Getting Started\n1. Install the required dependencies\n2. Configure the system settings\n3. Run the application\n\n## Usage\nFollow these steps to use the system effectively:\n1. Initialize the process\n2. Configure parameters\n3. Execute the operation"
        elif "troubleshooting" in prompt_template.lower():
            return f"# Troubleshooting Guide\n\n## Common Issues\n\n### Issue 1: Connection Problems\n**Symptoms:** Unable to connect\n**Solution:** Check network settings and firewall configuration\n\n### Issue 2: Performance Issues\n**Symptoms:** Slow response times\n**Solution:** Optimize system resources and check for bottlenecks"
        
        return "This technical documentation provides comprehensive guidance for implementation and usage."
    
    def _determine_difficulty(self, chunk: IntelligentChunk) -> str:
        """Determine difficulty level based on chunk metadata."""
        importance = chunk.metadata.importance_score
        readability = chunk.metadata.readability_score
        entity_count = chunk.metadata.entity_count
        
        # Calculate difficulty score
        difficulty_score = (1 - readability) + importance + (entity_count / 100)
        
        if difficulty_score < 0.3:
            return 'beginner'
        elif difficulty_score < 0.6:
            return 'intermediate'
        elif difficulty_score < 0.8:
            return 'advanced'
        else:
            return 'expert'
    
    def _calculate_quality_score(self, prompt: str, completion: str, format_type: str) -> float:
        """Calculate quality score for training example."""
        base_score = 0.5
        
        # Length factor
        prompt_len = len(prompt.split())
        completion_len = len(completion.split())
        
        if prompt_len > 10 and completion_len > 5:
            base_score += 0.2
        
        # Format-specific scoring
        if format_type == 'qa':
            if '?' in prompt and len(completion) > 20:
                base_score += 0.2
        elif format_type == 'summarization':
            if len(completion) < len(prompt) * 0.8:  # Good compression ratio
                base_score += 0.2
        elif format_type == 'code_explanation':
            if '```' in prompt and len(completion) > 30:
                base_score += 0.2
        
        # Coherence check
        if len(set(prompt.split()) & set(completion.split())) > 2:
            base_score += 0.1
        
        return min(base_score, 1.0)
    
    def generate_training_dataset(self, 
                                chunks: List[IntelligentChunk],
                                dataset_name: str,
                                target_formats: List[str] = None,
                                max_examples_per_chunk: int = 5,
                                quality_threshold: float = 0.5) -> TrainingDataset:
        """Generate a complete training dataset from chunks."""
        
        if target_formats is None:
            target_formats = list(self.formats.keys())
        
        all_examples = []
        
        for chunk in chunks:
            chunk_examples = []
            
            # Generate examples for each target format
            for format_name in target_formats:
                if format_name in self.formats:
                    try:
                        examples = self.formats[format_name](chunk)
                        chunk_examples.extend(examples)
                    except Exception as e:
                        print(f"⚠️  Error generating {format_name} examples: {e}")
            
            # Limit examples per chunk and filter by quality
            chunk_examples = [
                ex for ex in chunk_examples 
                if ex.quality_score >= quality_threshold
            ][:max_examples_per_chunk]
            
            all_examples.extend(chunk_examples)
        
        # Calculate dataset statistics
        format_distribution = {}
        difficulty_distribution = {}
        quality_scores = []
        
        for example in all_examples:
            format_distribution[example.format_type] = format_distribution.get(example.format_type, 0) + 1
            difficulty_distribution[example.difficulty_level] = difficulty_distribution.get(example.difficulty_level, 0) + 1
            quality_scores.append(example.quality_score)
        
        quality_metrics = {
            'avg_quality': np.mean(quality_scores) if quality_scores else 0,
            'min_quality': np.min(quality_scores) if quality_scores else 0,
            'max_quality': np.max(quality_scores) if quality_scores else 0,
            'high_quality_count': len([s for s in quality_scores if s >= 0.8]),
            'medium_quality_count': len([s for s in quality_scores if 0.6 <= s < 0.8]),
            'low_quality_count': len([s for s in quality_scores if s < 0.6])
        }
        
        # Create dataset
        dataset_id = hashlib.sha256(f"{dataset_name}_{datetime.now().isoformat()}".encode()).hexdigest()[:16]
        
        dataset = TrainingDataset(
            dataset_id=dataset_id,
            dataset_name=dataset_name,
            total_examples=len(all_examples),
            format_distribution=format_distribution,
            difficulty_distribution=difficulty_distribution,
            quality_metrics=quality_metrics,
            examples=all_examples,
            created_timestamp=datetime.now().isoformat()
        )
        
        return dataset
    
    def save_dataset(self, dataset: TrainingDataset, format: str = 'jsonl') -> str:
        """Save training dataset to file."""
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        if format == 'jsonl':
            filename = f"{dataset.dataset_name}_{timestamp}.jsonl"
            filepath = self.output_dir / filename
            
            with open(filepath, 'w', encoding='utf-8') as f:
                for example in dataset.examples:
                    f.write(json.dumps(asdict(example), ensure_ascii=False) + '\n')
        
        elif format == 'json':
            filename = f"{dataset.dataset_name}_{timestamp}.json"
            filepath = self.output_dir / filename
            
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(asdict(dataset), f, indent=2, ensure_ascii=False)
        
        else:
            raise ValueError(f"Unsupported format: {format}")
        
        return str(filepath)
    
    def load_dataset(self, filepath: str) -> TrainingDataset:
        """Load training dataset from file."""
        
        with open(filepath, 'r', encoding='utf-8') as f:
            if filepath.endswith('.jsonl'):
                examples = []
                for line in f:
                    example_data = json.loads(line)
                    examples.append(TrainingExample(**example_data))
                
                # Create minimal dataset object
                dataset = TrainingDataset(
                    dataset_id="loaded",
                    dataset_name=Path(filepath).stem,
                    total_examples=len(examples),
                    format_distribution={},
                    difficulty_distribution={},
                    quality_metrics={},
                    examples=examples,
                    created_timestamp=datetime.now().isoformat()
                )
                
            else:  # JSON format
                dataset_data = json.load(f)
                examples = [TrainingExample(**ex_data) for ex_data in dataset_data['examples']]
                dataset_data['examples'] = examples
                dataset = TrainingDataset(**dataset_data)
        
        return dataset

def main():
    """Demo the advanced training data generator."""
    
    print("🚀 Advanced Training Data Generator Demo")
    print("=" * 50)
    
    # Initialize generator
    generator = AdvancedTrainingDataGenerator()
    
    # Create sample chunks
    sample_content = """
    # Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that focuses on algorithms and statistical models.
    
    ## Supervised Learning
    Supervised learning uses labeled training data to learn a mapping from inputs to outputs.
    
    ```python
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    ```
    
    ## Unsupervised Learning
    Unsupervised learning finds hidden patterns in data without labeled examples.
    
    The K-means algorithm is a popular clustering method that groups similar data points together.
    """
    
    # Create a sample chunk
    from intelligent_chunking_processor import IntelligentChunkingProcessor
    chunk_processor = IntelligentChunkingProcessor()
    
    chunks = chunk_processor.create_intelligent_chunks(
        sample_content, 
        hashlib.sha256(sample_content.encode()).hexdigest()
    )
    
    print(f"\n📝 Processing {len(chunks)} chunks...")
    
    # Generate training dataset
    dataset = generator.generate_training_dataset(
        chunks,
        dataset_name="ml_fundamentals_demo",
        target_formats=['qa', 'summarization', 'code_explanation', 'completion'],
        max_examples_per_chunk=3,
        quality_threshold=0.4
    )
    
    print(f"\n✅ Generated training dataset:")
    print(f"   Dataset ID: {dataset.dataset_id}")
    print(f"   Total examples: {dataset.total_examples}")
    print(f"   Format distribution: {dataset.format_distribution}")
    print(f"   Difficulty distribution: {dataset.difficulty_distribution}")
    print(f"   Quality metrics: {dataset.quality_metrics}")
    
    # Show sample examples
    print(f"\n📄 Sample examples:")
    for i, example in enumerate(dataset.examples[:3]):
        print(f"\n   Example {i+1} ({example.format_type}):")
        print(f"   Prompt: {example.prompt[:100]}...")
        print(f"   Completion: {example.completion[:100]}...")
        print(f"   Quality score: {example.quality_score:.2f}")
    
    # Save dataset
    output_file = generator.save_dataset(dataset, format='jsonl')
    print(f"\n💾 Dataset saved to: {output_file}")
    
    print(f"\n✅ Advanced training data generator ready!")

if __name__ == "__main__":
    main()