#!/usr/bin/env python3 """ Advanced Training Data Generator =============================== Generates high-quality training data from chunks with various formats and augmentations. """ import json import random import hashlib import numpy as np from typing import List, Dict, Any, Optional, Tuple, Generator from dataclasses import dataclass, asdict from datetime import datetime import re from pathlib import Path from intelligent_chunking_processor import IntelligentChunk, ChunkMetadata @dataclass class TrainingExample: """A training example with various formats.""" example_id: str prompt: str completion: str format_type: str difficulty_level: str source_chunk_id: str metadata: Dict[str, Any] quality_score: float timestamp: str @dataclass class TrainingDataset: """A complete training dataset.""" dataset_id: str dataset_name: str total_examples: int format_distribution: Dict[str, int] difficulty_distribution: Dict[str, int] quality_metrics: Dict[str, float] examples: List[TrainingExample] created_timestamp: str class AdvancedTrainingDataGenerator: """Advanced training data generator with multiple formats and augmentations.""" def __init__(self, output_dir: str = "training_datasets"): self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) # Training formats self.formats = { 'qa': self._generate_qa_examples, 'summarization': self._generate_summarization_examples, 'code_explanation': self._generate_code_explanation_examples, 'translation': self._generate_translation_examples, 'classification': self._generate_classification_examples, 'completion': self._generate_completion_examples, 'instruction_following': self._generate_instruction_examples, 'reasoning': self._generate_reasoning_examples, 'creative_writing': self._generate_creative_examples, 'technical_documentation': self._generate_technical_examples } # Difficulty levels self.difficulty_levels = ['beginner', 'intermediate', 'advanced', 'expert'] # Quality thresholds self.quality_thresholds = { 'high': 0.8, 'medium': 0.6, 'low': 0.4 } def _generate_qa_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate Q&A examples from chunk.""" examples = [] content = chunk.content # Extract key concepts sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] if len(sentences) < 2: return examples # Generate different types of questions question_types = [ self._generate_what_questions, self._generate_how_questions, self._generate_why_questions, self._generate_when_questions, self._generate_where_questions ] for question_type in question_types: try: prompt, completion = question_type(sentences, chunk) if prompt and completion: example = TrainingExample( example_id=f"qa_{chunk.chunk_id}_{len(examples)}", prompt=prompt, completion=completion, format_type='qa', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'question_type': question_type.__name__}, quality_score=self._calculate_quality_score(prompt, completion, 'qa'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Q&A generation error: {e}") return examples[:3] # Limit to 3 examples per chunk def _generate_what_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]: """Generate 'What' questions.""" # Find sentences with definitions or explanations definition_sentences = [s for s in sentences if any(word in s.lower() for word in ['is', 'are', 'means', 'refers to', 'defined as'])] if not definition_sentences: return None, None sentence = random.choice(definition_sentences) # Extract the subject and definition if ' is ' in sentence.lower(): parts = sentence.split(' is ', 1) if len(parts) == 2: subject = parts[0].strip() definition = parts[1].strip() prompt = f"What is {subject}?" completion = f"{subject} is {definition}" return prompt, completion return None, None def _generate_how_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]: """Generate 'How' questions.""" # Find sentences with processes or methods process_sentences = [s for s in sentences if any(word in s.lower() for word in ['process', 'method', 'step', 'procedure', 'algorithm'])] if not process_sentences: return None, None sentence = random.choice(process_sentences) # Create a how question if 'process' in sentence.lower(): prompt = f"How does the process described work?" completion = sentence return prompt, completion return None, None def _generate_why_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]: """Generate 'Why' questions.""" # Find sentences with reasons or explanations reason_sentences = [s for s in sentences if any(word in s.lower() for word in ['because', 'due to', 'reason', 'explain', 'since'])] if not reason_sentences: return None, None sentence = random.choice(reason_sentences) # Create a why question prompt = f"Why is this important or relevant?" completion = sentence return prompt, completion def _generate_when_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]: """Generate 'When' questions.""" # Find sentences with temporal information temporal_sentences = [s for s in sentences if any(word in s.lower() for word in ['when', 'time', 'during', 'after', 'before', 'while'])] if not temporal_sentences: return None, None sentence = random.choice(temporal_sentences) prompt = f"When does this occur or apply?" completion = sentence return prompt, completion def _generate_where_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]: """Generate 'Where' questions.""" # Find sentences with location information location_sentences = [s for s in sentences if any(word in s.lower() for word in ['where', 'location', 'place', 'position', 'site'])] if not location_sentences: return None, None sentence = random.choice(location_sentences) prompt = f"Where does this occur or apply?" completion = sentence return prompt, completion def _generate_summarization_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate summarization examples.""" examples = [] content = chunk.content if len(content) < 200: # Too short to summarize return examples # Create different types of summaries summary_types = ['brief', 'detailed', 'bullet_points'] for summary_type in summary_types: try: prompt, completion = self._create_summary_pair(content, summary_type, chunk) if prompt and completion: example = TrainingExample( example_id=f"sum_{chunk.chunk_id}_{summary_type}", prompt=prompt, completion=completion, format_type='summarization', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'summary_type': summary_type}, quality_score=self._calculate_quality_score(prompt, completion, 'summarization'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Summarization generation error: {e}") return examples def _create_summary_pair(self, content: str, summary_type: str, chunk: IntelligentChunk) -> Tuple[str, str]: """Create a prompt-completion pair for summarization.""" if summary_type == 'brief': prompt = f"Summarize the following text in 1-2 sentences:\n\n{content}" # Simple extractive summary (first and last sentences) sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] if len(sentences) >= 2: completion = f"{sentences[0]}. {sentences[-1]}." else: completion = sentences[0] if sentences else content[:100] + "..." elif summary_type == 'detailed': prompt = f"Provide a detailed summary of the following text:\n\n{content}" # Create a more detailed summary sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] if len(sentences) > 3: completion = f"{sentences[0]}. {sentences[len(sentences)//2]}. {sentences[-1]}." else: completion = content[:200] + "..." elif summary_type == 'bullet_points': prompt = f"Summarize the following text as bullet points:\n\n{content}" # Create bullet points sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] completion = "\n".join([f"• {s}" for s in sentences[:5]]) return prompt, completion def _generate_code_explanation_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate code explanation examples.""" examples = [] # Check if chunk contains code if chunk.metadata.content_type != 'code': return examples content = chunk.content # Find code blocks code_blocks = re.findall(r'```[\s\S]*?```', content) if not code_blocks: # Look for inline code or function definitions code_blocks = re.findall(r'def\s+\w+\s*\([^)]*\):[\s\S]*?(?=\n\s*\w|\n\n|$)', content) for code_block in code_blocks[:2]: # Limit to 2 examples try: # Clean code block clean_code = re.sub(r'```\w*\n?', '', code_block).strip() if len(clean_code) > 50: # Only process substantial code prompt = f"Explain what the following code does:\n\n```\n{clean_code}\n```" completion = self._generate_code_explanation(clean_code, chunk) example = TrainingExample( example_id=f"code_{chunk.chunk_id}_{len(examples)}", prompt=prompt, completion=completion, format_type='code_explanation', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'code_language': self._detect_code_language(clean_code)}, quality_score=self._calculate_quality_score(prompt, completion, 'code_explanation'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Code explanation generation error: {e}") return examples def _generate_code_explanation(self, code: str, chunk: IntelligentChunk) -> str: """Generate explanation for code.""" # Simple heuristics for code explanation if 'def ' in code: # Function definition func_name = re.search(r'def\s+(\w+)', code) if func_name: return f"This code defines a function called '{func_name.group(1)}'. The function performs the operations described in the code block." elif 'class ' in code: # Class definition class_name = re.search(r'class\s+(\w+)', code) if class_name: return f"This code defines a class called '{class_name.group(1)}'. The class contains methods and attributes as specified." elif 'import ' in code: return "This code imports external libraries or modules for use in the program." elif '=' in code and any(op in code for op in ['+', '-', '*', '/']): return "This code performs mathematical calculations or data processing operations." else: return "This code performs various programming operations as specified in the implementation." def _detect_code_language(self, code: str) -> str: """Detect programming language from code.""" if 'def ' in code or 'import ' in code or 'from ' in code: return 'python' elif 'function ' in code or 'var ' in code or 'const ' in code: return 'javascript' elif '#include' in code or 'int main' in code: return 'c' elif 'public class' in code or 'System.out.println' in code: return 'java' else: return 'unknown' def _generate_completion_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate text completion examples.""" examples = [] content = chunk.content if len(content) < 100: return examples # Create completion tasks at different positions completion_positions = [0.3, 0.5, 0.7] # 30%, 50%, 70% through the text for position in completion_positions: try: split_point = int(len(content) * position) # Find a good split point (end of sentence) sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] if sentences: sentence_lengths = [len(s) for s in sentences] cumulative_length = 0 best_split = 0 for i, length in enumerate(sentence_lengths): cumulative_length += length if cumulative_length >= split_point: best_split = i break if best_split < len(sentences) - 1: prompt = ' '.join(sentences[:best_split + 1]) completion = ' '.join(sentences[best_split + 1:]) if len(completion) > 20: # Ensure meaningful completion example = TrainingExample( example_id=f"comp_{chunk.chunk_id}_{position}", prompt=prompt, completion=completion, format_type='completion', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'split_position': position}, quality_score=self._calculate_quality_score(prompt, completion, 'completion'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Completion generation error: {e}") return examples[:2] # Limit to 2 examples def _generate_classification_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate classification examples.""" examples = [] # Determine classification tasks based on content classification_tasks = [] if chunk.metadata.content_type == 'code': classification_tasks.append(('programming_language', self._classify_programming_language)) if chunk.metadata.content_type == 'natural_language': classification_tasks.append(('sentiment', self._classify_sentiment)) classification_tasks.append(('topic', self._classify_topic)) for task_name, classifier_func in classification_tasks: try: prompt, completion = classifier_func(chunk) if prompt and completion: example = TrainingExample( example_id=f"class_{chunk.chunk_id}_{task_name}", prompt=prompt, completion=completion, format_type='classification', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'classification_task': task_name}, quality_score=self._calculate_quality_score(prompt, completion, 'classification'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Classification generation error: {e}") return examples def _classify_programming_language(self, chunk: IntelligentChunk) -> Tuple[str, str]: """Generate programming language classification example.""" content = chunk.content language = self._detect_code_language(content) prompt = f"Classify the programming language of the following code:\n\n```\n{content[:200]}...\n```" completion = f"The programming language is {language}." return prompt, completion def _classify_sentiment(self, chunk: IntelligentChunk) -> Tuple[str, str]: """Generate sentiment classification example.""" content = chunk.content sentiment = "positive" if chunk.metadata.sentiment_score > 0.1 else "negative" if chunk.metadata.sentiment_score < -0.1 else "neutral" prompt = f"Classify the sentiment of the following text:\n\n{content[:200]}..." completion = f"The sentiment is {sentiment}." return prompt, completion def _classify_topic(self, chunk: IntelligentChunk) -> Tuple[str, str]: """Generate topic classification example.""" content = chunk.content topic = chunk.metadata.semantic_topic prompt = f"Classify the main topic of the following text:\n\n{content[:200]}..." completion = f"The main topic is {topic}." return prompt, completion def _generate_instruction_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate instruction following examples.""" examples = [] content = chunk.content # Create instruction-based prompts instructions = [ "Rewrite the following text in a more formal tone:", "Simplify the following text for beginners:", "Convert the following text into bullet points:", "Explain the following concept step by step:" ] for instruction in instructions[:2]: # Limit to 2 examples try: prompt = f"{instruction}\n\n{content[:300]}..." completion = self._apply_instruction(content, instruction) if completion: example = TrainingExample( example_id=f"inst_{chunk.chunk_id}_{hash(instruction) % 1000}", prompt=prompt, completion=completion, format_type='instruction_following', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'instruction_type': instruction.split(':')[0]}, quality_score=self._calculate_quality_score(prompt, completion, 'instruction_following'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Instruction generation error: {e}") return examples def _apply_instruction(self, content: str, instruction: str) -> str: """Apply instruction to content.""" if "formal tone" in instruction.lower(): return content.replace("don't", "do not").replace("can't", "cannot").replace("won't", "will not") elif "simplify" in instruction.lower(): # Simple simplification - remove complex words return content.replace("utilize", "use").replace("implement", "do").replace("facilitate", "help") elif "bullet points" in instruction.lower(): sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] return "\n".join([f"• {s}" for s in sentences[:5]]) elif "step by step" in instruction.lower(): return f"Step 1: {content[:100]}\nStep 2: {content[100:200]}\nStep 3: {content[200:300]}" return content def _generate_reasoning_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate reasoning examples.""" examples = [] content = chunk.content # Create reasoning prompts reasoning_prompts = [ "What are the implications of the following statement?", "What can we infer from the following information?", "What are the potential causes of the following situation?", "What would be the logical next step based on the following?" ] for prompt_template in reasoning_prompts[:2]: # Limit to 2 examples try: prompt = f"{prompt_template}\n\n{content[:300]}..." completion = self._generate_reasoning_response(content, prompt_template) if completion: example = TrainingExample( example_id=f"reason_{chunk.chunk_id}_{hash(prompt_template) % 1000}", prompt=prompt, completion=completion, format_type='reasoning', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'reasoning_type': prompt_template.split('?')[0]}, quality_score=self._calculate_quality_score(prompt, completion, 'reasoning'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Reasoning generation error: {e}") return examples def _generate_reasoning_response(self, content: str, prompt_template: str) -> str: """Generate reasoning response.""" if "implications" in prompt_template.lower(): return "The implications suggest that this concept has broader applications and may influence related areas of study or practice." elif "infer" in prompt_template.lower(): return "Based on this information, we can infer that there are underlying patterns or relationships that may not be immediately obvious." elif "causes" in prompt_template.lower(): return "The potential causes likely involve multiple factors including environmental conditions, historical context, and systematic influences." elif "next step" in prompt_template.lower(): return "The logical next step would be to investigate further, gather additional evidence, or implement the suggested approach." return "This requires careful analysis and consideration of multiple factors to reach a sound conclusion." def _generate_creative_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate creative writing examples.""" examples = [] content = chunk.content # Create creative prompts creative_prompts = [ "Write a creative story based on the following concept:", "Create a poem inspired by the following theme:", "Write a dialogue between two characters discussing the following topic:", "Create an imaginative scenario based on the following information:" ] for prompt_template in creative_prompts[:2]: # Limit to 2 examples try: prompt = f"{prompt_template}\n\n{content[:200]}..." completion = self._generate_creative_response(content, prompt_template) if completion: example = TrainingExample( example_id=f"creative_{chunk.chunk_id}_{hash(prompt_template) % 1000}", prompt=prompt, completion=completion, format_type='creative_writing', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'creative_type': prompt_template.split(':')[0]}, quality_score=self._calculate_quality_score(prompt, completion, 'creative_writing'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Creative generation error: {e}") return examples def _generate_creative_response(self, content: str, prompt_template: str) -> str: """Generate creative response.""" if "story" in prompt_template.lower(): return f"Once upon a time, there was a concept that changed everything. This concept, drawn from the depths of knowledge, began to spread its influence across the world, touching lives and inspiring new ways of thinking." elif "poem" in prompt_template.lower(): return f"In the realm of knowledge,\nWhere ideas take flight,\nThis concept emerges,\nShining bright in the night." elif "dialogue" in prompt_template.lower(): return f"Character A: 'I find this concept fascinating.'\nCharacter B: 'Indeed, it opens up so many possibilities.'\nCharacter A: 'How do you think we should approach it?'\nCharacter B: 'Let's explore it together, step by step.'" elif "scenario" in prompt_template.lower(): return f"In an alternate reality where this concept became the foundation of society, everything would be different. People would approach problems with new perspectives, and innovation would flourish in ways we can only imagine." return "This concept inspires creativity and imagination, opening doors to new possibilities and perspectives." def _generate_technical_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]: """Generate technical documentation examples.""" examples = [] content = chunk.content # Create technical prompts technical_prompts = [ "Create technical documentation for the following:", "Write an API documentation for the following code:", "Create a user manual for the following process:", "Write a troubleshooting guide for the following issue:" ] for prompt_template in technical_prompts[:2]: # Limit to 2 examples try: prompt = f"{prompt_template}\n\n{content[:300]}..." completion = self._generate_technical_response(content, prompt_template) if completion: example = TrainingExample( example_id=f"tech_{chunk.chunk_id}_{hash(prompt_template) % 1000}", prompt=prompt, completion=completion, format_type='technical_documentation', difficulty_level=self._determine_difficulty(chunk), source_chunk_id=chunk.chunk_id, metadata={'technical_type': prompt_template.split(' for')[0]}, quality_score=self._calculate_quality_score(prompt, completion, 'technical_documentation'), timestamp=datetime.now().isoformat() ) examples.append(example) except Exception as e: print(f"⚠️ Technical generation error: {e}") return examples def _generate_technical_response(self, content: str, prompt_template: str) -> str: """Generate technical response.""" if "documentation" in prompt_template.lower(): return f"# Technical Documentation\n\n## Overview\nThis section provides comprehensive technical documentation for the described concept.\n\n## Implementation\n1. Setup and configuration\n2. Core functionality\n3. Integration guidelines\n\n## Examples\nSee the provided code samples for practical implementation." elif "API" in prompt_template.lower(): return f"# API Documentation\n\n## Endpoints\n- GET /api/endpoint - Retrieve data\n- POST /api/endpoint - Create new entry\n\n## Parameters\n- param1: string (required)\n- param2: integer (optional)\n\n## Response Format\n```json\n{{\n \"status\": \"success\",\n \"data\": {{}}\n}}\n```" elif "manual" in prompt_template.lower(): return f"# User Manual\n\n## Getting Started\n1. Install the required dependencies\n2. Configure the system settings\n3. Run the application\n\n## Usage\nFollow these steps to use the system effectively:\n1. Initialize the process\n2. Configure parameters\n3. Execute the operation" elif "troubleshooting" in prompt_template.lower(): return f"# Troubleshooting Guide\n\n## Common Issues\n\n### Issue 1: Connection Problems\n**Symptoms:** Unable to connect\n**Solution:** Check network settings and firewall configuration\n\n### Issue 2: Performance Issues\n**Symptoms:** Slow response times\n**Solution:** Optimize system resources and check for bottlenecks" return "This technical documentation provides comprehensive guidance for implementation and usage." def _determine_difficulty(self, chunk: IntelligentChunk) -> str: """Determine difficulty level based on chunk metadata.""" importance = chunk.metadata.importance_score readability = chunk.metadata.readability_score entity_count = chunk.metadata.entity_count # Calculate difficulty score difficulty_score = (1 - readability) + importance + (entity_count / 100) if difficulty_score < 0.3: return 'beginner' elif difficulty_score < 0.6: return 'intermediate' elif difficulty_score < 0.8: return 'advanced' else: return 'expert' def _calculate_quality_score(self, prompt: str, completion: str, format_type: str) -> float: """Calculate quality score for training example.""" base_score = 0.5 # Length factor prompt_len = len(prompt.split()) completion_len = len(completion.split()) if prompt_len > 10 and completion_len > 5: base_score += 0.2 # Format-specific scoring if format_type == 'qa': if '?' in prompt and len(completion) > 20: base_score += 0.2 elif format_type == 'summarization': if len(completion) < len(prompt) * 0.8: # Good compression ratio base_score += 0.2 elif format_type == 'code_explanation': if '```' in prompt and len(completion) > 30: base_score += 0.2 # Coherence check if len(set(prompt.split()) & set(completion.split())) > 2: base_score += 0.1 return min(base_score, 1.0) def generate_training_dataset(self, chunks: List[IntelligentChunk], dataset_name: str, target_formats: List[str] = None, max_examples_per_chunk: int = 5, quality_threshold: float = 0.5) -> TrainingDataset: """Generate a complete training dataset from chunks.""" if target_formats is None: target_formats = list(self.formats.keys()) all_examples = [] for chunk in chunks: chunk_examples = [] # Generate examples for each target format for format_name in target_formats: if format_name in self.formats: try: examples = self.formats[format_name](chunk) chunk_examples.extend(examples) except Exception as e: print(f"⚠️ Error generating {format_name} examples: {e}") # Limit examples per chunk and filter by quality chunk_examples = [ ex for ex in chunk_examples if ex.quality_score >= quality_threshold ][:max_examples_per_chunk] all_examples.extend(chunk_examples) # Calculate dataset statistics format_distribution = {} difficulty_distribution = {} quality_scores = [] for example in all_examples: format_distribution[example.format_type] = format_distribution.get(example.format_type, 0) + 1 difficulty_distribution[example.difficulty_level] = difficulty_distribution.get(example.difficulty_level, 0) + 1 quality_scores.append(example.quality_score) quality_metrics = { 'avg_quality': np.mean(quality_scores) if quality_scores else 0, 'min_quality': np.min(quality_scores) if quality_scores else 0, 'max_quality': np.max(quality_scores) if quality_scores else 0, 'high_quality_count': len([s for s in quality_scores if s >= 0.8]), 'medium_quality_count': len([s for s in quality_scores if 0.6 <= s < 0.8]), 'low_quality_count': len([s for s in quality_scores if s < 0.6]) } # Create dataset dataset_id = hashlib.sha256(f"{dataset_name}_{datetime.now().isoformat()}".encode()).hexdigest()[:16] dataset = TrainingDataset( dataset_id=dataset_id, dataset_name=dataset_name, total_examples=len(all_examples), format_distribution=format_distribution, difficulty_distribution=difficulty_distribution, quality_metrics=quality_metrics, examples=all_examples, created_timestamp=datetime.now().isoformat() ) return dataset def save_dataset(self, dataset: TrainingDataset, format: str = 'jsonl') -> str: """Save training dataset to file.""" timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') if format == 'jsonl': filename = f"{dataset.dataset_name}_{timestamp}.jsonl" filepath = self.output_dir / filename with open(filepath, 'w', encoding='utf-8') as f: for example in dataset.examples: f.write(json.dumps(asdict(example), ensure_ascii=False) + '\n') elif format == 'json': filename = f"{dataset.dataset_name}_{timestamp}.json" filepath = self.output_dir / filename with open(filepath, 'w', encoding='utf-8') as f: json.dump(asdict(dataset), f, indent=2, ensure_ascii=False) else: raise ValueError(f"Unsupported format: {format}") return str(filepath) def load_dataset(self, filepath: str) -> TrainingDataset: """Load training dataset from file.""" with open(filepath, 'r', encoding='utf-8') as f: if filepath.endswith('.jsonl'): examples = [] for line in f: example_data = json.loads(line) examples.append(TrainingExample(**example_data)) # Create minimal dataset object dataset = TrainingDataset( dataset_id="loaded", dataset_name=Path(filepath).stem, total_examples=len(examples), format_distribution={}, difficulty_distribution={}, quality_metrics={}, examples=examples, created_timestamp=datetime.now().isoformat() ) else: # JSON format dataset_data = json.load(f) examples = [TrainingExample(**ex_data) for ex_data in dataset_data['examples']] dataset_data['examples'] = examples dataset = TrainingDataset(**dataset_data) return dataset def main(): """Demo the advanced training data generator.""" print("🚀 Advanced Training Data Generator Demo") print("=" * 50) # Initialize generator generator = AdvancedTrainingDataGenerator() # Create sample chunks sample_content = """ # Machine Learning Fundamentals Machine learning is a subset of artificial intelligence that focuses on algorithms and statistical models. ## Supervised Learning Supervised learning uses labeled training data to learn a mapping from inputs to outputs. ```python from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) predictions = model.predict(X_test) ``` ## Unsupervised Learning Unsupervised learning finds hidden patterns in data without labeled examples. The K-means algorithm is a popular clustering method that groups similar data points together. """ # Create a sample chunk from intelligent_chunking_processor import IntelligentChunkingProcessor chunk_processor = IntelligentChunkingProcessor() chunks = chunk_processor.create_intelligent_chunks( sample_content, hashlib.sha256(sample_content.encode()).hexdigest() ) print(f"\n📝 Processing {len(chunks)} chunks...") # Generate training dataset dataset = generator.generate_training_dataset( chunks, dataset_name="ml_fundamentals_demo", target_formats=['qa', 'summarization', 'code_explanation', 'completion'], max_examples_per_chunk=3, quality_threshold=0.4 ) print(f"\n✅ Generated training dataset:") print(f" Dataset ID: {dataset.dataset_id}") print(f" Total examples: {dataset.total_examples}") print(f" Format distribution: {dataset.format_distribution}") print(f" Difficulty distribution: {dataset.difficulty_distribution}") print(f" Quality metrics: {dataset.quality_metrics}") # Show sample examples print(f"\n📄 Sample examples:") for i, example in enumerate(dataset.examples[:3]): print(f"\n Example {i+1} ({example.format_type}):") print(f" Prompt: {example.prompt[:100]}...") print(f" Completion: {example.completion[:100]}...") print(f" Quality score: {example.quality_score:.2f}") # Save dataset output_file = generator.save_dataset(dataset, format='jsonl') print(f"\n💾 Dataset saved to: {output_file}") print(f"\n✅ Advanced training data generator ready!") if __name__ == "__main__": main()