File size: 41,341 Bytes
968c919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
#!/usr/bin/env python3
"""
Advanced Training Data Generator
===============================
Generates high-quality training data from chunks with various formats and augmentations.
"""

import json
import random
import hashlib
import numpy as np
from typing import List, Dict, Any, Optional, Tuple, Generator
from dataclasses import dataclass, asdict
from datetime import datetime
import re
from pathlib import Path
from intelligent_chunking_processor import IntelligentChunk, ChunkMetadata

@dataclass
class TrainingExample:
    """A training example with various formats."""
    example_id: str
    prompt: str
    completion: str
    format_type: str
    difficulty_level: str
    source_chunk_id: str
    metadata: Dict[str, Any]
    quality_score: float
    timestamp: str

@dataclass
class TrainingDataset:
    """A complete training dataset."""
    dataset_id: str
    dataset_name: str
    total_examples: int
    format_distribution: Dict[str, int]
    difficulty_distribution: Dict[str, int]
    quality_metrics: Dict[str, float]
    examples: List[TrainingExample]
    created_timestamp: str

class AdvancedTrainingDataGenerator:
    """Advanced training data generator with multiple formats and augmentations."""
    
    def __init__(self, output_dir: str = "training_datasets"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Training formats
        self.formats = {
            'qa': self._generate_qa_examples,
            'summarization': self._generate_summarization_examples,
            'code_explanation': self._generate_code_explanation_examples,
            'translation': self._generate_translation_examples,
            'classification': self._generate_classification_examples,
            'completion': self._generate_completion_examples,
            'instruction_following': self._generate_instruction_examples,
            'reasoning': self._generate_reasoning_examples,
            'creative_writing': self._generate_creative_examples,
            'technical_documentation': self._generate_technical_examples
        }
        
        # Difficulty levels
        self.difficulty_levels = ['beginner', 'intermediate', 'advanced', 'expert']
        
        # Quality thresholds
        self.quality_thresholds = {
            'high': 0.8,
            'medium': 0.6,
            'low': 0.4
        }
    
    def _generate_qa_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate Q&A examples from chunk."""
        examples = []
        content = chunk.content
        
        # Extract key concepts
        sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
        
        if len(sentences) < 2:
            return examples
        
        # Generate different types of questions
        question_types = [
            self._generate_what_questions,
            self._generate_how_questions,
            self._generate_why_questions,
            self._generate_when_questions,
            self._generate_where_questions
        ]
        
        for question_type in question_types:
            try:
                prompt, completion = question_type(sentences, chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"qa_{chunk.chunk_id}_{len(examples)}",
                        prompt=prompt,
                        completion=completion,
                        format_type='qa',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'question_type': question_type.__name__},
                        quality_score=self._calculate_quality_score(prompt, completion, 'qa'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Q&A generation error: {e}")
        
        return examples[:3]  # Limit to 3 examples per chunk
    
    def _generate_what_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'What' questions."""
        # Find sentences with definitions or explanations
        definition_sentences = [s for s in sentences if any(word in s.lower() for word in ['is', 'are', 'means', 'refers to', 'defined as'])]
        
        if not definition_sentences:
            return None, None
        
        sentence = random.choice(definition_sentences)
        
        # Extract the subject and definition
        if ' is ' in sentence.lower():
            parts = sentence.split(' is ', 1)
            if len(parts) == 2:
                subject = parts[0].strip()
                definition = parts[1].strip()
                prompt = f"What is {subject}?"
                completion = f"{subject} is {definition}"
                return prompt, completion
        
        return None, None
    
    def _generate_how_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'How' questions."""
        # Find sentences with processes or methods
        process_sentences = [s for s in sentences if any(word in s.lower() for word in ['process', 'method', 'step', 'procedure', 'algorithm'])]
        
        if not process_sentences:
            return None, None
        
        sentence = random.choice(process_sentences)
        
        # Create a how question
        if 'process' in sentence.lower():
            prompt = f"How does the process described work?"
            completion = sentence
            return prompt, completion
        
        return None, None
    
    def _generate_why_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'Why' questions."""
        # Find sentences with reasons or explanations
        reason_sentences = [s for s in sentences if any(word in s.lower() for word in ['because', 'due to', 'reason', 'explain', 'since'])]
        
        if not reason_sentences:
            return None, None
        
        sentence = random.choice(reason_sentences)
        
        # Create a why question
        prompt = f"Why is this important or relevant?"
        completion = sentence
        return prompt, completion
    
    def _generate_when_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'When' questions."""
        # Find sentences with temporal information
        temporal_sentences = [s for s in sentences if any(word in s.lower() for word in ['when', 'time', 'during', 'after', 'before', 'while'])]
        
        if not temporal_sentences:
            return None, None
        
        sentence = random.choice(temporal_sentences)
        
        prompt = f"When does this occur or apply?"
        completion = sentence
        return prompt, completion
    
    def _generate_where_questions(self, sentences: List[str], chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate 'Where' questions."""
        # Find sentences with location information
        location_sentences = [s for s in sentences if any(word in s.lower() for word in ['where', 'location', 'place', 'position', 'site'])]
        
        if not location_sentences:
            return None, None
        
        sentence = random.choice(location_sentences)
        
        prompt = f"Where does this occur or apply?"
        completion = sentence
        return prompt, completion
    
    def _generate_summarization_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate summarization examples."""
        examples = []
        content = chunk.content
        
        if len(content) < 200:  # Too short to summarize
            return examples
        
        # Create different types of summaries
        summary_types = ['brief', 'detailed', 'bullet_points']
        
        for summary_type in summary_types:
            try:
                prompt, completion = self._create_summary_pair(content, summary_type, chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"sum_{chunk.chunk_id}_{summary_type}",
                        prompt=prompt,
                        completion=completion,
                        format_type='summarization',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'summary_type': summary_type},
                        quality_score=self._calculate_quality_score(prompt, completion, 'summarization'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Summarization generation error: {e}")
        
        return examples
    
    def _create_summary_pair(self, content: str, summary_type: str, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Create a prompt-completion pair for summarization."""
        
        if summary_type == 'brief':
            prompt = f"Summarize the following text in 1-2 sentences:\n\n{content}"
            # Simple extractive summary (first and last sentences)
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            if len(sentences) >= 2:
                completion = f"{sentences[0]}. {sentences[-1]}."
            else:
                completion = sentences[0] if sentences else content[:100] + "..."
        
        elif summary_type == 'detailed':
            prompt = f"Provide a detailed summary of the following text:\n\n{content}"
            # Create a more detailed summary
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            if len(sentences) > 3:
                completion = f"{sentences[0]}. {sentences[len(sentences)//2]}. {sentences[-1]}."
            else:
                completion = content[:200] + "..."
        
        elif summary_type == 'bullet_points':
            prompt = f"Summarize the following text as bullet points:\n\n{content}"
            # Create bullet points
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            completion = "\n".join([f"• {s}" for s in sentences[:5]])
        
        return prompt, completion
    
    def _generate_code_explanation_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate code explanation examples."""
        examples = []
        
        # Check if chunk contains code
        if chunk.metadata.content_type != 'code':
            return examples
        
        content = chunk.content
        
        # Find code blocks
        code_blocks = re.findall(r'```[\s\S]*?```', content)
        if not code_blocks:
            # Look for inline code or function definitions
            code_blocks = re.findall(r'def\s+\w+\s*\([^)]*\):[\s\S]*?(?=\n\s*\w|\n\n|$)', content)
        
        for code_block in code_blocks[:2]:  # Limit to 2 examples
            try:
                # Clean code block
                clean_code = re.sub(r'```\w*\n?', '', code_block).strip()
                
                if len(clean_code) > 50:  # Only process substantial code
                    prompt = f"Explain what the following code does:\n\n```\n{clean_code}\n```"
                    completion = self._generate_code_explanation(clean_code, chunk)
                    
                    example = TrainingExample(
                        example_id=f"code_{chunk.chunk_id}_{len(examples)}",
                        prompt=prompt,
                        completion=completion,
                        format_type='code_explanation',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'code_language': self._detect_code_language(clean_code)},
                        quality_score=self._calculate_quality_score(prompt, completion, 'code_explanation'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Code explanation generation error: {e}")
        
        return examples
    
    def _generate_code_explanation(self, code: str, chunk: IntelligentChunk) -> str:
        """Generate explanation for code."""
        # Simple heuristics for code explanation
        
        if 'def ' in code:
            # Function definition
            func_name = re.search(r'def\s+(\w+)', code)
            if func_name:
                return f"This code defines a function called '{func_name.group(1)}'. The function performs the operations described in the code block."
        
        elif 'class ' in code:
            # Class definition
            class_name = re.search(r'class\s+(\w+)', code)
            if class_name:
                return f"This code defines a class called '{class_name.group(1)}'. The class contains methods and attributes as specified."
        
        elif 'import ' in code:
            return "This code imports external libraries or modules for use in the program."
        
        elif '=' in code and any(op in code for op in ['+', '-', '*', '/']):
            return "This code performs mathematical calculations or data processing operations."
        
        else:
            return "This code performs various programming operations as specified in the implementation."
    
    def _detect_code_language(self, code: str) -> str:
        """Detect programming language from code."""
        if 'def ' in code or 'import ' in code or 'from ' in code:
            return 'python'
        elif 'function ' in code or 'var ' in code or 'const ' in code:
            return 'javascript'
        elif '#include' in code or 'int main' in code:
            return 'c'
        elif 'public class' in code or 'System.out.println' in code:
            return 'java'
        else:
            return 'unknown'
    
    def _generate_completion_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate text completion examples."""
        examples = []
        content = chunk.content
        
        if len(content) < 100:
            return examples
        
        # Create completion tasks at different positions
        completion_positions = [0.3, 0.5, 0.7]  # 30%, 50%, 70% through the text
        
        for position in completion_positions:
            try:
                split_point = int(len(content) * position)
                
                # Find a good split point (end of sentence)
                sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
                if sentences:
                    sentence_lengths = [len(s) for s in sentences]
                    cumulative_length = 0
                    best_split = 0
                    
                    for i, length in enumerate(sentence_lengths):
                        cumulative_length += length
                        if cumulative_length >= split_point:
                            best_split = i
                            break
                    
                    if best_split < len(sentences) - 1:
                        prompt = ' '.join(sentences[:best_split + 1])
                        completion = ' '.join(sentences[best_split + 1:])
                        
                        if len(completion) > 20:  # Ensure meaningful completion
                            example = TrainingExample(
                                example_id=f"comp_{chunk.chunk_id}_{position}",
                                prompt=prompt,
                                completion=completion,
                                format_type='completion',
                                difficulty_level=self._determine_difficulty(chunk),
                                source_chunk_id=chunk.chunk_id,
                                metadata={'split_position': position},
                                quality_score=self._calculate_quality_score(prompt, completion, 'completion'),
                                timestamp=datetime.now().isoformat()
                            )
                            examples.append(example)
            except Exception as e:
                print(f"⚠️  Completion generation error: {e}")
        
        return examples[:2]  # Limit to 2 examples
    
    def _generate_classification_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate classification examples."""
        examples = []
        
        # Determine classification tasks based on content
        classification_tasks = []
        
        if chunk.metadata.content_type == 'code':
            classification_tasks.append(('programming_language', self._classify_programming_language))
        
        if chunk.metadata.content_type == 'natural_language':
            classification_tasks.append(('sentiment', self._classify_sentiment))
            classification_tasks.append(('topic', self._classify_topic))
        
        for task_name, classifier_func in classification_tasks:
            try:
                prompt, completion = classifier_func(chunk)
                if prompt and completion:
                    example = TrainingExample(
                        example_id=f"class_{chunk.chunk_id}_{task_name}",
                        prompt=prompt,
                        completion=completion,
                        format_type='classification',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'classification_task': task_name},
                        quality_score=self._calculate_quality_score(prompt, completion, 'classification'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Classification generation error: {e}")
        
        return examples
    
    def _classify_programming_language(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate programming language classification example."""
        content = chunk.content
        language = self._detect_code_language(content)
        
        prompt = f"Classify the programming language of the following code:\n\n```\n{content[:200]}...\n```"
        completion = f"The programming language is {language}."
        
        return prompt, completion
    
    def _classify_sentiment(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate sentiment classification example."""
        content = chunk.content
        sentiment = "positive" if chunk.metadata.sentiment_score > 0.1 else "negative" if chunk.metadata.sentiment_score < -0.1 else "neutral"
        
        prompt = f"Classify the sentiment of the following text:\n\n{content[:200]}..."
        completion = f"The sentiment is {sentiment}."
        
        return prompt, completion
    
    def _classify_topic(self, chunk: IntelligentChunk) -> Tuple[str, str]:
        """Generate topic classification example."""
        content = chunk.content
        topic = chunk.metadata.semantic_topic
        
        prompt = f"Classify the main topic of the following text:\n\n{content[:200]}..."
        completion = f"The main topic is {topic}."
        
        return prompt, completion
    
    def _generate_instruction_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate instruction following examples."""
        examples = []
        content = chunk.content
        
        # Create instruction-based prompts
        instructions = [
            "Rewrite the following text in a more formal tone:",
            "Simplify the following text for beginners:",
            "Convert the following text into bullet points:",
            "Explain the following concept step by step:"
        ]
        
        for instruction in instructions[:2]:  # Limit to 2 examples
            try:
                prompt = f"{instruction}\n\n{content[:300]}..."
                completion = self._apply_instruction(content, instruction)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"inst_{chunk.chunk_id}_{hash(instruction) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='instruction_following',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'instruction_type': instruction.split(':')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'instruction_following'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Instruction generation error: {e}")
        
        return examples
    
    def _apply_instruction(self, content: str, instruction: str) -> str:
        """Apply instruction to content."""
        if "formal tone" in instruction.lower():
            return content.replace("don't", "do not").replace("can't", "cannot").replace("won't", "will not")
        elif "simplify" in instruction.lower():
            # Simple simplification - remove complex words
            return content.replace("utilize", "use").replace("implement", "do").replace("facilitate", "help")
        elif "bullet points" in instruction.lower():
            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
            return "\n".join([f"• {s}" for s in sentences[:5]])
        elif "step by step" in instruction.lower():
            return f"Step 1: {content[:100]}\nStep 2: {content[100:200]}\nStep 3: {content[200:300]}"
        
        return content
    
    def _generate_reasoning_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate reasoning examples."""
        examples = []
        content = chunk.content
        
        # Create reasoning prompts
        reasoning_prompts = [
            "What are the implications of the following statement?",
            "What can we infer from the following information?",
            "What are the potential causes of the following situation?",
            "What would be the logical next step based on the following?"
        ]
        
        for prompt_template in reasoning_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:300]}..."
                completion = self._generate_reasoning_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"reason_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='reasoning',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'reasoning_type': prompt_template.split('?')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'reasoning'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Reasoning generation error: {e}")
        
        return examples
    
    def _generate_reasoning_response(self, content: str, prompt_template: str) -> str:
        """Generate reasoning response."""
        if "implications" in prompt_template.lower():
            return "The implications suggest that this concept has broader applications and may influence related areas of study or practice."
        elif "infer" in prompt_template.lower():
            return "Based on this information, we can infer that there are underlying patterns or relationships that may not be immediately obvious."
        elif "causes" in prompt_template.lower():
            return "The potential causes likely involve multiple factors including environmental conditions, historical context, and systematic influences."
        elif "next step" in prompt_template.lower():
            return "The logical next step would be to investigate further, gather additional evidence, or implement the suggested approach."
        
        return "This requires careful analysis and consideration of multiple factors to reach a sound conclusion."
    
    def _generate_creative_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate creative writing examples."""
        examples = []
        content = chunk.content
        
        # Create creative prompts
        creative_prompts = [
            "Write a creative story based on the following concept:",
            "Create a poem inspired by the following theme:",
            "Write a dialogue between two characters discussing the following topic:",
            "Create an imaginative scenario based on the following information:"
        ]
        
        for prompt_template in creative_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:200]}..."
                completion = self._generate_creative_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"creative_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='creative_writing',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'creative_type': prompt_template.split(':')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'creative_writing'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Creative generation error: {e}")
        
        return examples
    
    def _generate_creative_response(self, content: str, prompt_template: str) -> str:
        """Generate creative response."""
        if "story" in prompt_template.lower():
            return f"Once upon a time, there was a concept that changed everything. This concept, drawn from the depths of knowledge, began to spread its influence across the world, touching lives and inspiring new ways of thinking."
        elif "poem" in prompt_template.lower():
            return f"In the realm of knowledge,\nWhere ideas take flight,\nThis concept emerges,\nShining bright in the night."
        elif "dialogue" in prompt_template.lower():
            return f"Character A: 'I find this concept fascinating.'\nCharacter B: 'Indeed, it opens up so many possibilities.'\nCharacter A: 'How do you think we should approach it?'\nCharacter B: 'Let's explore it together, step by step.'"
        elif "scenario" in prompt_template.lower():
            return f"In an alternate reality where this concept became the foundation of society, everything would be different. People would approach problems with new perspectives, and innovation would flourish in ways we can only imagine."
        
        return "This concept inspires creativity and imagination, opening doors to new possibilities and perspectives."
    
    def _generate_technical_examples(self, chunk: IntelligentChunk) -> List[TrainingExample]:
        """Generate technical documentation examples."""
        examples = []
        content = chunk.content
        
        # Create technical prompts
        technical_prompts = [
            "Create technical documentation for the following:",
            "Write an API documentation for the following code:",
            "Create a user manual for the following process:",
            "Write a troubleshooting guide for the following issue:"
        ]
        
        for prompt_template in technical_prompts[:2]:  # Limit to 2 examples
            try:
                prompt = f"{prompt_template}\n\n{content[:300]}..."
                completion = self._generate_technical_response(content, prompt_template)
                
                if completion:
                    example = TrainingExample(
                        example_id=f"tech_{chunk.chunk_id}_{hash(prompt_template) % 1000}",
                        prompt=prompt,
                        completion=completion,
                        format_type='technical_documentation',
                        difficulty_level=self._determine_difficulty(chunk),
                        source_chunk_id=chunk.chunk_id,
                        metadata={'technical_type': prompt_template.split(' for')[0]},
                        quality_score=self._calculate_quality_score(prompt, completion, 'technical_documentation'),
                        timestamp=datetime.now().isoformat()
                    )
                    examples.append(example)
            except Exception as e:
                print(f"⚠️  Technical generation error: {e}")
        
        return examples
    
    def _generate_technical_response(self, content: str, prompt_template: str) -> str:
        """Generate technical response."""
        if "documentation" in prompt_template.lower():
            return f"# Technical Documentation\n\n## Overview\nThis section provides comprehensive technical documentation for the described concept.\n\n## Implementation\n1. Setup and configuration\n2. Core functionality\n3. Integration guidelines\n\n## Examples\nSee the provided code samples for practical implementation."
        elif "API" in prompt_template.lower():
            return f"# API Documentation\n\n## Endpoints\n- GET /api/endpoint - Retrieve data\n- POST /api/endpoint - Create new entry\n\n## Parameters\n- param1: string (required)\n- param2: integer (optional)\n\n## Response Format\n```json\n{{\n  \"status\": \"success\",\n  \"data\": {{}}\n}}\n```"
        elif "manual" in prompt_template.lower():
            return f"# User Manual\n\n## Getting Started\n1. Install the required dependencies\n2. Configure the system settings\n3. Run the application\n\n## Usage\nFollow these steps to use the system effectively:\n1. Initialize the process\n2. Configure parameters\n3. Execute the operation"
        elif "troubleshooting" in prompt_template.lower():
            return f"# Troubleshooting Guide\n\n## Common Issues\n\n### Issue 1: Connection Problems\n**Symptoms:** Unable to connect\n**Solution:** Check network settings and firewall configuration\n\n### Issue 2: Performance Issues\n**Symptoms:** Slow response times\n**Solution:** Optimize system resources and check for bottlenecks"
        
        return "This technical documentation provides comprehensive guidance for implementation and usage."
    
    def _determine_difficulty(self, chunk: IntelligentChunk) -> str:
        """Determine difficulty level based on chunk metadata."""
        importance = chunk.metadata.importance_score
        readability = chunk.metadata.readability_score
        entity_count = chunk.metadata.entity_count
        
        # Calculate difficulty score
        difficulty_score = (1 - readability) + importance + (entity_count / 100)
        
        if difficulty_score < 0.3:
            return 'beginner'
        elif difficulty_score < 0.6:
            return 'intermediate'
        elif difficulty_score < 0.8:
            return 'advanced'
        else:
            return 'expert'
    
    def _calculate_quality_score(self, prompt: str, completion: str, format_type: str) -> float:
        """Calculate quality score for training example."""
        base_score = 0.5
        
        # Length factor
        prompt_len = len(prompt.split())
        completion_len = len(completion.split())
        
        if prompt_len > 10 and completion_len > 5:
            base_score += 0.2
        
        # Format-specific scoring
        if format_type == 'qa':
            if '?' in prompt and len(completion) > 20:
                base_score += 0.2
        elif format_type == 'summarization':
            if len(completion) < len(prompt) * 0.8:  # Good compression ratio
                base_score += 0.2
        elif format_type == 'code_explanation':
            if '```' in prompt and len(completion) > 30:
                base_score += 0.2
        
        # Coherence check
        if len(set(prompt.split()) & set(completion.split())) > 2:
            base_score += 0.1
        
        return min(base_score, 1.0)
    
    def generate_training_dataset(self, 
                                chunks: List[IntelligentChunk],
                                dataset_name: str,
                                target_formats: List[str] = None,
                                max_examples_per_chunk: int = 5,
                                quality_threshold: float = 0.5) -> TrainingDataset:
        """Generate a complete training dataset from chunks."""
        
        if target_formats is None:
            target_formats = list(self.formats.keys())
        
        all_examples = []
        
        for chunk in chunks:
            chunk_examples = []
            
            # Generate examples for each target format
            for format_name in target_formats:
                if format_name in self.formats:
                    try:
                        examples = self.formats[format_name](chunk)
                        chunk_examples.extend(examples)
                    except Exception as e:
                        print(f"⚠️  Error generating {format_name} examples: {e}")
            
            # Limit examples per chunk and filter by quality
            chunk_examples = [
                ex for ex in chunk_examples 
                if ex.quality_score >= quality_threshold
            ][:max_examples_per_chunk]
            
            all_examples.extend(chunk_examples)
        
        # Calculate dataset statistics
        format_distribution = {}
        difficulty_distribution = {}
        quality_scores = []
        
        for example in all_examples:
            format_distribution[example.format_type] = format_distribution.get(example.format_type, 0) + 1
            difficulty_distribution[example.difficulty_level] = difficulty_distribution.get(example.difficulty_level, 0) + 1
            quality_scores.append(example.quality_score)
        
        quality_metrics = {
            'avg_quality': np.mean(quality_scores) if quality_scores else 0,
            'min_quality': np.min(quality_scores) if quality_scores else 0,
            'max_quality': np.max(quality_scores) if quality_scores else 0,
            'high_quality_count': len([s for s in quality_scores if s >= 0.8]),
            'medium_quality_count': len([s for s in quality_scores if 0.6 <= s < 0.8]),
            'low_quality_count': len([s for s in quality_scores if s < 0.6])
        }
        
        # Create dataset
        dataset_id = hashlib.sha256(f"{dataset_name}_{datetime.now().isoformat()}".encode()).hexdigest()[:16]
        
        dataset = TrainingDataset(
            dataset_id=dataset_id,
            dataset_name=dataset_name,
            total_examples=len(all_examples),
            format_distribution=format_distribution,
            difficulty_distribution=difficulty_distribution,
            quality_metrics=quality_metrics,
            examples=all_examples,
            created_timestamp=datetime.now().isoformat()
        )
        
        return dataset
    
    def save_dataset(self, dataset: TrainingDataset, format: str = 'jsonl') -> str:
        """Save training dataset to file."""
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        if format == 'jsonl':
            filename = f"{dataset.dataset_name}_{timestamp}.jsonl"
            filepath = self.output_dir / filename
            
            with open(filepath, 'w', encoding='utf-8') as f:
                for example in dataset.examples:
                    f.write(json.dumps(asdict(example), ensure_ascii=False) + '\n')
        
        elif format == 'json':
            filename = f"{dataset.dataset_name}_{timestamp}.json"
            filepath = self.output_dir / filename
            
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(asdict(dataset), f, indent=2, ensure_ascii=False)
        
        else:
            raise ValueError(f"Unsupported format: {format}")
        
        return str(filepath)
    
    def load_dataset(self, filepath: str) -> TrainingDataset:
        """Load training dataset from file."""
        
        with open(filepath, 'r', encoding='utf-8') as f:
            if filepath.endswith('.jsonl'):
                examples = []
                for line in f:
                    example_data = json.loads(line)
                    examples.append(TrainingExample(**example_data))
                
                # Create minimal dataset object
                dataset = TrainingDataset(
                    dataset_id="loaded",
                    dataset_name=Path(filepath).stem,
                    total_examples=len(examples),
                    format_distribution={},
                    difficulty_distribution={},
                    quality_metrics={},
                    examples=examples,
                    created_timestamp=datetime.now().isoformat()
                )
                
            else:  # JSON format
                dataset_data = json.load(f)
                examples = [TrainingExample(**ex_data) for ex_data in dataset_data['examples']]
                dataset_data['examples'] = examples
                dataset = TrainingDataset(**dataset_data)
        
        return dataset

def main():
    """Demo the advanced training data generator."""
    
    print("🚀 Advanced Training Data Generator Demo")
    print("=" * 50)
    
    # Initialize generator
    generator = AdvancedTrainingDataGenerator()
    
    # Create sample chunks
    sample_content = """
    # Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that focuses on algorithms and statistical models.
    
    ## Supervised Learning
    Supervised learning uses labeled training data to learn a mapping from inputs to outputs.
    
    ```python
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    ```
    
    ## Unsupervised Learning
    Unsupervised learning finds hidden patterns in data without labeled examples.
    
    The K-means algorithm is a popular clustering method that groups similar data points together.
    """
    
    # Create a sample chunk
    from intelligent_chunking_processor import IntelligentChunkingProcessor
    chunk_processor = IntelligentChunkingProcessor()
    
    chunks = chunk_processor.create_intelligent_chunks(
        sample_content, 
        hashlib.sha256(sample_content.encode()).hexdigest()
    )
    
    print(f"\n📝 Processing {len(chunks)} chunks...")
    
    # Generate training dataset
    dataset = generator.generate_training_dataset(
        chunks,
        dataset_name="ml_fundamentals_demo",
        target_formats=['qa', 'summarization', 'code_explanation', 'completion'],
        max_examples_per_chunk=3,
        quality_threshold=0.4
    )
    
    print(f"\n✅ Generated training dataset:")
    print(f"   Dataset ID: {dataset.dataset_id}")
    print(f"   Total examples: {dataset.total_examples}")
    print(f"   Format distribution: {dataset.format_distribution}")
    print(f"   Difficulty distribution: {dataset.difficulty_distribution}")
    print(f"   Quality metrics: {dataset.quality_metrics}")
    
    # Show sample examples
    print(f"\n📄 Sample examples:")
    for i, example in enumerate(dataset.examples[:3]):
        print(f"\n   Example {i+1} ({example.format_type}):")
        print(f"   Prompt: {example.prompt[:100]}...")
        print(f"   Completion: {example.completion[:100]}...")
        print(f"   Quality score: {example.quality_score:.2f}")
    
    # Save dataset
    output_file = generator.save_dataset(dataset, format='jsonl')
    print(f"\n💾 Dataset saved to: {output_file}")
    
    print(f"\n✅ Advanced training data generator ready!")

if __name__ == "__main__":
    main()