File size: 4,146 Bytes
3fc35d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Data collector for continuous learning"""

import json
import os
from datetime import datetime
from typing import Optional, Dict, List
import hashlib

from database import db
from config import (
    DATA_DIR, LEARNING_FROM_FEEDBACK, 
    SAVE_ALL_INTERACTIONS, REQUIRE_APPROVAL
)

class DataCollector:
    """Collects and manages user interaction data for continuous learning"""
    
    def __init__(self):
        self.current_session_id = self._generate_session_id()
        self.session_interactions = []
    
    def _generate_session_id(self) -> str:
        """Generate unique session ID"""
        timestamp = datetime.now().isoformat()
        return hashlib.md5(timestamp.encode()).hexdigest()[:12]
    
    def collect_interaction(
        self,
        prompt: str,
        generated_code: str,
        temperature: float = 0.7,
        max_tokens: int = 100
    ) -> int:
        """Collect a user interaction"""
        
        if not SAVE_ALL_INTERACTIONS:
            return -1
        
        # Save to database
        interaction_id = db.save_interaction(
            prompt=prompt,
            generated_code=generated_code,
            temperature=temperature,
            max_tokens=max_tokens,
            session_id=self.current_session_id
        )
        
        # Track in session
        self.session_interactions.append({
            'id': interaction_id,
            'prompt': prompt,
            'code': generated_code,
            'timestamp': datetime.now().isoformat()
        })
        
        return interaction_id
    
    def record_feedback(
        self,
        interaction_id: int,
        is_positive: bool,
        edited_code: str = None
    ):
        """Record user feedback for an interaction"""
        
        if not LEARNING_FROM_FEEDBACK:
            return
        
        feedback = 1 if is_positive else -1
        db.update_feedback(interaction_id, feedback, edited_code)
        
        print(f"Feedback recorded: {'👍' if is_positive else '👎'} for interaction {interaction_id}")
    
    def add_training_sample(self, code: str, category: str = "user_contributed"):
        """Add a code sample directly to training data"""
        return db.add_code_sample(code, source="user", category=category)
    
    def get_training_data(self, include_base: bool = True) -> List[str]:
        """Get all available training data"""
        samples = []
        
        # Get approved user interactions
        approved = db.get_approved_samples()
        for item in approved:
            # Combine prompt and code for training
            sample = f"# Prompt: {item['prompt']}\n{item['code']}"
            samples.append(sample)
        
        # Get curated code samples
        code_samples = db.get_all_code_samples()
        for item in code_samples:
            samples.append(item['code'])
        
        # Include base training data
        if include_base:
            base_path = os.path.join(DATA_DIR, "..", "programming.txt")
            if os.path.exists(base_path):
                with open(base_path, 'r', encoding='utf-8') as f:
                    base_code = f.read()
                samples.append(base_code)
        
        return samples
    
    def get_new_training_data(self) -> List[Dict]:
        """Get new approved samples not yet used for training"""
        return db.get_approved_samples(not_used=True)
    
    def get_pending_count(self) -> int:
        """Get count of samples pending training"""
        return db.get_pending_samples_count()
    
    def get_statistics(self) -> Dict:
        """Get collection statistics"""
        stats = db.get_statistics()
        stats['session_interactions'] = len(self.session_interactions)
        return stats
    
    def export_training_data(self, filepath: str):
        """Export all training data to a file"""
        samples = self.get_training_data()
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write('\n\n'.join(samples))
        
        print(f"Exported {len(samples)} samples to {filepath}")


# Global collector instance
collector = DataCollector()