Spaces:
Running
Running
| """Data collector for continuous learning""" | |
| import json | |
| import os | |
| from datetime import datetime | |
| from typing import Optional, Dict, List | |
| import hashlib | |
| from database import db | |
| from config import ( | |
| DATA_DIR, LEARNING_FROM_FEEDBACK, | |
| SAVE_ALL_INTERACTIONS, REQUIRE_APPROVAL | |
| ) | |
| class DataCollector: | |
| """Collects and manages user interaction data for continuous learning""" | |
| def __init__(self): | |
| self.current_session_id = self._generate_session_id() | |
| self.session_interactions = [] | |
| def _generate_session_id(self) -> str: | |
| """Generate unique session ID""" | |
| timestamp = datetime.now().isoformat() | |
| return hashlib.md5(timestamp.encode()).hexdigest()[:12] | |
| def collect_interaction( | |
| self, | |
| prompt: str, | |
| generated_code: str, | |
| temperature: float = 0.7, | |
| max_tokens: int = 100 | |
| ) -> int: | |
| """Collect a user interaction""" | |
| if not SAVE_ALL_INTERACTIONS: | |
| return -1 | |
| # Save to database | |
| interaction_id = db.save_interaction( | |
| prompt=prompt, | |
| generated_code=generated_code, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| session_id=self.current_session_id | |
| ) | |
| # Track in session | |
| self.session_interactions.append({ | |
| 'id': interaction_id, | |
| 'prompt': prompt, | |
| 'code': generated_code, | |
| 'timestamp': datetime.now().isoformat() | |
| }) | |
| return interaction_id | |
| def record_feedback( | |
| self, | |
| interaction_id: int, | |
| is_positive: bool, | |
| edited_code: str = None | |
| ): | |
| """Record user feedback for an interaction""" | |
| if not LEARNING_FROM_FEEDBACK: | |
| return | |
| feedback = 1 if is_positive else -1 | |
| db.update_feedback(interaction_id, feedback, edited_code) | |
| print(f"Feedback recorded: {'π' if is_positive else 'π'} for interaction {interaction_id}") | |
| def add_training_sample(self, code: str, category: str = "user_contributed"): | |
| """Add a code sample directly to training data""" | |
| return db.add_code_sample(code, source="user", category=category) | |
| def get_training_data(self, include_base: bool = True) -> List[str]: | |
| """Get all available training data""" | |
| samples = [] | |
| # Get approved user interactions | |
| approved = db.get_approved_samples() | |
| for item in approved: | |
| # Combine prompt and code for training | |
| sample = f"# Prompt: {item['prompt']}\n{item['code']}" | |
| samples.append(sample) | |
| # Get curated code samples | |
| code_samples = db.get_all_code_samples() | |
| for item in code_samples: | |
| samples.append(item['code']) | |
| # Include base training data | |
| if include_base: | |
| base_path = os.path.join(DATA_DIR, "..", "programming.txt") | |
| if os.path.exists(base_path): | |
| with open(base_path, 'r', encoding='utf-8') as f: | |
| base_code = f.read() | |
| samples.append(base_code) | |
| return samples | |
| def get_new_training_data(self) -> List[Dict]: | |
| """Get new approved samples not yet used for training""" | |
| return db.get_approved_samples(not_used=True) | |
| def get_pending_count(self) -> int: | |
| """Get count of samples pending training""" | |
| return db.get_pending_samples_count() | |
| def get_statistics(self) -> Dict: | |
| """Get collection statistics""" | |
| stats = db.get_statistics() | |
| stats['session_interactions'] = len(self.session_interactions) | |
| return stats | |
| def export_training_data(self, filepath: str): | |
| """Export all training data to a file""" | |
| samples = self.get_training_data() | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write('\n\n'.join(samples)) | |
| print(f"Exported {len(samples)} samples to {filepath}") | |
| # Global collector instance | |
| collector = DataCollector() |