veda-programming / data_collector.py
vedaco's picture
Create data_collector.py
3fc35d3 verified
"""Data collector for continuous learning"""
import json
import os
from datetime import datetime
from typing import Optional, Dict, List
import hashlib
from database import db
from config import (
DATA_DIR, LEARNING_FROM_FEEDBACK,
SAVE_ALL_INTERACTIONS, REQUIRE_APPROVAL
)
class DataCollector:
"""Collects and manages user interaction data for continuous learning"""
def __init__(self):
self.current_session_id = self._generate_session_id()
self.session_interactions = []
def _generate_session_id(self) -> str:
"""Generate unique session ID"""
timestamp = datetime.now().isoformat()
return hashlib.md5(timestamp.encode()).hexdigest()[:12]
def collect_interaction(
self,
prompt: str,
generated_code: str,
temperature: float = 0.7,
max_tokens: int = 100
) -> int:
"""Collect a user interaction"""
if not SAVE_ALL_INTERACTIONS:
return -1
# Save to database
interaction_id = db.save_interaction(
prompt=prompt,
generated_code=generated_code,
temperature=temperature,
max_tokens=max_tokens,
session_id=self.current_session_id
)
# Track in session
self.session_interactions.append({
'id': interaction_id,
'prompt': prompt,
'code': generated_code,
'timestamp': datetime.now().isoformat()
})
return interaction_id
def record_feedback(
self,
interaction_id: int,
is_positive: bool,
edited_code: str = None
):
"""Record user feedback for an interaction"""
if not LEARNING_FROM_FEEDBACK:
return
feedback = 1 if is_positive else -1
db.update_feedback(interaction_id, feedback, edited_code)
print(f"Feedback recorded: {'πŸ‘' if is_positive else 'πŸ‘Ž'} for interaction {interaction_id}")
def add_training_sample(self, code: str, category: str = "user_contributed"):
"""Add a code sample directly to training data"""
return db.add_code_sample(code, source="user", category=category)
def get_training_data(self, include_base: bool = True) -> List[str]:
"""Get all available training data"""
samples = []
# Get approved user interactions
approved = db.get_approved_samples()
for item in approved:
# Combine prompt and code for training
sample = f"# Prompt: {item['prompt']}\n{item['code']}"
samples.append(sample)
# Get curated code samples
code_samples = db.get_all_code_samples()
for item in code_samples:
samples.append(item['code'])
# Include base training data
if include_base:
base_path = os.path.join(DATA_DIR, "..", "programming.txt")
if os.path.exists(base_path):
with open(base_path, 'r', encoding='utf-8') as f:
base_code = f.read()
samples.append(base_code)
return samples
def get_new_training_data(self) -> List[Dict]:
"""Get new approved samples not yet used for training"""
return db.get_approved_samples(not_used=True)
def get_pending_count(self) -> int:
"""Get count of samples pending training"""
return db.get_pending_samples_count()
def get_statistics(self) -> Dict:
"""Get collection statistics"""
stats = db.get_statistics()
stats['session_interactions'] = len(self.session_interactions)
return stats
def export_training_data(self, filepath: str):
"""Export all training data to a file"""
samples = self.get_training_data()
with open(filepath, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(samples))
print(f"Exported {len(samples)} samples to {filepath}")
# Global collector instance
collector = DataCollector()