veda-programming / data_collector.py
vedaco's picture
Create data_collector.py
3fc35d3 verified
raw
history blame
4.15 kB
"""Data collector for continuous learning"""
import json
import os
from datetime import datetime
from typing import Optional, Dict, List
import hashlib
from database import db
from config import (
DATA_DIR, LEARNING_FROM_FEEDBACK,
SAVE_ALL_INTERACTIONS, REQUIRE_APPROVAL
)
class DataCollector:
"""Collects and manages user interaction data for continuous learning"""
def __init__(self):
self.current_session_id = self._generate_session_id()
self.session_interactions = []
def _generate_session_id(self) -> str:
"""Generate unique session ID"""
timestamp = datetime.now().isoformat()
return hashlib.md5(timestamp.encode()).hexdigest()[:12]
def collect_interaction(
self,
prompt: str,
generated_code: str,
temperature: float = 0.7,
max_tokens: int = 100
) -> int:
"""Collect a user interaction"""
if not SAVE_ALL_INTERACTIONS:
return -1
# Save to database
interaction_id = db.save_interaction(
prompt=prompt,
generated_code=generated_code,
temperature=temperature,
max_tokens=max_tokens,
session_id=self.current_session_id
)
# Track in session
self.session_interactions.append({
'id': interaction_id,
'prompt': prompt,
'code': generated_code,
'timestamp': datetime.now().isoformat()
})
return interaction_id
def record_feedback(
self,
interaction_id: int,
is_positive: bool,
edited_code: str = None
):
"""Record user feedback for an interaction"""
if not LEARNING_FROM_FEEDBACK:
return
feedback = 1 if is_positive else -1
db.update_feedback(interaction_id, feedback, edited_code)
print(f"Feedback recorded: {'πŸ‘' if is_positive else 'πŸ‘Ž'} for interaction {interaction_id}")
def add_training_sample(self, code: str, category: str = "user_contributed"):
"""Add a code sample directly to training data"""
return db.add_code_sample(code, source="user", category=category)
def get_training_data(self, include_base: bool = True) -> List[str]:
"""Get all available training data"""
samples = []
# Get approved user interactions
approved = db.get_approved_samples()
for item in approved:
# Combine prompt and code for training
sample = f"# Prompt: {item['prompt']}\n{item['code']}"
samples.append(sample)
# Get curated code samples
code_samples = db.get_all_code_samples()
for item in code_samples:
samples.append(item['code'])
# Include base training data
if include_base:
base_path = os.path.join(DATA_DIR, "..", "programming.txt")
if os.path.exists(base_path):
with open(base_path, 'r', encoding='utf-8') as f:
base_code = f.read()
samples.append(base_code)
return samples
def get_new_training_data(self) -> List[Dict]:
"""Get new approved samples not yet used for training"""
return db.get_approved_samples(not_used=True)
def get_pending_count(self) -> int:
"""Get count of samples pending training"""
return db.get_pending_samples_count()
def get_statistics(self) -> Dict:
"""Get collection statistics"""
stats = db.get_statistics()
stats['session_interactions'] = len(self.session_interactions)
return stats
def export_training_data(self, filepath: str):
"""Export all training data to a file"""
samples = self.get_training_data()
with open(filepath, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(samples))
print(f"Exported {len(samples)} samples to {filepath}")
# Global collector instance
collector = DataCollector()