""" Persistence utilities for datasets and scores """ import json from pathlib import Path from datetime import datetime class SessionManager: """Gère la persistance des sessions de scoring""" def __init__(self, data_dir="/app/data"): self.data_dir = Path(data_dir) self.data_dir.mkdir(exist_ok=True) self.dataset_file = self.data_dir / "dataset.jsonl" self.session_file = self.data_dir / "session.json" def save_dataset(self, dataset, source_info=None): """ Sauvegarde le dataset en JSONL Args: dataset: Liste de dicts source_info: Dict avec info sur la source (nom HF, fichier, etc.) """ # Sauvegarder le dataset with open(self.dataset_file, 'w', encoding='utf-8') as f: for item in dataset: f.write(json.dumps(item, ensure_ascii=False) + '\n') # Sauvegarder les métadonnées metadata = { 'saved_at': datetime.now().isoformat(), 'total_items': len(dataset), 'source_info': source_info or {} } metadata_file = self.data_dir / "dataset_metadata.json" with open(metadata_file, 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) return True def load_dataset(self): """ Charge le dataset sauvegardé Returns: Tuple (dataset, metadata) ou (None, None) si pas de dataset sauvegardé """ if not self.dataset_file.exists(): return None, None # Charger le dataset dataset = [] with open(self.dataset_file, 'r', encoding='utf-8') as f: for line in f: if line.strip(): dataset.append(json.loads(line)) # Charger les métadonnées metadata_file = self.data_dir / "dataset_metadata.json" metadata = None if metadata_file.exists(): with open(metadata_file, 'r', encoding='utf-8') as f: metadata = json.load(f) return dataset, metadata def save_session(self, feedback_scores, feedback_comments, scoring_index): """ Sauvegarde l'état de la session (scores, commentaires, position) Args: feedback_scores: Dict {idx: score} feedback_comments: Dict {idx: comment} scoring_index: Index actuel """ session_data = { 'feedback_scores': {str(k): v for k, v in feedback_scores.items()}, 'feedback_comments': {str(k): v for k, v in feedback_comments.items()}, 'scoring_index': scoring_index, 'last_saved': datetime.now().isoformat(), 'total_scored': len(feedback_scores) } with open(self.session_file, 'w', encoding='utf-8') as f: json.dump(session_data, f, indent=2, ensure_ascii=False) return True def load_session(self): """ Charge l'état de la session sauvegardée Returns: Dict avec feedback_scores, feedback_comments, scoring_index ou None si pas de session sauvegardée """ if not self.session_file.exists(): return None with open(self.session_file, 'r', encoding='utf-8') as f: session_data = json.load(f) # Convertir les clés en int session_data['feedback_scores'] = { int(k): v for k, v in session_data['feedback_scores'].items() } session_data['feedback_comments'] = { int(k): v for k, v in session_data['feedback_comments'].items() } return session_data def has_saved_session(self): """Vérifie s'il existe une session sauvegardée""" return self.dataset_file.exists() and self.session_file.exists() def clear_session(self): """Supprime la session sauvegardée""" if self.dataset_file.exists(): self.dataset_file.unlink() if self.session_file.exists(): self.session_file.unlink() metadata_file = self.data_dir / "dataset_metadata.json" if metadata_file.exists(): metadata_file.unlink() return True def get_session_info(self): """ Récupère les informations sur la session sauvegardée Returns: Dict avec les infos ou None """ if not self.has_saved_session(): return None dataset, dataset_metadata = self.load_dataset() session_data = self.load_session() if dataset and session_data: return { 'dataset_size': len(dataset), 'total_scored': session_data['total_scored'], 'last_saved': session_data['last_saved'], 'scoring_index': session_data['scoring_index'], 'source_info': dataset_metadata.get('source_info', {}) if dataset_metadata else {} } return None