Spaces:
Sleeping
Sleeping
| """ | |
| Persistence utilities for datasets and scores | |
| """ | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime | |
| class SessionManager: | |
| """Gère la persistance des sessions de scoring""" | |
| def __init__(self, data_dir="/app/data"): | |
| self.data_dir = Path(data_dir) | |
| self.data_dir.mkdir(exist_ok=True) | |
| self.dataset_file = self.data_dir / "dataset.jsonl" | |
| self.session_file = self.data_dir / "session.json" | |
| def save_dataset(self, dataset, source_info=None): | |
| """ | |
| Sauvegarde le dataset en JSONL | |
| Args: | |
| dataset: Liste de dicts | |
| source_info: Dict avec info sur la source (nom HF, fichier, etc.) | |
| """ | |
| # Sauvegarder le dataset | |
| with open(self.dataset_file, 'w', encoding='utf-8') as f: | |
| for item in dataset: | |
| f.write(json.dumps(item, ensure_ascii=False) + '\n') | |
| # Sauvegarder les métadonnées | |
| metadata = { | |
| 'saved_at': datetime.now().isoformat(), | |
| 'total_items': len(dataset), | |
| 'source_info': source_info or {} | |
| } | |
| metadata_file = self.data_dir / "dataset_metadata.json" | |
| with open(metadata_file, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f, indent=2, ensure_ascii=False) | |
| return True | |
| def load_dataset(self): | |
| """ | |
| Charge le dataset sauvegardé | |
| Returns: | |
| Tuple (dataset, metadata) ou (None, None) si pas de dataset sauvegardé | |
| """ | |
| if not self.dataset_file.exists(): | |
| return None, None | |
| # Charger le dataset | |
| dataset = [] | |
| with open(self.dataset_file, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| if line.strip(): | |
| dataset.append(json.loads(line)) | |
| # Charger les métadonnées | |
| metadata_file = self.data_dir / "dataset_metadata.json" | |
| metadata = None | |
| if metadata_file.exists(): | |
| with open(metadata_file, 'r', encoding='utf-8') as f: | |
| metadata = json.load(f) | |
| return dataset, metadata | |
| def save_session(self, feedback_scores, feedback_comments, scoring_index): | |
| """ | |
| Sauvegarde l'état de la session (scores, commentaires, position) | |
| Args: | |
| feedback_scores: Dict {idx: score} | |
| feedback_comments: Dict {idx: comment} | |
| scoring_index: Index actuel | |
| """ | |
| session_data = { | |
| 'feedback_scores': {str(k): v for k, v in feedback_scores.items()}, | |
| 'feedback_comments': {str(k): v for k, v in feedback_comments.items()}, | |
| 'scoring_index': scoring_index, | |
| 'last_saved': datetime.now().isoformat(), | |
| 'total_scored': len(feedback_scores) | |
| } | |
| with open(self.session_file, 'w', encoding='utf-8') as f: | |
| json.dump(session_data, f, indent=2, ensure_ascii=False) | |
| return True | |
| def load_session(self): | |
| """ | |
| Charge l'état de la session sauvegardée | |
| Returns: | |
| Dict avec feedback_scores, feedback_comments, scoring_index | |
| ou None si pas de session sauvegardée | |
| """ | |
| if not self.session_file.exists(): | |
| return None | |
| with open(self.session_file, 'r', encoding='utf-8') as f: | |
| session_data = json.load(f) | |
| # Convertir les clés en int | |
| session_data['feedback_scores'] = { | |
| int(k): v for k, v in session_data['feedback_scores'].items() | |
| } | |
| session_data['feedback_comments'] = { | |
| int(k): v for k, v in session_data['feedback_comments'].items() | |
| } | |
| return session_data | |
| def has_saved_session(self): | |
| """Vérifie s'il existe une session sauvegardée""" | |
| return self.dataset_file.exists() and self.session_file.exists() | |
| def clear_session(self): | |
| """Supprime la session sauvegardée""" | |
| if self.dataset_file.exists(): | |
| self.dataset_file.unlink() | |
| if self.session_file.exists(): | |
| self.session_file.unlink() | |
| metadata_file = self.data_dir / "dataset_metadata.json" | |
| if metadata_file.exists(): | |
| metadata_file.unlink() | |
| return True | |
| def get_session_info(self): | |
| """ | |
| Récupère les informations sur la session sauvegardée | |
| Returns: | |
| Dict avec les infos ou None | |
| """ | |
| if not self.has_saved_session(): | |
| return None | |
| dataset, dataset_metadata = self.load_dataset() | |
| session_data = self.load_session() | |
| if dataset and session_data: | |
| return { | |
| 'dataset_size': len(dataset), | |
| 'total_scored': session_data['total_scored'], | |
| 'last_saved': session_data['last_saved'], | |
| 'scoring_index': session_data['scoring_index'], | |
| 'source_info': dataset_metadata.get('source_info', {}) if dataset_metadata else {} | |
| } | |
| return None | |