Spaces:
Sleeping
Sleeping
File size: 5,020 Bytes
1d8c2e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
Persistence utilities for datasets and scores
"""
import json
from pathlib import Path
from datetime import datetime
class SessionManager:
"""Gère la persistance des sessions de scoring"""
def __init__(self, data_dir="/app/data"):
self.data_dir = Path(data_dir)
self.data_dir.mkdir(exist_ok=True)
self.dataset_file = self.data_dir / "dataset.jsonl"
self.session_file = self.data_dir / "session.json"
def save_dataset(self, dataset, source_info=None):
"""
Sauvegarde le dataset en JSONL
Args:
dataset: Liste de dicts
source_info: Dict avec info sur la source (nom HF, fichier, etc.)
"""
# Sauvegarder le dataset
with open(self.dataset_file, 'w', encoding='utf-8') as f:
for item in dataset:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
# Sauvegarder les métadonnées
metadata = {
'saved_at': datetime.now().isoformat(),
'total_items': len(dataset),
'source_info': source_info or {}
}
metadata_file = self.data_dir / "dataset_metadata.json"
with open(metadata_file, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
return True
def load_dataset(self):
"""
Charge le dataset sauvegardé
Returns:
Tuple (dataset, metadata) ou (None, None) si pas de dataset sauvegardé
"""
if not self.dataset_file.exists():
return None, None
# Charger le dataset
dataset = []
with open(self.dataset_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
dataset.append(json.loads(line))
# Charger les métadonnées
metadata_file = self.data_dir / "dataset_metadata.json"
metadata = None
if metadata_file.exists():
with open(metadata_file, 'r', encoding='utf-8') as f:
metadata = json.load(f)
return dataset, metadata
def save_session(self, feedback_scores, feedback_comments, scoring_index):
"""
Sauvegarde l'état de la session (scores, commentaires, position)
Args:
feedback_scores: Dict {idx: score}
feedback_comments: Dict {idx: comment}
scoring_index: Index actuel
"""
session_data = {
'feedback_scores': {str(k): v for k, v in feedback_scores.items()},
'feedback_comments': {str(k): v for k, v in feedback_comments.items()},
'scoring_index': scoring_index,
'last_saved': datetime.now().isoformat(),
'total_scored': len(feedback_scores)
}
with open(self.session_file, 'w', encoding='utf-8') as f:
json.dump(session_data, f, indent=2, ensure_ascii=False)
return True
def load_session(self):
"""
Charge l'état de la session sauvegardée
Returns:
Dict avec feedback_scores, feedback_comments, scoring_index
ou None si pas de session sauvegardée
"""
if not self.session_file.exists():
return None
with open(self.session_file, 'r', encoding='utf-8') as f:
session_data = json.load(f)
# Convertir les clés en int
session_data['feedback_scores'] = {
int(k): v for k, v in session_data['feedback_scores'].items()
}
session_data['feedback_comments'] = {
int(k): v for k, v in session_data['feedback_comments'].items()
}
return session_data
def has_saved_session(self):
"""Vérifie s'il existe une session sauvegardée"""
return self.dataset_file.exists() and self.session_file.exists()
def clear_session(self):
"""Supprime la session sauvegardée"""
if self.dataset_file.exists():
self.dataset_file.unlink()
if self.session_file.exists():
self.session_file.unlink()
metadata_file = self.data_dir / "dataset_metadata.json"
if metadata_file.exists():
metadata_file.unlink()
return True
def get_session_info(self):
"""
Récupère les informations sur la session sauvegardée
Returns:
Dict avec les infos ou None
"""
if not self.has_saved_session():
return None
dataset, dataset_metadata = self.load_dataset()
session_data = self.load_session()
if dataset and session_data:
return {
'dataset_size': len(dataset),
'total_scored': session_data['total_scored'],
'last_saved': session_data['last_saved'],
'scoring_index': session_data['scoring_index'],
'source_info': dataset_metadata.get('source_info', {}) if dataset_metadata else {}
}
return None
|