update evaluation progress
Browse files
backend/routes/evaluation.py
CHANGED
|
@@ -92,10 +92,15 @@ async def get_evaluation_logs(session_id: str):
|
|
| 92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
| 93 |
results = evaluation_task.results
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
return {
|
| 96 |
"logs": logs,
|
| 97 |
"is_completed": is_completed,
|
| 98 |
-
"results": results
|
|
|
|
|
|
|
| 99 |
}
|
| 100 |
|
| 101 |
@router.get("/evaluation-results/{session_id}")
|
|
|
|
| 92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
| 93 |
results = evaluation_task.results
|
| 94 |
|
| 95 |
+
# Récupérer l'information sur les étapes
|
| 96 |
+
progress = evaluation_task.get_progress()
|
| 97 |
+
|
| 98 |
return {
|
| 99 |
"logs": logs,
|
| 100 |
"is_completed": is_completed,
|
| 101 |
+
"results": results,
|
| 102 |
+
"current_step": progress["current_step"],
|
| 103 |
+
"completed_steps": progress["completed_steps"]
|
| 104 |
}
|
| 105 |
|
| 106 |
@router.get("/evaluation-results/{session_id}")
|
backend/tasks/evaluation_task.py
CHANGED
|
@@ -42,21 +42,35 @@ class EvaluationTask:
|
|
| 42 |
self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
| 43 |
self.current_step = "initializing"
|
| 44 |
self.completed_steps = []
|
|
|
|
| 45 |
|
| 46 |
# Nettoyer les anciens résultats si demandé
|
| 47 |
if clean_old_results:
|
| 48 |
self.clean_old_results()
|
| 49 |
|
| 50 |
-
def update_step(self, step: str) -> None:
|
| 51 |
"""
|
| 52 |
-
Update the current step and completed steps
|
| 53 |
|
| 54 |
Args:
|
| 55 |
step: Name of the step to update
|
| 56 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
self.current_step = step
|
|
|
|
|
|
|
|
|
|
| 58 |
if step not in self.completed_steps:
|
| 59 |
self.completed_steps.append(step)
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def get_progress(self) -> Dict:
|
| 62 |
"""
|
|
@@ -270,7 +284,7 @@ TASKS_TABLE = [yourbench]
|
|
| 270 |
]
|
| 271 |
|
| 272 |
# Step 1: Check available providers for each model
|
| 273 |
-
self.update_step("finding_available_model_providers")
|
| 274 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
| 275 |
|
| 276 |
model_providers = {}
|
|
@@ -288,11 +302,11 @@ TASKS_TABLE = [yourbench]
|
|
| 288 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
|
| 289 |
|
| 290 |
# Step 2: Run evaluations in parallel
|
| 291 |
-
self.update_step("starting_evaluation_process")
|
| 292 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
|
| 293 |
|
| 294 |
# Step 3: Evaluate models
|
| 295 |
-
self.update_step("evaluating_models")
|
| 296 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
|
| 297 |
|
| 298 |
tasks = []
|
|
@@ -306,13 +320,13 @@ TASKS_TABLE = [yourbench]
|
|
| 306 |
self.results = [r for r in results if r["status"] == "success"]
|
| 307 |
|
| 308 |
# Step 4: Save results
|
| 309 |
-
self.update_step("storing_evaluation_results")
|
| 310 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
|
| 311 |
self._save_results_to_hub()
|
| 312 |
|
| 313 |
# Mark task as completed
|
| 314 |
self.is_completed = True
|
| 315 |
-
self.update_step("completed")
|
| 316 |
|
| 317 |
total_time = time.time() - script_start_time
|
| 318 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
|
|
|
|
| 42 |
self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
| 43 |
self.current_step = "initializing"
|
| 44 |
self.completed_steps = []
|
| 45 |
+
self.step_start_time = time.time() # Enregistrer le temps de début de l'étape actuelle
|
| 46 |
|
| 47 |
# Nettoyer les anciens résultats si demandé
|
| 48 |
if clean_old_results:
|
| 49 |
self.clean_old_results()
|
| 50 |
|
| 51 |
+
async def update_step(self, step: str) -> None:
|
| 52 |
"""
|
| 53 |
+
Update the current step and completed steps with a minimum delay of 1 second
|
| 54 |
|
| 55 |
Args:
|
| 56 |
step: Name of the step to update
|
| 57 |
"""
|
| 58 |
+
# Calculer le temps écoulé depuis le début de l'étape précédente
|
| 59 |
+
elapsed_since_step_start = time.time() - self.step_start_time
|
| 60 |
+
|
| 61 |
+
# Si moins d'une seconde s'est écoulée, attendre pour compléter la seconde
|
| 62 |
+
if elapsed_since_step_start < 1.0:
|
| 63 |
+
await asyncio.sleep(1.0 - elapsed_since_step_start)
|
| 64 |
+
|
| 65 |
+
# Mettre à jour l'étape courante et enregistrer le nouvel horodatage
|
| 66 |
self.current_step = step
|
| 67 |
+
self.step_start_time = time.time()
|
| 68 |
+
|
| 69 |
+
# Ajouter aux étapes complétées si nécessaire
|
| 70 |
if step not in self.completed_steps:
|
| 71 |
self.completed_steps.append(step)
|
| 72 |
+
|
| 73 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Step changed to: {step}")
|
| 74 |
|
| 75 |
def get_progress(self) -> Dict:
|
| 76 |
"""
|
|
|
|
| 284 |
]
|
| 285 |
|
| 286 |
# Step 1: Check available providers for each model
|
| 287 |
+
await self.update_step("finding_available_model_providers")
|
| 288 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
| 289 |
|
| 290 |
model_providers = {}
|
|
|
|
| 302 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
|
| 303 |
|
| 304 |
# Step 2: Run evaluations in parallel
|
| 305 |
+
await self.update_step("starting_evaluation_process")
|
| 306 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
|
| 307 |
|
| 308 |
# Step 3: Evaluate models
|
| 309 |
+
await self.update_step("evaluating_models")
|
| 310 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
|
| 311 |
|
| 312 |
tasks = []
|
|
|
|
| 320 |
self.results = [r for r in results if r["status"] == "success"]
|
| 321 |
|
| 322 |
# Step 4: Save results
|
| 323 |
+
await self.update_step("storing_evaluation_results")
|
| 324 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
|
| 325 |
self._save_results_to_hub()
|
| 326 |
|
| 327 |
# Mark task as completed
|
| 328 |
self.is_completed = True
|
| 329 |
+
await self.update_step("completed")
|
| 330 |
|
| 331 |
total_time = time.time() - script_start_time
|
| 332 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
|
frontend/src/components/BenchmarkEvaluation.jsx
CHANGED
|
@@ -49,19 +49,18 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
| 49 |
|
| 50 |
// Add effect to handle starting messages
|
| 51 |
useEffect(() => {
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
| 54 |
setStartingMessageIndex((prev) => {
|
| 55 |
if (prev < STARTING_MESSAGES.length - 1) {
|
| 56 |
return prev + 1;
|
| 57 |
}
|
| 58 |
return prev;
|
| 59 |
});
|
| 60 |
-
},
|
| 61 |
-
|
| 62 |
-
? MESSAGE_CHANGE_INTERVAL.PRECALCULATED
|
| 63 |
-
: MESSAGE_CHANGE_INTERVAL.DEFAULT
|
| 64 |
-
);
|
| 65 |
|
| 66 |
return () => {
|
| 67 |
if (startingMessageIntervalRef.current) {
|
|
@@ -116,16 +115,24 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
| 116 |
clearInterval(startingMessageIntervalRef.current);
|
| 117 |
}
|
| 118 |
} else {
|
| 119 |
-
// Si l'évaluation est toujours en cours,
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
}
|
| 130 |
}
|
| 131 |
} catch (error) {
|
|
@@ -191,6 +198,35 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
| 191 |
].join(":");
|
| 192 |
};
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
// Start benchmark evaluation
|
| 195 |
const startEvaluation = async () => {
|
| 196 |
if (!sessionId) {
|
|
@@ -242,25 +278,25 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
| 242 |
clearInterval(startingMessageIntervalRef.current);
|
| 243 |
}
|
| 244 |
} else {
|
| 245 |
-
//
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
setStartingMessageIndex(estimatedStepIndex);
|
| 265 |
}
|
| 266 |
}
|
|
|
|
| 49 |
|
| 50 |
// Add effect to handle starting messages
|
| 51 |
useEffect(() => {
|
| 52 |
+
// Ne configurer l'intervalle automatique que pour les documents par défaut
|
| 53 |
+
// Pour les évaluations réelles, on se fiera uniquement aux mises à jour de l'API
|
| 54 |
+
if (isDefault) {
|
| 55 |
+
startingMessageIntervalRef.current = setInterval(() => {
|
| 56 |
setStartingMessageIndex((prev) => {
|
| 57 |
if (prev < STARTING_MESSAGES.length - 1) {
|
| 58 |
return prev + 1;
|
| 59 |
}
|
| 60 |
return prev;
|
| 61 |
});
|
| 62 |
+
}, MESSAGE_CHANGE_INTERVAL.PRECALCULATED);
|
| 63 |
+
}
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
return () => {
|
| 66 |
if (startingMessageIntervalRef.current) {
|
|
|
|
| 115 |
clearInterval(startingMessageIntervalRef.current);
|
| 116 |
}
|
| 117 |
} else {
|
| 118 |
+
// Si l'évaluation est toujours en cours, utiliser l'étape actuelle du backend
|
| 119 |
+
if (logsResult.current_step) {
|
| 120 |
+
// Utiliser la fonction de mappage pour déterminer l'index du message
|
| 121 |
+
const newIndex = mapStepToMessageIndex(
|
| 122 |
+
logsResult.current_step
|
| 123 |
+
);
|
| 124 |
+
setStartingMessageIndex(newIndex);
|
| 125 |
+
} else {
|
| 126 |
+
// Fallback basé sur le temps si l'étape n'est pas disponible
|
| 127 |
+
const progress = Math.min(
|
| 128 |
+
Math.floor(
|
| 129 |
+
(Date.now() - startTimeRef.current) /
|
| 130 |
+
MESSAGE_CHANGE_INTERVAL.DEFAULT
|
| 131 |
+
),
|
| 132 |
+
STARTING_MESSAGES.length - 1
|
| 133 |
+
);
|
| 134 |
+
setStartingMessageIndex(progress);
|
| 135 |
+
}
|
| 136 |
}
|
| 137 |
}
|
| 138 |
} catch (error) {
|
|
|
|
| 198 |
].join(":");
|
| 199 |
};
|
| 200 |
|
| 201 |
+
// Fonction pour mapper le nom de l'étape backend vers l'index dans STARTING_MESSAGES
|
| 202 |
+
const mapStepToMessageIndex = (currentStep) => {
|
| 203 |
+
switch (currentStep) {
|
| 204 |
+
case "initializing":
|
| 205 |
+
return 0;
|
| 206 |
+
case "finding_available_model_providers":
|
| 207 |
+
return 1;
|
| 208 |
+
case "starting_evaluation_process":
|
| 209 |
+
return 2;
|
| 210 |
+
case "evaluating_models":
|
| 211 |
+
return 3;
|
| 212 |
+
case "storing_evaluation_results":
|
| 213 |
+
case "completed":
|
| 214 |
+
return 4;
|
| 215 |
+
default:
|
| 216 |
+
// Calculer l'étape en fonction du temps écoulé si l'étape n'est pas reconnue
|
| 217 |
+
const elapsedSinceStart = Date.now() - startTimeRef.current;
|
| 218 |
+
const estimatedTotalTime = 80000; // 80 secondes
|
| 219 |
+
const estimatedProgress = Math.min(
|
| 220 |
+
elapsedSinceStart / estimatedTotalTime,
|
| 221 |
+
1
|
| 222 |
+
);
|
| 223 |
+
return Math.min(
|
| 224 |
+
Math.floor(estimatedProgress * STARTING_MESSAGES.length),
|
| 225 |
+
STARTING_MESSAGES.length - 1
|
| 226 |
+
);
|
| 227 |
+
}
|
| 228 |
+
};
|
| 229 |
+
|
| 230 |
// Start benchmark evaluation
|
| 231 |
const startEvaluation = async () => {
|
| 232 |
if (!sessionId) {
|
|
|
|
| 278 |
clearInterval(startingMessageIntervalRef.current);
|
| 279 |
}
|
| 280 |
} else {
|
| 281 |
+
// Récupérer l'étape actuelle à partir de l'API, si disponible
|
| 282 |
+
if (logsResult.current_step) {
|
| 283 |
+
// Utiliser la fonction de mappage pour déterminer l'index du message
|
| 284 |
+
const newIndex = mapStepToMessageIndex(
|
| 285 |
+
logsResult.current_step
|
| 286 |
+
);
|
| 287 |
+
setStartingMessageIndex(newIndex);
|
| 288 |
+
} else {
|
| 289 |
+
// Fallback: Si l'API ne renvoie pas d'étape, estimer en fonction du temps
|
| 290 |
+
const elapsedSinceStart = Date.now() - startTimeRef.current;
|
| 291 |
+
const estimatedTotalTime = 80000; // 80 secondes
|
| 292 |
+
const estimatedProgress = Math.min(
|
| 293 |
+
elapsedSinceStart / estimatedTotalTime,
|
| 294 |
+
1
|
| 295 |
+
);
|
| 296 |
+
const estimatedStepIndex = Math.min(
|
| 297 |
+
Math.floor(estimatedProgress * STARTING_MESSAGES.length),
|
| 298 |
+
STARTING_MESSAGES.length - 1
|
| 299 |
+
);
|
| 300 |
setStartingMessageIndex(estimatedStepIndex);
|
| 301 |
}
|
| 302 |
}
|