Spaces:
Sleeping
Sleeping
Fix improvement_data field mapping for real backend - use baseline_val_score, optimized_val_score, relative_improvement_percent
Browse files
app.py
CHANGED
|
@@ -351,6 +351,10 @@ def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5
|
|
| 351 |
|
| 352 |
# Run optimization
|
| 353 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
result = quick_optimize_sync(
|
| 355 |
seed_prompt=seed_prompt,
|
| 356 |
dataset=dataset,
|
|
@@ -362,6 +366,22 @@ def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5
|
|
| 362 |
verbose=True,
|
| 363 |
)
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
# Validate result structure
|
| 366 |
if not result:
|
| 367 |
return False, "Optimization returned no result.", None
|
|
@@ -719,16 +739,43 @@ def run_optimization_flow(seed, dataset, model, custom_model, iter_count, call_c
|
|
| 719 |
improvement_data = result.improvement_data if hasattr(result, 'improvement_data') else {}
|
| 720 |
|
| 721 |
# Convert improvement_data to display format
|
|
|
|
| 722 |
if isinstance(improvement_data, dict):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
improvement_metrics = {
|
| 724 |
-
"baseline_score":
|
| 725 |
-
"final_score":
|
| 726 |
-
"improvement":
|
| 727 |
"iterations_run": result.total_iterations if hasattr(result, 'total_iterations') else improvement_data.get("iterations", 0),
|
| 728 |
"optimization_time": f"{result.optimization_time:.2f}s" if hasattr(result, 'optimization_time') else "N/A",
|
| 729 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
else:
|
| 731 |
improvement_metrics = {}
|
|
|
|
| 732 |
|
| 733 |
# Create iteration history from reflection_history if available
|
| 734 |
iteration_history = []
|
|
|
|
| 351 |
|
| 352 |
# Run optimization
|
| 353 |
try:
|
| 354 |
+
logger.info(f"🚀 Starting optimization with model: {final_model}")
|
| 355 |
+
logger.info(f" Parameters: iterations={max_iterations}, metric_calls={max_metric_calls}, batch={batch_size}, llego={use_llego}")
|
| 356 |
+
logger.info(f" Dataset size: {len(dataset)} examples")
|
| 357 |
+
|
| 358 |
result = quick_optimize_sync(
|
| 359 |
seed_prompt=seed_prompt,
|
| 360 |
dataset=dataset,
|
|
|
|
| 366 |
verbose=True,
|
| 367 |
)
|
| 368 |
|
| 369 |
+
# Log result details for debugging
|
| 370 |
+
logger.info(f"📊 Optimization result received:")
|
| 371 |
+
logger.info(f" Type: {type(result)}")
|
| 372 |
+
logger.info(f" Has prompt: {hasattr(result, 'prompt')}")
|
| 373 |
+
logger.info(f" Has optimized_prompt: {hasattr(result, 'optimized_prompt')}")
|
| 374 |
+
if hasattr(result, 'improvement_data'):
|
| 375 |
+
logger.info(f" improvement_data: {result.improvement_data}")
|
| 376 |
+
if hasattr(result, 'total_iterations'):
|
| 377 |
+
logger.info(f" total_iterations: {result.total_iterations}")
|
| 378 |
+
if hasattr(result, 'optimization_time'):
|
| 379 |
+
logger.info(f" optimization_time: {result.optimization_time}")
|
| 380 |
+
if hasattr(result, 'status'):
|
| 381 |
+
logger.info(f" status: {result.status}")
|
| 382 |
+
if hasattr(result, 'error_message') and result.error_message:
|
| 383 |
+
logger.error(f" error_message: {result.error_message}")
|
| 384 |
+
|
| 385 |
# Validate result structure
|
| 386 |
if not result:
|
| 387 |
return False, "Optimization returned no result.", None
|
|
|
|
| 739 |
improvement_data = result.improvement_data if hasattr(result, 'improvement_data') else {}
|
| 740 |
|
| 741 |
# Convert improvement_data to display format
|
| 742 |
+
# Real backend uses: baseline_val_score, optimized_val_score, relative_improvement_percent
|
| 743 |
if isinstance(improvement_data, dict):
|
| 744 |
+
# Try real backend field names first, then fall back to alternatives
|
| 745 |
+
baseline_score = (
|
| 746 |
+
improvement_data.get("baseline_val_score") or
|
| 747 |
+
improvement_data.get("baseline_score") or
|
| 748 |
+
improvement_data.get("baseline_metrics", {}).get("composite_score", 0.0)
|
| 749 |
+
)
|
| 750 |
+
final_score = (
|
| 751 |
+
improvement_data.get("optimized_val_score") or
|
| 752 |
+
improvement_data.get("final_score") or
|
| 753 |
+
improvement_data.get("final_metrics", {}).get("composite_score", 0.0)
|
| 754 |
+
)
|
| 755 |
+
improvement_percent = (
|
| 756 |
+
improvement_data.get("relative_improvement_percent") or
|
| 757 |
+
improvement_data.get("improvement_percent") or
|
| 758 |
+
"N/A"
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
# Format improvement percent
|
| 762 |
+
if isinstance(improvement_percent, (int, float)):
|
| 763 |
+
improvement_percent = f"+{improvement_percent:.1f}%" if improvement_percent > 0 else f"{improvement_percent:.1f}%"
|
| 764 |
+
|
| 765 |
improvement_metrics = {
|
| 766 |
+
"baseline_score": round(baseline_score, 4) if isinstance(baseline_score, (int, float)) else baseline_score,
|
| 767 |
+
"final_score": round(final_score, 4) if isinstance(final_score, (int, float)) else final_score,
|
| 768 |
+
"improvement": improvement_percent,
|
| 769 |
"iterations_run": result.total_iterations if hasattr(result, 'total_iterations') else improvement_data.get("iterations", 0),
|
| 770 |
"optimization_time": f"{result.optimization_time:.2f}s" if hasattr(result, 'optimization_time') else "N/A",
|
| 771 |
}
|
| 772 |
+
|
| 773 |
+
# Log the improvement data for debugging
|
| 774 |
+
logger.info(f"📊 Improvement data received: {improvement_data}")
|
| 775 |
+
logger.info(f"📊 Formatted metrics: {improvement_metrics}")
|
| 776 |
else:
|
| 777 |
improvement_metrics = {}
|
| 778 |
+
logger.warning(f"⚠️ improvement_data is not a dict: {type(improvement_data)}")
|
| 779 |
|
| 780 |
# Create iteration history from reflection_history if available
|
| 781 |
iteration_history = []
|