Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -84,7 +84,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
| 84 |
# Load model
|
| 85 |
qa_pipeline, hf_token = evaluate_model()
|
| 86 |
if qa_pipeline is None:
|
| 87 |
-
return "β Failed to load model",
|
| 88 |
|
| 89 |
progress(0.1, desc="Loading CUAD dataset...")
|
| 90 |
|
|
@@ -97,7 +97,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
| 97 |
dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
|
| 98 |
test_data = dataset["test"]
|
| 99 |
except Exception as e2:
|
| 100 |
-
return f"β Error loading dataset: {e2}",
|
| 101 |
|
| 102 |
# Limit samples
|
| 103 |
num_samples = min(num_samples, len(test_data))
|
|
@@ -147,11 +147,15 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
| 147 |
})
|
| 148 |
|
| 149 |
except Exception as e:
|
|
|
|
| 150 |
continue
|
| 151 |
|
| 152 |
progress(0.9, desc="Calculating final metrics...")
|
| 153 |
|
| 154 |
# Calculate final metrics
|
|
|
|
|
|
|
|
|
|
| 155 |
avg_exact_match = np.mean(exact_matches) * 100
|
| 156 |
avg_f1_score = np.mean(f1_scores) * 100
|
| 157 |
|
|
@@ -178,7 +182,7 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
| 178 |
# Create detailed results DataFrame
|
| 179 |
df = pd.DataFrame(predictions)
|
| 180 |
|
| 181 |
-
# Save results
|
| 182 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 183 |
results_file = f"cuad_evaluation_results_{timestamp}.json"
|
| 184 |
|
|
@@ -192,8 +196,13 @@ def run_evaluation(num_samples, progress=gr.Progress()):
|
|
| 192 |
"predictions": predictions
|
| 193 |
}
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
progress(1.0, desc="β
Evaluation completed!")
|
| 199 |
|
|
@@ -256,7 +265,6 @@ def create_gradio_interface():
|
|
| 256 |
|
| 257 |
with gr.Row():
|
| 258 |
detailed_results = gr.Dataframe(
|
| 259 |
-
headers=["Sample_ID", "Question", "Predicted_Answer", "Ground_Truth", "Exact_Match", "F1_Score", "Confidence"],
|
| 260 |
label="Sample-by-Sample Results",
|
| 261 |
interactive=False,
|
| 262 |
wrap=True
|
|
@@ -269,14 +277,18 @@ def create_gradio_interface():
|
|
| 269 |
)
|
| 270 |
|
| 271 |
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
evaluate_btn.click(
|
| 273 |
-
fn=
|
| 274 |
inputs=[num_samples],
|
| 275 |
outputs=[results_summary, detailed_results, download_file],
|
| 276 |
show_progress=True
|
| 277 |
-
).then(
|
| 278 |
-
lambda: gr.update(visible=True),
|
| 279 |
-
outputs=[download_file]
|
| 280 |
)
|
| 281 |
|
| 282 |
# Footer
|
|
|
|
| 84 |
# Load model
|
| 85 |
qa_pipeline, hf_token = evaluate_model()
|
| 86 |
if qa_pipeline is None:
|
| 87 |
+
return "β Failed to load model", pd.DataFrame(), None
|
| 88 |
|
| 89 |
progress(0.1, desc="Loading CUAD dataset...")
|
| 90 |
|
|
|
|
| 97 |
dataset = load_dataset("theatticusproject/cuad", trust_remote_code=True, token=hf_token)
|
| 98 |
test_data = dataset["test"]
|
| 99 |
except Exception as e2:
|
| 100 |
+
return f"β Error loading dataset: {e2}", pd.DataFrame(), None
|
| 101 |
|
| 102 |
# Limit samples
|
| 103 |
num_samples = min(num_samples, len(test_data))
|
|
|
|
| 147 |
})
|
| 148 |
|
| 149 |
except Exception as e:
|
| 150 |
+
print(f"Error processing sample {i}: {e}")
|
| 151 |
continue
|
| 152 |
|
| 153 |
progress(0.9, desc="Calculating final metrics...")
|
| 154 |
|
| 155 |
# Calculate final metrics
|
| 156 |
+
if len(exact_matches) == 0:
|
| 157 |
+
return "β No samples were successfully processed", pd.DataFrame(), None
|
| 158 |
+
|
| 159 |
avg_exact_match = np.mean(exact_matches) * 100
|
| 160 |
avg_f1_score = np.mean(f1_scores) * 100
|
| 161 |
|
|
|
|
| 182 |
# Create detailed results DataFrame
|
| 183 |
df = pd.DataFrame(predictions)
|
| 184 |
|
| 185 |
+
# Save results to file
|
| 186 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 187 |
results_file = f"cuad_evaluation_results_{timestamp}.json"
|
| 188 |
|
|
|
|
| 196 |
"predictions": predictions
|
| 197 |
}
|
| 198 |
|
| 199 |
+
try:
|
| 200 |
+
with open(results_file, "w") as f:
|
| 201 |
+
json.dump(detailed_results, f, indent=2)
|
| 202 |
+
print(f"β Results saved to {results_file}")
|
| 203 |
+
except Exception as e:
|
| 204 |
+
print(f"β Warning: Could not save results file: {e}")
|
| 205 |
+
results_file = None
|
| 206 |
|
| 207 |
progress(1.0, desc="β
Evaluation completed!")
|
| 208 |
|
|
|
|
| 265 |
|
| 266 |
with gr.Row():
|
| 267 |
detailed_results = gr.Dataframe(
|
|
|
|
| 268 |
label="Sample-by-Sample Results",
|
| 269 |
interactive=False,
|
| 270 |
wrap=True
|
|
|
|
| 277 |
)
|
| 278 |
|
| 279 |
# Event handlers
|
| 280 |
+
def handle_evaluation(num_samples):
|
| 281 |
+
summary, df, file_path = run_evaluation(num_samples)
|
| 282 |
+
if file_path and os.path.exists(file_path):
|
| 283 |
+
return summary, df, gr.update(visible=True, value=file_path)
|
| 284 |
+
else:
|
| 285 |
+
return summary, df, gr.update(visible=False)
|
| 286 |
+
|
| 287 |
evaluate_btn.click(
|
| 288 |
+
fn=handle_evaluation,
|
| 289 |
inputs=[num_samples],
|
| 290 |
outputs=[results_summary, detailed_results, download_file],
|
| 291 |
show_progress=True
|
|
|
|
|
|
|
|
|
|
| 292 |
)
|
| 293 |
|
| 294 |
# Footer
|