Spaces:
Sleeping
Sleeping
zhenwu0831 commited on
Commit Β·
85a9e79
1
Parent(s): fd3a49e
v21
Browse files
app.py
CHANGED
|
@@ -174,7 +174,7 @@ def load_gold_map() -> Dict[str, Dict[str, Any]]:
|
|
| 174 |
if not qid:
|
| 175 |
continue
|
| 176 |
gold[qid] = {
|
| 177 |
-
"question": str(ex.get("question", ex.get("query", ""))).strip(),
|
| 178 |
"gold_answer": str(ex.get("answer", ex.get("gold", ex.get("reference", "")))).strip(),
|
| 179 |
}
|
| 180 |
|
|
@@ -475,11 +475,7 @@ def process_submission(file):
|
|
| 475 |
f1 = token_f1(pred, gold)
|
| 476 |
rec = answer_recall(pred, gold)
|
| 477 |
rouge = compute_rouge(pred, gold)
|
| 478 |
-
|
| 479 |
-
# Only use LLM judge if question is provided
|
| 480 |
-
judge = None
|
| 481 |
-
if question: # Skip judge if question is empty
|
| 482 |
-
judge = openai_judge(question, pred)
|
| 483 |
|
| 484 |
em_sum += em
|
| 485 |
f1_sum += f1
|
|
@@ -491,6 +487,7 @@ def process_submission(file):
|
|
| 491 |
if judge is not None:
|
| 492 |
judge_sum += int(judge)
|
| 493 |
judge_n += 1
|
|
|
|
| 494 |
|
| 495 |
denom = attempted if attempted > 0 else 1
|
| 496 |
|
|
@@ -550,7 +547,10 @@ def process_submission(file):
|
|
| 550 |
f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
|
| 551 |
]
|
| 552 |
if avg_judge is None:
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
| 554 |
else:
|
| 555 |
lines.append(f"LLM judge: {avg_judge:.3f} (1-5)")
|
| 556 |
|
|
@@ -596,11 +596,11 @@ We compute multiple metrics:
|
|
| 596 |
|
| 597 |
with gr.Tabs():
|
| 598 |
with gr.Tab("π€ Submit"):
|
| 599 |
-
file_input = gr.File(label="Upload submission
|
| 600 |
submit_btn = gr.Button("π Submit & Evaluate", variant="primary")
|
| 601 |
status = gr.Textbox(label="Result", lines=10, interactive=False)
|
| 602 |
|
| 603 |
-
gr.Markdown("### Sample submission")
|
| 604 |
sample = gr.Textbox(value=sample_submission_text(), lines=6)
|
| 605 |
|
| 606 |
with gr.Tab("π
Leaderboard"):
|
|
|
|
| 174 |
if not qid:
|
| 175 |
continue
|
| 176 |
gold[qid] = {
|
| 177 |
+
"question": str(ex.get("question", ex.get("query", ""))).strip(),
|
| 178 |
"gold_answer": str(ex.get("answer", ex.get("gold", ex.get("reference", "")))).strip(),
|
| 179 |
}
|
| 180 |
|
|
|
|
| 475 |
f1 = token_f1(pred, gold)
|
| 476 |
rec = answer_recall(pred, gold)
|
| 477 |
rouge = compute_rouge(pred, gold)
|
| 478 |
+
judge = openai_judge(question, pred)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
em_sum += em
|
| 481 |
f1_sum += f1
|
|
|
|
| 487 |
if judge is not None:
|
| 488 |
judge_sum += int(judge)
|
| 489 |
judge_n += 1
|
| 490 |
+
judge_n += 1
|
| 491 |
|
| 492 |
denom = attempted if attempted > 0 else 1
|
| 493 |
|
|
|
|
| 547 |
f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
|
| 548 |
]
|
| 549 |
if avg_judge is None:
|
| 550 |
+
if not OPENAI_API_KEY:
|
| 551 |
+
lines.append("LLM judge: NA (set OPENAI_API to enable)")
|
| 552 |
+
else:
|
| 553 |
+
lines.append("LLM judge: NA (questions not available in gold dataset)")
|
| 554 |
else:
|
| 555 |
lines.append(f"LLM judge: {avg_judge:.3f} (1-5)")
|
| 556 |
|
|
|
|
| 596 |
|
| 597 |
with gr.Tabs():
|
| 598 |
with gr.Tab("π€ Submit"):
|
| 599 |
+
file_input = gr.File(label="Upload submission.json", file_types=[".json"])
|
| 600 |
submit_btn = gr.Button("π Submit & Evaluate", variant="primary")
|
| 601 |
status = gr.Textbox(label="Result", lines=10, interactive=False)
|
| 602 |
|
| 603 |
+
gr.Markdown("### Sample submission.json")
|
| 604 |
sample = gr.Textbox(value=sample_submission_text(), lines=6)
|
| 605 |
|
| 606 |
with gr.Tab("π
Leaderboard"):
|