Spaces:
Sleeping
Sleeping
zhenwu0831 commited on
Commit Β·
3bc94bc
1
Parent(s): 6e3ec67
v25
Browse files- app.py +20 -20
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -43,6 +43,17 @@ from openai import OpenAI
|
|
| 43 |
from huggingface_hub import HfApi, hf_hub_download
|
| 44 |
from huggingface_hub.utils import HfHubHTTPError
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# =========================
|
| 48 |
# Config
|
|
@@ -86,11 +97,13 @@ def _clamp01(x: float) -> float:
|
|
| 86 |
|
| 87 |
|
| 88 |
def normalize_text(s: str) -> str:
|
| 89 |
-
"""SQuAD-style normalization."""
|
| 90 |
s = str(s).lower()
|
| 91 |
s = "".join(ch for ch in s if ch not in string.punctuation)
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
return s
|
| 95 |
|
| 96 |
|
|
@@ -251,7 +264,6 @@ def get_leaderboard_display() -> pd.DataFrame:
|
|
| 251 |
"AndrewID",
|
| 252 |
"Attempts Used",
|
| 253 |
"Total (%)",
|
| 254 |
-
"EM (%)",
|
| 255 |
"F1 (%)",
|
| 256 |
"Recall (%)",
|
| 257 |
"ROUGE (%)",
|
|
@@ -267,7 +279,6 @@ def get_leaderboard_display() -> pd.DataFrame:
|
|
| 267 |
"AndrewID": andrewid,
|
| 268 |
"Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
|
| 269 |
"Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
|
| 270 |
-
"EM (%)": f"{float(e.get('em', 0.0)) * 100:.2f}%",
|
| 271 |
"F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
|
| 272 |
"Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
|
| 273 |
"ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
|
|
@@ -291,7 +302,6 @@ def get_leaderboard_display() -> pd.DataFrame:
|
|
| 291 |
return (
|
| 292 |
pct(r.get("Total (%)", "0")),
|
| 293 |
pct(r.get("F1 (%)", "0")),
|
| 294 |
-
pct(r.get("EM (%)", "0")),
|
| 295 |
pct(r.get("Recall (%)", "0")),
|
| 296 |
judge,
|
| 297 |
)
|
|
@@ -388,14 +398,12 @@ def openai_judge(question: str, answer: str) -> Optional[int]:
|
|
| 388 |
# =========================
|
| 389 |
|
| 390 |
def compute_total_score(
|
| 391 |
-
em: float,
|
| 392 |
f1: float,
|
| 393 |
recall: float,
|
| 394 |
rouge_avg: float,
|
| 395 |
judge_score: Optional[float],
|
| 396 |
) -> float:
|
| 397 |
parts: List[float] = []
|
| 398 |
-
parts.append(_clamp01(em))
|
| 399 |
parts.append(_clamp01(f1))
|
| 400 |
parts.append(_clamp01(recall))
|
| 401 |
parts.append(_clamp01(rouge_avg))
|
|
@@ -456,7 +464,7 @@ def process_submission(file):
|
|
| 456 |
return error_msg, get_leaderboard_display()
|
| 457 |
|
| 458 |
attempted = 0
|
| 459 |
-
|
| 460 |
rouge1_sum = rouge2_sum = rougeL_sum = 0.0
|
| 461 |
judge_sum = 0
|
| 462 |
judge_n = 0
|
|
@@ -471,13 +479,11 @@ def process_submission(file):
|
|
| 471 |
gold = gold_map[qid]["gold_answer"]
|
| 472 |
question = gold_map[qid]["question"]
|
| 473 |
|
| 474 |
-
em = exact_match(pred, gold)
|
| 475 |
f1 = token_f1(pred, gold)
|
| 476 |
rec = answer_recall(pred, gold)
|
| 477 |
rouge = compute_rouge(pred, gold)
|
| 478 |
judge = openai_judge(question, pred)
|
| 479 |
|
| 480 |
-
em_sum += em
|
| 481 |
f1_sum += f1
|
| 482 |
rec_sum += rec
|
| 483 |
rouge1_sum += rouge["rouge1"]
|
|
@@ -490,7 +496,6 @@ def process_submission(file):
|
|
| 490 |
|
| 491 |
denom = attempted if attempted > 0 else 1
|
| 492 |
|
| 493 |
-
avg_em = em_sum / denom
|
| 494 |
avg_f1 = f1_sum / denom
|
| 495 |
avg_rec = rec_sum / denom
|
| 496 |
avg_rouge1 = rouge1_sum / denom
|
|
@@ -501,7 +506,6 @@ def process_submission(file):
|
|
| 501 |
avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
|
| 502 |
|
| 503 |
total_score = compute_total_score(
|
| 504 |
-
em=avg_em,
|
| 505 |
f1=avg_f1,
|
| 506 |
recall=avg_rec,
|
| 507 |
rouge_avg=avg_rouge,
|
|
@@ -512,7 +516,6 @@ def process_submission(file):
|
|
| 512 |
"andrewid": andrewid,
|
| 513 |
"attempt": used + 1,
|
| 514 |
"timestamp": _now_iso(),
|
| 515 |
-
"em": round(avg_em, 6),
|
| 516 |
"f1": round(avg_f1, 6),
|
| 517 |
"recall": round(avg_rec, 6),
|
| 518 |
"rouge1": round(avg_rouge1, 6),
|
|
@@ -540,7 +543,6 @@ def process_submission(file):
|
|
| 540 |
lines = [
|
| 541 |
f"β
Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
|
| 542 |
f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
|
| 543 |
-
f"EM: {avg_em:.4f} ({avg_em * 100:.2f}%)",
|
| 544 |
f"F1: {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
|
| 545 |
f"Recall: {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
|
| 546 |
f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
|
|
@@ -573,7 +575,7 @@ with gr.Blocks(title="Leaderboard QA Judge", theme=gr.themes.Soft()) as app:
|
|
| 573 |
# π Assignment 2 Public Leaderboard
|
| 574 |
|
| 575 |
We compute multiple metrics:
|
| 576 |
-
- **Standard metrics:** Answer Recall,
|
| 577 |
- **LLM-as-judge:** rubric-based score (1β5)
|
| 578 |
|
| 579 |
**Total score** is the uniform mean of the available normalized metrics (0β1).
|
|
@@ -590,18 +592,16 @@ We compute multiple metrics:
|
|
| 590 |
```
|
| 591 |
|
| 592 |
**Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
|
| 593 |
-
|
| 594 |
-
**Please don't refresh the page during evaluation, it may take some time for scoring.**
|
| 595 |
"""
|
| 596 |
)
|
| 597 |
|
| 598 |
with gr.Tabs():
|
| 599 |
with gr.Tab("π€ Submit"):
|
| 600 |
-
file_input = gr.File(label="Upload submission
|
| 601 |
submit_btn = gr.Button("π Submit & Evaluate", variant="primary")
|
| 602 |
status = gr.Textbox(label="Result", lines=10, interactive=False)
|
| 603 |
|
| 604 |
-
gr.Markdown("### Sample submission")
|
| 605 |
sample = gr.Textbox(value=sample_submission_text(), lines=6)
|
| 606 |
|
| 607 |
with gr.Tab("π
Leaderboard"):
|
|
|
|
| 43 |
from huggingface_hub import HfApi, hf_hub_download
|
| 44 |
from huggingface_hub.utils import HfHubHTTPError
|
| 45 |
|
| 46 |
+
import nltk
|
| 47 |
+
from nltk.corpus import stopwords
|
| 48 |
+
|
| 49 |
+
# Download stopwords if not already present
|
| 50 |
+
try:
|
| 51 |
+
nltk.data.find('corpora/stopwords')
|
| 52 |
+
except LookupError:
|
| 53 |
+
nltk.download('stopwords', quiet=True)
|
| 54 |
+
|
| 55 |
+
STOP_WORDS = set(stopwords.words('english'))
|
| 56 |
+
|
| 57 |
|
| 58 |
# =========================
|
| 59 |
# Config
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
def normalize_text(s: str) -> str:
|
| 100 |
+
"""SQuAD-style normalization with NLTK stop words."""
|
| 101 |
s = str(s).lower()
|
| 102 |
s = "".join(ch for ch in s if ch not in string.punctuation)
|
| 103 |
+
# Remove NLTK English stop words
|
| 104 |
+
tokens = s.split()
|
| 105 |
+
tokens = [t for t in tokens if t not in STOP_WORDS]
|
| 106 |
+
s = " ".join(tokens)
|
| 107 |
return s
|
| 108 |
|
| 109 |
|
|
|
|
| 264 |
"AndrewID",
|
| 265 |
"Attempts Used",
|
| 266 |
"Total (%)",
|
|
|
|
| 267 |
"F1 (%)",
|
| 268 |
"Recall (%)",
|
| 269 |
"ROUGE (%)",
|
|
|
|
| 279 |
"AndrewID": andrewid,
|
| 280 |
"Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
|
| 281 |
"Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
|
|
|
|
| 282 |
"F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
|
| 283 |
"Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
|
| 284 |
"ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
|
|
|
|
| 302 |
return (
|
| 303 |
pct(r.get("Total (%)", "0")),
|
| 304 |
pct(r.get("F1 (%)", "0")),
|
|
|
|
| 305 |
pct(r.get("Recall (%)", "0")),
|
| 306 |
judge,
|
| 307 |
)
|
|
|
|
| 398 |
# =========================
|
| 399 |
|
| 400 |
def compute_total_score(
|
|
|
|
| 401 |
f1: float,
|
| 402 |
recall: float,
|
| 403 |
rouge_avg: float,
|
| 404 |
judge_score: Optional[float],
|
| 405 |
) -> float:
|
| 406 |
parts: List[float] = []
|
|
|
|
| 407 |
parts.append(_clamp01(f1))
|
| 408 |
parts.append(_clamp01(recall))
|
| 409 |
parts.append(_clamp01(rouge_avg))
|
|
|
|
| 464 |
return error_msg, get_leaderboard_display()
|
| 465 |
|
| 466 |
attempted = 0
|
| 467 |
+
f1_sum = rec_sum = 0.0
|
| 468 |
rouge1_sum = rouge2_sum = rougeL_sum = 0.0
|
| 469 |
judge_sum = 0
|
| 470 |
judge_n = 0
|
|
|
|
| 479 |
gold = gold_map[qid]["gold_answer"]
|
| 480 |
question = gold_map[qid]["question"]
|
| 481 |
|
|
|
|
| 482 |
f1 = token_f1(pred, gold)
|
| 483 |
rec = answer_recall(pred, gold)
|
| 484 |
rouge = compute_rouge(pred, gold)
|
| 485 |
judge = openai_judge(question, pred)
|
| 486 |
|
|
|
|
| 487 |
f1_sum += f1
|
| 488 |
rec_sum += rec
|
| 489 |
rouge1_sum += rouge["rouge1"]
|
|
|
|
| 496 |
|
| 497 |
denom = attempted if attempted > 0 else 1
|
| 498 |
|
|
|
|
| 499 |
avg_f1 = f1_sum / denom
|
| 500 |
avg_rec = rec_sum / denom
|
| 501 |
avg_rouge1 = rouge1_sum / denom
|
|
|
|
| 506 |
avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
|
| 507 |
|
| 508 |
total_score = compute_total_score(
|
|
|
|
| 509 |
f1=avg_f1,
|
| 510 |
recall=avg_rec,
|
| 511 |
rouge_avg=avg_rouge,
|
|
|
|
| 516 |
"andrewid": andrewid,
|
| 517 |
"attempt": used + 1,
|
| 518 |
"timestamp": _now_iso(),
|
|
|
|
| 519 |
"f1": round(avg_f1, 6),
|
| 520 |
"recall": round(avg_rec, 6),
|
| 521 |
"rouge1": round(avg_rouge1, 6),
|
|
|
|
| 543 |
lines = [
|
| 544 |
f"β
Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
|
| 545 |
f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
|
|
|
|
| 546 |
f"F1: {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
|
| 547 |
f"Recall: {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
|
| 548 |
f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
|
|
|
|
| 575 |
# π Assignment 2 Public Leaderboard
|
| 576 |
|
| 577 |
We compute multiple metrics:
|
| 578 |
+
- **Standard metrics:** Answer Recall, F1 (token-level), and ROUGE-1/2/L (reported as an average)
|
| 579 |
- **LLM-as-judge:** rubric-based score (1β5)
|
| 580 |
|
| 581 |
**Total score** is the uniform mean of the available normalized metrics (0β1).
|
|
|
|
| 592 |
```
|
| 593 |
|
| 594 |
**Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
|
|
|
|
|
|
|
| 595 |
"""
|
| 596 |
)
|
| 597 |
|
| 598 |
with gr.Tabs():
|
| 599 |
with gr.Tab("π€ Submit"):
|
| 600 |
+
file_input = gr.File(label="Upload submission.json", file_types=[".json"])
|
| 601 |
submit_btn = gr.Button("π Submit & Evaluate", variant="primary")
|
| 602 |
status = gr.Textbox(label="Result", lines=10, interactive=False)
|
| 603 |
|
| 604 |
+
gr.Markdown("### Sample submission.json")
|
| 605 |
sample = gr.Textbox(value=sample_submission_text(), lines=6)
|
| 606 |
|
| 607 |
with gr.Tab("π
Leaderboard"):
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
openai==1.109.1
|
| 2 |
-
rouge_score==0.1.2
|
|
|
|
|
|
| 1 |
openai==1.109.1
|
| 2 |
+
rouge_score==0.1.2
|
| 3 |
+
nltk==3.9.1
|