zhenwu0831 commited on
Commit
3bc94bc
Β·
1 Parent(s): 6e3ec67
Files changed (2) hide show
  1. app.py +20 -20
  2. requirements.txt +2 -1
app.py CHANGED
@@ -43,6 +43,17 @@ from openai import OpenAI
43
  from huggingface_hub import HfApi, hf_hub_download
44
  from huggingface_hub.utils import HfHubHTTPError
45
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # =========================
48
  # Config
@@ -86,11 +97,13 @@ def _clamp01(x: float) -> float:
86
 
87
 
88
  def normalize_text(s: str) -> str:
89
- """SQuAD-style normalization."""
90
  s = str(s).lower()
91
  s = "".join(ch for ch in s if ch not in string.punctuation)
92
- s = re.sub(r"\b(a|an|the)\b", " ", s)
93
- s = " ".join(s.split())
 
 
94
  return s
95
 
96
 
@@ -251,7 +264,6 @@ def get_leaderboard_display() -> pd.DataFrame:
251
  "AndrewID",
252
  "Attempts Used",
253
  "Total (%)",
254
- "EM (%)",
255
  "F1 (%)",
256
  "Recall (%)",
257
  "ROUGE (%)",
@@ -267,7 +279,6 @@ def get_leaderboard_display() -> pd.DataFrame:
267
  "AndrewID": andrewid,
268
  "Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
269
  "Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
270
- "EM (%)": f"{float(e.get('em', 0.0)) * 100:.2f}%",
271
  "F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
272
  "Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
273
  "ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
@@ -291,7 +302,6 @@ def get_leaderboard_display() -> pd.DataFrame:
291
  return (
292
  pct(r.get("Total (%)", "0")),
293
  pct(r.get("F1 (%)", "0")),
294
- pct(r.get("EM (%)", "0")),
295
  pct(r.get("Recall (%)", "0")),
296
  judge,
297
  )
@@ -388,14 +398,12 @@ def openai_judge(question: str, answer: str) -> Optional[int]:
388
  # =========================
389
 
390
  def compute_total_score(
391
- em: float,
392
  f1: float,
393
  recall: float,
394
  rouge_avg: float,
395
  judge_score: Optional[float],
396
  ) -> float:
397
  parts: List[float] = []
398
- parts.append(_clamp01(em))
399
  parts.append(_clamp01(f1))
400
  parts.append(_clamp01(recall))
401
  parts.append(_clamp01(rouge_avg))
@@ -456,7 +464,7 @@ def process_submission(file):
456
  return error_msg, get_leaderboard_display()
457
 
458
  attempted = 0
459
- em_sum = f1_sum = rec_sum = 0.0
460
  rouge1_sum = rouge2_sum = rougeL_sum = 0.0
461
  judge_sum = 0
462
  judge_n = 0
@@ -471,13 +479,11 @@ def process_submission(file):
471
  gold = gold_map[qid]["gold_answer"]
472
  question = gold_map[qid]["question"]
473
 
474
- em = exact_match(pred, gold)
475
  f1 = token_f1(pred, gold)
476
  rec = answer_recall(pred, gold)
477
  rouge = compute_rouge(pred, gold)
478
  judge = openai_judge(question, pred)
479
 
480
- em_sum += em
481
  f1_sum += f1
482
  rec_sum += rec
483
  rouge1_sum += rouge["rouge1"]
@@ -490,7 +496,6 @@ def process_submission(file):
490
 
491
  denom = attempted if attempted > 0 else 1
492
 
493
- avg_em = em_sum / denom
494
  avg_f1 = f1_sum / denom
495
  avg_rec = rec_sum / denom
496
  avg_rouge1 = rouge1_sum / denom
@@ -501,7 +506,6 @@ def process_submission(file):
501
  avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
502
 
503
  total_score = compute_total_score(
504
- em=avg_em,
505
  f1=avg_f1,
506
  recall=avg_rec,
507
  rouge_avg=avg_rouge,
@@ -512,7 +516,6 @@ def process_submission(file):
512
  "andrewid": andrewid,
513
  "attempt": used + 1,
514
  "timestamp": _now_iso(),
515
- "em": round(avg_em, 6),
516
  "f1": round(avg_f1, 6),
517
  "recall": round(avg_rec, 6),
518
  "rouge1": round(avg_rouge1, 6),
@@ -540,7 +543,6 @@ def process_submission(file):
540
  lines = [
541
  f"βœ… Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
542
  f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
543
- f"EM: {avg_em:.4f} ({avg_em * 100:.2f}%)",
544
  f"F1: {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
545
  f"Recall: {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
546
  f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
@@ -573,7 +575,7 @@ with gr.Blocks(title="Leaderboard QA Judge", theme=gr.themes.Soft()) as app:
573
  # πŸ† Assignment 2 Public Leaderboard
574
 
575
  We compute multiple metrics:
576
- - **Standard metrics:** Answer Recall, Exact Match (EM), F1, and ROUGE-1/2/L (reported as an average)
577
  - **LLM-as-judge:** rubric-based score (1–5)
578
 
579
  **Total score** is the uniform mean of the available normalized metrics (0–1).
@@ -590,18 +592,16 @@ We compute multiple metrics:
590
  ```
591
 
592
  **Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
593
-
594
- **Please don't refresh the page during evaluation, it may take some time for scoring.**
595
  """
596
  )
597
 
598
  with gr.Tabs():
599
  with gr.Tab("πŸ“€ Submit"):
600
- file_input = gr.File(label="Upload submission in json", file_types=[".json"])
601
  submit_btn = gr.Button("πŸš€ Submit & Evaluate", variant="primary")
602
  status = gr.Textbox(label="Result", lines=10, interactive=False)
603
 
604
- gr.Markdown("### Sample submission")
605
  sample = gr.Textbox(value=sample_submission_text(), lines=6)
606
 
607
  with gr.Tab("πŸ… Leaderboard"):
 
43
  from huggingface_hub import HfApi, hf_hub_download
44
  from huggingface_hub.utils import HfHubHTTPError
45
 
46
+ import nltk
47
+ from nltk.corpus import stopwords
48
+
49
+ # Download stopwords if not already present
50
+ try:
51
+ nltk.data.find('corpora/stopwords')
52
+ except LookupError:
53
+ nltk.download('stopwords', quiet=True)
54
+
55
+ STOP_WORDS = set(stopwords.words('english'))
56
+
57
 
58
  # =========================
59
  # Config
 
97
 
98
 
99
  def normalize_text(s: str) -> str:
100
+ """SQuAD-style normalization with NLTK stop words."""
101
  s = str(s).lower()
102
  s = "".join(ch for ch in s if ch not in string.punctuation)
103
+ # Remove NLTK English stop words
104
+ tokens = s.split()
105
+ tokens = [t for t in tokens if t not in STOP_WORDS]
106
+ s = " ".join(tokens)
107
  return s
108
 
109
 
 
264
  "AndrewID",
265
  "Attempts Used",
266
  "Total (%)",
 
267
  "F1 (%)",
268
  "Recall (%)",
269
  "ROUGE (%)",
 
279
  "AndrewID": andrewid,
280
  "Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
281
  "Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
 
282
  "F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
283
  "Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
284
  "ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
 
302
  return (
303
  pct(r.get("Total (%)", "0")),
304
  pct(r.get("F1 (%)", "0")),
 
305
  pct(r.get("Recall (%)", "0")),
306
  judge,
307
  )
 
398
  # =========================
399
 
400
  def compute_total_score(
 
401
  f1: float,
402
  recall: float,
403
  rouge_avg: float,
404
  judge_score: Optional[float],
405
  ) -> float:
406
  parts: List[float] = []
 
407
  parts.append(_clamp01(f1))
408
  parts.append(_clamp01(recall))
409
  parts.append(_clamp01(rouge_avg))
 
464
  return error_msg, get_leaderboard_display()
465
 
466
  attempted = 0
467
+ f1_sum = rec_sum = 0.0
468
  rouge1_sum = rouge2_sum = rougeL_sum = 0.0
469
  judge_sum = 0
470
  judge_n = 0
 
479
  gold = gold_map[qid]["gold_answer"]
480
  question = gold_map[qid]["question"]
481
 
 
482
  f1 = token_f1(pred, gold)
483
  rec = answer_recall(pred, gold)
484
  rouge = compute_rouge(pred, gold)
485
  judge = openai_judge(question, pred)
486
 
 
487
  f1_sum += f1
488
  rec_sum += rec
489
  rouge1_sum += rouge["rouge1"]
 
496
 
497
  denom = attempted if attempted > 0 else 1
498
 
 
499
  avg_f1 = f1_sum / denom
500
  avg_rec = rec_sum / denom
501
  avg_rouge1 = rouge1_sum / denom
 
506
  avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
507
 
508
  total_score = compute_total_score(
 
509
  f1=avg_f1,
510
  recall=avg_rec,
511
  rouge_avg=avg_rouge,
 
516
  "andrewid": andrewid,
517
  "attempt": used + 1,
518
  "timestamp": _now_iso(),
 
519
  "f1": round(avg_f1, 6),
520
  "recall": round(avg_rec, 6),
521
  "rouge1": round(avg_rouge1, 6),
 
543
  lines = [
544
  f"βœ… Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
545
  f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
 
546
  f"F1: {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
547
  f"Recall: {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
548
  f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
 
575
  # πŸ† Assignment 2 Public Leaderboard
576
 
577
  We compute multiple metrics:
578
+ - **Standard metrics:** Answer Recall, F1 (token-level), and ROUGE-1/2/L (reported as an average)
579
  - **LLM-as-judge:** rubric-based score (1–5)
580
 
581
  **Total score** is the uniform mean of the available normalized metrics (0–1).
 
592
  ```
593
 
594
  **Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
 
 
595
  """
596
  )
597
 
598
  with gr.Tabs():
599
  with gr.Tab("πŸ“€ Submit"):
600
+ file_input = gr.File(label="Upload submission.json", file_types=[".json"])
601
  submit_btn = gr.Button("πŸš€ Submit & Evaluate", variant="primary")
602
  status = gr.Textbox(label="Result", lines=10, interactive=False)
603
 
604
+ gr.Markdown("### Sample submission.json")
605
  sample = gr.Textbox(value=sample_submission_text(), lines=6)
606
 
607
  with gr.Tab("πŸ… Leaderboard"):
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  openai==1.109.1
2
- rouge_score==0.1.2
 
 
1
  openai==1.109.1
2
+ rouge_score==0.1.2
3
+ nltk==3.9.1