Spaces:

swzwan
/

ANLP_S26_Assignment2

Running

App Files Files Community

zhenwu0831 commited on Feb 12

Commit

3bc94bc

1 Parent(s): 6e3ec67

v25

Browse files

Files changed (2) hide show

app.py +20 -20
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -43,6 +43,17 @@ from openai import OpenAI
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
 # =========================
 # Config
@@ -86,11 +97,13 @@ def _clamp01(x: float) -> float:
 def normalize_text(s: str) -> str:
-    """SQuAD-style normalization."""
     s = str(s).lower()
     s = "".join(ch for ch in s if ch not in string.punctuation)
-    s = re.sub(r"\b(a|an|the)\b", " ", s)
-    s = " ".join(s.split())
     return s
@@ -251,7 +264,6 @@ def get_leaderboard_display() -> pd.DataFrame:
                 "AndrewID",
                 "Attempts Used",
                 "Total (%)",
-                "EM (%)",
                 "F1 (%)",
                 "Recall (%)",
                 "ROUGE (%)",
@@ -267,7 +279,6 @@ def get_leaderboard_display() -> pd.DataFrame:
                 "AndrewID": andrewid,
                 "Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
                 "Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
-                "EM (%)": f"{float(e.get('em', 0.0)) * 100:.2f}%",
                 "F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
                 "Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
                 "ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
@@ -291,7 +302,6 @@ def get_leaderboard_display() -> pd.DataFrame:
         return (
             pct(r.get("Total (%)", "0")),
             pct(r.get("F1 (%)", "0")),
-            pct(r.get("EM (%)", "0")),
             pct(r.get("Recall (%)", "0")),
             judge,
         )
@@ -388,14 +398,12 @@ def openai_judge(question: str, answer: str) -> Optional[int]:
 # =========================
 def compute_total_score(
-    em: float,
     f1: float,
     recall: float,
     rouge_avg: float,
     judge_score: Optional[float],
 ) -> float:
     parts: List[float] = []
-    parts.append(_clamp01(em))
     parts.append(_clamp01(f1))
     parts.append(_clamp01(recall))
     parts.append(_clamp01(rouge_avg))
@@ -456,7 +464,7 @@ def process_submission(file):
             return error_msg, get_leaderboard_display()
         attempted = 0
-        em_sum = f1_sum = rec_sum = 0.0
         rouge1_sum = rouge2_sum = rougeL_sum = 0.0
         judge_sum = 0
         judge_n = 0
@@ -471,13 +479,11 @@ def process_submission(file):
             gold = gold_map[qid]["gold_answer"]
             question = gold_map[qid]["question"]
-            em = exact_match(pred, gold)
             f1 = token_f1(pred, gold)
             rec = answer_recall(pred, gold)
             rouge = compute_rouge(pred, gold)
             judge = openai_judge(question, pred)
-            em_sum += em
             f1_sum += f1
             rec_sum += rec
             rouge1_sum += rouge["rouge1"]
@@ -490,7 +496,6 @@ def process_submission(file):
         denom = attempted if attempted > 0 else 1
-        avg_em = em_sum / denom
         avg_f1 = f1_sum / denom
         avg_rec = rec_sum / denom
         avg_rouge1 = rouge1_sum / denom
@@ -501,7 +506,6 @@ def process_submission(file):
         avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
         total_score = compute_total_score(
-            em=avg_em,
             f1=avg_f1,
             recall=avg_rec,
             rouge_avg=avg_rouge,
@@ -512,7 +516,6 @@ def process_submission(file):
             "andrewid": andrewid,
             "attempt": used + 1,
             "timestamp": _now_iso(),
-            "em": round(avg_em, 6),
             "f1": round(avg_f1, 6),
             "recall": round(avg_rec, 6),
             "rouge1": round(avg_rouge1, 6),
@@ -540,7 +543,6 @@ def process_submission(file):
         lines = [
             f"✅ Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
             f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
-            f"EM:          {avg_em:.4f} ({avg_em * 100:.2f}%)",
             f"F1:          {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
             f"Recall:      {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
             f"ROUGE(avg):  {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
@@ -573,7 +575,7 @@ with gr.Blocks(title="Leaderboard QA Judge", theme=gr.themes.Soft()) as app:
 # 🏆 Assignment 2 Public Leaderboard
 We compute multiple metrics:
-- **Standard metrics:** Answer Recall, Exact Match (EM), F1, and ROUGE-1/2/L (reported as an average)
 - **LLM-as-judge:** rubric-based score (1–5)
 **Total score** is the uniform mean of the available normalized metrics (0–1).
@@ -590,18 +592,16 @@ We compute multiple metrics:
 ```
 **Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
-**Please don't refresh the page during evaluation, it may take some time for scoring.**
 """
     )
     with gr.Tabs():
         with gr.Tab("📤 Submit"):
-            file_input = gr.File(label="Upload submission in json", file_types=[".json"])
             submit_btn = gr.Button("🚀 Submit & Evaluate", variant="primary")
             status = gr.Textbox(label="Result", lines=10, interactive=False)
-            gr.Markdown("### Sample submission")
             sample = gr.Textbox(value=sample_submission_text(), lines=6)
         with gr.Tab("🏅 Leaderboard"):

 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.utils import HfHubHTTPError
+import nltk
+from nltk.corpus import stopwords
+# Download stopwords if not already present
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords', quiet=True)
+STOP_WORDS = set(stopwords.words('english'))
 # =========================
 # Config
 def normalize_text(s: str) -> str:
+    """SQuAD-style normalization with NLTK stop words."""
     s = str(s).lower()
     s = "".join(ch for ch in s if ch not in string.punctuation)
+    # Remove NLTK English stop words
+    tokens = s.split()
+    tokens = [t for t in tokens if t not in STOP_WORDS]
+    s = " ".join(tokens)
     return s
                 "AndrewID",
                 "Attempts Used",
                 "Total (%)",
                 "F1 (%)",
                 "Recall (%)",
                 "ROUGE (%)",
                 "AndrewID": andrewid,
                 "Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)),
                 "Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%",
                 "F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%",
                 "Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%",
                 "ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%",
         return (
             pct(r.get("Total (%)", "0")),
             pct(r.get("F1 (%)", "0")),
             pct(r.get("Recall (%)", "0")),
             judge,
         )
 # =========================
 def compute_total_score(
     f1: float,
     recall: float,
     rouge_avg: float,
     judge_score: Optional[float],
 ) -> float:
     parts: List[float] = []
     parts.append(_clamp01(f1))
     parts.append(_clamp01(recall))
     parts.append(_clamp01(rouge_avg))
             return error_msg, get_leaderboard_display()
         attempted = 0
+        f1_sum = rec_sum = 0.0
         rouge1_sum = rouge2_sum = rougeL_sum = 0.0
         judge_sum = 0
         judge_n = 0
             gold = gold_map[qid]["gold_answer"]
             question = gold_map[qid]["question"]
             f1 = token_f1(pred, gold)
             rec = answer_recall(pred, gold)
             rouge = compute_rouge(pred, gold)
             judge = openai_judge(question, pred)
             f1_sum += f1
             rec_sum += rec
             rouge1_sum += rouge["rouge1"]
         denom = attempted if attempted > 0 else 1
         avg_f1 = f1_sum / denom
         avg_rec = rec_sum / denom
         avg_rouge1 = rouge1_sum / denom
         avg_judge = (judge_sum / judge_n) if judge_n > 0 else None
         total_score = compute_total_score(
             f1=avg_f1,
             recall=avg_rec,
             rouge_avg=avg_rouge,
             "andrewid": andrewid,
             "attempt": used + 1,
             "timestamp": _now_iso(),
             "f1": round(avg_f1, 6),
             "recall": round(avg_rec, 6),
             "rouge1": round(avg_rouge1, 6),
         lines = [
             f"✅ Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).",
             f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)",
             f"F1:          {avg_f1:.4f} ({avg_f1 * 100:.2f}%)",
             f"Recall:      {avg_rec:.4f} ({avg_rec * 100:.2f}%)",
             f"ROUGE(avg):  {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)",
 # 🏆 Assignment 2 Public Leaderboard
 We compute multiple metrics:
+- **Standard metrics:** Answer Recall, F1 (token-level), and ROUGE-1/2/L (reported as an average)
 - **LLM-as-judge:** rubric-based score (1–5)
 **Total score** is the uniform mean of the available normalized metrics (0–1).
 ```
 **Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset.
 """
     )
     with gr.Tabs():
         with gr.Tab("📤 Submit"):
+            file_input = gr.File(label="Upload submission.json", file_types=[".json"])
             submit_btn = gr.Button("🚀 Submit & Evaluate", variant="primary")
             status = gr.Textbox(label="Result", lines=10, interactive=False)
+            gr.Markdown("### Sample submission.json")
             sample = gr.Textbox(value=sample_submission_text(), lines=6)
         with gr.Tab("🏅 Leaderboard"):

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 openai==1.109.1
-rouge_score==0.1.2

 openai==1.109.1
+rouge_score==0.1.2
+nltk==3.9.1