Spaces:
Running
Running
| # app.py | |
| # Hugging Face Spacesโready Gradio app (FULL) | |
| # | |
| # Features | |
| # - Student submission JSON: | |
| # { | |
| # "andrewid": "zwu", | |
| # "1": "Answer 1", | |
| # "2": "Answer 2; Answer 3" | |
| # } | |
| # - Gold data loaded from HF dataset (private): | |
| # swzwan/ANLP_S26_Assignment2_leaderboard_qa_gold | |
| # NOTE: dataset contains ONLY (id, question, answer). No retrieved-doc context. | |
| # - Evaluations include: | |
| # (a) Token metrics: Answer Recall, Exact Match, F1 (SQuAD-style) | |
| # (b) ROUGE (rouge1/rouge2/rougeL) | |
| # (c) LLM-as-judge score (1-5) using OpenAI rubric (NO rationale returned) | |
| # - Total score: uniform mean over normalized sub-metrics (0..1) | |
| # total = mean([EM, F1, Recall, ROUGE_avg, Judge_norm]) | |
| # Missing components (e.g., judge disabled) are excluded from the mean. | |
| # - Leaderboard update policy: | |
| # attempts always increase; reject if attempts >= 10 | |
| # best score is kept: overwrite stored scores only if total_score improves | |
| # | |
| # IMPORTANT: Leaderboard persistence is via a *separate HF dataset repo* (Option A). | |
| # This avoids HF Spaces ephemeral filesystem resets. | |
| import json | |
| import os | |
| import re | |
| import string | |
| import tempfile | |
| from datetime import datetime | |
| from typing import Any, Dict, List, Optional, Tuple | |
| import gradio as gr | |
| import pandas as pd | |
| import datasets | |
| from rouge_score import rouge_scorer | |
| from openai import OpenAI | |
| from huggingface_hub import HfApi, hf_hub_download | |
| from huggingface_hub.utils import HfHubHTTPError | |
| import nltk | |
| from nltk.corpus import stopwords | |
| # Download stopwords if not already present | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords', quiet=True) | |
| STOP_WORDS = set(stopwords.words('english')) | |
| # ========================= | |
| # Config | |
| # ========================= | |
| MAX_ATTEMPTS = 10 | |
| # Gold dataset (read) | |
| GOLD_DATASET_NAME = "swzwan/ANLP_S26_Assignment2_leaderboard_qa_gold" | |
| HF_ACCESS_TOKEN = os.getenv("HF_ACCESS_TOKEN") # token that can read the private gold dataset | |
| # Leaderboard dataset (read+write) | |
| # Create this dataset repo once, e.g.: | |
| # swzwan/ANLP_S26_Assignment2_leaderboard_state | |
| # and add an initial file leaderboard.json with {"attempts": {}, "entries": {}} | |
| LEADERBOARD_REPO_ID = os.getenv( | |
| "LEADERBOARD_REPO_ID", | |
| "swzwan/ANLP_S26_Assignment2_leaderboard_state", | |
| ) | |
| LEADERBOARD_FILENAME = os.getenv("LEADERBOARD_FILENAME", "leaderboard.json") | |
| LEADERBOARD_REPO_TYPE = "dataset" | |
| # Token for leaderboard repo. Needs write permission. | |
| # Recommended: set a *separate* secret HF_LEADERBOARD_TOKEN with write access. | |
| HF_LEADERBOARD_TOKEN = os.getenv("HF_LEADERBOARD_TOKEN") or os.getenv("HF_ACCESS_TOKEN") | |
| # OpenAI judge (optional) | |
| OPENAI_API_KEY = os.getenv("OPENAI_API") | |
| JUDGE_MODEL = os.getenv("OPENAI_MODEL", "gpt-4.1-mini") | |
| # ========================= | |
| # Helpers | |
| # ========================= | |
| def _now_iso() -> str: | |
| return datetime.now().isoformat() | |
| def _clamp01(x: float) -> float: | |
| return 0.0 if x < 0.0 else (1.0 if x > 1.0 else x) | |
| def normalize_text(s: str) -> str: | |
| """SQuAD-style normalization with NLTK stop words.""" | |
| s = str(s).lower() | |
| s = "".join(ch for ch in s if ch not in string.punctuation) | |
| # Remove NLTK English stop words | |
| tokens = s.split() | |
| tokens = [t for t in tokens if t not in STOP_WORDS] | |
| s = " ".join(tokens) | |
| return s | |
| def tokenize(s: str) -> List[str]: | |
| return normalize_text(s).split() | |
| def multiset_overlap_count(a_tokens: List[str], b_tokens: List[str]) -> int: | |
| counts: Dict[str, int] = {} | |
| for t in a_tokens: | |
| counts[t] = counts.get(t, 0) + 1 | |
| overlap = 0 | |
| for t in b_tokens: | |
| if counts.get(t, 0) > 0: | |
| overlap += 1 | |
| counts[t] -= 1 | |
| return overlap | |
| def exact_match(pred: str, gold: str) -> int: | |
| return int(normalize_text(pred) == normalize_text(gold)) | |
| def token_f1(pred: str, gold: str) -> float: | |
| pt = tokenize(pred) | |
| gt = tokenize(gold) | |
| if len(pt) == 0 and len(gt) == 0: | |
| return 1.0 | |
| if len(pt) == 0 or len(gt) == 0: | |
| return 0.0 | |
| overlap = multiset_overlap_count(pt, gt) | |
| if overlap == 0: | |
| return 0.0 | |
| precision = overlap / len(pt) | |
| recall = overlap / len(gt) | |
| return (2 * precision * recall) / (precision + recall) | |
| def answer_recall(pred: str, gold: str) -> float: | |
| pt = tokenize(pred) | |
| gt = tokenize(gold) | |
| if len(gt) == 0: | |
| return 1.0 if len(pt) == 0 else 0.0 | |
| if len(pt) == 0: | |
| return 0.0 | |
| overlap = multiset_overlap_count(pt, gt) | |
| return overlap / len(gt) | |
| def compute_rouge(pred: str, gold: str) -> Dict[str, float]: | |
| scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) | |
| scores = scorer.score(gold, pred) | |
| return { | |
| "rouge1": float(scores["rouge1"].fmeasure), | |
| "rouge2": float(scores["rouge2"].fmeasure), | |
| "rougeL": float(scores["rougeL"].fmeasure), | |
| } | |
| # ========================= | |
| # Gold loading (HF datasets) | |
| # ========================= | |
| _GOLD_CACHE: Optional[Dict[str, Dict[str, Any]]] = None | |
| def load_gold_map() -> Dict[str, Dict[str, Any]]: | |
| """Returns: { qid: {"question": str, "gold_answer": str} }""" | |
| global _GOLD_CACHE | |
| if _GOLD_CACHE is not None: | |
| return _GOLD_CACHE | |
| if not HF_ACCESS_TOKEN: | |
| raise RuntimeError("HF_ACCESS_TOKEN is not set.") | |
| ds = datasets.load_dataset(GOLD_DATASET_NAME, token=HF_ACCESS_TOKEN) | |
| split = "test" if "test" in ds else ("validation" if "validation" in ds else "train") | |
| gold: Dict[str, Dict[str, Any]] = {} | |
| for ex in ds[split]: | |
| qid = str(ex.get("id", ex.get("qid", ""))).strip() | |
| if not qid: | |
| continue | |
| gold[qid] = { | |
| "question": str(ex.get("question", ex.get("query", ""))).strip(), | |
| "gold_answer": str(ex.get("answer", ex.get("gold", ex.get("reference", "")))).strip(), | |
| } | |
| _GOLD_CACHE = gold | |
| return gold | |
| # ========================= | |
| # Leaderboard storage (HF dataset repo) | |
| # ========================= | |
| _api = HfApi(token=HF_LEADERBOARD_TOKEN) if HF_LEADERBOARD_TOKEN else None | |
| def _empty_lb() -> Dict[str, Any]: | |
| return {"attempts": {}, "entries": {}} | |
| def load_leaderboard() -> Dict[str, Any]: | |
| """Schema: {"attempts": {id:int}, "entries": {id: entry}}""" | |
| if not HF_LEADERBOARD_TOKEN: | |
| # Read-only fallback: try anonymous read; if private, this will fail and we return empty. | |
| pass | |
| try: | |
| path = hf_hub_download( | |
| repo_id=LEADERBOARD_REPO_ID, | |
| filename=LEADERBOARD_FILENAME, | |
| repo_type=LEADERBOARD_REPO_TYPE, | |
| token=HF_LEADERBOARD_TOKEN, | |
| ) | |
| with open(path, "r", encoding="utf-8") as f: | |
| obj = json.load(f) | |
| if not isinstance(obj, dict): | |
| return _empty_lb() | |
| obj.setdefault("attempts", {}) | |
| obj.setdefault("entries", {}) | |
| if not isinstance(obj["attempts"], dict) or not isinstance(obj["entries"], dict): | |
| return _empty_lb() | |
| return obj | |
| except Exception: | |
| return _empty_lb() | |
| def save_leaderboard(obj: Dict[str, Any], commit_message: str = "Update leaderboard") -> None: | |
| """Atomically overwrite leaderboard.json in the dataset repo.""" | |
| if _api is None: | |
| raise RuntimeError( | |
| "HF_LEADERBOARD_TOKEN is not set (needs write access to leaderboard dataset)." | |
| ) | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json", encoding="utf-8") as f: | |
| json.dump(obj, f, indent=2, ensure_ascii=False) | |
| tmp_path = f.name | |
| _api.upload_file( | |
| path_or_fileobj=tmp_path, | |
| path_in_repo=LEADERBOARD_FILENAME, | |
| repo_id=LEADERBOARD_REPO_ID, | |
| repo_type=LEADERBOARD_REPO_TYPE, | |
| commit_message=commit_message, | |
| token=HF_LEADERBOARD_TOKEN, | |
| ) | |
| def get_leaderboard_display() -> pd.DataFrame: | |
| lb = load_leaderboard() | |
| entries = lb.get("entries", {}) | |
| if not entries: | |
| return pd.DataFrame( | |
| columns=[ | |
| "Rank", | |
| "AndrewID", | |
| "Attempts Used", | |
| "Total (%)", | |
| "F1 (%)", | |
| "Recall (%)", | |
| "ROUGE (%)", | |
| "LLM Judge (1-5)", | |
| "Timestamp", | |
| ] | |
| ) | |
| rows = [] | |
| for andrewid, e in entries.items(): | |
| rows.append( | |
| { | |
| "AndrewID": andrewid, | |
| "Attempts Used": int(lb.get("attempts", {}).get(andrewid, 0)), | |
| "Total (%)": f"{float(e.get('total_score', 0.0)) * 100:.2f}%", | |
| "F1 (%)": f"{float(e.get('f1', 0.0)) * 100:.2f}%", | |
| "Recall (%)": f"{float(e.get('recall', 0.0)) * 100:.2f}%", | |
| "ROUGE (%)": f"{float(e.get('rouge_avg', 0.0)) * 100:.2f}%", | |
| "LLM Judge (1-5)": "" if e.get("judge") is None else f"{float(e['judge']):.3f}", | |
| "Timestamp": str(e.get("timestamp", ""))[:19].replace("T", " "), | |
| } | |
| ) | |
| def sort_key(r): | |
| def pct(x): | |
| try: | |
| return float(str(x).replace("%", "")) | |
| except Exception: | |
| return 0.0 | |
| try: | |
| judge = float(r.get("LLM Judge (1-5)") or 0.0) | |
| except Exception: | |
| judge = 0.0 | |
| return ( | |
| pct(r.get("Total (%)", "0")), | |
| pct(r.get("F1 (%)", "0")), | |
| pct(r.get("Recall (%)", "0")), | |
| judge, | |
| ) | |
| rows_sorted = sorted(rows, key=sort_key, reverse=True) | |
| for i, r in enumerate(rows_sorted, 1): | |
| r["Rank"] = i | |
| df = pd.DataFrame(rows_sorted) | |
| cols = ["Rank"] + [c for c in df.columns if c != "Rank"] | |
| return df[cols] | |
| # ========================= | |
| # Student submission parsing | |
| # ========================= | |
| def parse_submission_json(text: str) -> Tuple[str, Dict[str, str]]: | |
| obj = json.loads(text) | |
| if "andrewid" not in obj: | |
| raise ValueError("Missing 'andrewid' in submission.") | |
| andrewid = str(obj["andrewid"]).strip() | |
| if not andrewid: | |
| raise ValueError("'andrewid' cannot be empty.") | |
| answers: Dict[str, str] = {} | |
| for k, v in obj.items(): | |
| if k == "andrewid": | |
| continue | |
| if str(k).isdigit(): | |
| answers[str(k)] = str(v).strip() | |
| if not answers: | |
| raise ValueError("No answers found (expected numeric keys).") | |
| return andrewid, answers | |
| # ========================= | |
| # LLM-as-judge (OpenAI) 1-5 rubric (NO rationale) | |
| # ========================= | |
| JUDGE_RUBRIC = { | |
| "criteria": "Is the response factually accurate and directly addressing the query?", | |
| "score1_description": "The response contains major factual errors or is completely unrelated to the query.", | |
| "score2_description": "The response has some factual errors or partially misses the query intent.", | |
| "score3_description": "The response is mostly factually correct with occasional minor errors and generally addresses the query.", | |
| "score4_description": "The response is factually accurate with minimal errors and clearly addresses the query.", | |
| "score5_description": "The response is completely factually accurate and comprehensively addresses all aspects of the query.", | |
| } | |
| JUDGE_SYSTEM = ( | |
| "You are an evaluator. Assign an integer score from 1 to 5 using the rubric. " | |
| "Return JSON ONLY: {\"score\": 1..5}" | |
| ) | |
| def openai_judge(question: str, answer: str) -> Optional[int]: | |
| if not OPENAI_API_KEY: | |
| return None | |
| client = OpenAI(api_key=OPENAI_API_KEY, base_url="https://ai-gateway.andrew.cmu.edu/v1") | |
| payload = {"rubric": JUDGE_RUBRIC, "question": question, "response": answer} | |
| resp = client.responses.create( | |
| model=JUDGE_MODEL, | |
| instructions=JUDGE_SYSTEM, | |
| input=json.dumps(payload, ensure_ascii=False), | |
| temperature=0, | |
| max_output_tokens=40, | |
| ) | |
| text = (resp.output_text or "").strip() | |
| try: | |
| obj = json.loads(text) | |
| except Exception: | |
| m = re.search(r"\{.*\}", text, flags=re.DOTALL) | |
| if not m: | |
| return None | |
| obj = json.loads(m.group(0)) | |
| try: | |
| score = int(obj.get("score")) | |
| except Exception: | |
| return None | |
| if score < 1 or score > 5: | |
| return None | |
| return score | |
| # ========================= | |
| # Total score (uniform mean) | |
| # ========================= | |
| def compute_total_score( | |
| f1: float, | |
| recall: float, | |
| rouge_avg: float, | |
| judge_score: Optional[float], | |
| ) -> float: | |
| parts: List[float] = [] | |
| parts.append(_clamp01(f1)) | |
| parts.append(_clamp01(recall)) | |
| parts.append(_clamp01(rouge_avg)) | |
| # judge in [1,5] -> [0,1] | |
| if judge_score is not None: | |
| parts.append(_clamp01((float(judge_score) - 1.0) / 4.0)) | |
| return sum(parts) / len(parts) if parts else 0.0 | |
| # ========================= | |
| # Submission processing | |
| # ========================= | |
| def process_submission(file): | |
| try: | |
| if file is None: | |
| return "โ Please upload a submission JSON file.", get_leaderboard_display() | |
| path = file if isinstance(file, str) else file.name | |
| with open(path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| andrewid, user_answers = parse_submission_json(content) | |
| lb = load_leaderboard() | |
| attempts_map: Dict[str, int] = lb.get("attempts", {}) | |
| entries_map: Dict[str, Any] = lb.get("entries", {}) | |
| used = int(attempts_map.get(andrewid, 0)) | |
| if used >= MAX_ATTEMPTS: | |
| return ( | |
| f"โ Submission rejected: {andrewid} has already used {used}/{MAX_ATTEMPTS} attempts.", | |
| get_leaderboard_display(), | |
| ) | |
| gold_map = load_gold_map() | |
| # Validate that submission has the same number of entries as gold | |
| if len(user_answers) != len(gold_map): | |
| return ( | |
| f"โ Submission rejected: Expected {len(gold_map)} answers, but got {len(user_answers)}. " | |
| f"Your submission must contain answers for all questions in the gold dataset.", | |
| get_leaderboard_display(), | |
| ) | |
| # Validate that all question IDs in submission exist in gold | |
| missing_qids = set(gold_map.keys()) - set(user_answers.keys()) | |
| extra_qids = set(user_answers.keys()) - set(gold_map.keys()) | |
| if missing_qids or extra_qids: | |
| error_msg = "โ Submission rejected: Question ID mismatch.\n" | |
| if missing_qids: | |
| error_msg += f"Missing question IDs: {sorted(missing_qids)}\n" | |
| if extra_qids: | |
| error_msg += f"Extra/invalid question IDs: {sorted(extra_qids)}\n" | |
| return error_msg, get_leaderboard_display() | |
| attempted = 0 | |
| f1_sum = rec_sum = 0.0 | |
| rouge1_sum = rouge2_sum = rougeL_sum = 0.0 | |
| judge_sum = 0 | |
| judge_n = 0 | |
| for qid, pred in user_answers.items(): | |
| if qid not in gold_map: | |
| continue | |
| pred = str(pred).strip() | |
| attempted += 1 | |
| gold = gold_map[qid]["gold_answer"] | |
| question = gold_map[qid]["question"] | |
| f1 = token_f1(pred, gold) | |
| rec = answer_recall(pred, gold) | |
| rouge = compute_rouge(pred, gold) | |
| judge = openai_judge(question, pred) | |
| f1_sum += f1 | |
| rec_sum += rec | |
| rouge1_sum += rouge["rouge1"] | |
| rouge2_sum += rouge["rouge2"] | |
| rougeL_sum += rouge["rougeL"] | |
| if judge is not None: | |
| judge_sum += int(judge) | |
| judge_n += 1 | |
| denom = attempted if attempted > 0 else 1 | |
| avg_f1 = f1_sum / denom | |
| avg_rec = rec_sum / denom | |
| avg_rouge1 = rouge1_sum / denom | |
| avg_rouge2 = rouge2_sum / denom | |
| avg_rougeL = rougeL_sum / denom | |
| avg_rouge = (avg_rouge1 + avg_rouge2 + avg_rougeL) / 3.0 | |
| avg_judge = (judge_sum / judge_n) if judge_n > 0 else None | |
| total_score = compute_total_score( | |
| f1=avg_f1, | |
| recall=avg_rec, | |
| rouge_avg=avg_rouge, | |
| judge_score=avg_judge, | |
| ) | |
| entry = { | |
| "andrewid": andrewid, | |
| "attempt": used + 1, | |
| "timestamp": _now_iso(), | |
| "f1": round(avg_f1, 6), | |
| "recall": round(avg_rec, 6), | |
| "rouge1": round(avg_rouge1, 6), | |
| "rouge2": round(avg_rouge2, 6), | |
| "rougeL": round(avg_rougeL, 6), | |
| "rouge_avg": round(avg_rouge, 6), | |
| "judge": None if avg_judge is None else round(float(avg_judge), 6), | |
| "total_score": round(float(total_score), 6), | |
| "judge_model": JUDGE_MODEL if avg_judge is not None else None, | |
| } | |
| # best-score-wins | |
| prev_entry = entries_map.get(andrewid) | |
| if prev_entry is None or float(total_score) > float(prev_entry.get("total_score", -1.0)): | |
| entries_map[andrewid] = entry | |
| # attempts always increment | |
| attempts_map[andrewid] = used + 1 | |
| lb["entries"] = entries_map | |
| lb["attempts"] = attempts_map | |
| save_leaderboard(lb, commit_message=f"Update leaderboard: {andrewid} attempt {used + 1}") | |
| # concise status | |
| lines = [ | |
| f"โ Submission {andrewid} (attempt #{used + 1}/{MAX_ATTEMPTS}).", | |
| f"Total score: {total_score:.4f} ({total_score * 100:.2f}%)", | |
| f"F1: {avg_f1:.4f} ({avg_f1 * 100:.2f}%)", | |
| f"Recall: {avg_rec:.4f} ({avg_rec * 100:.2f}%)", | |
| f"ROUGE(avg): {avg_rouge:.4f} ({avg_rouge * 100:.2f}%)", | |
| ] | |
| if avg_judge is None: | |
| if not OPENAI_API_KEY: | |
| lines.append("LLM judge: NA (set OPENAI_API to enable)") | |
| else: | |
| lines.append("LLM judge: NA (questions not available in gold dataset)") | |
| else: | |
| lines.append(f"LLM judge: {avg_judge:.3f} (1-5)") | |
| return "\n".join(lines), get_leaderboard_display() | |
| except Exception as e: | |
| return f"โ Error: {e}", get_leaderboard_display() | |
| # ========================= | |
| # UI | |
| # ========================= | |
| def sample_submission_text(): | |
| return json.dumps({"andrewid": "user123", "1": "Paris", "2": "Answer A"}, indent=2) | |
| with gr.Blocks(title="Leaderboard QA Judge", theme=gr.themes.Soft()) as app: | |
| gr.Markdown( | |
| f""" | |
| # ๐ Assignment 2 Public Leaderboard | |
| We compute multiple metrics: | |
| - **Standard metrics:** Answer Recall, F1, and ROUGE-1/2/L (reported as an average) | |
| - **LLM-as-judge:** rubric-based score (1โ5) | |
| **Total score** is the uniform mean of the available normalized metrics (0โ1). | |
| **Attempts:** up to **{MAX_ATTEMPTS}**. Attempts always increase. Your leaderboard score updates only if your **total score improves**. | |
| **Submission format (JSON):** | |
| ```json | |
| {{ | |
| "andrewid": "YOUR_ANDREWID", | |
| "1": "Answer 1", | |
| "2": "Answer 2" | |
| }} | |
| ``` | |
| **Important:** Your submission must include answers for ALL questions in the dataset. The number of answers must exactly match the number of questions in the gold dataset. | |
| **Please don't refresh or redirect the page during evaluation. It may take some time to finish.** | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("๐ค Submit"): | |
| file_input = gr.File(label="Upload submission in json", file_types=[".json"]) | |
| submit_btn = gr.Button("๐ Submit & Evaluate", variant="primary") | |
| status = gr.Textbox(label="Result", lines=10, interactive=False) | |
| gr.Markdown("### Sample submission") | |
| sample = gr.Textbox(value=sample_submission_text(), lines=6) | |
| with gr.Tab("๐ Leaderboard"): | |
| leaderboard_df = gr.Dataframe(value=get_leaderboard_display(), interactive=False) | |
| refresh_btn = gr.Button("๐ Refresh") | |
| # Keep the leaderboard static during evaluation; update it only after scoring finishes. | |
| def process_and_return_status(file): | |
| status_text, _ = process_submission(file) | |
| return status_text | |
| submit_btn.click(fn=process_and_return_status, inputs=[file_input], outputs=[status]).then( | |
| fn=get_leaderboard_display, outputs=[leaderboard_df] | |
| ) | |
| refresh_btn.click(fn=get_leaderboard_display, outputs=[leaderboard_df]) | |
| # Ensure leaderboard loads from HF dataset for every new browser session / hard refresh. | |
| app.load(fn=get_leaderboard_display, outputs=[leaderboard_df]) | |
| if __name__ == "__main__": | |
| app.launch(share=True) |