Spaces:

iyosha
/

evaluation

Build error

App Files Files Community

iyosha commited on Apr 17, 2025

Commit

8c04306

verified ·

1 Parent(s): 91d38d3

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -28

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 from uuid import uuid4
 from datasets import load_dataset
 from collections import Counter
 from configs import configs
 from clients import backend, logger
 from backend.helpers import get_random_session_samples
@@ -46,7 +47,7 @@ def human_eval_tab():
             if p == configs.USER_PASSWORD and usr.strip() != "":
                 new_session_id = str(uuid4())
                 sample_indices, stage = get_random_session_samples(
-                    backend, dataset, STAGE_SPLITS, usr, num_samples=2
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
                 return (
@@ -271,6 +272,53 @@ def human_eval_tab():
         )
 def get_admin_tab():
     with gr.Tab("Admin Console"):
         admin_password = gr.Text(label="Enter Admin Password", type="password")
@@ -281,7 +329,7 @@ def get_admin_tab():
         def calculate_majority_vote_accuracy(pw):
             if pw != configs.ADMIN_PASSWORD:
                 return gr.update(
-                    visible=True, value="\u274c Incorrect password."
                 ), gr.update(visible=False)
             df = backend.get_all_rows()
@@ -290,43 +338,131 @@ def get_admin_tab():
                     visible=False
                 )
             majority_answers = {}
             for interp_id, group in df.groupby("interpretation_id"):
                 answer_counts = Counter(group["answer"])
                 if answer_counts:
                     majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
-            total = 0
-            correct = 0
-            for sample in dataset:
-                interp_id = sample["interpretation_id"]
-                if interp_id not in majority_answers:
-                    continue
-                predicted_answer = majority_answers[interp_id]
-                correct_label_idx = sample["label"]
-                correct_answer_text = sample["possible_answers"][correct_label_idx]
-                total += 1
-                if predicted_answer == correct_answer_text:
-                    correct += 1
-            acc = correct / total if total > 0 else 0
-            # calculate total answers submited
             total_answers = len(df)
-            answers_to_go = (3 * len(dataset)) - total_answers
             users_count = df["user_id"].nunique()
-            # update the admin console
-            return gr.update(visible=False), gr.update(
-                visible=True,
-                value=f"""**Accuracy over answered samples:** {acc:.3%} ({correct}/{total})
-                **Total answers submitted:** {total_answers}
-                **Answers to go:** {answers_to_go}
-                **Users count:** {users_count}""",
             )
         check_btn.click(
             fn=calculate_majority_vote_accuracy,
             inputs=admin_password,
@@ -339,5 +475,6 @@ with gr.Blocks() as demo:
     human_eval_tab()
     get_admin_tab()
-#demo.launch()

 from uuid import uuid4
 from datasets import load_dataset
 from collections import Counter
+import numpy as np
 from configs import configs
 from clients import backend, logger
 from backend.helpers import get_random_session_samples
             if p == configs.USER_PASSWORD and usr.strip() != "":
                 new_session_id = str(uuid4())
                 sample_indices, stage = get_random_session_samples(
+                    backend, dataset, STAGE_SPLITS, usr, num_samples=30
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
                 return (
         )
+def compute_random_sampled_accuracy(df, dataset, n_rounds=100, seed=42):
+    rng = np.random.default_rng(seed)
+    # Filter to interpretation_ids with at least 3 user answers
+    counts = df.groupby("interpretation_id")["user_id"].nunique()
+    eligible_ids = set(counts[counts >= 3].index)
+    # Group answers by interpretation_id
+    grouped = df[df["interpretation_id"].isin(eligible_ids)].groupby(
+        "interpretation_id"
+    )
+    all_scores = []
+    total_answered_per_round = []
+    for _ in range(n_rounds):
+        correct = 0
+        total = 0
+        for interp_id, group in grouped:
+            if group.empty:
+                continue
+            # Randomly pick one row
+            row = group.sample(1, random_state=rng.integers(1e6)).iloc[0]
+            answer = row["answer"]
+            idx = int(row["index_in_dataset"])
+            sample = dataset[idx]
+            gt = sample["possible_answers"][sample["label"]]
+            total += 1
+            if answer == gt:
+                correct += 1
+        if total > 0:
+            all_scores.append(correct / total)
+            total_answered_per_round.append(total)
+    if all_scores:
+        mean_acc = np.mean(all_scores)
+        mean_total = int(np.mean(total_answered_per_round))
+        std_acc = np.std(all_scores, ddof=1)  # sample std
+        ci_95 = 1.96 * std_acc / np.sqrt(n_rounds)
+        return mean_acc, std_acc, mean_total, ci_95
+    return None, None, 0, None
 def get_admin_tab():
     with gr.Tab("Admin Console"):
         admin_password = gr.Text(label="Enter Admin Password", type="password")
         def calculate_majority_vote_accuracy(pw):
             if pw != configs.ADMIN_PASSWORD:
                 return gr.update(
+                    visible=True, value="❌ Incorrect password."
                 ), gr.update(visible=False)
             df = backend.get_all_rows()
                     visible=False
                 )
+            # Majority vote per interpretation_id
             majority_answers = {}
             for interp_id, group in df.groupby("interpretation_id"):
                 answer_counts = Counter(group["answer"])
                 if answer_counts:
                     majority_answers[interp_id] = answer_counts.most_common(1)[0][0]
+            counts = df.groupby("interpretation_id")["user_id"].nunique().to_dict()
             total_answers = len(df)
             users_count = df["user_id"].nunique()
+            stage_acc = {}
+            stage_completes = {}
+            stage_counts = {}
+            stage_remaining = {}
+            # global_correct = 0
+            # global_total = 0
+            for stage in ["stage1", "stage2", "stage3"]:
+                correct, total = 0, 0
+                complete = 0
+                for i in STAGE_SPLITS[stage]:
+                    sample = dataset[i]
+                    interp_id = sample["interpretation_id"]
+                    label = sample["label"]
+                    gt = sample["possible_answers"][label]
+                    n = counts.get(interp_id, 0)
+                    if n >= 3:
+                        complete += 1
+                    if interp_id in majority_answers:
+                        pred = majority_answers[interp_id]
+                        total += 1
+                        if pred == gt:
+                            correct += 1
+                stage_counts[stage] = len(STAGE_SPLITS[stage])
+                stage_completes[stage] = complete
+                stage_remaining[stage] = 3 * len(STAGE_SPLITS[stage]) - sum(
+                    counts.get(dataset[i]["interpretation_id"], 0)
+                    for i in STAGE_SPLITS[stage]
+                )
+                if complete == len(STAGE_SPLITS[stage]):
+                    acc = correct / total if total > 0 else 0
+                    stage_acc[stage] = (acc, correct, total)
+                else:
+                    stage_acc[stage] = None  # not shown yet
+            # Determine active stage
+            if stage_completes["stage1"] < stage_counts["stage1"]:
+                current_stage = "Stage 1"
+            elif stage_completes["stage2"] < stage_counts["stage2"]:
+                current_stage = "Stage 2"
+            else:
+                current_stage = "Stage 3"
+            # Majority Vote Accuracy Section
+            agg_lines = []
+            if stage_acc["stage1"]:
+                acc1, c1, t1 = stage_acc["stage1"]
+                agg_lines.append(f"- **Stage 1:** {acc1:.2%} ({c1}/{t1})")
+            if stage_acc["stage2"]:
+                acc2, c2, t2 = stage_acc["stage2"]
+                agg_lines.append(
+                    f"- **Stage 1+2:** {(c1 + c2) / (t1 + t2):.2%} ({c1 + c2}/{t1 + t2})"
+                )
+            if stage_acc["stage3"]:
+                acc3, c3, t3 = stage_acc["stage3"]
+                agg_lines.append(
+                    f"- **All Stages:** {(c1 + c2 + c3) / (t1 + t2 + t3):.2%} ({c1 + c2 + c3}/{t1 + t2 + t3})"
+                )
+            agg_msg = "\n".join(agg_lines) if agg_lines else "No completed stages yet."
+            # Compute random-sampled accuracy
+            n_rounds = 100
+            rand_acc, rand_std, rand_total, rand_ci = compute_random_sampled_accuracy(
+                df, dataset, n_rounds=n_rounds
             )
+            # Random-sampled Accuracy
+            if rand_acc is not None:
+                rand_acc_msg = (
+                    f"**Accuracy:** {rand_acc:.2%} ± {rand_ci:.2%} (95% CI)\n\n"
+                    f"Standard deviation: {rand_std:.2%}\n\n"
+                    f"Samples used: {rand_total} × {n_rounds} rounds"
+                )
+            else:
+                rand_acc_msg = "Random sampling failed (no data)."
+            # Final message (no indentation!)
+            msg = f"""
+## ✅ Accuracy Summary
+### Majority Vote
+{agg_msg}
+---
+### Random-Sampled Accuracy
+{rand_acc_msg}
+---
+## 📊 Answer Progress
+- **Total answers submitted:** {total_answers}
+- **Answers to go (global):** {3 * len(dataset) - total_answers}
+- **Unique users:** {users_count}
+---
+## 🧱 Stage Breakdown
+| Stage | Completed | Total | Remaining Answers |
+|-------|-----------|--------|-------------------|
+|  1    | {stage_completes['stage1']} / {stage_counts['stage1']} | {stage_counts['stage1']} | {stage_remaining['stage1']} |
+|  2    | {stage_completes['stage2']} / {stage_counts['stage2']} | {stage_counts['stage2']} | {stage_remaining['stage2']} |
+|  3    | {stage_completes['stage3']} / {stage_counts['stage3']} | {stage_counts['stage3']} | {stage_remaining['stage3']} |
+**➡️ Current Active Stage:** {current_stage}
+"""
+            return gr.update(visible=False), gr.update(visible=True, value=msg)
         check_btn.click(
             fn=calculate_majority_vote_accuracy,
             inputs=admin_password,
     human_eval_tab()
     get_admin_tab()
+demo.launch()