Spaces:

iyosha
/

evaluation

Build error

App Files Files Community

iyosha commited on Apr 30, 2025

Commit

efb69be

verified ·

1 Parent(s): 30fc179

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -2

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from configs import configs
 from clients import backend, logger
 from backend.helpers import get_random_session_samples
-dataset = load_dataset("iyosha-huji/stressBench", token=configs.HF_API_TOKEN)["test"]
 INSTRUCTIONS = """<div align='center'>You are given an audio sample and a question with 2 answer options.\n\nListen to the audio and select the correct answer from the options below.\n\n<b>Note:</b> The question is the same for all samples, but the audio and the corresponding answers change.</div>"""
@@ -47,7 +47,7 @@ def human_eval_tab():
             if p == configs.USER_PASSWORD and usr.strip() != "":
                 new_session_id = str(uuid4())
                 sample_indices, stage = get_random_session_samples(
-                    backend, dataset, STAGE_SPLITS, usr, num_samples=20
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
                 return (
@@ -428,10 +428,37 @@ def get_admin_tab():
             else:
                 rand_acc_msg = "Random sampling failed (no data)."
             # Final message (no indentation!)
             msg = f"""
 ## ✅ Accuracy Summary
 ### Majority Vote
 {agg_msg}

 from clients import backend, logger
 from backend.helpers import get_random_session_samples
+dataset = load_dataset("iyosha-huji/stressEval", token=configs.HF_API_TOKEN)["test"]
 INSTRUCTIONS = """<div align='center'>You are given an audio sample and a question with 2 answer options.\n\nListen to the audio and select the correct answer from the options below.\n\n<b>Note:</b> The question is the same for all samples, but the audio and the corresponding answers change.</div>"""
             if p == configs.USER_PASSWORD and usr.strip() != "":
                 new_session_id = str(uuid4())
                 sample_indices, stage = get_random_session_samples(
+                    backend, dataset, STAGE_SPLITS, usr, num_samples=15
                 )
                 logger.info(f"Session ID: {new_session_id}, Stage: {stage}")
                 return (
             else:
                 rand_acc_msg = "Random sampling failed (no data)."
+            correct = 0
+            total = 0
+            for _, row in df.iterrows():
+                idx = int(row["index_in_dataset"])
+                if idx >= len(dataset):
+                    continue  # skip out-of-range
+                sample = dataset[idx]
+                gt_answer = sample["possible_answers"][sample["label"]]
+                if row["answer"] == gt_answer:
+                    correct += 1
+                total += 1
+            overall_acc = correct / total if total > 0 else None
+            if overall_acc is not None:
+                overall_acc_msg = (
+                    f"Overall Accuracy: {overall_acc:.2%} ({correct}/{total})"
+                )
+            else:
+                overall_acc_msg = "No data available."
             # Final message (no indentation!)
             msg = f"""
 ## ✅ Accuracy Summary
+### Overall Accuracy
+{overall_acc_msg}
+---
 ### Majority Vote
 {agg_msg}