Spaces:

paulgavrikov
/

visualoverload-submit

Sleeping

App Files Files

Paul Gavrikov commited on Sep 4, 2025

Commit

dfe5ccf

1 Parent(s): b06fec2

updating eval

Browse files

Files changed (6) hide show

app.py +91 -29
config.py +2 -2
eval_utils.py +0 -40
ground_truth.json +0 -1
ground_truth.secret +0 -0
judge.py +473 -0

app.py CHANGED Viewed

@@ -1,15 +1,57 @@
 import gradio as gr
 import json, os, time, uuid
-from eval_utils import evaluate_submission, clean_submission
 import pandas as pd
 from rate_limiter import RateLimiter, RateLimitConfig
 from config import *
 def login_check(profile: gr.OAuthProfile | None):
     visible = profile is not None
     welcome = (
-        f"## 👋 Welcome, {profile.username}! Ready to submit?"
         if visible
         else "🔒 Please sign in to submit."
     )
@@ -22,7 +64,7 @@ def login_check(profile: gr.OAuthProfile | None):
         gr.Markdown(value=quota_details, visible=visible),
         gr.Textbox(visible=visible & SAVE_SUBMISSIONS),
         gr.File(visible=visible),
-        gr.Checkbox(visible=visible),
         gr.Checkbox(visible=visible & SAVE_SUBMISSIONS),
         gr.Button(visible=visible),
     )
@@ -53,7 +95,7 @@ def quoata_check(profile: gr.OAuthProfile | None):
 def submit(
     submission_id: str,
     submission_file: str,
-    is_cleaning: bool,
     is_private: bool,
     profile: gr.OAuthProfile | None,
 ):
@@ -71,17 +113,18 @@ def submit(
             prediction_json = json.load(file)
         except json.JSONDecodeError:
             raise gr.Error("❌ Submission file is invalid JSON.")
-    with open(GROUND_TRUTH, "rb") as file:
-        ground_truth_json = json.load(file)
     try:
-        if is_cleaning:
-            prediction_json = clean_submission(prediction_json)
-        score_dict = evaluate_submission(prediction_json, ground_truth_json)
     except Exception as e:
         print(e)
-        raise gr.Error(f"❌ Invalid submission format.")
     allowed, allowed_reason = limiter.is_allowed(username)
     status = limiter.get_status(username)
@@ -117,15 +160,29 @@ def submit(
         json.dump(data, open(os.path.join(SUB_DIR, submission_record + ".json"), "w"))
-    score_response = f"{score_dict}"
     return gr.Text(score_response, visible=True)
 def get_leaderboard() -> pd.DataFrame | str:
-    df = pd.read_csv("leaderboard.csv")
     df = df.sort_values(by="Total", ascending=False)
-    return df
 def get_quota(profile: gr.OAuthProfile | None = None):
@@ -137,18 +194,18 @@ with gr.Blocks() as app:
     if SHOW_LEADERBOARD:
         with gr.Tab("🏆 Public Leaderboard"):
             leaderboard_heading_md = gr.Markdown(
-                "TODO: Introduction to the benchmark and leaderboard"
             )
             leaderboard_table = gr.Dataframe(get_leaderboard())
             leaderboard_footer_md = gr.Markdown(
-                "TODO: Add instructions for the leaderboard, e.g. how to submit, evaluation criteria, etc."
             )
     if SHOW_EVAL_SERVER:
         with gr.Tab("🚀 Evaluation"):
             login_button = gr.LoginButton()
             welcome_md = gr.Markdown("🔒 Please sign in to submit.")
             welcome_details_md = gr.Markdown(
-                "TODO: add instructions for submission format and evaluation criteria.",
                 visible=False,
             )
             submission_file = gr.File(
@@ -157,17 +214,17 @@ with gr.Blocks() as app:
             submission_id = gr.Textbox(
                 label="(Optional) Submission identifier", visible=False
             )
-            clean_flag = gr.Checkbox(
-                label="Attempt to clean my submission (Recommended for raw responses)",
-                value=True,
-                visible=False,
-            )
             private_flag = gr.Checkbox(
                 label="Do not save my submission", value=False, visible=False
             )
             quota_details = gr.Markdown(visible=False)
             submit_btn = gr.Button("Submit", visible=False)
-            result = gr.Textbox(label="✅ Submission processed", visible=False)
             # Load login state → show/hide components
             app.load(
@@ -179,7 +236,7 @@ with gr.Blocks() as app:
                     quota_details,
                     submission_id,
                     submission_file,
-                    clean_flag,
                     private_flag,
                     submit_btn,
                 ],
@@ -187,7 +244,9 @@ with gr.Blocks() as app:
             futures = submit_btn.click(
                 fn=submit,
-                inputs=[submission_id, submission_file, clean_flag, private_flag],
                 outputs=[result],
             ).then(quoata_check, outputs=[quota_details])
@@ -197,9 +256,9 @@ with gr.Blocks() as app:
                     outputs=[leaderboard_table],
                 )
-    copyright = gr.Markdown(
-        "Based on the [gradio-eval-server-template](https://github.com/paulgavrikov/gradio-eval-server-template) by Paul Gavrikov."
-    )
 if __name__ == "__main__":
@@ -211,4 +270,7 @@ if __name__ == "__main__":
     )
     limiter = RateLimiter(config)
     app.launch()

 import gradio as gr
 import json, os, time, uuid
 import pandas as pd
 from rate_limiter import RateLimiter, RateLimitConfig
 from config import *
+from judge import build_question_index_from_json, judge
+import pprint
+SUBMISSION_TEXT = """Upload a json file with your predictions and click Submit. Your predictions should be a list of dictionaries, each containing an \"question_id\" field and a \"response\" field. For multiple choice questions, the \"response\" field should contain the predicted answer choice. For open-ended questions, the \"response\" field should contain the option letter (A-D). We will apply simple heuistics to clean the responses, but please ensure they are as accurate as possible.
+Example:
+```
+[
+    {\"question_id\": \"28deb79e\", \"response\": \"A\"},
+    {\"question_id\": \"73cbabd7\", \"response\": \"C\"},
+    ...
+]
+```
+Your file:"""
+INTRO_TEXT = """# Welcome to the VisualOverload Leaderboard!
+Below you will find the public leaderboard for the [VisualOverload benchmark](https://huggingface.co/datasets/paulgavrikov/visualoverload), which evaluates models on their ability to understand and reason about complex visual scenes. We seperate by models and 'special' inference techniques (e.g., special prompts, ICL, CoT etc.) to better understand the source of their performance.
+The leaderboard ranks models based on their overall accuracy across a six tasks (activity recognition, attribute recognition, counting, OCR, reasoning, and global scene recognition). We provide an aggregate score (Total) as well as individual scores on three distinct splits per difficulty (Easy, Medium, Hard), and each task."""
+INTRO_DETAILS = "Please see the evaluation tab for evaluation and details on how to list your results."
+def load_ground_truth():
+    from cryptography.fernet import Fernet
+    key = os.getenv("SECRET_KEY")
+    cipher = Fernet(key)
+    with open("ground_truth.secret", "rb") as f:
+        json_data = json.loads(cipher.decrypt(f.read().decode()))
+    hardness_levels = json_data["splits"]
+    df_gt = pd.DataFrame.from_dict(json_data["benchmark"])
+    df_gt.question_id = df_gt.question_id.astype(str)
+    df_gt = df_gt.set_index("question_id")
+    for level, ids in hardness_levels.items():
+        ids = [str(i) for i in ids]
+        df_gt.loc[ids, "difficulty"] = level
+    return df_gt.reset_index()
 def login_check(profile: gr.OAuthProfile | None):
     visible = profile is not None
     welcome = (
+        f"## Welcome to the evaluation server @{profile.username} 👋"
         if visible
         else "🔒 Please sign in to submit."
     )
         gr.Markdown(value=quota_details, visible=visible),
         gr.Textbox(visible=visible & SAVE_SUBMISSIONS),
         gr.File(visible=visible),
+        # gr.Checkbox(visible=visible),
         gr.Checkbox(visible=visible & SAVE_SUBMISSIONS),
         gr.Button(visible=visible),
     )
 def submit(
     submission_id: str,
     submission_file: str,
+    # is_cleaning: bool,
     is_private: bool,
     profile: gr.OAuthProfile | None,
 ):
             prediction_json = json.load(file)
         except json.JSONDecodeError:
             raise gr.Error("❌ Submission file is invalid JSON.")
     try:
+        # if is_cleaning:
+        #     prediction_json = clean_submission(prediction_json)
+        # score_dict = evaluate_submission(prediction_json, ground_truth_json)
+        _, score_dict = judge(prediction_json, question_index)
+        score_dict = {k: round(v * 100, 1) for k, v in score_dict.items() if k.startswith("accuracy/")}
     except Exception as e:
         print(e)
+        raise gr.Error(f"❌ Invalid submission format. Check logs for details.")
     allowed, allowed_reason = limiter.is_allowed(username)
     status = limiter.get_status(username)
         json.dump(data, open(os.path.join(SUB_DIR, submission_record + ".json"), "w"))
+    score_response = f"""
+Your submission has been evaluated!
+```
+{pprint.pformat(score_dict, indent=4, sort_dicts=False)}
+```
+If you want your submission to appear on the public leaderboard, please follow the instructions to open a ticket at [https://github.com/paulgavrikov/visualoverload/issues](https://github.com/paulgavrikov/visualoverload/issues).
+"""
     return gr.Text(score_response, visible=True)
 def get_leaderboard() -> pd.DataFrame | str:
+    df = pd.read_csv("leaderboard.csv").set_index(["Model", "Special Inference"])
     df = df.sort_values(by="Total", ascending=False)
+    df = df.reset_index()
+    float_cols = df.select_dtypes(include=["float"]).columns
+    styler = df.style.format('{:.1f}', subset=float_cols)
+    return styler
 def get_quota(profile: gr.OAuthProfile | None = None):
     if SHOW_LEADERBOARD:
         with gr.Tab("🏆 Public Leaderboard"):
             leaderboard_heading_md = gr.Markdown(
+                INTRO_TEXT
             )
             leaderboard_table = gr.Dataframe(get_leaderboard())
             leaderboard_footer_md = gr.Markdown(
+                INTRO_DETAILS
             )
     if SHOW_EVAL_SERVER:
         with gr.Tab("🚀 Evaluation"):
             login_button = gr.LoginButton()
             welcome_md = gr.Markdown("🔒 Please sign in to submit.")
             welcome_details_md = gr.Markdown(
+                SUBMISSION_TEXT,
                 visible=False,
             )
             submission_file = gr.File(
             submission_id = gr.Textbox(
                 label="(Optional) Submission identifier", visible=False
             )
+            # clean_flag = gr.Checkbox(
+            #     label="Attempt to clean my submission (Recommended for raw responses)",
+            #     value=True,
+            #     visible=False,
+            # )
             private_flag = gr.Checkbox(
                 label="Do not save my submission", value=False, visible=False
             )
             quota_details = gr.Markdown(visible=False)
             submit_btn = gr.Button("Submit", visible=False)
+            result = gr.Markdown(label="✅ Submission processed", visible=False)
             # Load login state → show/hide components
             app.load(
                     quota_details,
                     submission_id,
                     submission_file,
+                    # clean_flag,
                     private_flag,
                     submit_btn,
                 ],
             futures = submit_btn.click(
                 fn=submit,
+                inputs=[submission_id, submission_file,
+                        # clean_flag,
+                        private_flag],
                 outputs=[result],
             ).then(quoata_check, outputs=[quota_details])
                     outputs=[leaderboard_table],
                 )
+    # copyright = gr.Markdown(
+    #     "Based on the [gradio-eval-server-template](https://github.com/paulgavrikov/gradio-eval-server-template) by Paul Gavrikov."
+    # )
 if __name__ == "__main__":
     )
     limiter = RateLimiter(config)
+    df_ground_truth = load_ground_truth()
+    question_index = build_question_index_from_json(df_ground_truth.to_dict(orient="records"))  # TODO: this should be precomputed once and reused
     app.launch()

config.py CHANGED Viewed

@@ -5,9 +5,9 @@ SUB_DIR = "./submissions"
 # Wait time for each submission in seconds, or 0 to disable minimum wait time
 RATE_LIMIT_MIN_INT_SEC = 10
 # Maximum total number of submissions per user, or 0 for no limit
-MAX_TOTAL_SUBMISSIONS_PER_USER = 40
 # Maxmimum number of submissions per user per day, or 0 for no limit
-MAX_SUBMISSIONS_PER_USER_PER_DAY = 10
 # Save submissions
 SAVE_SUBMISSIONS = False
 # Save submissions

 # Wait time for each submission in seconds, or 0 to disable minimum wait time
 RATE_LIMIT_MIN_INT_SEC = 10
 # Maximum total number of submissions per user, or 0 for no limit
+MAX_TOTAL_SUBMISSIONS_PER_USER = 0
 # Maxmimum number of submissions per user per day, or 0 for no limit
+MAX_SUBMISSIONS_PER_USER_PER_DAY = 5
 # Save submissions
 SAVE_SUBMISSIONS = False
 # Save submissions

eval_utils.py DELETED Viewed

@@ -1,40 +0,0 @@
-from collections import defaultdict
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-def clean_submission(preds):
-    cleaned_preds = preds
-    # TODO: Implement your cleaning logic here
-    return cleaned_preds
-def evaluate_submission(preds, ground_truth) -> dict:
-    # TODO: Implement your evaluation logic
-    accuracy_meter = defaultdict(AverageMeter)
-    accuracy_meter["partition1"].update(0.43)
-    accuracy_meter["partition2"].update(0.42)
-    accuracy_meter["partition3"].update(0.99)
-    return dict([(f"accuracy/{k}", 100 * v.avg) for k, v in accuracy_meter.items()])

ground_truth.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {}

ground_truth.secret ADDED Viewed

The diff for this file is too large to render. See raw diff

judge.py ADDED Viewed

	@@ -0,0 +1,473 @@

+import argparse
+import json
+from glob import glob
+import math
+from collections import defaultdict
+import re
+import pandas as pd
+def make_option_labels(options):
+    option_labels = []
+    for i, option in enumerate(options):
+        option_labels.append(f"{chr(65 + i)}. {option.strip()}")
+    return option_labels
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value.
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+def clean_counting_answer(answer):
+    if answer is None or answer.strip() == "":
+        return 0
+    answer_matches = re.findall(r"<answer>(.*?)</answer>", answer, flags=re.DOTALL)
+    if len(answer_matches) > 0:
+        answer = answer_matches[0]
+    clean_answer = answer.strip().lower()
+    if clean_answer.endswith("."):
+        clean_answer = clean_answer[:-1].strip()
+    if clean_answer in words_to_int:
+        clean_answer = words_to_int[clean_answer]
+    else:
+        try:
+            clean_answer = int(round(float(clean_answer)))
+        except ValueError:
+            # or pick the LAST number in the string
+            match = re.search(r'\d+(?:\.\d+)?(?=(?![\s\S]*\d))', clean_answer)
+            if match:
+                clean_answer = int(round(float(match.group(0))))
+            else:
+                matched = False
+                # or pick the FIRST spelled out number in the string
+                for word, number in words_to_int.items():
+                    if word in clean_answer:
+                        clean_answer = number
+                        matched = True
+                        break
+                if not matched:
+                    print(f"WARNING: Unable to convert answer '{answer}' to int.")
+                    clean_answer = 0
+    return clean_answer
+def clean_multiple_choice_answer(answer, options):
+    if answer is None:
+        return ""
+    answer_matches = re.findall(r"<answer>(.*?)</answer>", answer, flags=re.DOTALL)
+    if len(answer_matches) > 0:
+        answer = answer_matches[0]
+    clean_answer = answer.strip()
+    if answer.startswith("Answer:"):
+        clean_answer = clean_answer.replace("Answer:", "").strip()
+    if answer.startswith("The answer is "):
+        clean_answer = clean_answer.replace("The answer is ", "").strip()
+    if answer.startswith("The best answer is "):
+        clean_answer = clean_answer.replace("The best answer is ", "").strip()
+    if answer.endswith("."):
+        clean_answer = clean_answer[:-1].strip()
+    if len(clean_answer) > 1:
+        # If the answer is longer than one character, we assume it may contain the full label, e.g. "A. option text"
+        for option in options:
+            if option in clean_answer:
+                clean_answer = option[0]
+                break
+    if len(clean_answer) > 1:
+        # If the answer is still longer than one character, we assume it is a short label, e.g. "A" or "B"
+        for option in options:
+            if option[0] in clean_answer:
+                clean_answer = option[0]
+                break
+    return clean_answer
+def clean_ocr_answer(answer):
+    answer_matches = re.findall(r"<answer>(.*?)</answer>", answer, flags=re.DOTALL)
+    if len(answer_matches) > 0:
+        answer = answer_matches[0]
+    clean_answer = answer.strip()
+    clean_answer = extract_text_from_quotes(clean_answer)
+    clean_answer = clean_text(clean_answer)
+    return clean_answer
+def validate_choice_answer(answer, benchmark_truth):
+    all_options = make_option_labels(benchmark_truth["options"])
+     # clean the reponse
+    clean_answer = clean_multiple_choice_answer(answer["response"], all_options)
+    # map the truth to option key
+    correct_option_index = benchmark_truth["options"].index(benchmark_truth["ground_truth"])
+    correct_option_enum = all_options[correct_option_index][0]
+    return (clean_answer == correct_option_enum)
+words_to_int = {
+    "zero": 0,
+    "one": 1,
+    "two": 2,
+    "three": 3,
+    "four": 4,
+    "five": 5,
+    "six": 6,
+    "seven": 7,
+    "eight": 8,
+    "nine": 9,
+    "ten": 10,
+    "eleven": 11,
+    "twelve": 12,
+    "thirteen": 13,
+    "fourteen": 14,
+    "fifteen": 15,
+    "sixteen": 16,
+    "seventeen": 17,
+    "eighteen": 18,
+    "nineteen": 19,
+    "twenty": 20,
+    "thirty": 30,
+    "forty": 40,
+    "fifty": 50,
+    "sixty": 60,
+    "seventy": 70,
+    "eighty": 80,
+    "ninety": 90,
+    "hundred": 100,
+    "thousand": 1000,
+    "million": 1000000,
+}
+def clean_text(s):
+    replace_characters = {
+        "ä": "a",
+        "á": "a",
+        "à": "a",
+        "â": "a",
+        "ã": "a",
+        "å": "a",
+        "ā": "a",
+        "ö": "o",
+        "ó": "o",
+        "ò": "o",
+        "ô": "o",
+        "õ": "o",
+        "ō": "o",
+        "ü": "u",
+        "ú": "u",
+        "ù": "u",
+        "û": "u",
+        "ū": "u",
+        "é": "e",
+        "ĕ": "e",
+        "ė": "e",
+        "ę": "e",
+        "ě": "e",
+        "ç": "c",
+        "ć": "c",
+        "č": "c",
+        "ñ": "n",
+        "ń": "n",
+        "ń": "n",
+        "ł": "l",
+        "ś": "s",
+        "š": "s",
+        "ź": "z",
+        "ż": "z",
+        "ý": "y",
+        "ŷ": "y",
+        "ÿ": "y",
+        "œ": "oe",
+        "æ": "ae",
+        "v": "u"
+    }
+    s = s.strip().lower()
+    # replace special characters
+    for char, replacement in replace_characters.items():
+        s = s.replace(char, replacement)
+    s = s.replace("\t", "").replace("\n", "").replace(".", "").replace("&", "").replace(";", "")\
+        .replace(",", "").replace("-", "").replace("–", "").replace("’", "'").replace(":", "").replace("·", " ")\
+            .replace("'", "").replace("“", "").replace("”", "").replace('"', "").replace("•", " ")\
+                .replace("  ", " ")
+    return s
+def extract_text_from_quotes(s):
+    pattern = r"'([^']*?)\"|\"([^\"]*?)\"|`([^`]*?)`|“([^”]*?)”|‘([^’]*?)’"
+    matches = re.findall(pattern, s)
+    if matches:
+        # Extract and return the single letter
+        matched_group = [match for group in matches for match in group if match][0]
+        return matched_group
+    else:
+        # Return the original string if no match is found
+        return s
+def main(args):
+    try:
+        from pprint import pprint as print
+    except ImportError:
+        pass
+    question_index = build_question_index_from_file(args.benchmark_file)
+    for file in glob(args.results_file):
+        print(f"Judging {file}")
+        _, scores = judge_file(file, question_index)
+        print(scores)
+def build_question_index_from_file(benchmark_file):
+    with open(benchmark_file, "r") as f:
+        json_data = json.load(f)
+    hardness_levels = json_data["splits"]
+    df_gt = pd.DataFrame.from_dict(json_data["benchmark"]).set_index("question_id")
+    for level, ids in hardness_levels.items():
+        df_gt.loc[ids, "difficulty"] = level
+    return build_question_index_from_json(df_gt.reset_index().to_dict(orient="records"))
+def build_question_index_from_json(benchmark_data):
+    question_index = {}
+    for question in benchmark_data:
+        question_id = question["question_id"]
+        question_index[question_id] = question
+    return question_index
+def judge_file(results_file, question_index):
+    with open(results_file, "r") as f:
+        results_data = json.load(f)
+        for answer in results_data:
+            answer["question_id"] = str(answer["question_id"]) # ensure question_id is string
+    return judge(results_data, question_index)
+def judge(results_data, question_index):
+    answer_index = {}
+    for answer in results_data:
+        answer_index[answer["question_id"]] = answer
+    non_answered_questions = set(question_index.keys()) - set(answer_index.keys())
+    excessive_answers = set(answer_index.keys()) - set(question_index.keys())
+    if len(non_answered_questions) > 0:
+        print("WARNING: Some question IDs in benchmark data are not found in results file:")
+        print(non_answered_questions)
+        results_data = results_data + [{"question_id": qid, "response": ""} for qid in non_answered_questions]
+    else:
+        print("All question IDs in benchmark data are found in results file.")
+    if len(excessive_answers) > 0:
+        print("WARNING: Some question IDs in results file are not found in benchmark data:")
+        print(excessive_answers)
+        print("These questions will be ignored in the evaluation.")
+        results_data = [answer for answer in results_data if answer["question_id"] in question_index]
+    else:
+        print("All question IDs in results file are found in benchmark data.")
+    print()
+    accuracy_meters = defaultdict(AverageMeter)
+    # process counting data and compute accuracy and MAE
+    correct = 0
+    total = 0
+    mae = 0
+    mse = 0
+    for answer in results_data:
+        if answer["question_id"] not in question_index:
+            continue
+        benchmark_truth = question_index[answer["question_id"]]
+        if benchmark_truth["question_type"] != "counting":
+            continue
+        clean_answer = clean_counting_answer(answer["response"])
+        gt = benchmark_truth["ground_truth"]
+        difference = abs(clean_answer - gt)
+        mae += difference
+        mse += difference ** 2
+        is_correct = (clean_answer == gt)
+        correct_count = 1 if is_correct else 0
+        answer["judge/correct"] = is_correct
+        answer["judge/extracted_answer"] = clean_answer
+        correct += correct_count
+        total += 1
+        accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+    # process OCR data and compute accuracy and ESD
+    correct = 0
+    total = 0
+    for answer in results_data:
+        if answer["question_id"] not in question_index:
+            continue
+        benchmark_truth = question_index[answer["question_id"]]
+        if benchmark_truth["question_type"] != "ocr":
+            continue
+        # clean the reponse
+        clean_answer = clean_ocr_answer(answer["response"])
+        clean_gt = clean_text(benchmark_truth["ground_truth"])
+        answer["judge/extracted_answer"] = clean_answer
+        is_correct = clean_answer == clean_gt
+        correct_count = 1 if is_correct else 0
+        answer["judge/correct"] = is_correct
+        correct += correct_count
+        total += 1
+        accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+    # process multiple choice data without binary options and compute accuracy
+    correct = 0
+    total = 0
+    for answer in results_data:
+        if answer["question_id"] not in question_index:
+            continue
+        benchmark_truth = question_index[answer["question_id"]]
+        if benchmark_truth["question_type"] != "choice" or len(benchmark_truth["options"]) != 4:
+            continue
+        is_correct = validate_choice_answer(answer, benchmark_truth)
+        correct_count = 1 if is_correct else 0
+        answer["judge/correct"] = is_correct
+        correct += correct_count
+        total += 1
+        accuracy_meters[benchmark_truth["source_file"]].update(correct_count)
+    # process binary choice data and compute accuracy
+    correct = 0
+    total = 0
+    for answer in results_data:
+        if answer["question_id"] not in question_index:
+            continue
+        benchmark_truth = question_index[answer["question_id"]]
+        if benchmark_truth["question_type"] != "choice" or len(benchmark_truth["options"]) != 2:
+            continue
+        is_correct = validate_choice_answer(answer, benchmark_truth)
+        correct_count = 1 if is_correct else 0
+        answer["judge/correct"] = is_correct
+        correct += correct_count
+        total += 1
+    # process binary choice data with correction and compute accuracy
+    correct = 0
+    total = 0
+    opposite_error_pairs = []
+    for answer in results_data:
+        if answer["question_id"] not in question_index:
+            continue
+        benchmark_truth = question_index[answer["question_id"]]
+        # skip if the question is not a binary choice or does not have an opposite
+        if benchmark_truth["question_type"] != "choice" or len(benchmark_truth["options"]) != 2 or pd.isna(benchmark_truth["opposite_of"]):
+            continue
+        is_correct = validate_choice_answer(answer, benchmark_truth)
+        if benchmark_truth["opposite_of"] in answer_index:
+            is_opposite_correct = validate_choice_answer(answer_index[benchmark_truth["opposite_of"]], question_index[benchmark_truth["opposite_of"]])
+            answer_index[benchmark_truth["opposite_of"]]["judge/correct"] = is_opposite_correct
+        else:
+            is_opposite_correct = False
+        answer["judge/correct"] = is_correct
+        if is_correct and is_opposite_correct:
+            correct += 1
+            accuracy_meters[benchmark_truth["source_file"]].update(1)
+        else:
+            opposite_error_pairs.append((answer["question_id"], benchmark_truth["opposite_of"]))
+            accuracy_meters[benchmark_truth["source_file"]].update(0)
+        total += 1
+    df_preds = pd.DataFrame(results_data).set_index("question_id")
+    df_gt = pd.DataFrame.from_dict(question_index).T.set_index("question_id")
+    df = df_preds.join(df_gt)
+    scores = {
+        "is_complete": len(non_answered_questions) == 0,
+        "is_excessive": len(excessive_answers) > 0,
+        **dict([
+            ("accuracy/" + k.replace(".csv", ""), (v.sum/v.count)) for k, v in accuracy_meters.items()
+        ]),
+        "accuracy/easy": df.query("difficulty == 'easy'")["judge/correct"].mean(),
+        "accuracy/medium": df.query("difficulty == 'medium'")["judge/correct"].mean(),
+        "accuracy/hard": df.query("difficulty == 'hard'")["judge/correct"].mean(),
+        "accuracy/total": df["judge/correct"].mean(),
+    }
+    return results_data, scores