Spaces:

UIIAmerica
/

MedVidBench-Leaderboard

Sleeping

MedGRPO Team commited on 12 days ago

Commit

2bd924c

1 Parent(s): ebf8102

Filter DVC/VS/RC from tasks list when skip-llm-judge is set

Files changed (1) hide show

evaluation/evaluate_all_pai.py CHANGED Viewed

@@ -773,6 +773,14 @@ def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=
     print(f"\nRunning evaluation for tasks: {tasks}", flush=True)
     print(f"Total tasks to evaluate: {len(tasks)}", flush=True)
     # Dictionary to store all evaluation results
     all_task_results = {}

     print(f"\nRunning evaluation for tasks: {tasks}", flush=True)
     print(f"Total tasks to evaluate: {len(tasks)}", flush=True)
+    # Filter out LLM judge tasks if skip flag is set
+    if skip_llm_judge:
+        original_tasks = tasks.copy()
+        tasks = [t for t in tasks if t not in ['dvc', 'vs', 'rc']]
+        if len(tasks) < len(original_tasks):
+            print(f"Skipping LLM judge tasks: {[t for t in original_tasks if t not in tasks]}", flush=True)
+            print(f"Evaluating {len(tasks)} tasks: {tasks}", flush=True)
     # Dictionary to store all evaluation results
     all_task_results = {}