Spaces:
Sleeping
Sleeping
MedGRPO Team
commited on
Commit
·
2bd924c
1
Parent(s):
ebf8102
Filter DVC/VS/RC from tasks list when skip-llm-judge is set
Browse files
evaluation/evaluate_all_pai.py
CHANGED
|
@@ -773,6 +773,14 @@ def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=
|
|
| 773 |
print(f"\nRunning evaluation for tasks: {tasks}", flush=True)
|
| 774 |
print(f"Total tasks to evaluate: {len(tasks)}", flush=True)
|
| 775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
# Dictionary to store all evaluation results
|
| 777 |
all_task_results = {}
|
| 778 |
|
|
|
|
| 773 |
print(f"\nRunning evaluation for tasks: {tasks}", flush=True)
|
| 774 |
print(f"Total tasks to evaluate: {len(tasks)}", flush=True)
|
| 775 |
|
| 776 |
+
# Filter out LLM judge tasks if skip flag is set
|
| 777 |
+
if skip_llm_judge:
|
| 778 |
+
original_tasks = tasks.copy()
|
| 779 |
+
tasks = [t for t in tasks if t not in ['dvc', 'vs', 'rc']]
|
| 780 |
+
if len(tasks) < len(original_tasks):
|
| 781 |
+
print(f"Skipping LLM judge tasks: {[t for t in original_tasks if t not in tasks]}", flush=True)
|
| 782 |
+
print(f"Evaluating {len(tasks)} tasks: {tasks}", flush=True)
|
| 783 |
+
|
| 784 |
# Dictionary to store all evaluation results
|
| 785 |
all_task_results = {}
|
| 786 |
|