EvalArena

Running

dror44 commited on Apr 24, 2025

Commit

5bed6f3

1 Parent(s): 3df66f9

wip

Files changed (9) hide show

data/examples/example1.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "example1", "input": "Write a poem about the ocean.", "output": "The waves crash and foam,\nSalt spray fills the air like mist,\nOcean breathes deeply."}

data/examples/example2.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "example2", "input": "Explain how photosynthesis works.", "output": "Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen. The chlorophyll in plant cells captures light energy, which is then used to convert CO2 and water into glucose, releasing oxygen as a byproduct."}

data/examples/example3.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "example3", "input": "Solve this math problem: If x + y = 10 and x - y = 4, what are x and y?", "output": "To solve this system of equations:\nx + y = 10\nx - y = 4\n\nAdd these equations:\n2x = 14\nx = 7\n\nSubstitute back:\n7 + y = 10\ny = 3\n\nTherefore, x = 7 and y = 3."}

data/judges/judge1.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "judge1", "name": "EvalGPT", "description": "A comprehensive evaluation model focused on accuracy and completeness"}

data/judges/judge2.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "judge2", "name": "CritiqueBot", "description": "An evaluation model specializing in identifying factual errors"}

data/judges/judge3.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "judge3", "name": "GradeAssist", "description": "A holistic evaluation model that balances substance and style"}

data/judges/judge4.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"id": "judge4", "name": "PrecisionJudge", "description": "A technical evaluator that emphasizes precision and correctness"}

src/app.py CHANGED Viewed

@@ -120,7 +120,16 @@ def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
     except Exception as e:
         logger.error(f"Error getting example: {e}")
         # Return empty strings for all fields
-        return "", "", "", "", "", "", "", ""
 def submit_example(
@@ -212,7 +221,13 @@ def get_evaluation1(
         # Format inputs based on test type
         input_text, output_text = format_inputs_for_evaluation(
-            text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type
         )
         # Get evaluation from the first judge
@@ -249,13 +264,29 @@ def get_evaluation2(
     try:
         if not selected_judges or len(selected_judges) < 2:
-            return "No judges selected", gr.update(visible=False), gr.update(visible=False)
-        logger.info(f"Starting evaluation 2 with judge {selected_judges[1]['name']}")
         # Format inputs based on test type
         input_text, output_text = format_inputs_for_evaluation(
-            text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type
         )
         # Get evaluation from the second judge

     except Exception as e:
         logger.error(f"Error getting example: {e}")
         # Return empty strings for all fields
+        return (
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+            "",
+        )
 def submit_example(
         # Format inputs based on test type
         input_text, output_text = format_inputs_for_evaluation(
+            text_input,
+            claim_input,
+            single_text_input,
+            policy_input,
+            policy_output,
+            policy_assertion,
+            test_type,
         )
         # Get evaluation from the first judge
     try:
         if not selected_judges or len(selected_judges) < 2:
+            return (
+                "No judges selected",
+                gr.update(
+                    visible=False,
+                ),
+                gr.update(
+                    visible=False,
+                ),
+            )
+        logger.info(
+            f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
+        )
         # Format inputs based on test type
         input_text, output_text = format_inputs_for_evaluation(
+            text_input,
+            claim_input,
+            single_text_input,
+            policy_input,
+            policy_output,
+            policy_assertion,
+            test_type,
         )
         # Get evaluation from the second judge

src/data_manager.py CHANGED Viewed

@@ -26,7 +26,10 @@ class DatasetManager:
                 test_type_kebab = test_type.replace(" ", "-")
                 dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
                 logger.info(f"Loading dataset: {dataset_name}")
-                self.datasets[test_type] = load_dataset(dataset_name, split="train")
                 dataset_names.append(dataset_name)
             except Exception as e:
                 logger.error(f"Failed to load dataset {dataset_name}: {e}")
@@ -65,7 +68,9 @@ def load_models() -> List[Dict[str, Any]]:
                     try:
                         models.append(json.loads(line))
                     except json.JSONDecodeError as json_err:
-                        logger.warning(f"Skipping invalid JSON in line: {line}. Error: {json_err}")
         return models
     except Exception as e:
         logger.error(f"Error loading models: {e}")
@@ -83,7 +88,9 @@ def get_random_example(test_type: str) -> Dict[str, str]:
     try:
         dataset_name = DATASET_MAPPING.get(test_type)
         if not dataset_name:
-            logger.warning(f"No dataset mapping found for test type: {test_type}")
             return {
                 "text": f"Sample text for {test_type}",
                 "claim": f"Sample claim for {test_type}",

                 test_type_kebab = test_type.replace(" ", "-")
                 dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
                 logger.info(f"Loading dataset: {dataset_name}")
+                self.datasets[test_type] = load_dataset(
+                    dataset_name,
+                    split="train",
+                )
                 dataset_names.append(dataset_name)
             except Exception as e:
                 logger.error(f"Failed to load dataset {dataset_name}: {e}")
                     try:
                         models.append(json.loads(line))
                     except json.JSONDecodeError as json_err:
+                        logger.warning(
+                            f"Skipping invalid JSON in line: {line}. Error: {json_err}",
+                        )
         return models
     except Exception as e:
         logger.error(f"Error loading models: {e}")
     try:
         dataset_name = DATASET_MAPPING.get(test_type)
         if not dataset_name:
+            logger.warning(
+                f"No dataset mapping found for test type: {test_type}",
+            )
             return {
                 "text": f"Sample text for {test_type}",
                 "claim": f"Sample claim for {test_type}",