wip
Browse files- data/examples/example1.json +0 -1
- data/examples/example2.json +0 -1
- data/examples/example3.json +0 -1
- data/judges/judge1.json +0 -1
- data/judges/judge2.json +0 -1
- data/judges/judge3.json +0 -1
- data/judges/judge4.json +0 -1
- src/app.py +36 -5
- src/data_manager.py +10 -3
data/examples/example1.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "example1", "input": "Write a poem about the ocean.", "output": "The waves crash and foam,\nSalt spray fills the air like mist,\nOcean breathes deeply."}
|
|
|
|
|
|
data/examples/example2.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "example2", "input": "Explain how photosynthesis works.", "output": "Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen. The chlorophyll in plant cells captures light energy, which is then used to convert CO2 and water into glucose, releasing oxygen as a byproduct."}
|
|
|
|
|
|
data/examples/example3.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "example3", "input": "Solve this math problem: If x + y = 10 and x - y = 4, what are x and y?", "output": "To solve this system of equations:\nx + y = 10\nx - y = 4\n\nAdd these equations:\n2x = 14\nx = 7\n\nSubstitute back:\n7 + y = 10\ny = 3\n\nTherefore, x = 7 and y = 3."}
|
|
|
|
|
|
data/judges/judge1.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "judge1", "name": "EvalGPT", "description": "A comprehensive evaluation model focused on accuracy and completeness"}
|
|
|
|
|
|
data/judges/judge2.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "judge2", "name": "CritiqueBot", "description": "An evaluation model specializing in identifying factual errors"}
|
|
|
|
|
|
data/judges/judge3.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "judge3", "name": "GradeAssist", "description": "A holistic evaluation model that balances substance and style"}
|
|
|
|
|
|
data/judges/judge4.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"id": "judge4", "name": "PrecisionJudge", "description": "A technical evaluator that emphasizes precision and correctness"}
|
|
|
|
|
|
src/app.py
CHANGED
|
@@ -120,7 +120,16 @@ def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
|
|
| 120 |
except Exception as e:
|
| 121 |
logger.error(f"Error getting example: {e}")
|
| 122 |
# Return empty strings for all fields
|
| 123 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def submit_example(
|
|
@@ -212,7 +221,13 @@ def get_evaluation1(
|
|
| 212 |
|
| 213 |
# Format inputs based on test type
|
| 214 |
input_text, output_text = format_inputs_for_evaluation(
|
| 215 |
-
text_input,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
)
|
| 217 |
|
| 218 |
# Get evaluation from the first judge
|
|
@@ -249,13 +264,29 @@ def get_evaluation2(
|
|
| 249 |
|
| 250 |
try:
|
| 251 |
if not selected_judges or len(selected_judges) < 2:
|
| 252 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
logger.info(
|
|
|
|
|
|
|
| 255 |
|
| 256 |
# Format inputs based on test type
|
| 257 |
input_text, output_text = format_inputs_for_evaluation(
|
| 258 |
-
text_input,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
)
|
| 260 |
|
| 261 |
# Get evaluation from the second judge
|
|
|
|
| 120 |
except Exception as e:
|
| 121 |
logger.error(f"Error getting example: {e}")
|
| 122 |
# Return empty strings for all fields
|
| 123 |
+
return (
|
| 124 |
+
"",
|
| 125 |
+
"",
|
| 126 |
+
"",
|
| 127 |
+
"",
|
| 128 |
+
"",
|
| 129 |
+
"",
|
| 130 |
+
"",
|
| 131 |
+
"",
|
| 132 |
+
)
|
| 133 |
|
| 134 |
|
| 135 |
def submit_example(
|
|
|
|
| 221 |
|
| 222 |
# Format inputs based on test type
|
| 223 |
input_text, output_text = format_inputs_for_evaluation(
|
| 224 |
+
text_input,
|
| 225 |
+
claim_input,
|
| 226 |
+
single_text_input,
|
| 227 |
+
policy_input,
|
| 228 |
+
policy_output,
|
| 229 |
+
policy_assertion,
|
| 230 |
+
test_type,
|
| 231 |
)
|
| 232 |
|
| 233 |
# Get evaluation from the first judge
|
|
|
|
| 264 |
|
| 265 |
try:
|
| 266 |
if not selected_judges or len(selected_judges) < 2:
|
| 267 |
+
return (
|
| 268 |
+
"No judges selected",
|
| 269 |
+
gr.update(
|
| 270 |
+
visible=False,
|
| 271 |
+
),
|
| 272 |
+
gr.update(
|
| 273 |
+
visible=False,
|
| 274 |
+
),
|
| 275 |
+
)
|
| 276 |
|
| 277 |
+
logger.info(
|
| 278 |
+
f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
|
| 279 |
+
)
|
| 280 |
|
| 281 |
# Format inputs based on test type
|
| 282 |
input_text, output_text = format_inputs_for_evaluation(
|
| 283 |
+
text_input,
|
| 284 |
+
claim_input,
|
| 285 |
+
single_text_input,
|
| 286 |
+
policy_input,
|
| 287 |
+
policy_output,
|
| 288 |
+
policy_assertion,
|
| 289 |
+
test_type,
|
| 290 |
)
|
| 291 |
|
| 292 |
# Get evaluation from the second judge
|
src/data_manager.py
CHANGED
|
@@ -26,7 +26,10 @@ class DatasetManager:
|
|
| 26 |
test_type_kebab = test_type.replace(" ", "-")
|
| 27 |
dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
|
| 28 |
logger.info(f"Loading dataset: {dataset_name}")
|
| 29 |
-
self.datasets[test_type] = load_dataset(
|
|
|
|
|
|
|
|
|
|
| 30 |
dataset_names.append(dataset_name)
|
| 31 |
except Exception as e:
|
| 32 |
logger.error(f"Failed to load dataset {dataset_name}: {e}")
|
|
@@ -65,7 +68,9 @@ def load_models() -> List[Dict[str, Any]]:
|
|
| 65 |
try:
|
| 66 |
models.append(json.loads(line))
|
| 67 |
except json.JSONDecodeError as json_err:
|
| 68 |
-
logger.warning(
|
|
|
|
|
|
|
| 69 |
return models
|
| 70 |
except Exception as e:
|
| 71 |
logger.error(f"Error loading models: {e}")
|
|
@@ -83,7 +88,9 @@ def get_random_example(test_type: str) -> Dict[str, str]:
|
|
| 83 |
try:
|
| 84 |
dataset_name = DATASET_MAPPING.get(test_type)
|
| 85 |
if not dataset_name:
|
| 86 |
-
logger.warning(
|
|
|
|
|
|
|
| 87 |
return {
|
| 88 |
"text": f"Sample text for {test_type}",
|
| 89 |
"claim": f"Sample claim for {test_type}",
|
|
|
|
| 26 |
test_type_kebab = test_type.replace(" ", "-")
|
| 27 |
dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
|
| 28 |
logger.info(f"Loading dataset: {dataset_name}")
|
| 29 |
+
self.datasets[test_type] = load_dataset(
|
| 30 |
+
dataset_name,
|
| 31 |
+
split="train",
|
| 32 |
+
)
|
| 33 |
dataset_names.append(dataset_name)
|
| 34 |
except Exception as e:
|
| 35 |
logger.error(f"Failed to load dataset {dataset_name}: {e}")
|
|
|
|
| 68 |
try:
|
| 69 |
models.append(json.loads(line))
|
| 70 |
except json.JSONDecodeError as json_err:
|
| 71 |
+
logger.warning(
|
| 72 |
+
f"Skipping invalid JSON in line: {line}. Error: {json_err}",
|
| 73 |
+
)
|
| 74 |
return models
|
| 75 |
except Exception as e:
|
| 76 |
logger.error(f"Error loading models: {e}")
|
|
|
|
| 88 |
try:
|
| 89 |
dataset_name = DATASET_MAPPING.get(test_type)
|
| 90 |
if not dataset_name:
|
| 91 |
+
logger.warning(
|
| 92 |
+
f"No dataset mapping found for test type: {test_type}",
|
| 93 |
+
)
|
| 94 |
return {
|
| 95 |
"text": f"Sample text for {test_type}",
|
| 96 |
"claim": f"Sample claim for {test_type}",
|