dror44 commited on
Commit
5bed6f3
·
1 Parent(s): 3df66f9
data/examples/example1.json DELETED
@@ -1 +0,0 @@
1
- {"id": "example1", "input": "Write a poem about the ocean.", "output": "The waves crash and foam,\nSalt spray fills the air like mist,\nOcean breathes deeply."}
 
 
data/examples/example2.json DELETED
@@ -1 +0,0 @@
1
- {"id": "example2", "input": "Explain how photosynthesis works.", "output": "Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen. The chlorophyll in plant cells captures light energy, which is then used to convert CO2 and water into glucose, releasing oxygen as a byproduct."}
 
 
data/examples/example3.json DELETED
@@ -1 +0,0 @@
1
- {"id": "example3", "input": "Solve this math problem: If x + y = 10 and x - y = 4, what are x and y?", "output": "To solve this system of equations:\nx + y = 10\nx - y = 4\n\nAdd these equations:\n2x = 14\nx = 7\n\nSubstitute back:\n7 + y = 10\ny = 3\n\nTherefore, x = 7 and y = 3."}
 
 
data/judges/judge1.json DELETED
@@ -1 +0,0 @@
1
- {"id": "judge1", "name": "EvalGPT", "description": "A comprehensive evaluation model focused on accuracy and completeness"}
 
 
data/judges/judge2.json DELETED
@@ -1 +0,0 @@
1
- {"id": "judge2", "name": "CritiqueBot", "description": "An evaluation model specializing in identifying factual errors"}
 
 
data/judges/judge3.json DELETED
@@ -1 +0,0 @@
1
- {"id": "judge3", "name": "GradeAssist", "description": "A holistic evaluation model that balances substance and style"}
 
 
data/judges/judge4.json DELETED
@@ -1 +0,0 @@
1
- {"id": "judge4", "name": "PrecisionJudge", "description": "A technical evaluator that emphasizes precision and correctness"}
 
 
src/app.py CHANGED
@@ -120,7 +120,16 @@ def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
120
  except Exception as e:
121
  logger.error(f"Error getting example: {e}")
122
  # Return empty strings for all fields
123
- return "", "", "", "", "", "", "", ""
 
 
 
 
 
 
 
 
 
124
 
125
 
126
  def submit_example(
@@ -212,7 +221,13 @@ def get_evaluation1(
212
 
213
  # Format inputs based on test type
214
  input_text, output_text = format_inputs_for_evaluation(
215
- text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type
 
 
 
 
 
 
216
  )
217
 
218
  # Get evaluation from the first judge
@@ -249,13 +264,29 @@ def get_evaluation2(
249
 
250
  try:
251
  if not selected_judges or len(selected_judges) < 2:
252
- return "No judges selected", gr.update(visible=False), gr.update(visible=False)
 
 
 
 
 
 
 
 
253
 
254
- logger.info(f"Starting evaluation 2 with judge {selected_judges[1]['name']}")
 
 
255
 
256
  # Format inputs based on test type
257
  input_text, output_text = format_inputs_for_evaluation(
258
- text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type
 
 
 
 
 
 
259
  )
260
 
261
  # Get evaluation from the second judge
 
120
  except Exception as e:
121
  logger.error(f"Error getting example: {e}")
122
  # Return empty strings for all fields
123
+ return (
124
+ "",
125
+ "",
126
+ "",
127
+ "",
128
+ "",
129
+ "",
130
+ "",
131
+ "",
132
+ )
133
 
134
 
135
  def submit_example(
 
221
 
222
  # Format inputs based on test type
223
  input_text, output_text = format_inputs_for_evaluation(
224
+ text_input,
225
+ claim_input,
226
+ single_text_input,
227
+ policy_input,
228
+ policy_output,
229
+ policy_assertion,
230
+ test_type,
231
  )
232
 
233
  # Get evaluation from the first judge
 
264
 
265
  try:
266
  if not selected_judges or len(selected_judges) < 2:
267
+ return (
268
+ "No judges selected",
269
+ gr.update(
270
+ visible=False,
271
+ ),
272
+ gr.update(
273
+ visible=False,
274
+ ),
275
+ )
276
 
277
+ logger.info(
278
+ f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
279
+ )
280
 
281
  # Format inputs based on test type
282
  input_text, output_text = format_inputs_for_evaluation(
283
+ text_input,
284
+ claim_input,
285
+ single_text_input,
286
+ policy_input,
287
+ policy_output,
288
+ policy_assertion,
289
+ test_type,
290
  )
291
 
292
  # Get evaluation from the second judge
src/data_manager.py CHANGED
@@ -26,7 +26,10 @@ class DatasetManager:
26
  test_type_kebab = test_type.replace(" ", "-")
27
  dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
28
  logger.info(f"Loading dataset: {dataset_name}")
29
- self.datasets[test_type] = load_dataset(dataset_name, split="train")
 
 
 
30
  dataset_names.append(dataset_name)
31
  except Exception as e:
32
  logger.error(f"Failed to load dataset {dataset_name}: {e}")
@@ -65,7 +68,9 @@ def load_models() -> List[Dict[str, Any]]:
65
  try:
66
  models.append(json.loads(line))
67
  except json.JSONDecodeError as json_err:
68
- logger.warning(f"Skipping invalid JSON in line: {line}. Error: {json_err}")
 
 
69
  return models
70
  except Exception as e:
71
  logger.error(f"Error loading models: {e}")
@@ -83,7 +88,9 @@ def get_random_example(test_type: str) -> Dict[str, str]:
83
  try:
84
  dataset_name = DATASET_MAPPING.get(test_type)
85
  if not dataset_name:
86
- logger.warning(f"No dataset mapping found for test type: {test_type}")
 
 
87
  return {
88
  "text": f"Sample text for {test_type}",
89
  "claim": f"Sample claim for {test_type}",
 
26
  test_type_kebab = test_type.replace(" ", "-")
27
  dataset_name = f"{DEFAULT_DATASET_PREFIX}-{test_type_kebab}"
28
  logger.info(f"Loading dataset: {dataset_name}")
29
+ self.datasets[test_type] = load_dataset(
30
+ dataset_name,
31
+ split="train",
32
+ )
33
  dataset_names.append(dataset_name)
34
  except Exception as e:
35
  logger.error(f"Failed to load dataset {dataset_name}: {e}")
 
68
  try:
69
  models.append(json.loads(line))
70
  except json.JSONDecodeError as json_err:
71
+ logger.warning(
72
+ f"Skipping invalid JSON in line: {line}. Error: {json_err}",
73
+ )
74
  return models
75
  except Exception as e:
76
  logger.error(f"Error loading models: {e}")
 
88
  try:
89
  dataset_name = DATASET_MAPPING.get(test_type)
90
  if not dataset_name:
91
+ logger.warning(
92
+ f"No dataset mapping found for test type: {test_type}",
93
+ )
94
  return {
95
  "text": f"Sample text for {test_type}",
96
  "claim": f"Sample claim for {test_type}",