Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -112,6 +112,7 @@ def add_new_eval(
|
|
| 112 |
|
| 113 |
with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
|
| 114 |
with open(file_path, 'r') as f:
|
|
|
|
| 115 |
for ix, line in enumerate(f):
|
| 116 |
try:
|
| 117 |
task = json.loads(line)
|
|
@@ -141,12 +142,17 @@ def add_new_eval(
|
|
| 141 |
)
|
| 142 |
|
| 143 |
all_scores.append({"score": score, "has_ans": has_ans, "model_answer": answer, 'id': task_id})
|
| 144 |
-
|
| 145 |
scores += score
|
| 146 |
num_questions += 1
|
| 147 |
difficulty_scores[difficulty] += score
|
| 148 |
difficulty_counts[difficulty] += 1
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
|
| 151 |
accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
|
| 152 |
accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
|
|
|
|
| 112 |
|
| 113 |
with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
|
| 114 |
with open(file_path, 'r') as f:
|
| 115 |
+
submitted_ids = set()
|
| 116 |
for ix, line in enumerate(f):
|
| 117 |
try:
|
| 118 |
task = json.loads(line)
|
|
|
|
| 142 |
)
|
| 143 |
|
| 144 |
all_scores.append({"score": score, "has_ans": has_ans, "model_answer": answer, 'id': task_id})
|
| 145 |
+
submitted_ids.add(task["id"])
|
| 146 |
scores += score
|
| 147 |
num_questions += 1
|
| 148 |
difficulty_scores[difficulty] += score
|
| 149 |
difficulty_counts[difficulty] += 1
|
| 150 |
|
| 151 |
+
# Check if all gold answer IDs are present in the submission
|
| 152 |
+
missing_ids = set(gold_answers["test"].keys()) - submitted_ids
|
| 153 |
+
if missing_ids:
|
| 154 |
+
return format_error(f"Submission is missing the following IDs: {', '.join(missing_ids)}")
|
| 155 |
+
|
| 156 |
accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
|
| 157 |
accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
|
| 158 |
accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
|