Spaces:
Sleeping
Sleeping
Update server/code_assessment_environment.py
#16
by rsaibhargav - opened
server/code_assessment_environment.py
CHANGED
|
@@ -685,13 +685,16 @@ class CodeAssessmentEnvironment(Environment):
|
|
| 685 |
# Grading dispatch
|
| 686 |
# ------------------------------------------------------------------
|
| 687 |
def _grade(self, task_type: str, answer: str, problem: Dict) -> Tuple[bool, float, str]:
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
|
|
|
|
|
|
|
|
|
| 695 |
|
| 696 |
# ββ Task 1: Correctness Check βββββββββββββββββββββββββββββββββββββ
|
| 697 |
def _grade_correctness(self, answer: str, problem: Dict) -> Tuple[bool, float, str]:
|
|
@@ -729,17 +732,19 @@ class CodeAssessmentEnvironment(Environment):
|
|
| 729 |
given_rating = parts[0] if parts else ""
|
| 730 |
rating_match = expected_rating in given_rating or given_rating in expected_rating
|
| 731 |
|
| 732 |
-
# Parse issues
|
| 733 |
-
ALL_ISSUES =
|
| 734 |
"age-inappropriate", "too-technical", "insensitive",
|
| 735 |
"tone-mismatch", "contains-pii", "harmful", "biased", "none",
|
| 736 |
-
|
|
|
|
| 737 |
found_issues: set = set()
|
| 738 |
-
for
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
|
|
|
| 743 |
|
| 744 |
# Score issues via F1
|
| 745 |
if "none" in expected_issues:
|
|
|
|
| 685 |
# Grading dispatch
|
| 686 |
# ------------------------------------------------------------------
|
| 687 |
def _grade(self, task_type: str, answer: str, problem: Dict) -> Tuple[bool, float, str]:
|
| 688 |
+
try:
|
| 689 |
+
if task_type == "correctness_check":
|
| 690 |
+
is_correct, score, fb = self._grade_correctness(answer, problem)
|
| 691 |
+
elif task_type == "tone_appropriateness":
|
| 692 |
+
is_correct, score, fb = self._grade_tone(answer, problem)
|
| 693 |
+
else:
|
| 694 |
+
is_correct, score, fb = self._grade_multi_dimensional(answer, problem)
|
| 695 |
+
return is_correct, self._clamp(score), fb
|
| 696 |
+
except Exception as e:
|
| 697 |
+
return False, 0.05, f"Grading error: {str(e)}"
|
| 698 |
|
| 699 |
# ββ Task 1: Correctness Check βββββββββββββββββββββββββββββββββββββ
|
| 700 |
def _grade_correctness(self, answer: str, problem: Dict) -> Tuple[bool, float, str]:
|
|
|
|
| 732 |
given_rating = parts[0] if parts else ""
|
| 733 |
rating_match = expected_rating in given_rating or given_rating in expected_rating
|
| 734 |
|
| 735 |
+
# Parse issues from comma-separated parts (skip first part which is the rating)
|
| 736 |
+
ALL_ISSUES = {
|
| 737 |
"age-inappropriate", "too-technical", "insensitive",
|
| 738 |
"tone-mismatch", "contains-pii", "harmful", "biased", "none",
|
| 739 |
+
}
|
| 740 |
+
answer_parts = [p.strip() for p in cleaned.split(",")]
|
| 741 |
found_issues: set = set()
|
| 742 |
+
for part in answer_parts[1:]: # skip the rating
|
| 743 |
+
normalized = part.strip()
|
| 744 |
+
if normalized in ALL_ISSUES:
|
| 745 |
+
found_issues.add(normalized)
|
| 746 |
+
elif normalized.replace(" ", "-") in ALL_ISSUES:
|
| 747 |
+
found_issues.add(normalized.replace(" ", "-"))
|
| 748 |
|
| 749 |
# Score issues via F1
|
| 750 |
if "none" in expected_issues:
|