Update server/code_assessment_environment.py

#16
Files changed (1) hide show
  1. server/code_assessment_environment.py +20 -15
server/code_assessment_environment.py CHANGED
@@ -685,13 +685,16 @@ class CodeAssessmentEnvironment(Environment):
685
  # Grading dispatch
686
  # ------------------------------------------------------------------
687
  def _grade(self, task_type: str, answer: str, problem: Dict) -> Tuple[bool, float, str]:
688
- if task_type == "correctness_check":
689
- is_correct, score, fb = self._grade_correctness(answer, problem)
690
- elif task_type == "tone_appropriateness":
691
- is_correct, score, fb = self._grade_tone(answer, problem)
692
- else:
693
- is_correct, score, fb = self._grade_multi_dimensional(answer, problem)
694
- return is_correct, self._clamp(score), fb
 
 
 
695
 
696
  # ── Task 1: Correctness Check ─────────────────────────────────────
697
  def _grade_correctness(self, answer: str, problem: Dict) -> Tuple[bool, float, str]:
@@ -729,17 +732,19 @@ class CodeAssessmentEnvironment(Environment):
729
  given_rating = parts[0] if parts else ""
730
  rating_match = expected_rating in given_rating or given_rating in expected_rating
731
 
732
- # Parse issues
733
- ALL_ISSUES = [
734
  "age-inappropriate", "too-technical", "insensitive",
735
  "tone-mismatch", "contains-pii", "harmful", "biased", "none",
736
- ]
 
737
  found_issues: set = set()
738
- for issue in ALL_ISSUES:
739
- if issue in cleaned or issue.replace("-", " ") in cleaned:
740
- found_issues.add(issue)
741
- # Remove the rating word itself from issues if it crept in
742
- found_issues -= {"appropriate", "needs-adjustment", "inappropriate"}
 
743
 
744
  # Score issues via F1
745
  if "none" in expected_issues:
 
685
  # Grading dispatch
686
  # ------------------------------------------------------------------
687
  def _grade(self, task_type: str, answer: str, problem: Dict) -> Tuple[bool, float, str]:
688
+ try:
689
+ if task_type == "correctness_check":
690
+ is_correct, score, fb = self._grade_correctness(answer, problem)
691
+ elif task_type == "tone_appropriateness":
692
+ is_correct, score, fb = self._grade_tone(answer, problem)
693
+ else:
694
+ is_correct, score, fb = self._grade_multi_dimensional(answer, problem)
695
+ return is_correct, self._clamp(score), fb
696
+ except Exception as e:
697
+ return False, 0.05, f"Grading error: {str(e)}"
698
 
699
  # ── Task 1: Correctness Check ─────────────────────────────────────
700
  def _grade_correctness(self, answer: str, problem: Dict) -> Tuple[bool, float, str]:
 
732
  given_rating = parts[0] if parts else ""
733
  rating_match = expected_rating in given_rating or given_rating in expected_rating
734
 
735
+ # Parse issues from comma-separated parts (skip first part which is the rating)
736
+ ALL_ISSUES = {
737
  "age-inappropriate", "too-technical", "insensitive",
738
  "tone-mismatch", "contains-pii", "harmful", "biased", "none",
739
+ }
740
+ answer_parts = [p.strip() for p in cleaned.split(",")]
741
  found_issues: set = set()
742
+ for part in answer_parts[1:]: # skip the rating
743
+ normalized = part.strip()
744
+ if normalized in ALL_ISSUES:
745
+ found_issues.add(normalized)
746
+ elif normalized.replace(" ", "-") in ALL_ISSUES:
747
+ found_issues.add(normalized.replace(" ", "-"))
748
 
749
  # Score issues via F1
750
  if "none" in expected_issues: