Spaces:
Sleeping
Sleeping
Update grader.py
Browse files
grader.py
CHANGED
|
@@ -1,7 +1,15 @@
|
|
| 1 |
import numpy as np
|
| 2 |
from sentence_transformers import SentenceTransformer, util
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
_model = None
|
| 6 |
|
| 7 |
def _get_model():
|
|
@@ -11,30 +19,16 @@ def _get_model():
|
|
| 11 |
return _model
|
| 12 |
|
| 13 |
def grade_comment(comment: str, expected_keywords: list, expert_comment: str) -> float:
|
| 14 |
-
"""
|
| 15 |
-
Returns a score in [0,1] based on:
|
| 16 |
-
- semantic similarity with expert comment (70%)
|
| 17 |
-
- keyword coverage (30%)
|
| 18 |
-
- length bonus/penalty
|
| 19 |
-
"""
|
| 20 |
if not comment:
|
| 21 |
-
return 0.0
|
| 22 |
-
|
| 23 |
-
# 1. Semantic similarity
|
| 24 |
model = _get_model()
|
| 25 |
emb_comment = model.encode(comment, convert_to_tensor=True)
|
| 26 |
emb_expert = model.encode(expert_comment, convert_to_tensor=True)
|
| 27 |
-
sim = util.pytorch_cos_sim(emb_comment, emb_expert).item()
|
| 28 |
-
|
| 29 |
-
# 2. Keyword coverage
|
| 30 |
comment_lower = comment.lower()
|
| 31 |
matched = sum(1 for kw in expected_keywords if kw in comment_lower)
|
| 32 |
kw_score = min(1.0, matched / max(1, len(expected_keywords) // 2))
|
| 33 |
-
|
| 34 |
-
# 3. Combine (70% semantic, 30% keywords)
|
| 35 |
combined = 0.7 * sim + 0.3 * kw_score
|
| 36 |
-
|
| 37 |
-
# 4. Length bonus/penalty
|
| 38 |
words = comment.split()
|
| 39 |
if len(words) >= 15:
|
| 40 |
length_bonus = 0.1
|
|
@@ -42,36 +36,27 @@ def grade_comment(comment: str, expected_keywords: list, expert_comment: str) ->
|
|
| 42 |
length_bonus = -0.2
|
| 43 |
else:
|
| 44 |
length_bonus = 0.0
|
| 45 |
-
|
| 46 |
-
# 5. Final score, clamped
|
| 47 |
final = combined + length_bonus
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
| 51 |
def grade_question(question: str) -> float:
|
| 52 |
-
"""Simple heuristic for question quality."""
|
| 53 |
words = question.split()
|
| 54 |
if len(words) < 3:
|
| 55 |
-
return 0.0
|
| 56 |
-
# Check for question words
|
| 57 |
if any(q in question.lower() for q in ["what", "how", "why", "where", "when", "does", "is"]):
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
|
| 62 |
def grade_fix(proposed_fix: str, expected_fix_keywords: list, hidden_test: callable) -> float:
|
| 63 |
-
"""Evaluates a code fix. Hidden_test can be a function that runs unit tests."""
|
| 64 |
-
# Keyword check (simplified)
|
| 65 |
matched = sum(1 for kw in expected_fix_keywords if kw in proposed_fix.lower())
|
| 66 |
kw_score = min(1.0, matched / max(1, len(expected_fix_keywords) // 2))
|
| 67 |
-
|
| 68 |
-
# If we have a test function, run it
|
| 69 |
test_score = 0.0
|
| 70 |
if hidden_test is not None:
|
| 71 |
try:
|
| 72 |
test_score = hidden_test(proposed_fix)
|
| 73 |
except Exception:
|
| 74 |
test_score = 0.0
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
return 0.6 * test_score + 0.4 * kw_score
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
from sentence_transformers import SentenceTransformer, util
|
| 3 |
|
| 4 |
+
EPS = 0.001
|
| 5 |
+
|
| 6 |
+
def clamp_score(score):
|
| 7 |
+
if score <= 0.0:
|
| 8 |
+
return EPS
|
| 9 |
+
if score >= 1.0:
|
| 10 |
+
return 1.0 - EPS
|
| 11 |
+
return score
|
| 12 |
+
|
| 13 |
_model = None
|
| 14 |
|
| 15 |
def _get_model():
|
|
|
|
| 19 |
return _model
|
| 20 |
|
| 21 |
def grade_comment(comment: str, expected_keywords: list, expert_comment: str) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
if not comment:
|
| 23 |
+
return clamp_score(0.0)
|
|
|
|
|
|
|
| 24 |
model = _get_model()
|
| 25 |
emb_comment = model.encode(comment, convert_to_tensor=True)
|
| 26 |
emb_expert = model.encode(expert_comment, convert_to_tensor=True)
|
| 27 |
+
sim = util.pytorch_cos_sim(emb_comment, emb_expert).item()
|
|
|
|
|
|
|
| 28 |
comment_lower = comment.lower()
|
| 29 |
matched = sum(1 for kw in expected_keywords if kw in comment_lower)
|
| 30 |
kw_score = min(1.0, matched / max(1, len(expected_keywords) // 2))
|
|
|
|
|
|
|
| 31 |
combined = 0.7 * sim + 0.3 * kw_score
|
|
|
|
|
|
|
| 32 |
words = comment.split()
|
| 33 |
if len(words) >= 15:
|
| 34 |
length_bonus = 0.1
|
|
|
|
| 36 |
length_bonus = -0.2
|
| 37 |
else:
|
| 38 |
length_bonus = 0.0
|
|
|
|
|
|
|
| 39 |
final = combined + length_bonus
|
| 40 |
+
# Clamp to (0,1) using EPS
|
| 41 |
+
return clamp_score(final)
|
| 42 |
|
| 43 |
def grade_question(question: str) -> float:
|
|
|
|
| 44 |
words = question.split()
|
| 45 |
if len(words) < 3:
|
| 46 |
+
return clamp_score(0.0)
|
|
|
|
| 47 |
if any(q in question.lower() for q in ["what", "how", "why", "where", "when", "does", "is"]):
|
| 48 |
+
score = min(1.0, len(words) / 20)
|
| 49 |
+
return clamp_score(score)
|
| 50 |
+
return clamp_score(0.2)
|
| 51 |
|
| 52 |
def grade_fix(proposed_fix: str, expected_fix_keywords: list, hidden_test: callable) -> float:
|
|
|
|
|
|
|
| 53 |
matched = sum(1 for kw in expected_fix_keywords if kw in proposed_fix.lower())
|
| 54 |
kw_score = min(1.0, matched / max(1, len(expected_fix_keywords) // 2))
|
|
|
|
|
|
|
| 55 |
test_score = 0.0
|
| 56 |
if hidden_test is not None:
|
| 57 |
try:
|
| 58 |
test_score = hidden_test(proposed_fix)
|
| 59 |
except Exception:
|
| 60 |
test_score = 0.0
|
| 61 |
+
score = 0.6 * test_score + 0.4 * kw_score
|
| 62 |
+
return clamp_score(score)
|
|
|