Improve arch grader scoring and LLM JSON parsing
Browse filesEnhance architectural grader by computing quality as a mix of semantic term matches and message length: extract full text, match against architectural keywords, compute semantic_score (capped) and length_score, then combine as 0.7*semantic + 0.3*length. Clarified final weighting comment and kept final score normalization. Add extract_json utility to robustly parse JSON from LLM outputs (handles raw JSON, fenced code blocks, and brace-based fallback), increase LLM max_tokens to 800, and use extract_json in call_llm for more reliable parsing of model responses.
- codelens_env/graders/arch_grader.py +19 -4
- inference.py +40 -11
codelens_env/graders/arch_grader.py
CHANGED
|
@@ -46,16 +46,31 @@ def grade_architectural_review(scenario: Scenario, history: List[ActionRecord])
|
|
| 46 |
|
| 47 |
verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
|
| 48 |
|
| 49 |
-
# 3. Quality Score
|
| 50 |
max_body_len = 0
|
|
|
|
| 51 |
for action in flag_actions:
|
| 52 |
-
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
if max_body_len > 20:
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# 4. Final Weighted Calculation
|
|
|
|
| 59 |
final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
|
| 60 |
return float(max(0.0, min(1.0, final_score)))
|
| 61 |
|
|
|
|
| 46 |
|
| 47 |
verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
|
| 48 |
|
| 49 |
+
# 3. Quality Score (Semantic + Length)
|
| 50 |
max_body_len = 0
|
| 51 |
+
full_text = ""
|
| 52 |
for action in flag_actions:
|
| 53 |
+
body = action.body or ""
|
| 54 |
+
max_body_len = max(max_body_len, len(body))
|
| 55 |
+
full_text += " " + body.lower()
|
| 56 |
|
| 57 |
+
# Reward professional architectural terminology (Phase 3 Human Review polish)
|
| 58 |
+
arch_keywords = [
|
| 59 |
+
"responsibility", "coupling", "cohesion", "dependency", "abstraction",
|
| 60 |
+
"interface", "pattern", "n+1", "god", "scalability", "latency",
|
| 61 |
+
"concurrency", "layer", "separation", "solid", "dry"
|
| 62 |
+
]
|
| 63 |
+
match_count = sum(1 for kw in arch_keywords if kw in full_text)
|
| 64 |
+
semantic_score = min(1.0, match_count / 3) # Reward up to 3 high-quality terms
|
| 65 |
+
|
| 66 |
+
length_score = 0.0
|
| 67 |
if max_body_len > 20:
|
| 68 |
+
length_score = min(1.0, max_body_len / 200)
|
| 69 |
+
|
| 70 |
+
quality_score = 0.7 * semantic_score + 0.3 * length_score
|
| 71 |
|
| 72 |
# 4. Final Weighted Calculation
|
| 73 |
+
# issue_detection (60%), verdict (20%), quality (20%)
|
| 74 |
final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
|
| 75 |
return float(max(0.0, min(1.0, final_score)))
|
| 76 |
|
inference.py
CHANGED
|
@@ -126,8 +126,44 @@ Code diff:
|
|
| 126 |
Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
|
| 127 |
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
def call_llm(messages: list) -> dict:
|
| 130 |
-
"""Call the LLM with retries and parse its JSON response."""
|
| 131 |
last_err = None
|
| 132 |
for attempt in range(3):
|
| 133 |
try:
|
|
@@ -135,18 +171,11 @@ def call_llm(messages: list) -> dict:
|
|
| 135 |
model=MODEL_NAME,
|
| 136 |
messages=messages,
|
| 137 |
temperature=0.1,
|
| 138 |
-
max_tokens=
|
| 139 |
response_format={"type": "json_object"},
|
| 140 |
)
|
| 141 |
-
content = response.choices[0].message.content
|
| 142 |
-
|
| 143 |
-
# Robust JSON extract (some models might still use markdown)
|
| 144 |
-
if "```json" in content:
|
| 145 |
-
content = content.split("```json")[1].split("```")[0].strip()
|
| 146 |
-
elif "```" in content:
|
| 147 |
-
content = content.split("```")[1].split("```")[0].strip()
|
| 148 |
-
|
| 149 |
-
return json.loads(content)
|
| 150 |
except Exception as e:
|
| 151 |
last_err = e
|
| 152 |
if attempt < 2:
|
|
|
|
| 126 |
Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
|
| 127 |
|
| 128 |
|
| 129 |
+
def extract_json(text: str) -> dict:
|
| 130 |
+
"""Robustly extract the first JSON object from a string."""
|
| 131 |
+
text = text.strip()
|
| 132 |
+
|
| 133 |
+
# 1. Try direct parse
|
| 134 |
+
try:
|
| 135 |
+
return json.loads(text)
|
| 136 |
+
except json.JSONDecodeError:
|
| 137 |
+
pass
|
| 138 |
+
|
| 139 |
+
# 2. Try markdown extraction
|
| 140 |
+
if "```json" in text:
|
| 141 |
+
try:
|
| 142 |
+
content = text.split("```json")[1].split("```")[0].strip()
|
| 143 |
+
return json.loads(content)
|
| 144 |
+
except (IndexError, json.JSONDecodeError):
|
| 145 |
+
pass
|
| 146 |
+
elif "```" in text:
|
| 147 |
+
try:
|
| 148 |
+
content = text.split("```")[1].split("```")[0].strip()
|
| 149 |
+
return json.loads(content)
|
| 150 |
+
except (IndexError, json.JSONDecodeError):
|
| 151 |
+
pass
|
| 152 |
+
|
| 153 |
+
# 3. Last resort: find first { and last }
|
| 154 |
+
start = text.find('{')
|
| 155 |
+
end = text.rfind('}')
|
| 156 |
+
if start != -1 and end != -1:
|
| 157 |
+
try:
|
| 158 |
+
return json.loads(text[start:end+1])
|
| 159 |
+
except json.JSONDecodeError:
|
| 160 |
+
pass
|
| 161 |
+
|
| 162 |
+
raise ValueError("Could not extract valid JSON from LLM response")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
def call_llm(messages: list) -> dict:
|
| 166 |
+
"""Call the LLM with retries and robustly parse its JSON response."""
|
| 167 |
last_err = None
|
| 168 |
for attempt in range(3):
|
| 169 |
try:
|
|
|
|
| 171 |
model=MODEL_NAME,
|
| 172 |
messages=messages,
|
| 173 |
temperature=0.1,
|
| 174 |
+
max_tokens=800,
|
| 175 |
response_format={"type": "json_object"},
|
| 176 |
)
|
| 177 |
+
content = response.choices[0].message.content
|
| 178 |
+
return extract_json(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
except Exception as e:
|
| 180 |
last_err = e
|
| 181 |
if attempt < 2:
|