ArshVerma commited on
Commit
f68fa46
·
1 Parent(s): 2ef7f43

Improve arch grader scoring and LLM JSON parsing

Browse files

Enhance architectural grader by computing quality as a mix of semantic term matches and message length: extract full text, match against architectural keywords, compute semantic_score (capped) and length_score, then combine as 0.7*semantic + 0.3*length. Clarified final weighting comment and kept final score normalization. Add extract_json utility to robustly parse JSON from LLM outputs (handles raw JSON, fenced code blocks, and brace-based fallback), increase LLM max_tokens to 800, and use extract_json in call_llm for more reliable parsing of model responses.

Files changed (2) hide show
  1. codelens_env/graders/arch_grader.py +19 -4
  2. inference.py +40 -11
codelens_env/graders/arch_grader.py CHANGED
@@ -46,16 +46,31 @@ def grade_architectural_review(scenario: Scenario, history: List[ActionRecord])
46
 
47
  verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
48
 
49
- # 3. Quality Score
50
  max_body_len = 0
 
51
  for action in flag_actions:
52
- max_body_len = max(max_body_len, len(action.body or ""))
 
 
53
 
54
- quality_score = 0.0
 
 
 
 
 
 
 
 
 
55
  if max_body_len > 20:
56
- quality_score = min(1.0, max_body_len / 200)
 
 
57
 
58
  # 4. Final Weighted Calculation
 
59
  final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
60
  return float(max(0.0, min(1.0, final_score)))
61
 
 
46
 
47
  verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
48
 
49
+ # 3. Quality Score (Semantic + Length)
50
  max_body_len = 0
51
+ full_text = ""
52
  for action in flag_actions:
53
+ body = action.body or ""
54
+ max_body_len = max(max_body_len, len(body))
55
+ full_text += " " + body.lower()
56
 
57
+ # Reward professional architectural terminology (Phase 3 Human Review polish)
58
+ arch_keywords = [
59
+ "responsibility", "coupling", "cohesion", "dependency", "abstraction",
60
+ "interface", "pattern", "n+1", "god", "scalability", "latency",
61
+ "concurrency", "layer", "separation", "solid", "dry"
62
+ ]
63
+ match_count = sum(1 for kw in arch_keywords if kw in full_text)
64
+ semantic_score = min(1.0, match_count / 3) # Reward up to 3 high-quality terms
65
+
66
+ length_score = 0.0
67
  if max_body_len > 20:
68
+ length_score = min(1.0, max_body_len / 200)
69
+
70
+ quality_score = 0.7 * semantic_score + 0.3 * length_score
71
 
72
  # 4. Final Weighted Calculation
73
+ # issue_detection (60%), verdict (20%), quality (20%)
74
  final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
75
  return float(max(0.0, min(1.0, final_score)))
76
 
inference.py CHANGED
@@ -126,8 +126,44 @@ Code diff:
126
  Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
127
 
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def call_llm(messages: list) -> dict:
130
- """Call the LLM with retries and parse its JSON response."""
131
  last_err = None
132
  for attempt in range(3):
133
  try:
@@ -135,18 +171,11 @@ def call_llm(messages: list) -> dict:
135
  model=MODEL_NAME,
136
  messages=messages,
137
  temperature=0.1,
138
- max_tokens=600,
139
  response_format={"type": "json_object"},
140
  )
141
- content = response.choices[0].message.content.strip()
142
-
143
- # Robust JSON extract (some models might still use markdown)
144
- if "```json" in content:
145
- content = content.split("```json")[1].split("```")[0].strip()
146
- elif "```" in content:
147
- content = content.split("```")[1].split("```")[0].strip()
148
-
149
- return json.loads(content)
150
  except Exception as e:
151
  last_err = e
152
  if attempt < 2:
 
126
  Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
127
 
128
 
129
+ def extract_json(text: str) -> dict:
130
+ """Robustly extract the first JSON object from a string."""
131
+ text = text.strip()
132
+
133
+ # 1. Try direct parse
134
+ try:
135
+ return json.loads(text)
136
+ except json.JSONDecodeError:
137
+ pass
138
+
139
+ # 2. Try markdown extraction
140
+ if "```json" in text:
141
+ try:
142
+ content = text.split("```json")[1].split("```")[0].strip()
143
+ return json.loads(content)
144
+ except (IndexError, json.JSONDecodeError):
145
+ pass
146
+ elif "```" in text:
147
+ try:
148
+ content = text.split("```")[1].split("```")[0].strip()
149
+ return json.loads(content)
150
+ except (IndexError, json.JSONDecodeError):
151
+ pass
152
+
153
+ # 3. Last resort: find first { and last }
154
+ start = text.find('{')
155
+ end = text.rfind('}')
156
+ if start != -1 and end != -1:
157
+ try:
158
+ return json.loads(text[start:end+1])
159
+ except json.JSONDecodeError:
160
+ pass
161
+
162
+ raise ValueError("Could not extract valid JSON from LLM response")
163
+
164
+
165
  def call_llm(messages: list) -> dict:
166
+ """Call the LLM with retries and robustly parse its JSON response."""
167
  last_err = None
168
  for attempt in range(3):
169
  try:
 
171
  model=MODEL_NAME,
172
  messages=messages,
173
  temperature=0.1,
174
+ max_tokens=800,
175
  response_format={"type": "json_object"},
176
  )
177
+ content = response.choices[0].message.content
178
+ return extract_json(content)
 
 
 
 
 
 
 
179
  except Exception as e:
180
  last_err = e
181
  if attempt < 2: