Spaces:

ArshVerma
/

CodeLens

Sleeping

ArshVerma commited on Apr 8

Commit

f68fa46

1 Parent(s): 2ef7f43

Improve arch grader scoring and LLM JSON parsing

Enhance architectural grader by computing quality as a mix of semantic term matches and message length: extract full text, match against architectural keywords, compute semantic_score (capped) and length_score, then combine as 0.7*semantic + 0.3*length. Clarified final weighting comment and kept final score normalization. Add extract_json utility to robustly parse JSON from LLM outputs (handles raw JSON, fenced code blocks, and brace-based fallback), increase LLM max_tokens to 800, and use extract_json in call_llm for more reliable parsing of model responses.

Files changed (2) hide show

codelens_env/graders/arch_grader.py +19 -4
inference.py +40 -11

codelens_env/graders/arch_grader.py CHANGED Viewed

@@ -46,16 +46,31 @@ def grade_architectural_review(scenario: Scenario, history: List[ActionRecord])
     verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
-    # 3. Quality Score
     max_body_len = 0
     for action in flag_actions:
-        max_body_len = max(max_body_len, len(action.body or ""))
-    quality_score = 0.0
     if max_body_len > 20:
-        quality_score = min(1.0, max_body_len / 200)
     # 4. Final Weighted Calculation
     final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
     return float(max(0.0, min(1.0, final_score)))

     verdict_avg = sum(verdict_scores) / len(verdict_scores) if verdict_scores else 0.0
+    # 3. Quality Score (Semantic + Length)
     max_body_len = 0
+    full_text = ""
     for action in flag_actions:
+        body = action.body or ""
+        max_body_len = max(max_body_len, len(body))
+        full_text += " " + body.lower()
+    # Reward professional architectural terminology (Phase 3 Human Review polish)
+    arch_keywords = [
+        "responsibility", "coupling", "cohesion", "dependency", "abstraction",
+        "interface", "pattern", "n+1", "god", "scalability", "latency",
+        "concurrency", "layer", "separation", "solid", "dry"
+    ]
+    match_count = sum(1 for kw in arch_keywords if kw in full_text)
+    semantic_score = min(1.0, match_count / 3) # Reward up to 3 high-quality terms
+    length_score = 0.0
     if max_body_len > 20:
+        length_score = min(1.0, max_body_len / 200)
+    quality_score = 0.7 * semantic_score + 0.3 * length_score
     # 4. Final Weighted Calculation
+    # issue_detection (60%), verdict (20%), quality (20%)
     final_score = 0.6 * issue_score_avg + 0.2 * verdict_avg + 0.2 * quality_score
     return float(max(0.0, min(1.0, final_score)))

inference.py CHANGED Viewed

@@ -126,8 +126,44 @@ Code diff:
 Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
 def call_llm(messages: list) -> dict:
-    """Call the LLM with retries and parse its JSON response."""
     last_err = None
     for attempt in range(3):
         try:
@@ -135,18 +171,11 @@ def call_llm(messages: list) -> dict:
                 model=MODEL_NAME,
                 messages=messages,
                 temperature=0.1,
-                max_tokens=600,
                 response_format={"type": "json_object"},
             )
-            content = response.choices[0].message.content.strip()
-            # Robust JSON extract (some models might still use markdown)
-            if "```json" in content:
-                content = content.split("```json")[1].split("```")[0].strip()
-            elif "```" in content:
-                 content = content.split("```")[1].split("```")[0].strip()
-            return json.loads(content)
         except Exception as e:
             last_err = e
             if attempt < 2:

 Output a single JSON action object. If you've already flagged the main issues, submit approve or request_changes."""
+def extract_json(text: str) -> dict:
+    """Robustly extract the first JSON object from a string."""
+    text = text.strip()
+    # 1. Try direct parse
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # 2. Try markdown extraction
+    if "```json" in text:
+        try:
+            content = text.split("```json")[1].split("```")[0].strip()
+            return json.loads(content)
+        except (IndexError, json.JSONDecodeError):
+            pass
+    elif "```" in text:
+        try:
+            content = text.split("```")[1].split("```")[0].strip()
+            return json.loads(content)
+        except (IndexError, json.JSONDecodeError):
+            pass
+    # 3. Last resort: find first { and last }
+    start = text.find('{')
+    end = text.rfind('}')
+    if start != -1 and end != -1:
+        try:
+            return json.loads(text[start:end+1])
+        except json.JSONDecodeError:
+            pass
+    raise ValueError("Could not extract valid JSON from LLM response")
 def call_llm(messages: list) -> dict:
+    """Call the LLM with retries and robustly parse its JSON response."""
     last_err = None
     for attempt in range(3):
         try:
                 model=MODEL_NAME,
                 messages=messages,
                 temperature=0.1,
+                max_tokens=800,
                 response_format={"type": "json_object"},
             )
+            content = response.choices[0].message.content
+            return extract_json(content)
         except Exception as e:
             last_err = e
             if attempt < 2: