Spaces:
Sleeping
Sleeping
update parse_llm_review_markdown() to be complete instead of skeleton code
Browse files- src/langgraph_logic/nodes.py +103 -4
src/langgraph_logic/nodes.py
CHANGED
|
@@ -184,10 +184,109 @@ def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
|
|
| 184 |
return comments
|
| 185 |
|
| 186 |
def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
|
| 187 |
-
"""
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# --- Graph Nodes ---
|
| 193 |
|
|
|
|
| 184 |
return comments
|
| 185 |
|
| 186 |
def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
|
| 187 |
+
"""
|
| 188 |
+
Parses the LLM-generated Markdown review into a structured LLMReviewOutput Pydantic model.
|
| 189 |
+
This version is designed to be robust against formatting variations by dynamically finding
|
| 190 |
+
section headers and extracting content between them.
|
| 191 |
+
Args:
|
| 192 |
+
markdown_review (str): The full Markdown string generated by the LLM.
|
| 193 |
+
Returns:
|
| 194 |
+
LLMReviewOutput: A Pydantic model containing structured review data.
|
| 195 |
+
"""
|
| 196 |
+
temp_structured_data: Dict[str, Any] = {
|
| 197 |
+
'overall_impression': None,
|
| 198 |
+
'file_reviews': [],
|
| 199 |
+
'general_sections': [],
|
| 200 |
+
'summary': None,
|
| 201 |
+
'approval_status': 'Comment'
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
# --- 1. Find all major section headers and their positions ---
|
| 205 |
+
# This pattern recognizes "## Section Title:" and "1. **Section Title:**"
|
| 206 |
+
section_header_pattern = re.compile(
|
| 207 |
+
r"^(?:##\s+([\w\s/()]+):|(\d+)\.\s+\*\*([\w\s/()]+):\*\*)\s*$",
|
| 208 |
+
re.MULTILINE
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
sections = []
|
| 212 |
+
for match in section_header_pattern.finditer(markdown_review):
|
| 213 |
+
# Consolidate title from group 1 (for '## Title:') or group 3 (for '1. **Title:**')
|
| 214 |
+
title = match.group(1) or match.group(3)
|
| 215 |
+
if title:
|
| 216 |
+
sections.append({
|
| 217 |
+
'title': title.strip(),
|
| 218 |
+
'content_start': match.end(),
|
| 219 |
+
'header_start': match.start()
|
| 220 |
+
})
|
| 221 |
+
|
| 222 |
+
if not sections:
|
| 223 |
+
if markdown_review.strip():
|
| 224 |
+
temp_structured_data['summary'] = "Could not parse the review markdown. The format was not recognized."
|
| 225 |
+
temp_structured_data['overall_impression'] = markdown_review
|
| 226 |
+
return LLMReviewOutput(**temp_structured_data)
|
| 227 |
+
|
| 228 |
+
# --- 2. Process each identified section by extracting content between headers ---
|
| 229 |
+
for i in range(len(sections)):
|
| 230 |
+
current_section = sections[i]
|
| 231 |
+
title = current_section['title']
|
| 232 |
+
content_start = current_section['content_start']
|
| 233 |
+
|
| 234 |
+
# The content ends where the next section's header begins.
|
| 235 |
+
# For the last section, it ends at the end of the string.
|
| 236 |
+
content_end = sections[i+1]['header_start'] if i + 1 < len(sections) else len(markdown_review)
|
| 237 |
+
|
| 238 |
+
content = markdown_review[content_start:content_end].strip()
|
| 239 |
+
|
| 240 |
+
if "Overall Impression" in title:
|
| 241 |
+
temp_structured_data['overall_impression'] = content
|
| 242 |
+
|
| 243 |
+
elif "Specific Observations and Suggestions" in title:
|
| 244 |
+
# This pattern handles "- **`file.py`:**", "### file.py", and "**File: file.py**"
|
| 245 |
+
file_header_line_pattern = re.compile(
|
| 246 |
+
r"^\s*(?:-\s+\*\*(?:`?)([\w\/\.\-_]+\.\w+)(?:`?):\*\*|###\s*`?([\w\/\.\-_]+\.\w+)`?|\*\*File:\s*`?([\w\/\.\-_]+\.\w+)`?\*\*)\s*$",
|
| 247 |
+
re.MULTILINE
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
file_matches = list(file_header_line_pattern.finditer(content))
|
| 251 |
+
|
| 252 |
+
for j, match in enumerate(file_matches):
|
| 253 |
+
file_name = next((g for g in match.groups() if g is not None), None)
|
| 254 |
+
if not file_name: continue
|
| 255 |
+
|
| 256 |
+
file_name = file_name.strip().replace('`', '')
|
| 257 |
+
|
| 258 |
+
start_idx = match.end()
|
| 259 |
+
end_idx = file_matches[j+1].start() if j + 1 < len(file_matches) else len(content)
|
| 260 |
+
file_content_block = content[start_idx:end_idx].strip()
|
| 261 |
+
|
| 262 |
+
# Assumes _parse_bullet_comments is defined elsewhere and works correctly
|
| 263 |
+
parsed_comments = _parse_bullet_comments(file_content_block)
|
| 264 |
+
|
| 265 |
+
if parsed_comments:
|
| 266 |
+
temp_structured_data['file_reviews'].append(FileReviewComments(
|
| 267 |
+
file_path=file_name,
|
| 268 |
+
sections={"General_File_Comments": parsed_comments}
|
| 269 |
+
))
|
| 270 |
+
|
| 271 |
+
elif "Summary" in title:
|
| 272 |
+
temp_structured_data['summary'] = content
|
| 273 |
+
# Extract approval status from the summary
|
| 274 |
+
approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment)", content, re.IGNORECASE | re.MULTILINE)
|
| 275 |
+
if approval_match:
|
| 276 |
+
temp_structured_data['approval_status'] = approval_match.group(1).strip().capitalize()
|
| 277 |
+
|
| 278 |
+
else: # Any other section is treated as a general section
|
| 279 |
+
if content:
|
| 280 |
+
temp_structured_data['general_sections'].append(ParsedReviewSection(
|
| 281 |
+
title=title,
|
| 282 |
+
content=content
|
| 283 |
+
))
|
| 284 |
+
|
| 285 |
+
# --- 3. Final fallbacks and cleanup ---
|
| 286 |
+
if not temp_structured_data['summary']:
|
| 287 |
+
temp_structured_data['summary'] = "Automated review completed."
|
| 288 |
+
|
| 289 |
+
return LLMReviewOutput(**temp_structured_data)
|
| 290 |
|
| 291 |
# --- Graph Nodes ---
|
| 292 |
|