nikhmr1235 commited on
Commit
269a5f3
·
verified ·
1 Parent(s): d669e92

update parse_llm_review_markdown() to be complete instead of skeleton code

Browse files
Files changed (1) hide show
  1. src/langgraph_logic/nodes.py +103 -4
src/langgraph_logic/nodes.py CHANGED
@@ -184,10 +184,109 @@ def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
184
  return comments
185
 
186
  def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
187
- """Parses the LLM-generated Markdown review into a structured LLMReviewOutput Pydantic model."""
188
- # Implementation from the original file, simplified for brevity
189
- # ... (The full parsing logic would be here) ...
190
- return LLMReviewOutput(overall_impression=markdown_review) # Placeholder for actual parsing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  # --- Graph Nodes ---
193
 
 
184
  return comments
185
 
186
  def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
187
+ """
188
+ Parses the LLM-generated Markdown review into a structured LLMReviewOutput Pydantic model.
189
+ This version is designed to be robust against formatting variations by dynamically finding
190
+ section headers and extracting content between them.
191
+ Args:
192
+ markdown_review (str): The full Markdown string generated by the LLM.
193
+ Returns:
194
+ LLMReviewOutput: A Pydantic model containing structured review data.
195
+ """
196
+ temp_structured_data: Dict[str, Any] = {
197
+ 'overall_impression': None,
198
+ 'file_reviews': [],
199
+ 'general_sections': [],
200
+ 'summary': None,
201
+ 'approval_status': 'Comment'
202
+ }
203
+
204
+ # --- 1. Find all major section headers and their positions ---
205
+ # This pattern recognizes "## Section Title:" and "1. **Section Title:**"
206
+ section_header_pattern = re.compile(
207
+ r"^(?:##\s+([\w\s/()]+):|(\d+)\.\s+\*\*([\w\s/()]+):\*\*)\s*$",
208
+ re.MULTILINE
209
+ )
210
+
211
+ sections = []
212
+ for match in section_header_pattern.finditer(markdown_review):
213
+ # Consolidate title from group 1 (for '## Title:') or group 3 (for '1. **Title:**')
214
+ title = match.group(1) or match.group(3)
215
+ if title:
216
+ sections.append({
217
+ 'title': title.strip(),
218
+ 'content_start': match.end(),
219
+ 'header_start': match.start()
220
+ })
221
+
222
+ if not sections:
223
+ if markdown_review.strip():
224
+ temp_structured_data['summary'] = "Could not parse the review markdown. The format was not recognized."
225
+ temp_structured_data['overall_impression'] = markdown_review
226
+ return LLMReviewOutput(**temp_structured_data)
227
+
228
+ # --- 2. Process each identified section by extracting content between headers ---
229
+ for i in range(len(sections)):
230
+ current_section = sections[i]
231
+ title = current_section['title']
232
+ content_start = current_section['content_start']
233
+
234
+ # The content ends where the next section's header begins.
235
+ # For the last section, it ends at the end of the string.
236
+ content_end = sections[i+1]['header_start'] if i + 1 < len(sections) else len(markdown_review)
237
+
238
+ content = markdown_review[content_start:content_end].strip()
239
+
240
+ if "Overall Impression" in title:
241
+ temp_structured_data['overall_impression'] = content
242
+
243
+ elif "Specific Observations and Suggestions" in title:
244
+ # This pattern handles "- **`file.py`:**", "### file.py", and "**File: file.py**"
245
+ file_header_line_pattern = re.compile(
246
+ r"^\s*(?:-\s+\*\*(?:`?)([\w\/\.\-_]+\.\w+)(?:`?):\*\*|###\s*`?([\w\/\.\-_]+\.\w+)`?|\*\*File:\s*`?([\w\/\.\-_]+\.\w+)`?\*\*)\s*$",
247
+ re.MULTILINE
248
+ )
249
+
250
+ file_matches = list(file_header_line_pattern.finditer(content))
251
+
252
+ for j, match in enumerate(file_matches):
253
+ file_name = next((g for g in match.groups() if g is not None), None)
254
+ if not file_name: continue
255
+
256
+ file_name = file_name.strip().replace('`', '')
257
+
258
+ start_idx = match.end()
259
+ end_idx = file_matches[j+1].start() if j + 1 < len(file_matches) else len(content)
260
+ file_content_block = content[start_idx:end_idx].strip()
261
+
262
+ # Assumes _parse_bullet_comments is defined elsewhere and works correctly
263
+ parsed_comments = _parse_bullet_comments(file_content_block)
264
+
265
+ if parsed_comments:
266
+ temp_structured_data['file_reviews'].append(FileReviewComments(
267
+ file_path=file_name,
268
+ sections={"General_File_Comments": parsed_comments}
269
+ ))
270
+
271
+ elif "Summary" in title:
272
+ temp_structured_data['summary'] = content
273
+ # Extract approval status from the summary
274
+ approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment)", content, re.IGNORECASE | re.MULTILINE)
275
+ if approval_match:
276
+ temp_structured_data['approval_status'] = approval_match.group(1).strip().capitalize()
277
+
278
+ else: # Any other section is treated as a general section
279
+ if content:
280
+ temp_structured_data['general_sections'].append(ParsedReviewSection(
281
+ title=title,
282
+ content=content
283
+ ))
284
+
285
+ # --- 3. Final fallbacks and cleanup ---
286
+ if not temp_structured_data['summary']:
287
+ temp_structured_data['summary'] = "Automated review completed."
288
+
289
+ return LLMReviewOutput(**temp_structured_data)
290
 
291
  # --- Graph Nodes ---
292