nikhmr1235 commited on
Commit
158410c
·
verified ·
1 Parent(s): 224666a

Create langgraph_pr_review_bot.py

Browse files
Files changed (1) hide show
  1. langgraph_pr_review_bot.py +782 -0
langgraph_pr_review_bot.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, Optional
2
+ from pydantic import BaseModel, Field
3
+ from uuid import uuid4
4
+
5
+ class PRReviewState(BaseModel):
6
+ # GitHub PR Information (Mandatory)
7
+ pr_id: int
8
+ repo_name: str # e.g., "owner/repo"
9
+
10
+ # GitHub PR Information (Now Optional)
11
+ diff_url: Optional[str] = None
12
+ pr_title: Optional[str] = None
13
+ pr_author: Optional[str] = None
14
+ # review_run_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Optional: Unique ID for this specific review run
15
+
16
+ # Code Content (Now Optional)
17
+ code_diff: Optional[str] = None # The fetched raw diff content
18
+ file_contents: Dict[str, str] = {} # Map of filename to full content for context (already has default)
19
+
20
+ # LLM Review Outputs (Already Optional)
21
+ llm_markdown_review: Optional[str] = None # The raw Markdown output from the LLM (e.g., from generate_code_review_markdown)
22
+ parsed_llm_review_data: Optional[Dict[str, Any]] = None # Structured dict from parsing the Markdown (e.g., from parse_llm_review_markdown)
23
+
24
+ # Human-in-the-Loop (Simplified for Phase 1) (Already Optional or has default)
25
+ require_human_approval: bool = False # Config flag, set at graph initialization
26
+ human_approval_status: Optional[bool] = None # True if approved, False if rejected
27
+ human_feedback_message: Optional[str] = None # Any message from human rejection
28
+
29
+ # System Status (Now Optional or has default)
30
+ review_status: str = "initiated" # e.g., "initiated", "fetching_code", "code_retrieved", "generating_llm_review", "llm_review_generated", "parsing_llm_review", "review_parsed", "awaiting_human_approval", "posting_review", "posted", "rejected", "failed"
31
+ last_error: Optional[str] = None # Stores the last encountered error message
32
+ # error_traceback: Optional[str] = None # Optional: For more detailed error debugging
33
+ review_id: Optional[int] = None # this is the pull-request-review-id (pending -> approved/Discarded based on HIL) (Now Optional)
34
+ review_comment_url: Optional[str] = None # URL of the main posted GitHub review comment
35
+
36
+ '''
37
+ from kaggle_secrets import UserSecretsClient
38
+ user_secrets = UserSecretsClient()
39
+ git_hub_token = user_secrets.get_secret("GITHUB_token_ID")
40
+ google_api_key = user_secrets.get_secret("GOOGLE_API_KEY")
41
+ '''
42
+
43
+ import os
44
+ from typing import Dict, Any, List, Optional, Tuple # Ensure Tuple is imported
45
+ from github import Github, PullRequest
46
+ from github.GithubException import GithubException, UnknownObjectException
47
+ import requests # Make sure requests is imported for patch_url
48
+ from dotenv import load_dotenv
49
+
50
+ # For local testing, you might need to load dotenv if your environment variables
51
+ # are managed via a .env file. In a deployed environment, they would likely be
52
+ # set directly.
53
+ # Only load dotenv if it's not already loaded (e.g., in __main__ or a test setup)
54
+ # This prevents redundant loading in production or if your main script handles it.
55
+ if not os.getenv("GITHUB_TOKEN"): # Only load if token not already set
56
+ load_dotenv() # Load environment variables from .env file
57
+
58
+ # Assuming 'git_hub_token' is defined globally or passed in a larger context
59
+ # If git_hub_token is expected to be a global variable, ensure it's imported or declared.
60
+ # For better practice, pass it as an argument or rely solely on os.getenv.
61
+ # Let's adjust to purely rely on os.getenv for this function.
62
+ # github_token = os.getenv("GITHUB_TOKEN") # Moved inside function for safety
63
+
64
+
65
+ def fetch_pr_code_changes(repo_name: str, pr_id: int) -> Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
66
+ """
67
+ Fetches the raw diff content, the full contents of changed files,
68
+ and the head commit SHA for a given PR.
69
+
70
+ Args:
71
+ repo_name (str): The full name of the repository (e.g., "octocat/Spoon-Knife").
72
+ pr_id (int): The ID of the Pull Request.
73
+
74
+ Returns:
75
+ Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
76
+ - raw_diff_content (str or None): The raw diff content of the PR.
77
+ - file_contents (Dict[str, str] or None): Dictionary mapping filename to its full content (after changes).
78
+ - head_commit_sha (str or None): The SHA of the head commit of the PR.
79
+ - error_message (str or None): An error message if something went wrong.
80
+ """
81
+ #github_token = os.getenv("GITHUB_TOKEN")
82
+ github_token = git_hub_token
83
+
84
+ if not github_token:
85
+ print("Error: GITHUB_TOKEN environment variable not set.")
86
+ return None, None, None, "GitHub token not found in environment variables."
87
+
88
+ try:
89
+ g = Github(github_token)
90
+ repo = g.get_repo(repo_name)
91
+ pull_request = repo.get_pull(pr_id)
92
+
93
+ # --- NEW: Get the head commit SHA ---
94
+ head_commit_sha = pull_request.head.sha
95
+ print(f"Fetched PR {pr_id} head commit SHA: {head_commit_sha}")
96
+
97
+
98
+ # 1. Fetch raw diff content (patch)
99
+ # Using requests directly for patch_url is good as PyGithub's get_patch() can sometimes be rate-limited differently
100
+ patch_url = pull_request.patch_url
101
+ headers = {"Authorization": f"token {github_token}"}
102
+ raw_diff_content = requests.get(patch_url, headers=headers).text
103
+
104
+ # 2. Fetch full content of changed files
105
+ file_contents: Dict[str, str] = {}
106
+ for file in pull_request.get_files():
107
+ # Skip files that were deleted, as their content cannot be retrieved from the current head.
108
+ if file.status == 'deleted':
109
+ file_contents[file.filename] = "[FILE DELETED]"
110
+ continue
111
+
112
+ try:
113
+ # We want the content *after* the change, which is from the PR's head branch.
114
+ # PyGithub's get_contents should be called with `ref` set to `pull_request.head.ref`
115
+ # or `pull_request.head.sha` for explicit content at the PR's head.
116
+ # Using pull_request.head.sha is more robust as ref might change.
117
+ file_content_obj = repo.get_contents(file.filename, ref=pull_request.head.sha)
118
+
119
+ if isinstance(file_content_obj, list):
120
+ print(f"Warning: '{file.filename}' is a directory or multiple files, skipping content retrieval for now.")
121
+ file_contents[file.filename] = "[DIRECTORY OR MULTIPLE FILES]"
122
+ continue
123
+
124
+ file_contents[file.filename] = file_content_obj.decoded_content.decode('utf-8')
125
+
126
+ except GithubException as e:
127
+ print(f"Warning: GitHub API error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e.status} - {e.data.get('message', 'No message')}")
128
+ file_contents[file.filename] = f"[ERROR: Could not fetch content. Status: {e.status}, Message: {e.data.get('message', 'No message')}]"
129
+ except Exception as e:
130
+ print(f"Unexpected error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e}")
131
+ file_contents[file.filename] = f"[ERROR: Unexpected error fetching content: {e}]"
132
+
133
+ # Return the new head_commit_sha along with existing returns
134
+ return raw_diff_content, file_contents, head_commit_sha, None # No error message if successful
135
+
136
+ except UnknownObjectException as e:
137
+ error_msg = f"GitHub object not found (repo or PR): {e.data.get('message', 'No message')}"
138
+ print(f"Error in fetch_pr_code_changes: {error_msg}")
139
+ return None, None, None, error_msg
140
+ except GithubException as e:
141
+ error_msg = f"GitHub API error for PR {pr_id} from {repo_name}: {e.status} - {e.data.get('message', 'No message')}"
142
+ print(f"Error in fetch_pr_code_changes: {error_msg}")
143
+ return None, None, None, error_msg
144
+ except Exception as e:
145
+ error_msg = f"An unexpected error occurred while fetching PR {pr_id} from {repo_name}: {e}"
146
+ print(f"Error in fetch_pr_code_changes: {error_msg}")
147
+ return None, None, None, error_msg
148
+
149
+ def code_retriever_node(state:PRReviewState):
150
+ repo_name = state.repo_name
151
+ pull_req_id = state.pr_id
152
+
153
+ print(f"repo_name :{repo_name}-------- pull_req_id:{pull_req_id}")
154
+
155
+ diff, contents,head_commit_sha, error = fetch_pr_code_changes(repo_name, pull_req_id)
156
+
157
+ # Don't forget to return an updated state, as nodes in LangGraph should always do
158
+ # For this simple example, we'll just return a copy with an updated status
159
+ updated_state = state.model_copy(update={
160
+ "review_status": "code_retrieved", # Update status after retrieval logic
161
+ "code_diff": diff,
162
+ "file_contents": contents
163
+ })
164
+ return updated_state
165
+
166
+ import os
167
+ from typing import Dict, Any
168
+ from langchain_core.prompts import ChatPromptTemplate
169
+ # Ensure you have your LLM provider installed, e.g., pip install langchain-google-genai
170
+ from langchain_google_genai import ChatGoogleGenerativeAI # Using Gemini as per your preference
171
+
172
+ # Initialize your LLM. Make sure your GOOGLE_API_KEY is set in environment variables.
173
+ # You can also configure other models like "gemini-1.5-flash" or "gemini-1.5-pro"
174
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.0, api_key=google_api_key) # Lower temperature for more deterministic output
175
+
176
+ def generate_code_review_markdown(code_diff: str, file_contents: Dict[str, str]) -> str:
177
+ """
178
+ Generates a detailed, human-readable code review in Markdown format from the LLM.
179
+
180
+ The prompt is designed to elicit structured Markdown output that can then be
181
+ parsed for GitHub PR comments, grouped by file and function.
182
+
183
+ Args:
184
+ code_diff (str): The string representation of the code diff.
185
+ file_contents (Dict[str, str]): A dictionary where keys are file paths
186
+ and values are their full content.
187
+
188
+ Returns:
189
+ str: A Markdown string representing the code review.
190
+ """
191
+
192
+ # Prepare full contents context
193
+ full_contents_str = ""
194
+ if file_contents:
195
+ for filename, content in file_contents.items():
196
+ # Add a clear separator and Markdown code block for each file
197
+ full_contents_str += f"--- Full Content of {filename} ---\n```python\n{content}\n```\n\n"
198
+ else:
199
+ full_contents_str = "No full file contents provided for additional context."
200
+
201
+ # Construct the Prompt Template
202
+ prompt = ChatPromptTemplate.from_messages(
203
+ [
204
+ ("system",
205
+ "You are an expert Senior Software Engineer and a meticulous code reviewer.\n"
206
+ "Your task is to review the provided code changes in a Pull Request.\n"
207
+ "Analyze the `code_diff` for potential bugs, performance issues, security vulnerabilities, code style violations, maintainability concerns, and missing tests or documentation.\n"
208
+ "Refer to the `full_file_contents` for additional context if the diff alone is insufficient to understand the changes or their implications.\n"
209
+ "Provide a comprehensive, actionable, and constructive review.\n"
210
+ "Format your review clearly using Markdown. Structure it with the following top-level sections:\n"
211
+ "1. **Overall Impression:** A brief summary of the PR's purpose and overall quality.\n"
212
+ "2. **Specific Observations and Suggestions:** Detailed feedback, grouped by file.\n"
213
+ " - Within each file's section, group related comments, ideally by function or logical block.\n"
214
+ " - For each observation/suggestion, include relevant line numbers from the *new* file for context (e.g., 'Line X-Y:').\n"
215
+ "3. **Potential Issues and Edge Cases:** Discuss any missed scenarios or potential problems.\n"
216
+ "4. **Security Implications:** Highlight any security concerns.\n"
217
+ "5. **Adherence to Best Practices (PEP 8):** Comment on style and best practice compliance.\n"
218
+ "6. **Performance Considerations:** Discuss performance aspects.\n"
219
+ "7. **Unit Testing Suggestions:** Recommend additional tests.\n"
220
+ "8. **Docstring/Comment Improvements:** Suggest documentation enhancements.\n"
221
+ "9. **Clarity and Conciseness:** Feedback on code readability.\n"
222
+ "10. **Summary:** A concise conclusion and recommended action (e.g., 'Approve', 'Request Changes', 'Comment').\n\n"
223
+ "For code suggestions, use GitHub's Markdown code block with 'suggestion' annotation, like this:\n"
224
+ "```suggestion\n"
225
+ "your_suggested_code_here\n"
226
+ "```\n"
227
+ "Ensure file paths are correctly formatted (e.g., `src/utils/data_processor.py`)."
228
+ ),
229
+ ("human",
230
+ "Here are the code changes (diff):\n"
231
+ "```diff\n"
232
+ "{code_diff}\n"
233
+ "```\n\n"
234
+ "Here are the full contents of the changed files (for additional context, use only if necessary to understand the diff):\n"
235
+ "{full_contents_context}\n\n"
236
+ "Please provide your structured code review in Markdown."
237
+ ),
238
+ ]
239
+ )
240
+
241
+ # Create the Chain
242
+ review_chain = prompt | llm
243
+
244
+ # Invoke the Chain
245
+ try:
246
+ review_markdown = review_chain.invoke({
247
+ "code_diff": code_diff,
248
+ "full_contents_context": full_contents_str
249
+ }).content # Access the content attribute for Chat model output
250
+ return review_markdown
251
+ except Exception as e:
252
+ print(f"Error generating code review: {e}")
253
+ return f"Error: Could not generate code review. {e}\n\n" \
254
+ f"Please check the LLM API call or token limits."
255
+
256
+ def code_reviewer_node(state:PRReviewState):
257
+ code_diff = state.code_diff
258
+ file_contents = state.file_contents
259
+
260
+ review_markdown = generate_code_review_markdown(code_diff, file_contents)
261
+
262
+ # Don't forget to return an updated state, as nodes in LangGraph should always do
263
+ # For this simple example, we'll just return a copy with an updated status
264
+ updated_state = state.model_copy(update={
265
+ "review_status": "code_reviewed", # Update status after retrieval logic
266
+ "llm_markdown_review":review_markdown,
267
+ })
268
+ return updated_state
269
+
270
+
271
+ import re
272
+ from typing import List, Dict, Tuple, Optional, Any
273
+
274
+ class ParsedComment:
275
+ """
276
+ Represents a single parsed comment from the LLM's review,
277
+ intended for grouping by file/function.
278
+ """
279
+ def __init__(self, message: str, suggestion: Optional[str] = None):
280
+ self.message = message
281
+ self.suggestion = suggestion
282
+
283
+ def __repr__(self):
284
+ return f"ParsedComment(msg='{self.message[:50]}...', has_suggestion={self.suggestion is not None})"
285
+
286
+ class ParsedReviewSection:
287
+ """
288
+ Represents a categorized section of the review, e.g., 'Potential Issues'.
289
+ """
290
+ def __init__(self, title: str, content: str):
291
+ self.title = title
292
+ self.content = content
293
+
294
+ def __repr__(self):
295
+ return f"ParsedReviewSection(title='{self.title}', content='{self.content[:50]}...')"
296
+
297
+
298
+ # Helper to extract suggestion block and clean message
299
+ def _extract_suggestion(text: str) -> Tuple[Optional[str], str]:
300
+ """Helper to extract suggestion block and clean message."""
301
+ suggestion_match = re.search(r"```suggestion\n([\s\S]*?)\n```", text, re.MULTILINE)
302
+ suggestion_code = suggestion_match.group(1).strip() if suggestion_match else None
303
+
304
+ # Remove suggestion from the main message
305
+ cleaned_message = re.sub(r"```suggestion[\s\S]*?```", "", text).strip()
306
+ return suggestion_code, cleaned_message
307
+
308
+ def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
309
+ """Helper to parse bullet-point comments from a given text block."""
310
+ comments = []
311
+ # FIX: Updated regex for bullet comments
312
+ # Now matches from a bullet point until the start of the next bullet point or end of the text block.
313
+ # This handles multi-line comments and embedded suggestion blocks more robustly.
314
+ comment_matches = re.finditer(r"(^ *[-*]\s*[\s\S]*?)(?=\n *[-*]\s*|\Z)", text_block, re.MULTILINE | re.DOTALL)
315
+ for cm in comment_matches:
316
+ full_comment_text = cm.group(1).strip()
317
+ if full_comment_text:
318
+ suggestion_code, cleaned_message = _extract_suggestion(full_comment_text)
319
+ comments.append(ParsedComment(message=cleaned_message, suggestion=suggestion_code))
320
+ return comments
321
+
322
+
323
+ def parse_llm_review_markdown(markdown_review: str) -> Dict[str, Any]:
324
+ """
325
+ Parses the LLM-generated Markdown review into a structured dictionary.
326
+ It extracts the overall summary, file-specific/function-specific comments,
327
+ and other general review sections.
328
+
329
+ Args:
330
+ markdown_review (str): The full Markdown string generated by the LLM.
331
+
332
+ Returns:
333
+ Dict[str, Any]: A dictionary containing structured review data:
334
+ - 'overall_impression': str
335
+ - 'file_comments': Dict[str, Dict[str, List[ParsedComment]]]
336
+ (file_path -> function_name -> List[ParsedComment])
337
+ - 'general_sections': List[ParsedReviewSection]
338
+ - 'summary': str
339
+ - 'approval_status': str (extracted from summary, if present)
340
+ """
341
+ structured_review: Dict[str, Any] = {
342
+ 'overall_impression': '',
343
+ 'file_comments': {},
344
+ 'general_sections': [],
345
+ 'summary': '',
346
+ 'approval_status': 'Comment' # Default status
347
+ }
348
+
349
+ # Helper to extract content between two headers.
350
+ # Now more flexible: allows optional numbering and variable header level for top sections
351
+ def extract_section_content(text: str, start_header_text: str, end_header_text: str) -> Optional[str]:
352
+ # Pattern to match headers with optional numbering and flexible spacing
353
+ start_pattern = r"^(?:##|###)\s*\d*\.?\s*" + re.escape(start_header_text) + r":\s*$"
354
+ end_pattern = r"^(?:##|###)\s*\d*\.?\s*" + re.escape(end_header_text) + r":\s*$"
355
+
356
+ # Use re.DOTALL to allow . to match newlines
357
+ match = re.search(f"{start_pattern}([\\s\\S]*?)(?={end_pattern}|\\Z)", text, re.MULTILINE | re.DOTALL)
358
+ if match:
359
+ return match.group(1).strip()
360
+ return None
361
+
362
+ # --- 1. Extract Overall Impression ---
363
+ overall_impression_content = extract_section_content(markdown_review, "Overall Impression", "Specific Observations and Suggestions")
364
+ if overall_impression_content:
365
+ structured_review['overall_impression'] = overall_impression_content
366
+
367
+ # --- 2. Extract Specific Observations and Suggestions (File/Function Comments) ---
368
+ specific_obs_section_content = extract_section_content(markdown_review, "Specific Observations and Suggestions", "Potential Issues and Edge Cases")
369
+
370
+ # Debug prints for specific_obs_section_content (kept for verification)
371
+ print(f"\n--- DEBUG: specific_obs_section_content (extracted from markdown_review) ---")
372
+ if specific_obs_section_content is None:
373
+ print("specific_obs_section_content is None")
374
+ elif not specific_obs_section_content.strip():
375
+ print("specific_obs_section_content is empty or only whitespace")
376
+ else:
377
+ print(specific_obs_section_content[:500] + "..." if len(specific_obs_section_content) > 500 else specific_obs_section_content)
378
+ print(f"--- END DEBUG: specific_obs_section_content ---\n")
379
+
380
+ print(f"\n--- DEBUG: Raw specific_obs_section_content (using repr()):")
381
+ if specific_obs_section_content is not None:
382
+ print(repr(specific_obs_section_content))
383
+ print(f"Length of specific_obs_section_content: {len(specific_obs_section_content)}")
384
+ print(f"Does it start with '### `data_processor.py`'? {specific_obs_section_content.startswith('### `data_processor.py`')}")
385
+ starts_as_file_header = False
386
+ if specific_obs_section_content.startswith('### `') or specific_obs_section_content.startswith('**File:'):
387
+ starts_as_file_header = True
388
+ print(f"Does it start with a common file header pattern? {starts_as_file_header}")
389
+ else:
390
+ print("specific_obs_section_content is None.")
391
+ print(f"--- END DEBUG: Raw specific_obs_section_content ---\n")
392
+
393
+
394
+ if specific_obs_section_content:
395
+ # NEW STRATEGY FOR FILE BLOCK PARSING:
396
+ # Step 1: Find all file header line matches first
397
+ file_header_line_pattern = re.compile(
398
+ r"^(?:\*\*File:\s*`?([\w\/\.\-_]+\.\w+)`?\*\*|###\s*`?([\w\/\.\-_]+\.\w+)`?)\s*$",
399
+ re.MULTILINE
400
+ )
401
+
402
+ header_matches = list(file_header_line_pattern.finditer(specific_obs_section_content))
403
+
404
+ print(f"--- DEBUG: Number of file_header_line_pattern matches found (New Strategy): {len(header_matches)} ---")
405
+ if not header_matches:
406
+ print("No file headers were found. Cannot parse file blocks.")
407
+ pass
408
+ else:
409
+ # Step 2: Iterate through header matches and extract content blocks
410
+ for i, header_match in enumerate(header_matches):
411
+ file_name = (header_match.group(1) or header_match.group(2)).strip().replace('`', '')
412
+
413
+ # Determine the start of the content block (after the header line)
414
+ content_start_index = header_match.end()
415
+
416
+ # Determine the end of the content block (start of next header or end of section content)
417
+ content_end_index = len(specific_obs_section_content)
418
+ if i + 1 < len(header_matches):
419
+ content_end_index = header_matches[i+1].start()
420
+
421
+ file_content_block = specific_obs_section_content[content_start_index:content_end_index].strip()
422
+
423
+ print(f"\n--- DEBUG: Processing file (new strategy): {file_name} ---")
424
+ print(f"File content block (first 200 chars):\n{file_content_block[:200]}..." if len(file_content_block) > 200 else file_content_block)
425
+
426
+ if not file_name: continue
427
+
428
+ structured_review['file_comments'][file_name] = {}
429
+ general_comments_for_file: List[ParsedComment] = []
430
+
431
+ # Refined split to capture general file comments and specific function/section comments
432
+ # Matches '#### Function: `func_name`' OR '#### Any other section title'
433
+ sub_section_header_pattern = re.compile(
434
+ r"^(####\s*(?:Function:\s*`?([\w_]+)`?|[\s\S]+?))\s*$",
435
+ re.MULTILINE
436
+ )
437
+
438
+ sub_section_matches_list = list(sub_section_header_pattern.finditer(file_content_block))
439
+ print(f"--- DEBUG: Number of sub-section (####) matches for {file_name}: {len(sub_section_matches_list)} ---")
440
+ if not sub_section_matches_list:
441
+ print(f"No '####' sub-sections were found in the block for {file_name}. All content will be general comments or missed.")
442
+ if file_content_block.strip():
443
+ parsed_general_comments = _parse_bullet_comments(file_content_block.strip())
444
+ structured_review['file_comments'][file_name]["General_File_Comments"] = parsed_general_comments
445
+ print(f" - DEBUG: Parsed {len(parsed_general_comments)} general comments for {file_name}.")
446
+ continue
447
+
448
+ # If sub-sections (#### headers) ARE found, process comments before the first sub-section header (these are file-level comments)
449
+ first_match_start_index = sub_section_matches_list[0].start()
450
+ pre_section_comments_content = file_content_block[:first_match_start_index].strip()
451
+ if pre_section_comments_content:
452
+ general_comments_for_file.extend(_parse_bullet_comments(pre_section_comments_content))
453
+ print(f" - DEBUG: Added {len(general_comments_for_file)} general comments (before first sub-section) for {file_name}.")
454
+
455
+
456
+ # Process each sub-section
457
+ for k, current_match in enumerate(sub_section_matches_list):
458
+ section_header_raw = current_match.group(1).strip()
459
+ func_name_from_group = current_match.group(2)
460
+
461
+ section_title_key = ""
462
+ if func_name_from_group:
463
+ section_title_key = func_name_from_group.replace('`', '')
464
+ else:
465
+ section_title_key = section_header_raw[section_header_raw.find('####') + 4:].strip().replace('`', '')
466
+
467
+ content_start_index = current_match.end()
468
+ content_end_index = (sub_section_matches_list[k+1].start()
469
+ if k + 1 < len(sub_section_matches_list)
470
+ else len(file_content_block))
471
+
472
+ sub_section_content = file_content_block[content_start_index:content_end_index].strip()
473
+
474
+ print(f" - DEBUG: Sub-section '{section_title_key}' content (first 100 chars): {sub_section_content[:100]}..." if len(sub_section_content) > 100 else sub_section_content)
475
+ if sub_section_content:
476
+ parsed_comments_for_section = _parse_bullet_comments(sub_section_content)
477
+ structured_review['file_comments'][file_name][section_title_key] = parsed_comments_for_section
478
+ print(f" - DEBUG: Parsed {len(parsed_comments_for_section)} comments for '{section_title_key}'.")
479
+ else:
480
+ structured_review['file_comments'][file_name][section_title_key] = []
481
+ print(f" - DEBUG: No content for sub-section '{section_title_key}'.")
482
+
483
+ if general_comments_for_file:
484
+ structured_review['file_comments'][file_name]["General_File_Comments"] = general_comments_for_file
485
+
486
+
487
+ # --- 3. Extract General Sections ---
488
+ general_section_headers = [
489
+ ("Potential Issues and Edge Cases", "Potential Issues and Edge Cases"),
490
+ ("Security Implications", "Security Implications"),
491
+ ("Adherence to Best Practices (PEP 8)", "Adherence to Best Practices (PEP 8)"),
492
+ ("Performance Considerations", "Performance Considerations"),
493
+ ("Unit Testing Suggestions", "Unit Testing Suggestions"),
494
+ ("Docstring/Comment Improvements", "Docstring/Comment Improvements"),
495
+ ("Clarity and Conciseness", "Clarity and Conciseness"),
496
+ ("Summary", "Summary"),
497
+ ]
498
+
499
+ current_markdown_to_parse = markdown_review
500
+
501
+ start_parsing_from_match = re.search(r"^##\s*\d*\.?\s*Potential Issues and Edge Cases:\s*$", current_markdown_to_parse, re.MULTILINE)
502
+ if not start_parsing_from_match:
503
+ specific_obs_end_idx = 0
504
+ specific_obs_match = re.search(r"^##\s*\d*\.?\s*Specific Observations and Suggestions:\s*([\s\S]*?)(?=^##\s*\d*\.?\s*[\w\s\(\)\/]+:|\Z)", current_markdown_to_parse, re.MULTILINE | re.DOTALL)
505
+ if specific_obs_match:
506
+ current_markdown_to_parse = current_markdown_to_parse[specific_obs_match.end():].strip()
507
+ else:
508
+ pass
509
+ else:
510
+ current_markdown_to_parse = current_markdown_to_parse[start_parsing_from_match.start():].strip()
511
+
512
+
513
+ for i, (title, header_text) in enumerate(general_section_headers):
514
+ current_header_pattern = r"^##\s*\d*\.?\s*" + re.escape(header_text) + r":\s*$"
515
+
516
+ start_match = re.search(current_header_pattern, current_markdown_to_parse, re.MULTILINE)
517
+ if not start_match:
518
+ continue
519
+
520
+ section_start_idx = start_match.end()
521
+
522
+ section_end_idx = len(current_markdown_to_parse)
523
+
524
+ if i + 1 < len(general_section_headers):
525
+ next_header_text = general_section_headers[i+1][1]
526
+ next_header_pattern = r"^##\s*\d*\.?\s*" + re.escape(next_header_text) + r":\s*$"
527
+ next_match = re.search(next_header_pattern, current_markdown_to_parse[section_start_idx:], re.MULTILINE)
528
+ if next_match:
529
+ section_end_idx = section_start_idx + next_match.start()
530
+
531
+ content_raw = current_markdown_to_parse[section_start_idx:section_end_idx].strip()
532
+
533
+ if title == "Summary":
534
+ structured_review['summary'] = content_raw
535
+ structured_review['summary'] = re.sub(r'(`{3,})\s*$', '', structured_review['summary']).strip()
536
+
537
+ approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment|No action required)", structured_review['summary'], re.IGNORECASE | re.MULTILINE)
538
+ if approval_match:
539
+ structured_review['approval_status'] = approval_match.group(1).strip().replace(' ', '').capitalize()
540
+ else:
541
+ structured_review['approval_status'] = 'Comment'
542
+ else:
543
+ structured_review['general_sections'].append(ParsedReviewSection(title=title, content=content_raw))
544
+
545
+ current_markdown_to_parse = current_markdown_to_parse[section_end_idx:].strip()
546
+
547
+ if not structured_review['summary']:
548
+ summary_match = re.search(r"^##\s*\d*\.?\s*Summary:\s*([\s\S]*)$", markdown_review, re.MULTILINE | re.DOTALL)
549
+ if summary_match:
550
+ structured_review['summary'] = summary_match.group(1).strip()
551
+ structured_review['summary'] = re.sub(r'(`{3,})\s*$', '', structured_review['summary']).strip()
552
+
553
+ approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment|No action required)", structured_review['summary'], re.IGNORECASE | re.MULTILINE)
554
+ if approval_match:
555
+ structured_review['approval_status'] = approval_match.group(1).strip().replace(' ', '').capitalize()
556
+ else:
557
+ structured_review['approval_status'] = 'Comment'
558
+ else:
559
+ structured_review['summary'] = "Automated review completed."
560
+
561
+ return structured_review
562
+
563
+ def feedback_formatter_node(state: PRReviewState):
564
+ llm_markdown_review = state.llm_markdown_review
565
+
566
+ parsed_llm_review_data = parse_llm_review_markdown(llm_markdown_review)
567
+
568
+ # Don't forget to return an updated state, as nodes in LangGraph should always do
569
+ # For this simple example, we'll just return a copy with an updated status
570
+ updated_state = state.model_copy(update={
571
+ "review_status": "review_parsed" ,# Update status after retrieval logic
572
+ "parsed_llm_review_data":parsed_llm_review_data,
573
+ })
574
+ return updated_state
575
+
576
+
577
+ from github import Github, PullRequest
578
+ from github.GithubException import GithubException, UnknownObjectException
579
+ from github.Commit import Commit # Import Commit type for clarity and correctness
580
+ from typing import Dict, Any, List, Optional
581
+ import os
582
+ import re
583
+ import logging
584
+
585
+ # IMPORTANT: These classes should be imported from src.utils.markdown_parser
586
+ # For standalone execution or if import paths are complex, ensure they are correctly defined or imported.
587
+ class ParsedComment:
588
+ def __init__(self, message: str, suggestion: Optional[str] = None):
589
+ self.message = message
590
+ self.suggestion = suggestion
591
+ def __repr__(self):
592
+ return f"ParsedComment(msg='{self.message[:50]}...', has_suggestion={self.suggestion is not None})"
593
+
594
+ class ParsedReviewSection:
595
+ def __init__(self, title: str, content: str):
596
+ self.title = title
597
+ self.content = content
598
+ def __repr__(self):
599
+ return f"ParsedReviewSection(title='{self.title}', content='{self.content[:50]}...')"
600
+
601
+
602
+ # Configure logging (optional, but good practice)
603
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
604
+
605
+
606
+ def post_review_comments_on_github(
607
+ repo_name: str,
608
+ pr_id: int,
609
+ parsed_review_data: Dict[str, Any],
610
+ github_token: Optional[str] = None
611
+ ) -> Dict[str, Any]:
612
+ """
613
+ Posts a structured code review to a GitHub Pull Request.
614
+
615
+ Args:
616
+ repo_name (str): The full name of the repository (e.g., "owner/repo").
617
+ pr_id (int): The Pull Request number.
618
+ parsed_review_data (Dict[str, Any]): The structured review data
619
+ as returned by parse_llm_review_markdown.
620
+ github_token (str, optional): GitHub Personal Access Token.
621
+ If None, tries to read from GITHUB_TOKEN env var.
622
+
623
+ Returns:
624
+ Dict[str, Any]: A dictionary containing details of the posted review,
625
+ e.g., {'status': 'success', 'review_url': '...', 'main_comment_id': ...}.
626
+ Raises an exception on failure.
627
+ """
628
+ if github_token is None:
629
+ github_token = os.getenv("GITHUB_TOKEN")
630
+ if github_token is None:
631
+ logging.error("GitHub token not provided and GITHUB_TOKEN environment variable not set.")
632
+ raise ValueError("GitHub token not provided and GITHUB_TOKEN environment variable not set.")
633
+
634
+ try:
635
+ g = Github(github_token)
636
+ repo = g.get_repo(repo_name)
637
+ pr = repo.get_pull(pr_id)
638
+ logging.info(f"Connected to GitHub repo '{repo_name}', PR #{pr_id}.")
639
+
640
+ # --- 1. Prepare the Main Review Body ---
641
+ overall_impression = parsed_review_data.get('overall_impression', '')
642
+ general_sections = parsed_review_data.get('general_sections', [])
643
+ summary = parsed_review_data.get('summary', '')
644
+ approval_status = parsed_review_data.get('approval_status', 'COMMENT').upper()
645
+
646
+ main_review_body = f"### 🤖 Automated Code Review\n\n"
647
+
648
+ if overall_impression.strip():
649
+ main_review_body += f"**Overall Impression:**\n{overall_impression}\n\n---\n\n"
650
+
651
+ for section in general_sections:
652
+ title_to_add = section.title
653
+ content_to_add = section.content
654
+
655
+ if content_to_add.strip():
656
+ main_review_body += f"### {title_to_add}\n{content_to_add}\n\n---\n\n"
657
+
658
+ if summary.strip():
659
+ main_review_body += f"### Summary\n{summary}\n\n"
660
+
661
+ main_review_body += f"**Recommended Action:** {approval_status}\n"
662
+
663
+ github_event = "COMMENT"
664
+ if approval_status == "APPROVE":
665
+ github_event = "APPROVE"
666
+ elif approval_status == "REQUEST CHANGES":
667
+ github_event = "REQUEST_CHANGES"
668
+
669
+ logging.info(f"Calculated GitHub review event: {github_event}")
670
+
671
+ # --- 2. Prepare Line/File Comments ---
672
+ github_comments = []
673
+ file_comments_data = parsed_review_data.get('file_comments', {})
674
+
675
+ head_commit_sha = pr.head.sha
676
+ # FIX: Get the Commit object from the SHA
677
+ pr_commit_obj = repo.get_commit(head_commit_sha) # <--- ADDED THIS LINE
678
+ logging.info(f"Using head commit SHA: {head_commit_sha} (as Commit object)")
679
+
680
+ if file_comments_data:
681
+ logging.info(f"Preparing {len(file_comments_data)} file-specific comments.")
682
+ for file_path, functions_data in file_comments_data.items():
683
+ consolidated_file_comment_body = f"### Review for `{file_path}`\n\n"
684
+
685
+ sorted_func_names = sorted(functions_data.keys(), key=lambda x: (0 if x == "General_File_Comments" else 1, x))
686
+
687
+ for func_name in sorted_func_names:
688
+ comments_for_func = functions_data[func_name]
689
+
690
+ if not comments_for_func:
691
+ continue
692
+
693
+ if func_name != "General_File_Comments":
694
+ consolidated_file_comment_body += f"#### ⚙️ Function: `{func_name}`\n\n"
695
+ else:
696
+ if len(sorted_func_names) > 1 or (len(sorted_func_names) == 1 and func_name == "General_File_Comments"):
697
+ consolidated_file_comment_body += f"#### 📄 General File Comments\n\n"
698
+
699
+ for comment in comments_for_func:
700
+ consolidated_file_comment_body += f"{comment.message}\n"
701
+ if comment.suggestion:
702
+ consolidated_file_comment_body += f"\n```suggestion\n{comment.suggestion}\n```\n\n"
703
+ consolidated_file_comment_body += "\n---\n\n"
704
+
705
+ if consolidated_file_comment_body.strip() != f"### Review for `{file_path}`":
706
+ github_comments.append({
707
+ "path": file_path,
708
+ "position": 1,
709
+ "body": consolidated_file_comment_body.strip(),
710
+ })
711
+
712
+ # --- 3. Submit the Review ---
713
+ # Pass the Commit object to the 'commit' parameter
714
+ review = pr.create_review(
715
+ commit=pr_commit_obj, # <--- CHANGED THIS LINE
716
+ body=main_review_body,
717
+ event=github_event,
718
+ comments=github_comments
719
+ )
720
+
721
+ logging.info(f"Successfully posted GitHub review. URL: {review.html_url}")
722
+ return {
723
+ 'status': 'success',
724
+ 'review_url': review.html_url,
725
+ 'review_id': review.id,
726
+ 'main_comment_body': main_review_body
727
+ }
728
+
729
+ except UnknownObjectException as e:
730
+ logging.error(f"GitHub object not found (repo or PR): {e}")
731
+ raise ValueError(f"GitHub object not found (repo or PR): {e}")
732
+ except GithubException as e:
733
+ logging.error(f"GitHub API error: {e}")
734
+ raise RuntimeError(f"GitHub API error: {e}")
735
+ except Exception as e:
736
+ logging.critical(f"An unexpected error occurred while posting review: {e}", exc_info=True)
737
+ raise RuntimeError(f"An unexpected error occurred while posting review: {e}")
738
+
739
+
740
+ def post_review_coments_on_github_node(state:PRReviewState):
741
+ repo_name = state.repo_name
742
+ pr_id = state.pr_id
743
+ parsed_llm_review_data = state.parsed_llm_review_data
744
+
745
+ result = post_review_comments_on_github(repo_name,pr_id,parsed_llm_review_data, git_hub_token)
746
+
747
+ # Don't forget to return an updated state, as nodes in LangGraph should always do
748
+ # For this simple example, we'll just return a copy with an updated status
749
+ updated_state = state.model_copy(update={
750
+ "review_status": "posted", # Update status after retrieval logic
751
+ "review_comment_url":result['review_url'],
752
+ "review_id":result['review_id'],
753
+ "last_error":result['status'] # change this field later
754
+
755
+ })
756
+ return updated_state
757
+
758
+
759
+ from IPython.display import Image, display
760
+ from langgraph.graph import StateGraph, START, END
761
+
762
+ # Build graph
763
+ builder = StateGraph(PRReviewState)
764
+ builder.add_node("code_retriever_node", code_retriever_node)
765
+ builder.add_node("code_reviewer_node", code_reviewer_node)
766
+ builder.add_node("feedback_formatter_node", feedback_formatter_node)
767
+ builder.add_node("post_review_coments_on_github_node", post_review_coments_on_github_node)
768
+
769
+ # Logic
770
+ builder.add_edge(START, "code_retriever_node")
771
+ builder.add_edge("code_retriever_node", "code_reviewer_node")
772
+ builder.add_edge("code_reviewer_node", "feedback_formatter_node")
773
+ builder.add_edge("feedback_formatter_node", "post_review_coments_on_github_node")
774
+ builder.add_edge("post_review_coments_on_github_node", END)
775
+
776
+ # need to fix ParsedComment serializable error
777
+ #graph = builder.compile(checkpointer=memory)
778
+ graph = builder.compile()
779
+
780
+
781
+ # View
782
+ #display(Image(graph.get_graph().draw_mermaid_png()))