nikhmr1235's picture
update nodes with changes lost during refactoring
d47a09f verified
import os
import re
import sys
import logging
from typing import Dict, Any, List, Optional, Tuple
from github import Github, PullRequest
from github.GithubException import GithubException, UnknownObjectException
import requests
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from .state import PRReviewState, LLMReviewOutput, ParsedReviewSection, ParsedComment, FileReviewComments
# --- Environment Variable Loading ---
google_api_key = os.getenv("GOOGLE_API_KEY")
git_hub_token = os.getenv("GITHUB_token_ID")
if not google_api_key:
print("Google API key not found in environment variables.")
if not git_hub_token:
print("GITHUB_token_ID not found in environment variables.")
load_dotenv()
# --- LLM Initialization ---
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", temperature=0.0, api_key=google_api_key)
# --- Logging Configuration ---
logging.basicConfig(
format='%(asctime)s | %(levelname)s : %(message)s',
level=logging.INFO,
stream=sys.stdout
)
# --- GitHub API Functions ---
def fetch_pr_code_changes(repo_name: str, pr_id: int) -> Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
"""
Fetches the raw diff content, the full contents of changed files,
and the head commit SHA for a given PR.
Args:
repo_name (str): The full name of the repository (e.g., "octocat/Spoon-Knife").
pr_id (int): The ID of the Pull Request.
Returns:
Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
- raw_diff_content (str or None): The raw diff content of the PR.
- file_contents (Dict[str, str] or None): Dictionary mapping filename to its full content (after changes).
- head_commit_sha (str or None): The SHA of the head commit of the PR.
- error_message (str or None): An error message if something went wrong.
"""
#github_token = os.getenv("GITHUB_TOKEN")
github_token = git_hub_token
if not github_token:
print("Error: GITHUB_TOKEN environment variable not set.")
return None, None, None, "GitHub token not found in environment variables."
try:
g = Github(github_token)
repo = g.get_repo(repo_name)
pull_request = repo.get_pull(pr_id)
# --- NEW: Get the head commit SHA ---
head_commit_sha = pull_request.head.sha
print(f"Fetched PR {pr_id} head commit SHA: {head_commit_sha}")
# 1. Fetch raw diff content (patch)
# Using requests directly for patch_url is good as PyGithub's get_patch() can sometimes be rate-limited differently
patch_url = pull_request.patch_url
headers = {"Authorization": f"token {github_token}"}
raw_diff_content = requests.get(patch_url, headers=headers).text
# 2. Fetch full content of changed files
file_contents: Dict[str, str] = {}
for file in pull_request.get_files():
# Skip files that were deleted, as their content cannot be retrieved from the current head.
if file.status == 'deleted':
file_contents[file.filename] = "[FILE DELETED]"
continue
try:
# We want the content *after* the change, which is from the PR's head branch.
# PyGithub's get_contents should be called with `ref` set to `pull_request.head.ref`
# or `pull_request.head.sha` for explicit content at the PR's head.
# Using pull_request.head.sha is more robust as ref might change.
file_content_obj = repo.get_contents(file.filename, ref=pull_request.head.sha)
if isinstance(file_content_obj, list):
print(f"Warning: '{file.filename}' is a directory or multiple files, skipping content retrieval for now.")
file_contents[file.filename] = "[DIRECTORY OR MULTIPLE FILES]"
continue
file_contents[file.filename] = file_content_obj.decoded_content.decode('utf-8')
except GithubException as e:
print(f"Warning: GitHub API error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e.status} - {e.data.get('message', 'No message')}")
file_contents[file.filename] = f"[ERROR: Could not fetch content. Status: {e.status}, Message: {e.data.get('message', 'No message')}]"
except Exception as e:
print(f"Unexpected error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e}")
file_contents[file.filename] = f"[ERROR: Unexpected error fetching content: {e}]"
# Return the new head_commit_sha along with existing returns
return raw_diff_content, file_contents, head_commit_sha, None # No error message if successful
except UnknownObjectException as e:
error_msg = f"GitHub object not found (repo or PR): {e.data.get('message', 'No message')}"
print(f"Error in fetch_pr_code_changes: {error_msg}")
return None, None, None, error_msg
except GithubException as e:
error_msg = f"GitHub API error for PR {pr_id} from {repo_name}: {e.status} - {e.data.get('message', 'No message')}"
print(f"Error in fetch_pr_code_changes: {error_msg}")
return None, None, None, error_msg
except Exception as e:
error_msg = f"An unexpected error occurred while fetching PR {pr_id} from {repo_name}: {e}"
print(f"Error in fetch_pr_code_changes: {error_msg}")
return None, None, None, error_msg
def post_review_comments_on_github(
repo_name: str, pr_id: int, parsed_review_data: LLMReviewOutput,
github_token: Optional[str] = None, final_event: Optional[str] = "COMMENT"
) -> Dict[str, Any]:
"""Posts a structured code review to a GitHub Pull Request."""
if github_token is None:
github_token = os.getenv("GITHUB_TOKEN")
if not github_token:
raise ValueError("GitHub token not provided.")
try:
g = Github(github_token)
repo = g.get_repo(repo_name)
pr = repo.get_pull(pr_id)
main_review_body = f"### 🤖 Automated Code Review\n\n"
if parsed_review_data.overall_impression:
main_review_body += f"**Overall Impression:**\n{parsed_review_data.overall_impression}\n\n---\n\n"
for section in parsed_review_data.general_sections:
if section.content.strip():
main_review_body += f"### {section.title}\n{section.content}\n\n---\n\n"
if parsed_review_data.summary:
main_review_body += f"### Summary\n{parsed_review_data.summary}\n\n"
main_review_body += f"**LLM Recommended Action:** {parsed_review_data.approval_status.upper()}\n"
github_comments = []
head_commit_sha = pr.head.sha
pr_commit_obj = repo.get_commit(head_commit_sha)
for file_review in parsed_review_data.file_reviews:
consolidated_file_comment_body = f"### Review for `{file_review.file_path}`\n\n"
for func_name, comments in file_review.sections.items():
if not comments: continue
section_header = f"#### 📄 General File Comments\n\n" if func_name == "General_File_Comments" else f"#### ⚙️ Function: `{func_name}`\n\n"
consolidated_file_comment_body += section_header
for comment in comments:
consolidated_file_comment_body += f"{comment.message}\n"
if comment.suggestion:
consolidated_file_comment_body += f"\n```suggestion\n{comment.suggestion}\n```\n\n"
consolidated_file_comment_body += "\n---\n\n"
if consolidated_file_comment_body.strip() != f"### Review for `{file_review.file_path}`":
github_comments.append({
"path": file_review.file_path, "position": 1,
"body": consolidated_file_comment_body.strip(),
})
review = pr.create_review(commit=pr_commit_obj, body=main_review_body, event=final_event, comments=github_comments)
return {'status': 'success', 'review_url': review.html_url, 'review_id': review.id, 'main_comment_body': main_review_body}
except (UnknownObjectException, GithubException, Exception) as e:
logging.error(f"Error posting review: {e}")
raise RuntimeError(f"Failed to post review: {e}")
def update_submitted_review_body(
repo_name: str, pr_id: int, review_id: int, new_body: str, github_token: Optional[str] = None
) -> Dict[str, Any]:
"""Updates the main body of an already submitted GitHub PR review."""
if github_token is None:
github_token = os.getenv("GITHUB_TOKEN")
if not github_token:
raise ValueError("GitHub token not provided.")
try:
g = Github(github_token)
repo = g.get_repo(repo_name)
pr = repo.get_pull(pr_id)
review = pr.get_review(review_id)
if review.state == "PENDING":
return {'status': 'error', 'message': 'Cannot update body of a pending review.'}
review.edit(body=new_body)
return {'status': 'success', 'review_url': review.html_url, 'review_id': review.id, 'updated_body': review.body}
except (UnknownObjectException, GithubException, Exception) as e:
logging.error(f"Error updating review body: {e}")
raise RuntimeError(f"Failed to update review body: {e}")
# --- LLM and Parsing Functions ---
def generate_code_review_markdown(code_diff: str, file_contents: Dict[str, str]) -> str:
"""
Generates a detailed, human-readable code review in Markdown format from the LLM.
The prompt is designed to elicit structured Markdown output that can then be
parsed for GitHub PR comments, grouped by file and function.
Args:
code_diff (str): The string representation of the code diff.
file_contents (Dict[str, str]): A dictionary where keys are file paths
and values are their full content.
Returns:
str: A Markdown string representing the code review.
"""
# Prepare full contents context
full_contents_str = ""
if file_contents:
for filename, content in file_contents.items():
# Add a clear separator and Markdown code block for each file
full_contents_str += f"--- Full Content of {filename} ---\n```python\n{content}\n```\n\n"
else:
full_contents_str = "No full file contents provided for additional context."
# Construct the Prompt Template
prompt = ChatPromptTemplate.from_messages(
[
("system",
"You are an expert Senior Software Engineer and a meticulous code reviewer.\n"
"Your task is to review the provided code changes in a Pull Request.\n"
"Analyze the `code_diff` for potential bugs, performance issues, security vulnerabilities, code style violations, maintainability concerns, and missing tests or documentation.\n"
"Refer to the `full_file_contents` for additional context if the diff alone is insufficient to understand the changes or their implications.\n"
"Provide a comprehensive, actionable, and constructive review.\n"
"Format your review clearly using Markdown. Structure it with the following top-level sections:\n"
"1. **Overall Impression:** A brief summary of the PR's purpose and overall quality.\n"
"2. **Specific Observations and Suggestions:** Detailed feedback, grouped by file.\n"
" - Within each file's section, group related comments, ideally by function or logical block.\n"
" - For each observation/suggestion, include relevant line numbers from the *new* file for context (e.g., 'Line X-Y:').\n"
"3. **Potential Issues and Edge Cases:** Discuss any missed scenarios or potential problems.\n"
"4. **Security Implications:** Highlight any security concerns.\n"
"5. **Adherence to Best Practices (PEP 8):** Comment on style and best practice compliance.\n"
"6. **Performance Considerations:** Discuss performance aspects.\n"
"7. **Unit Testing Suggestions:** Recommend additional tests.\n"
"8. **Docstring/Comment Improvements:** Suggest documentation enhancements.\n"
"9. **Clarity and Conciseness:** Feedback on code readability.\n"
"10. **Summary:** A concise conclusion and recommended action (e.g., 'Approve', 'Request Changes', 'Comment').\n\n"
"For code suggestions, use GitHub's Markdown code block with 'suggestion' annotation, like this:\n"
"```suggestion\n"
"your_suggested_code_here\n"
"```\n"
"Ensure file paths are correctly formatted (e.g., `src/utils/data_processor.py`)."
),
("human",
"Here are the code changes (diff):\n"
"```diff\n"
"{code_diff}\n"
"```\n\n"
"Here are the full contents of the changed files (for additional context, use only if necessary to understand the diff):\n"
"{full_contents_context}\n\n"
"Please provide your structured code review in Markdown."
),
]
)
# Create the Chain
review_chain = prompt | llm
# Invoke the Chain
try:
review_markdown = review_chain.invoke({
"code_diff": code_diff,
"full_contents_context": full_contents_str
}).content # Access the content attribute for Chat model output
return review_markdown
except Exception as e:
print(f"Error generating code review: {e}")
return f"Error: Could not generate code review. {e}\n\n" \
f"Please check the LLM API call or token limits."
# Helper to extract suggestion block and clean message (No change needed)
def _extract_suggestion(text: str) -> Tuple[Optional[str], str]:
"""Helper to extract suggestion block and clean message."""
suggestion_match = re.search(r"```suggestion\n([\s\S]*?)\n```", text, re.MULTILINE)
suggestion_code = suggestion_match.group(1).strip() if suggestion_match else None
# Remove suggestion from the main message
cleaned_message = re.sub(r"```suggestion[\s\S]*?```", "", text).strip()
return suggestion_code, cleaned_message
# Helper to parse bullet-point comments (No change needed, already uses ParsedComment)
def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
"""Helper to parse bullet-point comments from a given text block."""
comments = []
comment_matches = re.finditer(r"(^ *[-*]\s*[\s\S]*?)(?=\n *[-*]\s*|\Z)", text_block, re.MULTILINE | re.DOTALL)
for cm in comment_matches:
full_comment_text = cm.group(1).strip()
if full_comment_text:
suggestion_code, cleaned_message = _extract_suggestion(full_comment_text)
comments.append(ParsedComment(message=cleaned_message, suggestion=suggestion_code))
return comments
def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
"""
Parses the LLM-generated Markdown review into a structured LLMReviewOutput Pydantic model.
This version is designed to be robust against formatting variations by dynamically finding
section headers and extracting content between them.
Args:
markdown_review (str): The full Markdown string generated by the LLM.
Returns:
LLMReviewOutput: A Pydantic model containing structured review data.
"""
temp_structured_data: Dict[str, Any] = {
'overall_impression': None,
'file_reviews': [],
'general_sections': [],
'summary': None,
'approval_status': 'Comment'
}
# --- 1. Find all major section headers and their positions ---
# This pattern recognizes "## Section Title:" and "1. **Section Title:**"
section_header_pattern = re.compile(
r"^(?:##\s+([\w\s/()]+):|(\d+)\.\s+\*\*([\w\s/()]+):\*\*)\s*$",
re.MULTILINE
)
sections = []
for match in section_header_pattern.finditer(markdown_review):
# Consolidate title from group 1 (for '## Title:') or group 3 (for '1. **Title:**')
title = match.group(1) or match.group(3)
if title:
sections.append({
'title': title.strip(),
'content_start': match.end(),
'header_start': match.start()
})
if not sections:
if markdown_review.strip():
temp_structured_data['summary'] = "Could not parse the review markdown. The format was not recognized."
temp_structured_data['overall_impression'] = markdown_review
return LLMReviewOutput(**temp_structured_data)
# --- 2. Process each identified section by extracting content between headers ---
for i in range(len(sections)):
current_section = sections[i]
title = current_section['title']
content_start = current_section['content_start']
# The content ends where the next section's header begins.
# For the last section, it ends at the end of the string.
content_end = sections[i+1]['header_start'] if i + 1 < len(sections) else len(markdown_review)
content = markdown_review[content_start:content_end].strip()
if "Overall Impression" in title:
temp_structured_data['overall_impression'] = content
elif "Specific Observations and Suggestions" in title:
# This pattern handles "- **`file.py`:**", "### file.py", and "**File: file.py**"
file_header_line_pattern = re.compile(
r"^\s*(?:-\s+\*\*(?:`?)([\w\/\.\-_]+\.\w+)(?:`?):\*\*|###\s*`?([\w\/\.\-_]+\.\w+)`?|\*\*File:\s*`?([\w\/\.\-_]+\.\w+)`?\*\*)\s*$",
re.MULTILINE
)
file_matches = list(file_header_line_pattern.finditer(content))
for j, match in enumerate(file_matches):
file_name = next((g for g in match.groups() if g is not None), None)
if not file_name: continue
file_name = file_name.strip().replace('`', '')
start_idx = match.end()
end_idx = file_matches[j+1].start() if j + 1 < len(file_matches) else len(content)
file_content_block = content[start_idx:end_idx].strip()
# Assumes _parse_bullet_comments is defined elsewhere and works correctly
parsed_comments = _parse_bullet_comments(file_content_block)
if parsed_comments:
temp_structured_data['file_reviews'].append(FileReviewComments(
file_path=file_name,
sections={"General_File_Comments": parsed_comments}
))
elif "Summary" in title:
temp_structured_data['summary'] = content
# Extract approval status from the summary
approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment)", content, re.IGNORECASE | re.MULTILINE)
if approval_match:
temp_structured_data['approval_status'] = approval_match.group(1).strip().capitalize()
else: # Any other section is treated as a general section
if content:
temp_structured_data['general_sections'].append(ParsedReviewSection(
title=title,
content=content
))
# --- 3. Final fallbacks and cleanup ---
if not temp_structured_data['summary']:
temp_structured_data['summary'] = "Automated review completed."
return LLMReviewOutput(**temp_structured_data)
# --- Graph Nodes ---
def code_retriever_node(state:PRReviewState):
repo_name = state.repo_name
pull_req_id = state.pr_id
print(f"code_retriever_node started")
print(f"repo_name :{repo_name}-------- pull_req_id:{pull_req_id}")
diff, contents,head_commit_sha, error = fetch_pr_code_changes(repo_name, pull_req_id)
# Don't forget to return an updated state, as nodes in LangGraph should always do
# For this simple example, we'll just return a copy with an updated status
updated_state = state.model_copy(update={
"review_status": "code_fetched", # Update status after retrieval logic
"code_diff": diff,
"file_contents": contents
})
return updated_state
def code_reviewer_node(state:PRReviewState):
code_diff = state.code_diff
file_contents = state.file_contents
print(f"code_reviewer_node started")
review_markdown = generate_code_review_markdown(code_diff, file_contents)
# --- DEBUG LOGGING START ---
print("\n" + "="*50)
print("--- RAW LLM MARKDOWN OUTPUT ---")
print(review_markdown)
print("="*50 + "\n")
# --- DEBUG LOGGING END ---
# Don't forget to return an updated state, as nodes in LangGraph should always do
# For this simple example, we'll just return a copy with an updated status
updated_state = state.model_copy(update={
"review_status": "code_reviewed", # Update status after retrieval logic
"llm_markdown_review":review_markdown,
})
return updated_state
def feedback_formatter_node(state: PRReviewState):
print(f"feedback_formatter_node started")
llm_markdown_review = state.llm_markdown_review
parsed_llm_review_data = parse_llm_review_markdown(llm_markdown_review)
# Don't forget to return an updated state, as nodes in LangGraph should always do
# For this simple example, we'll just return a copy with an updated status
updated_state = state.model_copy(update={
"review_status": "review_parsed" ,# Update status after retrieval logic
"parsed_llm_review_data":parsed_llm_review_data,
})
return updated_state
def post_code_review_node(state: PRReviewState) -> PRReviewState:
"""
Posts the LLM-generated review as a PENDING GitHub review.
"""
print("--- NODE: post_code_review_node ---")
if not state.parsed_llm_review_data:
raise ValueError("Cannot post pending review: parsed_llm_review_data is missing.")
repo_name = state.repo_name
pr_id = state.pr_id
parsed_llm_review_data = state.parsed_llm_review_data
try:
# Call the helper to post as PENDING
result = post_review_comments_on_github(
repo_name=repo_name,
pr_id=pr_id,
parsed_review_data=parsed_llm_review_data,
github_token=git_hub_token,
)
print(f"result from post_pending_review_node():result = {result}")
return state.model_copy(update={
"review_status": "initial_review_posted",
"original_review_id": result['review_id'],
"original_review_url": result['review_url'],
"main_comment_body": result['main_comment_body'],
"last_error": None # Clear previous errors
})
except Exception as e:
#logging.error(f"Error posting pending review: {e}")
print(f"Error posting pending review: {e}")
return state.model_copy(update={
"review_status": "error",
"last_error": f"Failed to post pending review: {e}"
})
def update_review_body_based_on_human_input_node(state: PRReviewState) -> PRReviewState:
"""
Posts the LLM-generated review as a PENDING GitHub review.
"""
print("--- NODE: update_review_body_based_on_human_input_node ---")
if not state.main_comment_body:
raise ValueError("Cannot update submitted review body: main_comment_body is missing.")
if not state.require_human_approval:
print("require_human_approval is False so exiting this function")
logging.info("require_human_approval is False so exiting this function")
return state
repo_name = state.repo_name
pr_id = state.pr_id
original_review_id = state.original_review_id
main_comment_body = state.main_comment_body
if state.human_approval_status is True:
if state.human_feedback_message is not None:
updated_review_body = f"""**Human Decision:** approved\n---\n\nHuman Feedback: {state.human_feedback_message}\n\n Please go ahead and incorporate these Automated Bots review comments\n\n{main_comment_body}"""
else:
updated_review_body = f"""**Human Decision:** approved\n---\n\n Please go ahead and incorporate these Automated Bots review comments\n\n{main_comment_body}"""
elif state.human_approval_status is False:
if state.human_feedback_message is not None:
updated_review_body = f"""**Human Decision:** Reject\n---\n\nHuman Feedback: {state.human_feedback_message}\n\n Please IGNORE these Automated Bots review comments and wait for new review comments from your team\n\n{main_comment_body}"""
else:
updated_review_body = f"""**Human Decision:** Reject\n---\n\nPlease IGNORE these Automated Bots review comments and wait for new review comments from your team\n\n{main_comment_body}"""
else:
return state
try:
# Call the helper to post as PENDING
result = update_submitted_review_body(
repo_name=repo_name,
pr_id=pr_id,
review_id = original_review_id,
new_body =updated_review_body,
github_token=git_hub_token
)
print(f"result from update_submitted_review_body_node():result = {result}")
return state.model_copy(update={
"review_status": "review_submitted",
"final_review_id": result['review_id'],
"final_review_url": result['review_url'],
"main_comment_body": result['updated_body'],
"last_error": None # Clear previous errors
})
except Exception as e:
#logging.error(f"Error posting pending review: {e}")
print(f"Error posting pending review: {e}")
return state.model_copy(update={
"review_status": "error",
"last_error": f"Failed to post pending review: {e}"
})