langgraph_PR_Review_Bot

Sleeping

App Files Files Community

langgraph_PR_Review_Bot / src /langgraph_logic /nodes.py

nikhmr1235

update nodes with changes lost during refactoring

d47a09f verified 4 months ago

raw

history blame contribute delete

26.4 kB

	import os
	import re
	import sys
	import logging
	from typing import Dict, Any, List, Optional, Tuple
	from github import Github, PullRequest
	from github.GithubException import GithubException, UnknownObjectException
	import requests
	from dotenv import load_dotenv
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_google_genai import ChatGoogleGenerativeAI

	from .state import PRReviewState, LLMReviewOutput, ParsedReviewSection, ParsedComment, FileReviewComments

	# --- Environment Variable Loading ---
	google_api_key = os.getenv("GOOGLE_API_KEY")
	git_hub_token = os.getenv("GITHUB_token_ID")

	if not google_api_key:
	print("Google API key not found in environment variables.")
	if not git_hub_token:
	print("GITHUB_token_ID not found in environment variables.")

	load_dotenv()

	# --- LLM Initialization ---
	llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", temperature=0.0, api_key=google_api_key)

	# --- Logging Configuration ---
	logging.basicConfig(
	format='%(asctime)s \| %(levelname)s : %(message)s',
	level=logging.INFO,
	stream=sys.stdout
	)

	# --- GitHub API Functions ---

	def fetch_pr_code_changes(repo_name: str, pr_id: int) -> Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
	"""
	Fetches the raw diff content, the full contents of changed files,
	and the head commit SHA for a given PR.
	Args:
	repo_name (str): The full name of the repository (e.g., "octocat/Spoon-Knife").
	pr_id (int): The ID of the Pull Request.
	Returns:
	Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
	- raw_diff_content (str or None): The raw diff content of the PR.
	- file_contents (Dict[str, str] or None): Dictionary mapping filename to its full content (after changes).
	- head_commit_sha (str or None): The SHA of the head commit of the PR.
	- error_message (str or None): An error message if something went wrong.
	"""
	#github_token = os.getenv("GITHUB_TOKEN")
	github_token = git_hub_token

	if not github_token:
	print("Error: GITHUB_TOKEN environment variable not set.")
	return None, None, None, "GitHub token not found in environment variables."

	try:
	g = Github(github_token)
	repo = g.get_repo(repo_name)
	pull_request = repo.get_pull(pr_id)

	# --- NEW: Get the head commit SHA ---
	head_commit_sha = pull_request.head.sha
	print(f"Fetched PR {pr_id} head commit SHA: {head_commit_sha}")


	# 1. Fetch raw diff content (patch)
	# Using requests directly for patch_url is good as PyGithub's get_patch() can sometimes be rate-limited differently
	patch_url = pull_request.patch_url
	headers = {"Authorization": f"token {github_token}"}
	raw_diff_content = requests.get(patch_url, headers=headers).text

	# 2. Fetch full content of changed files
	file_contents: Dict[str, str] = {}
	for file in pull_request.get_files():
	# Skip files that were deleted, as their content cannot be retrieved from the current head.
	if file.status == 'deleted':
	file_contents[file.filename] = "[FILE DELETED]"
	continue

	try:
	# We want the content after the change, which is from the PR's head branch.
	# PyGithub's get_contents should be called with `ref` set to `pull_request.head.ref`
	# or `pull_request.head.sha` for explicit content at the PR's head.
	# Using pull_request.head.sha is more robust as ref might change.
	file_content_obj = repo.get_contents(file.filename, ref=pull_request.head.sha)

	if isinstance(file_content_obj, list):
	print(f"Warning: '{file.filename}' is a directory or multiple files, skipping content retrieval for now.")
	file_contents[file.filename] = "[DIRECTORY OR MULTIPLE FILES]"
	continue

	file_contents[file.filename] = file_content_obj.decoded_content.decode('utf-8')

	except GithubException as e:
	print(f"Warning: GitHub API error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e.status} - {e.data.get('message', 'No message')}")
	file_contents[file.filename] = f"[ERROR: Could not fetch content. Status: {e.status}, Message: {e.data.get('message', 'No message')}]"
	except Exception as e:
	print(f"Unexpected error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e}")
	file_contents[file.filename] = f"[ERROR: Unexpected error fetching content: {e}]"

	# Return the new head_commit_sha along with existing returns
	return raw_diff_content, file_contents, head_commit_sha, None # No error message if successful

	except UnknownObjectException as e:
	error_msg = f"GitHub object not found (repo or PR): {e.data.get('message', 'No message')}"
	print(f"Error in fetch_pr_code_changes: {error_msg}")
	return None, None, None, error_msg
	except GithubException as e:
	error_msg = f"GitHub API error for PR {pr_id} from {repo_name}: {e.status} - {e.data.get('message', 'No message')}"
	print(f"Error in fetch_pr_code_changes: {error_msg}")
	return None, None, None, error_msg
	except Exception as e:
	error_msg = f"An unexpected error occurred while fetching PR {pr_id} from {repo_name}: {e}"
	print(f"Error in fetch_pr_code_changes: {error_msg}")
	return None, None, None, error_msg

	def post_review_comments_on_github(
	repo_name: str, pr_id: int, parsed_review_data: LLMReviewOutput,
	github_token: Optional[str] = None, final_event: Optional[str] = "COMMENT"
	) -> Dict[str, Any]:
	"""Posts a structured code review to a GitHub Pull Request."""
	if github_token is None:
	github_token = os.getenv("GITHUB_TOKEN")
	if not github_token:
	raise ValueError("GitHub token not provided.")

	try:
	g = Github(github_token)
	repo = g.get_repo(repo_name)
	pr = repo.get_pull(pr_id)

	main_review_body = f"### 🤖 Automated Code Review\n\n"
	if parsed_review_data.overall_impression:
	main_review_body += f"Overall Impression:\n{parsed_review_data.overall_impression}\n\n---\n\n"
	for section in parsed_review_data.general_sections:
	if section.content.strip():
	main_review_body += f"### {section.title}\n{section.content}\n\n---\n\n"
	if parsed_review_data.summary:
	main_review_body += f"### Summary\n{parsed_review_data.summary}\n\n"
	main_review_body += f"LLM Recommended Action: {parsed_review_data.approval_status.upper()}\n"

	github_comments = []
	head_commit_sha = pr.head.sha
	pr_commit_obj = repo.get_commit(head_commit_sha)

	for file_review in parsed_review_data.file_reviews:
	consolidated_file_comment_body = f"### Review for `{file_review.file_path}`\n\n"
	for func_name, comments in file_review.sections.items():
	if not comments: continue
	section_header = f"#### 📄 General File Comments\n\n" if func_name == "General_File_Comments" else f"#### ⚙️ Function: `{func_name}`\n\n"
	consolidated_file_comment_body += section_header
	for comment in comments:
	consolidated_file_comment_body += f"{comment.message}\n"
	if comment.suggestion:
	consolidated_file_comment_body += f"\n```suggestion\n{comment.suggestion}\n```\n\n"
	consolidated_file_comment_body += "\n---\n\n"

	if consolidated_file_comment_body.strip() != f"### Review for `{file_review.file_path}`":
	github_comments.append({
	"path": file_review.file_path, "position": 1,
	"body": consolidated_file_comment_body.strip(),
	})

	review = pr.create_review(commit=pr_commit_obj, body=main_review_body, event=final_event, comments=github_comments)
	return {'status': 'success', 'review_url': review.html_url, 'review_id': review.id, 'main_comment_body': main_review_body}
	except (UnknownObjectException, GithubException, Exception) as e:
	logging.error(f"Error posting review: {e}")
	raise RuntimeError(f"Failed to post review: {e}")

	def update_submitted_review_body(
	repo_name: str, pr_id: int, review_id: int, new_body: str, github_token: Optional[str] = None
	) -> Dict[str, Any]:
	"""Updates the main body of an already submitted GitHub PR review."""
	if github_token is None:
	github_token = os.getenv("GITHUB_TOKEN")
	if not github_token:
	raise ValueError("GitHub token not provided.")

	try:
	g = Github(github_token)
	repo = g.get_repo(repo_name)
	pr = repo.get_pull(pr_id)
	review = pr.get_review(review_id)

	if review.state == "PENDING":
	return {'status': 'error', 'message': 'Cannot update body of a pending review.'}

	review.edit(body=new_body)
	return {'status': 'success', 'review_url': review.html_url, 'review_id': review.id, 'updated_body': review.body}
	except (UnknownObjectException, GithubException, Exception) as e:
	logging.error(f"Error updating review body: {e}")
	raise RuntimeError(f"Failed to update review body: {e}")

	# --- LLM and Parsing Functions ---

	def generate_code_review_markdown(code_diff: str, file_contents: Dict[str, str]) -> str:
	"""
	Generates a detailed, human-readable code review in Markdown format from the LLM.
	The prompt is designed to elicit structured Markdown output that can then be
	parsed for GitHub PR comments, grouped by file and function.
	Args:
	code_diff (str): The string representation of the code diff.
	file_contents (Dict[str, str]): A dictionary where keys are file paths
	and values are their full content.
	Returns:
	str: A Markdown string representing the code review.
	"""

	# Prepare full contents context
	full_contents_str = ""
	if file_contents:
	for filename, content in file_contents.items():
	# Add a clear separator and Markdown code block for each file
	full_contents_str += f"--- Full Content of {filename} ---\n```python\n{content}\n```\n\n"
	else:
	full_contents_str = "No full file contents provided for additional context."

	# Construct the Prompt Template
	prompt = ChatPromptTemplate.from_messages(
	[
	("system",
	"You are an expert Senior Software Engineer and a meticulous code reviewer.\n"
	"Your task is to review the provided code changes in a Pull Request.\n"
	"Analyze the `code_diff` for potential bugs, performance issues, security vulnerabilities, code style violations, maintainability concerns, and missing tests or documentation.\n"
	"Refer to the `full_file_contents` for additional context if the diff alone is insufficient to understand the changes or their implications.\n"
	"Provide a comprehensive, actionable, and constructive review.\n"
	"Format your review clearly using Markdown. Structure it with the following top-level sections:\n"
	"1. Overall Impression: A brief summary of the PR's purpose and overall quality.\n"
	"2. Specific Observations and Suggestions: Detailed feedback, grouped by file.\n"
	" - Within each file's section, group related comments, ideally by function or logical block.\n"
	" - For each observation/suggestion, include relevant line numbers from the new file for context (e.g., 'Line X-Y:').\n"
	"3. Potential Issues and Edge Cases: Discuss any missed scenarios or potential problems.\n"
	"4. Security Implications: Highlight any security concerns.\n"
	"5. Adherence to Best Practices (PEP 8): Comment on style and best practice compliance.\n"
	"6. Performance Considerations: Discuss performance aspects.\n"
	"7. Unit Testing Suggestions: Recommend additional tests.\n"
	"8. Docstring/Comment Improvements: Suggest documentation enhancements.\n"
	"9. Clarity and Conciseness: Feedback on code readability.\n"
	"10. Summary: A concise conclusion and recommended action (e.g., 'Approve', 'Request Changes', 'Comment').\n\n"
	"For code suggestions, use GitHub's Markdown code block with 'suggestion' annotation, like this:\n"
	"```suggestion\n"
	"your_suggested_code_here\n"
	"```\n"
	"Ensure file paths are correctly formatted (e.g., `src/utils/data_processor.py`)."
	),
	("human",
	"Here are the code changes (diff):\n"
	"```diff\n"
	"{code_diff}\n"
	"```\n\n"
	"Here are the full contents of the changed files (for additional context, use only if necessary to understand the diff):\n"
	"{full_contents_context}\n\n"
	"Please provide your structured code review in Markdown."
	),
	]
	)

	# Create the Chain
	review_chain = prompt \| llm

	# Invoke the Chain
	try:
	review_markdown = review_chain.invoke({
	"code_diff": code_diff,
	"full_contents_context": full_contents_str
	}).content # Access the content attribute for Chat model output
	return review_markdown
	except Exception as e:
	print(f"Error generating code review: {e}")
	return f"Error: Could not generate code review. {e}\n\n" \
	f"Please check the LLM API call or token limits."

	# Helper to extract suggestion block and clean message (No change needed)
	def _extract_suggestion(text: str) -> Tuple[Optional[str], str]:
	"""Helper to extract suggestion block and clean message."""
	suggestion_match = re.search(r"```suggestion\n([\s\S]*?)\n```", text, re.MULTILINE)
	suggestion_code = suggestion_match.group(1).strip() if suggestion_match else None

	# Remove suggestion from the main message
	cleaned_message = re.sub(r"```suggestion[\s\S]*?```", "", text).strip()
	return suggestion_code, cleaned_message

	# Helper to parse bullet-point comments (No change needed, already uses ParsedComment)
	def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
	"""Helper to parse bullet-point comments from a given text block."""
	comments = []
	comment_matches = re.finditer(r"(^ [-]\s[\s\S]?)(?=\n [-]\s*\|\Z)", text_block, re.MULTILINE \| re.DOTALL)
	for cm in comment_matches:
	full_comment_text = cm.group(1).strip()
	if full_comment_text:
	suggestion_code, cleaned_message = _extract_suggestion(full_comment_text)
	comments.append(ParsedComment(message=cleaned_message, suggestion=suggestion_code))
	return comments

	def parse_llm_review_markdown(markdown_review: str) -> LLMReviewOutput:
	"""
	Parses the LLM-generated Markdown review into a structured LLMReviewOutput Pydantic model.
	This version is designed to be robust against formatting variations by dynamically finding
	section headers and extracting content between them.
	Args:
	markdown_review (str): The full Markdown string generated by the LLM.
	Returns:
	LLMReviewOutput: A Pydantic model containing structured review data.
	"""
	temp_structured_data: Dict[str, Any] = {
	'overall_impression': None,
	'file_reviews': [],
	'general_sections': [],
	'summary': None,
	'approval_status': 'Comment'
	}

	# --- 1. Find all major section headers and their positions ---
	# This pattern recognizes "## Section Title:" and "1. Section Title:"
	section_header_pattern = re.compile(
	r"^(?:##\s+([\w\s/()]+):\|(\d+)\.\s+\\([\w\s/()]+):\\)\s*$",
	re.MULTILINE
	)

	sections = []
	for match in section_header_pattern.finditer(markdown_review):
	# Consolidate title from group 1 (for '## Title:') or group 3 (for '1. Title:')
	title = match.group(1) or match.group(3)
	if title:
	sections.append({
	'title': title.strip(),
	'content_start': match.end(),
	'header_start': match.start()
	})

	if not sections:
	if markdown_review.strip():
	temp_structured_data['summary'] = "Could not parse the review markdown. The format was not recognized."
	temp_structured_data['overall_impression'] = markdown_review
	return LLMReviewOutput(**temp_structured_data)

	# --- 2. Process each identified section by extracting content between headers ---
	for i in range(len(sections)):
	current_section = sections[i]
	title = current_section['title']
	content_start = current_section['content_start']

	# The content ends where the next section's header begins.
	# For the last section, it ends at the end of the string.
	content_end = sections[i+1]['header_start'] if i + 1 < len(sections) else len(markdown_review)

	content = markdown_review[content_start:content_end].strip()

	if "Overall Impression" in title:
	temp_structured_data['overall_impression'] = content

	elif "Specific Observations and Suggestions" in title:
	# This pattern handles "- `file.py`:", "### file.py", and "File: file.py"
	file_header_line_pattern = re.compile(
	r"^\s(?:-\s+\\(?:`?)([\w\/\.\-_]+\.\w+)(?:`?):\\\|###\s`?([\w\/\.\-_]+\.\w+)`?\|\\File:\s`?([\w\/\.\-_]+\.\w+)`?\\)\s$",
	re.MULTILINE
	)

	file_matches = list(file_header_line_pattern.finditer(content))

	for j, match in enumerate(file_matches):
	file_name = next((g for g in match.groups() if g is not None), None)
	if not file_name: continue

	file_name = file_name.strip().replace('`', '')

	start_idx = match.end()
	end_idx = file_matches[j+1].start() if j + 1 < len(file_matches) else len(content)
	file_content_block = content[start_idx:end_idx].strip()

	# Assumes _parse_bullet_comments is defined elsewhere and works correctly
	parsed_comments = _parse_bullet_comments(file_content_block)

	if parsed_comments:
	temp_structured_data['file_reviews'].append(FileReviewComments(
	file_path=file_name,
	sections={"General_File_Comments": parsed_comments}
	))

	elif "Summary" in title:
	temp_structured_data['summary'] = content
	# Extract approval status from the summary
	approval_match = re.search(r"^\s\\(?:Action\|Recommended Action\|Status):\\\s(Approve\|Request Changes\|Comment)", content, re.IGNORECASE \| re.MULTILINE)
	if approval_match:
	temp_structured_data['approval_status'] = approval_match.group(1).strip().capitalize()

	else: # Any other section is treated as a general section
	if content:
	temp_structured_data['general_sections'].append(ParsedReviewSection(
	title=title,
	content=content
	))

	# --- 3. Final fallbacks and cleanup ---
	if not temp_structured_data['summary']:
	temp_structured_data['summary'] = "Automated review completed."

	return LLMReviewOutput(**temp_structured_data)

	# --- Graph Nodes ---

	def code_retriever_node(state:PRReviewState):
	repo_name = state.repo_name
	pull_req_id = state.pr_id

	print(f"code_retriever_node started")
	print(f"repo_name :{repo_name}-------- pull_req_id:{pull_req_id}")

	diff, contents,head_commit_sha, error = fetch_pr_code_changes(repo_name, pull_req_id)

	# Don't forget to return an updated state, as nodes in LangGraph should always do
	# For this simple example, we'll just return a copy with an updated status
	updated_state = state.model_copy(update={
	"review_status": "code_fetched", # Update status after retrieval logic
	"code_diff": diff,
	"file_contents": contents
	})
	return updated_state

	def code_reviewer_node(state:PRReviewState):
	code_diff = state.code_diff
	file_contents = state.file_contents

	print(f"code_reviewer_node started")

	review_markdown = generate_code_review_markdown(code_diff, file_contents)

	# --- DEBUG LOGGING START ---
	print("\n" + "="*50)
	print("--- RAW LLM MARKDOWN OUTPUT ---")
	print(review_markdown)
	print("="*50 + "\n")
	# --- DEBUG LOGGING END ---

	# Don't forget to return an updated state, as nodes in LangGraph should always do
	# For this simple example, we'll just return a copy with an updated status
	updated_state = state.model_copy(update={
	"review_status": "code_reviewed", # Update status after retrieval logic
	"llm_markdown_review":review_markdown,
	})
	return updated_state

	def feedback_formatter_node(state: PRReviewState):
	print(f"feedback_formatter_node started")
	llm_markdown_review = state.llm_markdown_review

	parsed_llm_review_data = parse_llm_review_markdown(llm_markdown_review)

	# Don't forget to return an updated state, as nodes in LangGraph should always do
	# For this simple example, we'll just return a copy with an updated status
	updated_state = state.model_copy(update={
	"review_status": "review_parsed" ,# Update status after retrieval logic
	"parsed_llm_review_data":parsed_llm_review_data,
	})
	return updated_state

	def post_code_review_node(state: PRReviewState) -> PRReviewState:
	"""
	Posts the LLM-generated review as a PENDING GitHub review.
	"""
	print("--- NODE: post_code_review_node ---")
	if not state.parsed_llm_review_data:
	raise ValueError("Cannot post pending review: parsed_llm_review_data is missing.")

	repo_name = state.repo_name
	pr_id = state.pr_id
	parsed_llm_review_data = state.parsed_llm_review_data

	try:
	# Call the helper to post as PENDING
	result = post_review_comments_on_github(
	repo_name=repo_name,
	pr_id=pr_id,
	parsed_review_data=parsed_llm_review_data,
	github_token=git_hub_token,
	)

	print(f"result from post_pending_review_node():result = {result}")
	return state.model_copy(update={
	"review_status": "initial_review_posted",
	"original_review_id": result['review_id'],
	"original_review_url": result['review_url'],
	"main_comment_body": result['main_comment_body'],
	"last_error": None # Clear previous errors
	})
	except Exception as e:
	#logging.error(f"Error posting pending review: {e}")
	print(f"Error posting pending review: {e}")
	return state.model_copy(update={
	"review_status": "error",
	"last_error": f"Failed to post pending review: {e}"
	})


	def update_review_body_based_on_human_input_node(state: PRReviewState) -> PRReviewState:
	"""
	Posts the LLM-generated review as a PENDING GitHub review.
	"""
	print("--- NODE: update_review_body_based_on_human_input_node ---")
	if not state.main_comment_body:
	raise ValueError("Cannot update submitted review body: main_comment_body is missing.")

	if not state.require_human_approval:
	print("require_human_approval is False so exiting this function")
	logging.info("require_human_approval is False so exiting this function")
	return state

	repo_name = state.repo_name
	pr_id = state.pr_id
	original_review_id = state.original_review_id
	main_comment_body = state.main_comment_body

	if state.human_approval_status is True:
	if state.human_feedback_message is not None:
	updated_review_body = f"""Human Decision: approved\n---\n\nHuman Feedback: {state.human_feedback_message}\n\n Please go ahead and incorporate these Automated Bots review comments\n\n{main_comment_body}"""
	else:
	updated_review_body = f"""Human Decision: approved\n---\n\n Please go ahead and incorporate these Automated Bots review comments\n\n{main_comment_body}"""

	elif state.human_approval_status is False:
	if state.human_feedback_message is not None:
	updated_review_body = f"""Human Decision: Reject\n---\n\nHuman Feedback: {state.human_feedback_message}\n\n Please IGNORE these Automated Bots review comments and wait for new review comments from your team\n\n{main_comment_body}"""
	else:
	updated_review_body = f"""Human Decision: Reject\n---\n\nPlease IGNORE these Automated Bots review comments and wait for new review comments from your team\n\n{main_comment_body}"""
	else:
	return state


	try:
	# Call the helper to post as PENDING
	result = update_submitted_review_body(
	repo_name=repo_name,
	pr_id=pr_id,
	review_id = original_review_id,
	new_body =updated_review_body,
	github_token=git_hub_token
	)

	print(f"result from update_submitted_review_body_node():result = {result}")
	return state.model_copy(update={
	"review_status": "review_submitted",
	"final_review_id": result['review_id'],
	"final_review_url": result['review_url'],
	"main_comment_body": result['updated_body'],
	"last_error": None # Clear previous errors
	})
	except Exception as e:
	#logging.error(f"Error posting pending review: {e}")
	print(f"Error posting pending review: {e}")
	return state.model_copy(update={
	"review_status": "error",
	"last_error": f"Failed to post pending review: {e}"
	})