Spaces:

nihalaninihal
/

Gitu

Runtime error

App Files Files Community

Gitu / github_ai_agent.py

nihalaninihal

Update github_ai_agent.py

7da0953 verified 11 months ago

raw

history blame contribute delete

47.7 kB

	# github_ai_agent.py - Improved version with parallel processing and error handling

	import os
	import re
	import time
	import json
	import datetime
	import networkx as nx
	from collections import defaultdict, Counter
	from itertools import combinations
	import numpy as np
	from typing import List, Dict, Tuple, Any, Optional, Union
	import concurrent.futures
	from functools import lru_cache
	import google.generativeai as genai

	# External libraries
	from github import Github, GithubException
	from sentence_transformers import SentenceTransformer
	import faiss
	from gemini_integration import GeminiClient
	from visualization_module import RepositoryVisualizer


	# Configuration
	class Config:
	"""Configuration for the GitHub AI Agent"""
	def __init__(self):
	self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
	self.github_token = os.environ.get("GITHUB_ACCESS_TOKEN")
	self.embedding_model_name = "all-MiniLM-L6-v2"
	self.gemini_model = "gemini-2.0-pro-exp-02-05"
	self.max_files_to_load = 100 # Safety limit for large repos
	self.max_token_length = 64000 # Gemini Pro context limit
	self.enable_advanced_metrics = True
	self.visualization_node_limit = 150
	self.cache_enabled = True
	self.cache_ttl = 3600 # Cache time to live in seconds

	# File extensions to analyze
	self.code_extensions = [
	'.py', '.js', '.jsx', '.ts', '.tsx', '.java', '.c', '.cpp', '.cs',
	'.go', '.rb', '.php', '.swift', '.kt', '.rs', '.hs', '.scala', '.ml'
	]
	self.doc_extensions = [
	'.md', '.txt', '.rst', '.html', '.xml', '.json', '.yaml', '.yml'
	]


	# GitHub Repository Management
	class GitHubManager:
	"""Manages interaction with GitHub repositories"""
	def __init__(self, config: Config):
	self.config = config
	self.g = Github(config.github_token) if config.github_token else Github()
	self.current_repo = None
	self.repo_data = {}
	self.file_contents = {}
	self.contributors_data = {}
	self.commit_history = []
	self.issues_data = []
	self.file_cache = {} # Cache for loaded files

	def load_repository(self, repo_url: str) -> bool:
	"""Load a repository from URL"""
	try:
	# Extract repo name from URL
	repo_name = self._extract_repo_name(repo_url)
	if not repo_name:
	return False

	# Get repository
	self.current_repo = self.g.get_repo(repo_name)

	# Load basic repository data
	self.repo_data = {
	'name': self.current_repo.name,
	'full_name': self.current_repo.full_name,
	'description': self.current_repo.description,
	'stars': self.current_repo.stargazers_count,
	'forks': self.current_repo.forks_count,
	'watchers': self.current_repo.watchers_count,
	'open_issues': self.current_repo.open_issues_count,
	'created_at': self.current_repo.created_at,
	'updated_at': self.current_repo.updated_at,
	'default_branch': self.current_repo.default_branch,
	'language': self.current_repo.language,
	'topics': self.current_repo.get_topics(),
	'license': self.current_repo.license.name if self.current_repo.license else None,
	}

	return True
	except Exception as e:
	print(f"Error loading repository: {e}")
	return False

	def _extract_repo_name(self, repo_url: str) -> Optional[str]:
	"""Extract repository name from URL"""
	# Handle URLs like: https://github.com/username/repository
	github_pattern = r'github\.com[/:]([^/]+)/([^/]+)'
	match = re.search(github_pattern, repo_url)
	if match:
	username, repo = match.groups()
	# Remove .git extension if present
	repo = repo.replace('.git', '')
	return f"{username}/{repo}"
	return None

	def load_files(self) -> Dict[str, Dict]:
	"""Load files from repository with improved performance"""
	if not self.current_repo:
	return {}

	try:
	contents = self.current_repo.get_contents("")
	self.file_contents = {}
	files_loaded = 0
	batch_size = 20 # Process files in batches

	# Create a queue of files to process
	file_queue = []

	# First pass - collect all file paths
	while contents:
	content_item = contents.pop(0)

	# Skip directories but add their contents to our processing queue
	if content_item.type == "dir":
	try:
	dir_contents = self.current_repo.get_contents(content_item.path)
	contents.extend(dir_contents)
	except Exception as e:
	print(f"Error accessing directory {content_item.path}: {e}")
	continue

	# Filter by extensions
	_, ext = os.path.splitext(content_item.path)
	if ext not in self.config.code_extensions + self.config.doc_extensions:
	continue

	# Add file to processing queue
	file_queue.append(content_item)

	# Stop if we've reached our limit
	if len(file_queue) >= self.config.max_files_to_load:
	break

	# Process files in batches
	for i in range(0, len(file_queue), batch_size):
	batch = file_queue[i:i+batch_size]

	# Process batch in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_file = {
	executor.submit(self._process_file, file_content): file_content
	for file_content in batch
	}

	for future in concurrent.futures.as_completed(future_to_file):
	file_content = future_to_file[future]
	try:
	result = future.result()
	if result:
	self.file_contents[file_content.path] = result
	files_loaded += 1
	except Exception as e:
	print(f"Error processing file {file_content.path}: {e}")

	return self.file_contents
	except Exception as e:
	print(f"Error loading files: {e}")
	return {}

	def _process_file(self, file_content) -> Optional[Dict]:
	"""Process a single file (for parallel execution)"""
	try:
	# Check if in cache
	if file_content.path in self.file_cache:
	return self.file_cache[file_content.path]

	_, ext = os.path.splitext(file_content.path)

	# Only process text files with specified extensions
	if ext not in self.config.code_extensions + self.config.doc_extensions:
	return None

	try:
	# Decode content
	decoded_content = file_content.decoded_content.decode('utf-8')
	result = {
	'content': decoded_content,
	'type': 'code' if ext in self.config.code_extensions else 'document',
	'size': file_content.size,
	'ext': ext
	}

	# Update cache
	self.file_cache[file_content.path] = result
	return result
	except UnicodeDecodeError:
	# Skip binary files
	return None

	except Exception as e:
	print(f"Error processing file {file_content.path}: {e}")
	return None

	def load_contributors(self) -> List[Dict]:
	"""Load repository contributors with improved performance"""
	if not self.current_repo:
	return []

	try:
	contributors = self.current_repo.get_contributors()
	self.contributors_data = {}

	# Collect basic contributor info
	contributor_list = list(contributors) # Convert from PaginatedList to list

	# Process in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_contributor = {
	executor.submit(self._process_contributor, contributor): contributor
	for contributor in contributor_list
	}

	for future in concurrent.futures.as_completed(future_to_contributor):
	contributor = future_to_contributor[future]
	try:
	contributor_data = future.result()
	if contributor_data:
	self.contributors_data[contributor.login] = contributor_data
	except Exception as e:
	print(f"Error processing contributor {contributor.login}: {e}")

	return list(self.contributors_data.values())
	except Exception as e:
	print(f"Error loading contributors: {e}")
	return []

	def _process_contributor(self, contributor) -> Dict:
	"""Process a single contributor (for parallel execution)"""
	try:
	return {
	'login': contributor.login,
	'id': contributor.id,
	'contributions': contributor.contributions,
	'avatar_url': contributor.avatar_url,
	'html_url': contributor.html_url,
	'type': contributor.type,
	'files_modified': [],
	'commit_messages': [],
	'activity_dates': []
	}
	except Exception as e:
	print(f"Error processing contributor {contributor.login}: {e}")
	return None

	def load_commits(self, limit: int = 100) -> List[Dict]:
	"""Load repository commits with improved performance"""
	if not self.current_repo:
	return []

	try:
	commits = self.current_repo.get_commits()[:limit]
	self.commit_history = []
	commits_list = list(commits) # Convert from PaginatedList to list

	# Process commits in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_commit = {
	executor.submit(self._process_commit, commit): commit
	for commit in commits_list
	}

	for future in concurrent.futures.as_completed(future_to_commit):
	commit = future_to_commit[future]
	try:
	commit_data = future.result()
	if commit_data:
	self.commit_history.append(commit_data)
	except Exception as e:
	print(f"Error processing commit {commit.sha}: {e}")

	# Process contributor file statistics
	self._update_contributor_file_stats()

	return self.commit_history
	except Exception as e:
	print(f"Error loading commits: {e}")
	return []

	def _process_commit(self, commit) -> Optional[Dict]:
	"""Process a single commit (for parallel execution)"""
	try:
	# Make sure the commit date is timezone-naive
	commit_date = commit.commit.author.date
	if hasattr(commit_date, 'tzinfo') and commit_date.tzinfo:
	commit_date = commit_date.replace(tzinfo=None)

	commit_data = {
	'sha': commit.sha,
	'author': commit.author.login if commit.author else 'Unknown',
	'date': commit_date,
	'message': commit.commit.message,
	'files': []
	}

	# Get files changed in this commit
	try:
	commit_files = commit.files
	for file in commit_files:
	file_data = {
	'filename': file.filename,
	'additions': file.additions,
	'deletions': file.deletions,
	'changes': file.changes,
	'status': file.status
	}
	commit_data['files'].append(file_data)

	# Add this file to the contributor's file list
	if commit.author and commit.author.login in self.contributors_data:
	self.contributors_data[commit.author.login]['files_modified'].append(file.filename)
	self.contributors_data[commit.author.login]['commit_messages'].append(commit.commit.message)
	self.contributors_data[commit.author.login]['activity_dates'].append(commit_date)
	except Exception as e:
	print(f"Error processing files for commit {commit.sha}: {e}")

	return commit_data
	except Exception as e:
	print(f"Error processing commit {commit.sha}: {e}")
	return None

	def _update_contributor_file_stats(self):
	"""Update contributor file statistics"""
	for login, contributor in self.contributors_data.items():
	if 'files_modified' in contributor:
	# Count occurrences of each file
	file_counts = Counter(contributor['files_modified'])
	# Replace list with a list of (filename, count) tuples
	self.contributors_data[login]['files_modified'] = [
	{'filename': filename, 'count': count}
	for filename, count in file_counts.most_common(10)
	]

	def load_issues(self, limit: int = 30) -> List[Dict]:
	"""Load repository issues with improved performance"""
	if not self.current_repo:
	return []

	try:
	issues = self.current_repo.get_issues(state='all')[:limit]
	self.issues_data = []
	issues_list = list(issues) # Convert from PaginatedList to list

	# Process issues in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_issue = {
	executor.submit(self._process_issue, issue): issue
	for issue in issues_list
	}

	for future in concurrent.futures.as_completed(future_to_issue):
	issue = future_to_issue[future]
	try:
	issue_data = future.result()
	if issue_data:
	self.issues_data.append(issue_data)
	except Exception as e:
	print(f"Error processing issue #{issue.number}: {e}")

	return self.issues_data
	except Exception as e:
	print(f"Error loading issues: {e}")
	return []

	def _process_issue(self, issue) -> Optional[Dict]:
	"""Process a single issue (for parallel execution)"""
	try:
	# Normalize datetime objects
	created_at = issue.created_at
	updated_at = issue.updated_at
	closed_at = issue.closed_at

	if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
	created_at = created_at.replace(tzinfo=None)
	if hasattr(updated_at, 'tzinfo') and updated_at.tzinfo:
	updated_at = updated_at.replace(tzinfo=None)
	if hasattr(closed_at, 'tzinfo') and closed_at and closed_at.tzinfo:
	closed_at = closed_at.replace(tzinfo=None)

	issue_data = {
	'number': issue.number,
	'title': issue.title,
	'body': issue.body,
	'user': issue.user.login if issue.user else 'Unknown',
	'state': issue.state,
	'created_at': created_at,
	'updated_at': updated_at,
	'closed_at': closed_at,
	'labels': [label.name for label in issue.labels],
	'comments': []
	}

	# Get comments for this issue (limited to 10)
	try:
	comments = issue.get_comments()[:10]
	for comment in comments:
	# Normalize datetime
	comment_created_at = comment.created_at
	if hasattr(comment_created_at, 'tzinfo') and comment_created_at.tzinfo:
	comment_created_at = comment_created_at.replace(tzinfo=None)

	issue_data['comments'].append({
	'user': comment.user.login if comment.user else 'Unknown',
	'body': comment.body,
	'created_at': comment_created_at
	})
	except Exception as e:
	print(f"Error loading comments for issue #{issue.number}: {e}")

	return issue_data
	except Exception as e:
	print(f"Error processing issue #{issue.number}: {e}")
	return None


	# Knowledge Base and Vector Storage
	class KnowledgeBase:
	"""Manages the knowledge base for the repository"""
	def __init__(self, config: Config):
	self.config = config
	self.embeddings = {}
	self.embedding_model = SentenceTransformer(config.embedding_model_name)
	self.index = None
	self.knowledge_graph = nx.Graph()
	self.insights = {}
	self.insights_cache = {}
	self.cache_timestamp = None

	def initialize_vector_storage(self, file_contents: Dict[str, Dict]) -> None:
	"""Initialize vector storage with file contents and batched processing"""
	try:
	# Clear existing data
	self.embeddings = {}
	self.knowledge_graph = nx.Graph()

	# Process files and create embeddings
	texts = []
	ids = []

	# Process files in parallel for large repositories
	if len(file_contents) > 50:
	with concurrent.futures.ThreadPoolExecutor() as executor:
	# Process files in batches
	batch_size = 20
	keys = list(file_contents.keys())
	batches = [keys[i:i + batch_size] for i in range(0, len(keys), batch_size)]

	# Create a function to process a batch
	def process_batch(batch_keys):
	batch_texts = []
	batch_ids = []
	for path in batch_keys:
	file_data = file_contents[path]
	content = file_data['content']

	# Skip very large files to avoid embedding issues
	if len(content) > 10000:
	content = content[:10000] + "..."

	batch_texts.append(content)
	batch_ids.append(path)
	return batch_texts, batch_ids

	# Submit batch processing tasks
	futures = [executor.submit(process_batch, batch) for batch in batches]

	# Collect results
	for future in concurrent.futures.as_completed(futures):
	batch_texts, batch_ids = future.result()
	texts.extend(batch_texts)
	ids.extend(batch_ids)
	else:
	# For smaller repositories, process sequentially
	for path, file_data in file_contents.items():
	content = file_data['content']

	# Skip very large files to avoid embedding issues
	if len(content) > 10000:
	content = content[:10000] + "..."

	texts.append(content)
	ids.append(path)

	# Add nodes to knowledge graph
	for path, file_data in file_contents.items():
	self.knowledge_graph.add_node(
	path,
	type='file',
	file_type=file_data.get('type', 'unknown'),
	size=file_data.get('size', 0),
	extension=file_data.get('ext', '')
	)

	# Create embeddings for all files
	if texts:
	# Process embeddings in batches to avoid memory issues
	batch_size = 32
	file_embeddings = []

	for i in range(0, len(texts), batch_size):
	batch_texts = texts[i:i+batch_size]
	batch_embeddings = self.embedding_model.encode(batch_texts)
	file_embeddings.append(batch_embeddings)

	file_embeddings = np.vstack(file_embeddings)

	# Initialize FAISS index
	dimension = file_embeddings.shape[1]
	self.index = faiss.IndexFlatL2(dimension)
	self.index.add(np.array(file_embeddings).astype('float32'))

	# Store embeddings with their IDs
	for i, file_id in enumerate(ids):
	self.embeddings[file_id] = {
	'embedding': file_embeddings[i],
	'content': texts[i]
	}
	except Exception as e:
	print(f"Error initializing vector storage: {e}")

	def build_knowledge_graph(self, commits: List[Dict], contributors: Dict) -> nx.Graph:
	"""Build knowledge graph from repository data"""
	try:
	# Add contributor nodes
	for login, data in contributors.items():
	self.knowledge_graph.add_node(
	login,
	type='contributor',
	contributions=data['contributions']
	)

	# Add connections between contributors and files
	for login, data in contributors.items():
	for file_data in data['files_modified']:
	filename = file_data['filename']
	count = file_data['count']

	# Only add edges if file exists in the graph
	if filename in self.knowledge_graph:
	if self.knowledge_graph.has_edge(login, filename):
	# Update weight if edge exists
	self.knowledge_graph[login][filename]['weight'] += count
	else:
	# Create new edge
	self.knowledge_graph.add_edge(login, filename, weight=count)

	# Optimized co-occurrence calculation
	file_co_occurrence = defaultdict(int)

	# Process in batches for large commit histories
	batch_size = 50
	for i in range(0, len(commits), batch_size):
	batch_commits = commits[i:i+batch_size]

	for commit in batch_commits:
	# Get all files in this commit
	commit_files = [file['filename'] for file in commit['files']]

	# Add co-occurrence for each pair of files
	from itertools import combinations
	for file1, file2 in combinations(commit_files, 2):
	if file1 in self.knowledge_graph and file2 in self.knowledge_graph:
	file_pair = tuple(sorted([file1, file2]))
	file_co_occurrence[file_pair] += 1

	# Add edges for file co-occurrence
	for (file1, file2), count in file_co_occurrence.items():
	if count >= 2: # Only add edge if files co-occur at least twice
	if self.knowledge_graph.has_edge(file1, file2):
	self.knowledge_graph[file1][file2]['weight'] += count
	else:
	self.knowledge_graph.add_edge(file1, file2, weight=count, type='co-occurrence')

	return self.knowledge_graph
	except Exception as e:
	print(f"Error building knowledge graph: {e}")
	return nx.Graph()

	@lru_cache(maxsize=32)
	def search_similar_files(self, query: str, top_k: int = 5) -> List[Dict]:
	"""Search for files similar to query with caching"""
	try:
	if not self.index:
	return []

	# Encode query
	query_embedding = self.embedding_model.encode([query])

	# Search in FAISS
	distances, indices = self.index.search(np.array(query_embedding).astype('float32'), top_k)

	# Get results
	results = []
	all_ids = list(self.embeddings.keys())

	for i, idx in enumerate(indices[0]):
	if idx < len(all_ids):
	file_id = all_ids[idx]
	results.append({
	'file': file_id,
	'distance': float(distances[0][i]),
	'content': self.embeddings[file_id]['content'][:1000] + "..." if len(self.embeddings[file_id]['content']) > 1000 else self.embeddings[file_id]['content']
	})

	return results
	except Exception as e:
	print(f"Error searching similar files: {e}")
	return []

	def extract_insights(self, repo_data: Dict, commits: List[Dict], contributors: Dict, issues: List[Dict]) -> Dict:
	"""Extract insights from repository data with datetime fix and caching"""
	# Check if we have a recent cache (less than 10 minutes old)
	current_time = time.time()
	if self.cache_timestamp and (current_time - self.cache_timestamp < 600) and self.insights_cache:
	return self.insights_cache

	try:
	insights = {
	'basic_stats': {},
	'activity': {},
	'contributors': {},
	'code': {},
	'issues': {}
	}

	# Make a deep copy of repo_data to avoid modifying the original
	repo_data_copy = {k: v for k, v in repo_data.items()}

	# Basic statistics
	insights['basic_stats'] = {
	'name': repo_data_copy['name'],
	'description': repo_data_copy['description'],
	'stars': repo_data_copy['stars'],
	'forks': repo_data_copy['forks'],
	'age_days': None, # Will calculate below
	'primary_language': repo_data_copy['language'],
	'topics': repo_data_copy['topics']
	}

	# Fix: Normalize datetime objects to be timezone-naive for consistent comparison
	created_at = repo_data_copy.get('created_at')
	if created_at:
	# Remove timezone info if present
	if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
	created_at = created_at.replace(tzinfo=None)

	# Calculate age
	now = datetime.datetime.now()
	insights['basic_stats']['age_days'] = (now - created_at).days

	# Activity insights
	if commits:
	# Fix: Normalize all datetime objects to be timezone-naive
	commit_dates = []
	for commit in commits:
	date = commit.get('date')
	if date:
	# Remove timezone info if present
	if hasattr(date, 'tzinfo') and date.tzinfo:
	date = date.replace(tzinfo=None)
	commit_dates.append(date)

	# Sort dates
	commit_dates.sort()

	if commit_dates:
	# Calculate commit frequency
	first_commit = commit_dates[0]
	last_commit = commit_dates[-1]
	days_span = (last_commit - first_commit).days + 1

	insights['activity'] = {
	'total_commits': len(commits),
	'first_commit': first_commit,
	'last_commit': last_commit,
	'days_span': days_span,
	'commits_per_day': round(len(commits) / max(days_span, 1), 2),
	}

	# Fix: Use Counter for most active day calculation
	date_counter = Counter(d.date() for d in commit_dates)
	if date_counter:
	insights['activity']['most_active_day'] = date_counter.most_common(1)[0][0]

	# Commit activity by month
	commit_months = [d.strftime('%Y-%m') for d in commit_dates]
	month_counts = Counter(commit_months)
	insights['activity']['monthly_activity'] = [
	{'month': month, 'commits': count} for month, count in month_counts.most_common(12)
	]

	# Contributor insights
	if contributors:
	top_contributors = sorted(contributors.values(), key=lambda x: x['contributions'], reverse=True)[:10]

	insights['contributors'] = {
	'total_contributors': len(contributors),
	'top_contributors': [
	{
	'login': c['login'],
	'contributions': c['contributions'],
	'top_files': [f['filename'] for f in c['files_modified'][:5]] if c['files_modified'] else []
	} for c in top_contributors
	]
	}

	# Calculate bus factor (simplified)
	total_commits = sum(c['contributions'] for c in contributors.values())
	running_sum = 0
	bus_factor = 0

	for c in top_contributors:
	running_sum += c['contributions']
	bus_factor += 1
	if running_sum / total_commits > 0.5:
	break

	insights['contributors']['bus_factor'] = bus_factor

	# Code insights
	if self.knowledge_graph:
	# Get top connected files
	file_nodes = [(node, degree) for node, degree in self.knowledge_graph.degree()
	if self.knowledge_graph.nodes[node].get('type') == 'file']

	top_files = sorted(file_nodes, key=lambda x: x[1], reverse=True)[:10]
	insights['code']['central_files'] = [
	{'filename': filename, 'connections': degree} for filename, degree in top_files
	]

	# Most frequently modified files from commits
	file_modifications = Counter()
	for commit in commits:
	for file in commit['files']:
	file_modifications[file['filename']] += 1

	insights['code']['frequently_modified_files'] = [
	{'filename': filename, 'modifications': count}
	for filename, count in file_modifications.most_common(10)
	]

	# File types distribution
	file_types = Counter([os.path.splitext(node)[1] for node in self.knowledge_graph.nodes()
	if '.' in node and self.knowledge_graph.nodes[node].get('type') == 'file'])

	insights['code']['file_types'] = [
	{'extension': ext, 'count': count} for ext, count in file_types.most_common()
	]

	# Issue insights
	if issues:
	# Calculate issue statistics
	open_issues = [issue for issue in issues if issue['state'] == 'open']
	closed_issues = [issue for issue in issues if issue['state'] == 'closed']

	insights['issues'] = {
	'total_issues': len(issues),
	'open_issues': len(open_issues),
	'closed_issues': len(closed_issues),
	'resolution_rate': round(len(closed_issues) / max(len(issues), 1), 2)
	}

	# Calculate average time to close
	close_times = []
	for issue in closed_issues:
	if issue['created_at'] and issue['closed_at']:
	# Fix: Normalize datetime objects to be timezone-naive
	created_at = issue['created_at']
	closed_at = issue['closed_at']

	if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
	created_at = created_at.replace(tzinfo=None)

	if hasattr(closed_at, 'tzinfo') and closed_at.tzinfo:
	closed_at = closed_at.replace(tzinfo=None)

	close_time = (closed_at - created_at).days
	close_times.append(close_time)

	if close_times:
	insights['issues']['avg_days_to_close'] = round(sum(close_times) / len(close_times), 1)

	# Top issue labels
	issue_labels = [label for issue in issues for label in issue['labels']]
	label_counts = Counter(issue_labels)
	insights['issues']['top_labels'] = [
	{'label': label, 'count': count} for label, count in label_counts.most_common(5)
	]

	# Update cache
	self.insights_cache = insights
	self.cache_timestamp = current_time
	self.insights = insights

	return insights
	except Exception as e:
	import traceback
	print(f"Error extracting insights: {e}")
	print(traceback.format_exc())
	return {}


	# Main GitHub AI Agent Class
	class GitHubAIAgent:
	"""Main class for GitHub AI Agent"""
	def __init__(self):
	self.config = Config()
	self.github_manager = None
	self.knowledge_base = None
	self.gemini_client = None
	self.visualization_manager = None

	self.repository_loaded = False
	self.repository_url = ""
	self.repository_analysis = {}
	self.visualizations = {}

	# Initialize caches
	self.file_cache = {}
	self.contributor_cache = {}
	self.commit_cache = {}
	self.issue_cache = {}
	self.query_cache = {}

	def set_api_keys(self, gemini_api_key: str, github_token: str = None) -> None:
	"""Set API keys"""
	# Set environment variables
	os.environ["GEMINI_API_KEY"] = gemini_api_key
	if github_token:
	os.environ["GITHUB_ACCESS_TOKEN"] = github_token

	# Update config
	self.config.gemini_api_key = gemini_api_key
	self.config.github_token = github_token

	# Initialize clients
	self.github_manager = GitHubManager(self.config)
	self.knowledge_base = KnowledgeBase(self.config)
	self.gemini_client = GeminiClient(self.config.gemini_api_key, self.config.gemini_model)
	self.visualization_manager = RepositoryVisualizer(self.config)

	def load_repository(self, repository_url: str) -> Dict:
	"""Load and analyze a GitHub repository with improved parallelization"""
	result = {
	'success': False,
	'message': '',
	'repo_data': {},
	'file_count': 0,
	'contributor_count': 0
	}

	try:
	# Reset state
	self.repository_loaded = False
	self.repository_url = ""
	self.repository_analysis = {}
	self.visualizations = {}

	# Load repository basic info
	print(f"Loading repository: {repository_url}")
	repo_loaded = self.github_manager.load_repository(repository_url)

	if not repo_loaded:
	result['message'] = "Failed to load repository. Check the URL and your GitHub access token."
	return result

	# Store repository URL
	self.repository_url = repository_url

	# Use parallel processing for loading repository data
	with concurrent.futures.ThreadPoolExecutor() as executor:
	# Submit tasks
	files_future = executor.submit(self.github_manager.load_files)
	contributors_future = executor.submit(self.github_manager.load_contributors)
	commits_future = executor.submit(self.github_manager.load_commits)
	issues_future = executor.submit(self.github_manager.load_issues)

	# Get results
	files = files_future.result()
	contributors = contributors_future.result()
	commits = commits_future.result()
	issues = issues_future.result()

	result['file_count'] = len(files)
	result['contributor_count'] = len(contributors)

	# Initialize vector storage and build knowledge graph
	# (These are kept sequential as they depend on previous steps)
	print("Building knowledge base")
	self.knowledge_base.initialize_vector_storage(files)
	knowledge_graph = self.knowledge_base.build_knowledge_graph(
	commits, self.github_manager.contributors_data
	)

	# Extract repository insights
	print("Extracting repository insights")
	insights = self.knowledge_base.extract_insights(
	self.github_manager.repo_data,
	commits,
	self.github_manager.contributors_data,
	issues
	)

	# Use a separate thread for Gemini analysis which can be slower
	# and doesn't block the main thread
	def analyze_with_gemini():
	print("Analyzing repository with Gemini")
	return self.gemini_client.analyze_repository(
	self.github_manager.repo_data,
	files,
	commits,
	self.github_manager.contributors_data,
	insights
	)

	# Use another thread pool for visualization generation
	def create_visualizations():
	print("Creating repository visualizations")
	repo_graph_path = self.visualization_manager.create_repository_graph(knowledge_graph)
	activity_chart_path = self.visualization_manager.create_commit_activity_chart(commits)
	contributor_network_path = self.visualization_manager.create_contributor_network(
	self.github_manager.contributors_data, commits
	)
	dependency_graph_path = self.visualization_manager.create_file_dependency_graph(files)

	return {
	'repository_graph': repo_graph_path,
	'activity_chart': activity_chart_path,
	'contributor_network': contributor_network_path,
	'dependency_graph': dependency_graph_path,
	}

	# Run Gemini analysis and visualization generation in parallel
	with concurrent.futures.ThreadPoolExecutor() as executor:
	analysis_future = executor.submit(analyze_with_gemini)
	viz_future = executor.submit(create_visualizations)

	# Get results
	self.repository_analysis = analysis_future.result()
	self.visualizations = viz_future.result()

	# Update result
	result['success'] = True
	result['message'] = f"Successfully loaded and analyzed repository: {self.github_manager.repo_data['full_name']}"
	result['repo_data'] = self.github_manager.repo_data

	self.repository_loaded = True

	return result
	except Exception as e:
	import traceback
	print(f"Error loading repository: {str(e)}")
	print(traceback.format_exc())
	result['message'] = f"Error loading repository: {str(e)}"
	return result

	@lru_cache(maxsize=32)
	def answer_query(self, query: str) -> Dict:
	"""Answer a natural language query about the repository with caching"""
	if not self.repository_loaded:
	return {
	'success': False,
	'message': "No repository loaded. Please load a repository first.",
	'answer': ""
	}

	# Check cache if enabled
	cache_key = f"query_{hash(query)}"
	if self.config.cache_enabled and cache_key in self.query_cache:
	cached_result = self.query_cache[cache_key]
	# Check if cache is still valid
	if time.time() - cached_result['timestamp'] < self.config.cache_ttl:
	return cached_result['result']

	try:
	# Search for relevant files
	similar_files = self.knowledge_base.search_similar_files(query)

	# Get answer from Gemini
	answer = self.gemini_client.answer_query(
	query,
	self.github_manager.repo_data,
	similar_files,
	self.knowledge_base.insights
	)

	result = {
	'success': True,
	'message': "Query answered successfully",
	'answer': answer,
	'relevant_files': [f['file'] for f in similar_files]
	}

	# Update cache
	if self.config.cache_enabled:
	self.query_cache[cache_key] = {
	'result': result,
	'timestamp': time.time()
	}

	return result
	except Exception as e:
	return {
	'success': False,
	'message': f"Error answering query: {str(e)}",
	'answer': ""
	}

	def analyze_code(self, file_path: str = "", code_snippet: str = "", language: str = "") -> Dict:
	"""Analyze a code file or snippet with improved error handling"""
	if not file_path and not code_snippet:
	return {
	'success': False,
	'message': "Please provide a file path or code snippet",
	'analysis': ""
	}

	try:
	# If file path provided, get code from repository
	if file_path:
	if not self.repository_loaded:
	return {
	'success': False,
	'message': "No repository loaded. Please load a repository first.",
	'analysis': ""
	}

	if file_path not in self.github_manager.file_contents:
	return {
	'success': False,
	'message': f"File not found: {file_path}",
	'analysis': ""
	}

	code = self.github_manager.file_contents[file_path]['content']
	_, ext = os.path.splitext(file_path)
	language = ext.lstrip('.')
	else:
	code = code_snippet

	# Analyze code with Gemini
	analysis = self.gemini_client.analyze_code_snippet(code, language)

	return {
	'success': True,
	'message': "Code analyzed successfully",
	'analysis': analysis
	}
	except Exception as e:
	return {
	'success': False,
	'message': f"Error analyzing code: {str(e)}",
	'analysis': ""
	}

	def find_collaborators(self, requirements: str) -> Dict:
	"""Find potential collaborators based on requirements"""
	if not self.repository_loaded:
	return {
	'success': False,
	'message': "No repository loaded. Please load a repository first.",
	'collaborators': []
	}

	try:
	# Find collaborators with Gemini
	collaborators = self.gemini_client.identify_potential_collaborators(
	self.github_manager.contributors_data,
	self.knowledge_base.insights,
	requirements
	)

	return {
	'success': True,
	'message': "Potential collaborators identified",
	'collaborators': collaborators
	}
	except Exception as e:
	return {
	'success': False,
	'message': f"Error finding collaborators: {str(e)}",
	'collaborators': []
	}

	def get_repository_insights(self) -> Dict:
	"""Get insights about the repository"""
	if not self.repository_loaded:
	return {
	'success': False,
	'message': "No repository loaded. Please load a repository first.",
	'insights': {}
	}

	try:
	return {
	'success': True,
	'message': "Repository insights retrieved",
	'insights': self.knowledge_base.insights,
	'analysis': self.repository_analysis
	}
	except Exception as e:
	return {
	'success': False,
	'message': f"Error getting repository insights: {str(e)}",
	'insights': {}
	}

	def get_visualizations(self) -> Dict:
	"""Get repository visualizations"""
	if not self.repository_loaded:
	return {
	'success': False,
	'message': "No repository loaded. Please load a repository first.",
	'visualizations': {}
	}

	return {
	'success': True,
	'message': "Repository visualizations retrieved",
	'visualizations': self.visualizations
	}

	def clear_caches(self) -> None:
	"""Clear all caches"""
	self.file_cache.clear()
	self.contributor_cache.clear()
	self.commit_cache.clear()
	self.issue_cache.clear()
	self.query_cache.clear()

	# Clear LRU caches
	self.answer_query.cache_clear()
	if hasattr(self.knowledge_base, 'search_similar_files'):
	self.knowledge_base.search_similar_files.cache_clear()