Spaces:

sonuprasad23
/

github-companion-backend

Sleeping

Sonu Prasad

updated langchain

c711e54 about 1 month ago

8.12 kB

	"""
	AI Core Module for GitHub Companion

	Handles:
	- Document embedding with ChromaDB
	- Conversational RAG chain creation
	- Context-aware query processing

	Uses stable LangChain imports compatible with latest versions.
	"""

	import os
	import tempfile
	import pathlib
	import logging
	from typing import List

	# LangChain imports - using stable paths for latest versions
	from langchain_community.document_loaders import TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import SentenceTransformerEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_google_genai import GoogleGenerativeAI
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.documents import Document
	from langchain_core.messages import AIMessage, HumanMessage
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough

	from shared import analysis_jobs, update_session, get_session

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	# Cache directory for embeddings model
	CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")


	def format_docs(docs):
	"""Format retrieved documents into a single string."""
	return "\n\n".join(doc.page_content for doc in docs)


	def create_conversational_chain(file_paths: List[str], session_id: str):
	"""
	Create a conversational RAG chain from the provided files.

	Args:
	file_paths: List of file paths to embed for initial context
	session_id: Unique session identifier

	Returns:
	A runnable chain or None if creation fails
	"""
	try:
	logger.info(f"Creating conversational chain for session {session_id}")

	chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)

	# Load documents
	documents = []
	if file_paths:
	for file_path in file_paths:
	try:
	loader = TextLoader(file_path, encoding='utf-8')
	documents.extend(loader.load())
	logger.debug(f"Loaded file: {file_path}")
	except Exception as e:
	logger.warning(f"Skipping file {file_path}: {e}")
	continue

	# Fallback if no documents
	if not documents:
	documents = [Document(page_content="No text files were provided for initial analysis.")]
	logger.warning("No documents loaded, using fallback.")

	# Split documents
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
	texts = text_splitter.split_documents(documents)
	logger.info(f"Split into {len(texts)} text chunks")

	# Create embeddings
	embeddings = SentenceTransformerEmbeddings(
	model_name="all-MiniLM-L6-v2",
	cache_folder=CACHE_DIR
	)

	# Create vector store
	db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
	logger.info(f"Created ChromaDB at {chroma_db_path}")

	# Create retriever
	retriever = db.as_retriever(search_kwargs={"k": 5})

	# Create LLM
	llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7)

	# System prompt template
	prompt = ChatPromptTemplate.from_template(
	"""You are an expert software developer assistant. Your goal is to help users
	understand a GitHub repository. Use the following pieces of retrieved context
	to answer the question. If you don't know the answer, just say that you don't know.
	Keep your answers concise and informative. When providing code snippets, use markdown formatting.

	Context:
	{context}

	Question: {question}

	Answer:"""
	)

	# Create chain using LCEL (LangChain Expression Language)
	rag_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	# Store vectorstore and retriever in session
	update_session(session_id, "vectorstore", db)
	update_session(session_id, "retriever", retriever)

	logger.info(f"✅ Conversational chain created for session {session_id}")
	return rag_chain

	except Exception as e:
	logger.error(f"❌ Error creating conversational chain: {e}")
	return None


	def embed_entire_repository(session_id: str, all_file_paths: List[str]):
	"""
	Background task to embed all text files in the repository.

	Args:
	session_id: Unique session identifier
	all_file_paths: List of all text file paths to embed
	"""
	try:
	logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)")

	job = get_session(session_id)
	if not job or "vectorstore" not in job:
	logger.error(f"No vectorstore found for session {session_id}")
	return

	vectorstore = job["vectorstore"]

	# Load all documents
	documents = []
	for file_path in all_file_paths:
	try:
	loader = TextLoader(file_path, encoding='utf-8')
	documents.extend(loader.load())
	except Exception:
	continue

	if documents:
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
	texts = text_splitter.split_documents(documents)
	vectorstore.add_documents(texts)
	logger.info(f"Added {len(texts)} chunks to vectorstore")

	update_session(session_id, "embedding_complete", True)
	logger.info(f"✅ Background embedding complete for session {session_id}")

	except Exception as e:
	logger.error(f"❌ Error in background embedding for session {session_id}: {e}")


	def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str:
	"""
	Query the RAG chain with additional context from pinned files.

	Args:
	rag_chain: The runnable chain
	chat_history: List of previous chat messages
	query: The user's query
	pinned_files: List of file paths the user has pinned for context
	repo_path: Path to the repository root

	Returns:
	The AI's response as a string
	"""
	try:
	# Build context from pinned files
	context_str = ""
	if pinned_files:
	context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
	for file in pinned_files:
	file_p = (pathlib.Path(repo_path) / file).resolve()
	if file_p.is_file():
	context_str += f"--- START OF FILE: {file} ---\n"
	try:
	# Limit file content to prevent token overflow
	context_str += file_p.read_text(encoding="utf-8")[:4000]
	except Exception:
	context_str += "(Could not read file content)"
	context_str += f"\n--- END OF FILE: {file} ---\n\n"

	# Build final query with pinned context
	final_query = f"{context_str}Based on the context, answer the question: {query}"

	# Invoke the chain - LCEL chains are simpler to invoke
	answer = rag_chain.invoke(final_query)

	# Update chat history
	chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])

	return answer

	except Exception as e:
	logger.error(f"Error during query invocation: {e}")
	return f"An error occurred while processing your request: {str(e)}"