Sonu Prasad
updated langchain
c711e54
"""
AI Core Module for GitHub Companion
Handles:
- Document embedding with ChromaDB
- Conversational RAG chain creation
- Context-aware query processing
Uses stable LangChain imports compatible with latest versions.
"""
import os
import tempfile
import pathlib
import logging
from typing import List
# LangChain imports - using stable paths for latest versions
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from shared import analysis_jobs, update_session, get_session
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Cache directory for embeddings model
CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")
def format_docs(docs):
"""Format retrieved documents into a single string."""
return "\n\n".join(doc.page_content for doc in docs)
def create_conversational_chain(file_paths: List[str], session_id: str):
"""
Create a conversational RAG chain from the provided files.
Args:
file_paths: List of file paths to embed for initial context
session_id: Unique session identifier
Returns:
A runnable chain or None if creation fails
"""
try:
logger.info(f"Creating conversational chain for session {session_id}")
chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)
# Load documents
documents = []
if file_paths:
for file_path in file_paths:
try:
loader = TextLoader(file_path, encoding='utf-8')
documents.extend(loader.load())
logger.debug(f"Loaded file: {file_path}")
except Exception as e:
logger.warning(f"Skipping file {file_path}: {e}")
continue
# Fallback if no documents
if not documents:
documents = [Document(page_content="No text files were provided for initial analysis.")]
logger.warning("No documents loaded, using fallback.")
# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
logger.info(f"Split into {len(texts)} text chunks")
# Create embeddings
embeddings = SentenceTransformerEmbeddings(
model_name="all-MiniLM-L6-v2",
cache_folder=CACHE_DIR
)
# Create vector store
db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
logger.info(f"Created ChromaDB at {chroma_db_path}")
# Create retriever
retriever = db.as_retriever(search_kwargs={"k": 5})
# Create LLM
llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7)
# System prompt template
prompt = ChatPromptTemplate.from_template(
"""You are an expert software developer assistant. Your goal is to help users
understand a GitHub repository. Use the following pieces of retrieved context
to answer the question. If you don't know the answer, just say that you don't know.
Keep your answers concise and informative. When providing code snippets, use markdown formatting.
Context:
{context}
Question: {question}
Answer:"""
)
# Create chain using LCEL (LangChain Expression Language)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
# Store vectorstore and retriever in session
update_session(session_id, "vectorstore", db)
update_session(session_id, "retriever", retriever)
logger.info(f"βœ… Conversational chain created for session {session_id}")
return rag_chain
except Exception as e:
logger.error(f"❌ Error creating conversational chain: {e}")
return None
def embed_entire_repository(session_id: str, all_file_paths: List[str]):
"""
Background task to embed all text files in the repository.
Args:
session_id: Unique session identifier
all_file_paths: List of all text file paths to embed
"""
try:
logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)")
job = get_session(session_id)
if not job or "vectorstore" not in job:
logger.error(f"No vectorstore found for session {session_id}")
return
vectorstore = job["vectorstore"]
# Load all documents
documents = []
for file_path in all_file_paths:
try:
loader = TextLoader(file_path, encoding='utf-8')
documents.extend(loader.load())
except Exception:
continue
if documents:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
vectorstore.add_documents(texts)
logger.info(f"Added {len(texts)} chunks to vectorstore")
update_session(session_id, "embedding_complete", True)
logger.info(f"βœ… Background embedding complete for session {session_id}")
except Exception as e:
logger.error(f"❌ Error in background embedding for session {session_id}: {e}")
def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str:
"""
Query the RAG chain with additional context from pinned files.
Args:
rag_chain: The runnable chain
chat_history: List of previous chat messages
query: The user's query
pinned_files: List of file paths the user has pinned for context
repo_path: Path to the repository root
Returns:
The AI's response as a string
"""
try:
# Build context from pinned files
context_str = ""
if pinned_files:
context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
for file in pinned_files:
file_p = (pathlib.Path(repo_path) / file).resolve()
if file_p.is_file():
context_str += f"--- START OF FILE: {file} ---\n"
try:
# Limit file content to prevent token overflow
context_str += file_p.read_text(encoding="utf-8")[:4000]
except Exception:
context_str += "(Could not read file content)"
context_str += f"\n--- END OF FILE: {file} ---\n\n"
# Build final query with pinned context
final_query = f"{context_str}Based on the context, answer the question: {query}"
# Invoke the chain - LCEL chains are simpler to invoke
answer = rag_chain.invoke(final_query)
# Update chat history
chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])
return answer
except Exception as e:
logger.error(f"Error during query invocation: {e}")
return f"An error occurred while processing your request: {str(e)}"