""" AI Core Module for GitHub Companion Handles: - Document embedding with ChromaDB - Conversational RAG chain creation - Context-aware query processing Uses stable LangChain imports compatible with latest versions. """ import os import tempfile import pathlib import logging from typing import List # LangChain imports - using stable paths for latest versions from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_community.vectorstores import Chroma from langchain_google_genai import GoogleGenerativeAI from langchain_core.prompts import ChatPromptTemplate from langchain_core.documents import Document from langchain_core.messages import AIMessage, HumanMessage from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from shared import analysis_jobs, update_session, get_session # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Cache directory for embeddings model CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers") def format_docs(docs): """Format retrieved documents into a single string.""" return "\n\n".join(doc.page_content for doc in docs) def create_conversational_chain(file_paths: List[str], session_id: str): """ Create a conversational RAG chain from the provided files. Args: file_paths: List of file paths to embed for initial context session_id: Unique session identifier Returns: A runnable chain or None if creation fails """ try: logger.info(f"Creating conversational chain for session {session_id}") chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id) # Load documents documents = [] if file_paths: for file_path in file_paths: try: loader = TextLoader(file_path, encoding='utf-8') documents.extend(loader.load()) logger.debug(f"Loaded file: {file_path}") except Exception as e: logger.warning(f"Skipping file {file_path}: {e}") continue # Fallback if no documents if not documents: documents = [Document(page_content="No text files were provided for initial analysis.")] logger.warning("No documents loaded, using fallback.") # Split documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) texts = text_splitter.split_documents(documents) logger.info(f"Split into {len(texts)} text chunks") # Create embeddings embeddings = SentenceTransformerEmbeddings( model_name="all-MiniLM-L6-v2", cache_folder=CACHE_DIR ) # Create vector store db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path) logger.info(f"Created ChromaDB at {chroma_db_path}") # Create retriever retriever = db.as_retriever(search_kwargs={"k": 5}) # Create LLM llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7) # System prompt template prompt = ChatPromptTemplate.from_template( """You are an expert software developer assistant. Your goal is to help users understand a GitHub repository. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep your answers concise and informative. When providing code snippets, use markdown formatting. Context: {context} Question: {question} Answer:""" ) # Create chain using LCEL (LangChain Expression Language) rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) # Store vectorstore and retriever in session update_session(session_id, "vectorstore", db) update_session(session_id, "retriever", retriever) logger.info(f"✅ Conversational chain created for session {session_id}") return rag_chain except Exception as e: logger.error(f"❌ Error creating conversational chain: {e}") return None def embed_entire_repository(session_id: str, all_file_paths: List[str]): """ Background task to embed all text files in the repository. Args: session_id: Unique session identifier all_file_paths: List of all text file paths to embed """ try: logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)") job = get_session(session_id) if not job or "vectorstore" not in job: logger.error(f"No vectorstore found for session {session_id}") return vectorstore = job["vectorstore"] # Load all documents documents = [] for file_path in all_file_paths: try: loader = TextLoader(file_path, encoding='utf-8') documents.extend(loader.load()) except Exception: continue if documents: text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) texts = text_splitter.split_documents(documents) vectorstore.add_documents(texts) logger.info(f"Added {len(texts)} chunks to vectorstore") update_session(session_id, "embedding_complete", True) logger.info(f"✅ Background embedding complete for session {session_id}") except Exception as e: logger.error(f"❌ Error in background embedding for session {session_id}: {e}") def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str: """ Query the RAG chain with additional context from pinned files. Args: rag_chain: The runnable chain chat_history: List of previous chat messages query: The user's query pinned_files: List of file paths the user has pinned for context repo_path: Path to the repository root Returns: The AI's response as a string """ try: # Build context from pinned files context_str = "" if pinned_files: context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n" for file in pinned_files: file_p = (pathlib.Path(repo_path) / file).resolve() if file_p.is_file(): context_str += f"--- START OF FILE: {file} ---\n" try: # Limit file content to prevent token overflow context_str += file_p.read_text(encoding="utf-8")[:4000] except Exception: context_str += "(Could not read file content)" context_str += f"\n--- END OF FILE: {file} ---\n\n" # Build final query with pinned context final_query = f"{context_str}Based on the context, answer the question: {query}" # Invoke the chain - LCEL chains are simpler to invoke answer = rag_chain.invoke(final_query) # Update chat history chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)]) return answer except Exception as e: logger.error(f"Error during query invocation: {e}") return f"An error occurred while processing your request: {str(e)}"