| | """ |
| | AI Core Module for GitHub Companion |
| | |
| | Handles: |
| | - Document embedding with ChromaDB |
| | - Conversational RAG chain creation |
| | - Context-aware query processing |
| | |
| | Uses stable LangChain imports compatible with latest versions. |
| | """ |
| |
|
| | import os |
| | import tempfile |
| | import pathlib |
| | import logging |
| | from typing import List |
| |
|
| | |
| | from langchain_community.document_loaders import TextLoader |
| | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| | from langchain_community.embeddings import SentenceTransformerEmbeddings |
| | from langchain_community.vectorstores import Chroma |
| | from langchain_google_genai import GoogleGenerativeAI |
| | from langchain_core.prompts import ChatPromptTemplate |
| | from langchain_core.documents import Document |
| | from langchain_core.messages import AIMessage, HumanMessage |
| | from langchain_core.output_parsers import StrOutputParser |
| | from langchain_core.runnables import RunnablePassthrough |
| |
|
| | from shared import analysis_jobs, update_session, get_session |
| |
|
| | |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers") |
| |
|
| |
|
| | def format_docs(docs): |
| | """Format retrieved documents into a single string.""" |
| | return "\n\n".join(doc.page_content for doc in docs) |
| |
|
| |
|
| | def create_conversational_chain(file_paths: List[str], session_id: str): |
| | """ |
| | Create a conversational RAG chain from the provided files. |
| | |
| | Args: |
| | file_paths: List of file paths to embed for initial context |
| | session_id: Unique session identifier |
| | |
| | Returns: |
| | A runnable chain or None if creation fails |
| | """ |
| | try: |
| | logger.info(f"Creating conversational chain for session {session_id}") |
| | |
| | chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id) |
| | |
| | |
| | documents = [] |
| | if file_paths: |
| | for file_path in file_paths: |
| | try: |
| | loader = TextLoader(file_path, encoding='utf-8') |
| | documents.extend(loader.load()) |
| | logger.debug(f"Loaded file: {file_path}") |
| | except Exception as e: |
| | logger.warning(f"Skipping file {file_path}: {e}") |
| | continue |
| | |
| | |
| | if not documents: |
| | documents = [Document(page_content="No text files were provided for initial analysis.")] |
| | logger.warning("No documents loaded, using fallback.") |
| | |
| | |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) |
| | texts = text_splitter.split_documents(documents) |
| | logger.info(f"Split into {len(texts)} text chunks") |
| | |
| | |
| | embeddings = SentenceTransformerEmbeddings( |
| | model_name="all-MiniLM-L6-v2", |
| | cache_folder=CACHE_DIR |
| | ) |
| | |
| | |
| | db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path) |
| | logger.info(f"Created ChromaDB at {chroma_db_path}") |
| | |
| | |
| | retriever = db.as_retriever(search_kwargs={"k": 5}) |
| | |
| | |
| | llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7) |
| | |
| | |
| | prompt = ChatPromptTemplate.from_template( |
| | """You are an expert software developer assistant. Your goal is to help users |
| | understand a GitHub repository. Use the following pieces of retrieved context |
| | to answer the question. If you don't know the answer, just say that you don't know. |
| | Keep your answers concise and informative. When providing code snippets, use markdown formatting. |
| | |
| | Context: |
| | {context} |
| | |
| | Question: {question} |
| | |
| | Answer:""" |
| | ) |
| | |
| | |
| | rag_chain = ( |
| | {"context": retriever | format_docs, "question": RunnablePassthrough()} |
| | | prompt |
| | | llm |
| | | StrOutputParser() |
| | ) |
| | |
| | |
| | update_session(session_id, "vectorstore", db) |
| | update_session(session_id, "retriever", retriever) |
| | |
| | logger.info(f"β
Conversational chain created for session {session_id}") |
| | return rag_chain |
| | |
| | except Exception as e: |
| | logger.error(f"β Error creating conversational chain: {e}") |
| | return None |
| |
|
| |
|
| | def embed_entire_repository(session_id: str, all_file_paths: List[str]): |
| | """ |
| | Background task to embed all text files in the repository. |
| | |
| | Args: |
| | session_id: Unique session identifier |
| | all_file_paths: List of all text file paths to embed |
| | """ |
| | try: |
| | logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)") |
| | |
| | job = get_session(session_id) |
| | if not job or "vectorstore" not in job: |
| | logger.error(f"No vectorstore found for session {session_id}") |
| | return |
| | |
| | vectorstore = job["vectorstore"] |
| | |
| | |
| | documents = [] |
| | for file_path in all_file_paths: |
| | try: |
| | loader = TextLoader(file_path, encoding='utf-8') |
| | documents.extend(loader.load()) |
| | except Exception: |
| | continue |
| | |
| | if documents: |
| | text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200) |
| | texts = text_splitter.split_documents(documents) |
| | vectorstore.add_documents(texts) |
| | logger.info(f"Added {len(texts)} chunks to vectorstore") |
| | |
| | update_session(session_id, "embedding_complete", True) |
| | logger.info(f"β
Background embedding complete for session {session_id}") |
| | |
| | except Exception as e: |
| | logger.error(f"β Error in background embedding for session {session_id}: {e}") |
| |
|
| |
|
| | def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str: |
| | """ |
| | Query the RAG chain with additional context from pinned files. |
| | |
| | Args: |
| | rag_chain: The runnable chain |
| | chat_history: List of previous chat messages |
| | query: The user's query |
| | pinned_files: List of file paths the user has pinned for context |
| | repo_path: Path to the repository root |
| | |
| | Returns: |
| | The AI's response as a string |
| | """ |
| | try: |
| | |
| | context_str = "" |
| | if pinned_files: |
| | context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n" |
| | for file in pinned_files: |
| | file_p = (pathlib.Path(repo_path) / file).resolve() |
| | if file_p.is_file(): |
| | context_str += f"--- START OF FILE: {file} ---\n" |
| | try: |
| | |
| | context_str += file_p.read_text(encoding="utf-8")[:4000] |
| | except Exception: |
| | context_str += "(Could not read file content)" |
| | context_str += f"\n--- END OF FILE: {file} ---\n\n" |
| | |
| | |
| | final_query = f"{context_str}Based on the context, answer the question: {query}" |
| | |
| | |
| | answer = rag_chain.invoke(final_query) |
| | |
| | |
| | chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)]) |
| | |
| | return answer |
| | |
| | except Exception as e: |
| | logger.error(f"Error during query invocation: {e}") |
| | return f"An error occurred while processing your request: {str(e)}" |
| |
|