Spaces:

sonuprasad23
/

github-companion-backend

Sleeping

App Files Files Community

Sonu Prasad commited on Jan 25

Commit

4c6e0cc

1 Parent(s): 825dc14

Optimize backend for production

Browse files

Files changed (8) hide show

Dockerfile +9 -1
__pycache__/ai_core.cpython-314.pyc +0 -0
__pycache__/main.cpython-314.pyc +0 -0
__pycache__/shared.cpython-314.pyc +0 -0
ai_core.py +115 -31
main.py +176 -73
requirements.txt +17 -14
shared.py +57 -0

Dockerfile CHANGED Viewed

@@ -1,5 +1,6 @@
 FROM python:3.11-slim
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libmagic1 \
     git \
@@ -10,15 +11,22 @@ WORKDIR /code
 # Set environment variables for cache directories
 ENV HF_HOME=/tmp/huggingface_cache
 ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface_cache
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-# Pre-download model with proper cache directory
 RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface_cache/sentence_transformers')"
 COPY ./ai_core.py /code/ai_core.py
 COPY ./main.py /code/main.py
 EXPOSE 7860
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
+# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libmagic1 \
     git \
 # Set environment variables for cache directories
 ENV HF_HOME=/tmp/huggingface_cache
 ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface_cache
+ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
+# Copy requirements first for better caching
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Pre-download the embedding model
 RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface_cache/sentence_transformers')"
+# Copy application code
+COPY ./shared.py /code/shared.py
 COPY ./ai_core.py /code/ai_core.py
 COPY ./main.py /code/main.py
+# Expose port for Hugging Face Spaces
 EXPOSE 7860
+# Run with uvicorn (single worker to avoid multiprocessing issues in HF Spaces)
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/ai_core.cpython-314.pyc ADDED Viewed

Binary file (9.8 kB). View file

__pycache__/main.cpython-314.pyc ADDED Viewed

Binary file (19.1 kB). View file

__pycache__/shared.cpython-314.pyc ADDED Viewed

Binary file (3.59 kB). View file

ai_core.py CHANGED Viewed

@@ -1,6 +1,18 @@
 import os
 import tempfile
 import pathlib
 from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
@@ -12,38 +24,74 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain.docstore.document import Document
 from langchain_core.messages import AIMessage, HumanMessage
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")
-def create_conversational_chain(file_paths: list[str], session_id: str):
-    from main import analysis_jobs
     try:
         chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)
         documents = []
         if file_paths:
             for file_path in file_paths:
                 try:
                     loader = TextLoader(file_path, encoding='utf-8')
                     documents.extend(loader.load())
                 except Exception as e:
-                    print(f"Skipping file {file_path} due to error: {e}")
                     continue
         if not documents:
             documents = [Document(page_content="No text files were provided for initial analysis.")]
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
         texts = text_splitter.split_documents(documents)
-        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
         db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
         retriever = db.as_retriever(search_kwargs={"k": 5})
-        llm = GoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)
         system_prompt = (
             "You are an expert software developer assistant. Your goal is to help users "
             "understand a GitHub repository. Use the following pieces of retrieved context "
@@ -58,45 +106,77 @@ def create_conversational_chain(file_paths: list[str], session_id: str):
             ("human", "{input}"),
         ])
         question_answer_chain = create_stuff_documents_chain(llm, prompt)
         rag_chain = create_retrieval_chain(retriever, question_answer_chain)
-        # Store vectorstore in analysis_jobs directly
-        analysis_jobs[session_id]["vectorstore"] = db
         return rag_chain
     except Exception as e:
-        print(f"Error creating conversational chain: {e}")
         return None
-def embed_entire_repository(session_id: str, all_file_paths: list[str]):
-    from main import analysis_jobs
     try:
-        if session_id in analysis_jobs and "vectorstore" in analysis_jobs[session_id]:
-            vectorstore = analysis_jobs[session_id]["vectorstore"]
-            documents = []
-            for file_path in all_file_paths:
-                try:
-                    loader = TextLoader(file_path, encoding='utf-8')
-                    documents.extend(loader.load())
-                except Exception:
-                    continue
-            if documents:
-                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
-                texts = text_splitter.split_documents(documents)
-                vectorstore.add_documents(texts)
-            analysis_jobs[session_id]["embedding_complete"] = True
-            print(f"Background embedding complete for session {session_id}")
     except Exception as e:
-        print(f"Error in background embedding for session {session_id}: {e}")
-def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: list[str], repo_path: str):
     try:
         context_str = ""
         if pinned_files:
             context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
@@ -105,21 +185,25 @@ def query_with_context(rag_chain, chat_history: list, query: str, pinned_files:
                 if file_p.is_file():
                     context_str += f"--- START OF FILE: {file} ---\n"
                     try:
                         context_str += file_p.read_text(encoding="utf-8")[:4000]
                     except Exception:
                         context_str += "(Could not read file content)"
                     context_str += f"\n--- END OF FILE: {file} ---\n\n"
         final_query = f"{context_str}Based on the context and our history, answer the question: {query}"
         response = rag_chain.invoke({"input": final_query, "chat_history": chat_history})
         answer = response.get("answer", "I could not find an answer.")
-        # Add to chat history
         chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])
         return answer
     except Exception as e:
-        print(f"Error during query invocation: {e}")
         return f"An error occurred while processing your request: {str(e)}"

+"""
+AI Core Module for GitHub Companion
+Handles:
+- Document embedding with ChromaDB
+- Conversational RAG chain creation
+- Context-aware query processing
+"""
 import os
 import tempfile
 import pathlib
+import logging
+from typing import List
 from langchain_community.document_loaders import TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain.docstore.document import Document
 from langchain_core.messages import AIMessage, HumanMessage
+from shared import analysis_jobs, update_session, get_session
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Cache directory for embeddings model
 CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")
+def create_conversational_chain(file_paths: List[str], session_id: str):
+    """
+    Create a conversational RAG chain from the provided files.
+    Args:
+        file_paths: List of file paths to embed for initial context
+        session_id: Unique session identifier
+    Returns:
+        A LangChain retrieval chain or None if creation fails
+    """
     try:
+        logger.info(f"Creating conversational chain for session {session_id}")
         chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)
+        # Load documents
         documents = []
         if file_paths:
             for file_path in file_paths:
                 try:
                     loader = TextLoader(file_path, encoding='utf-8')
                     documents.extend(loader.load())
+                    logger.debug(f"Loaded file: {file_path}")
                 except Exception as e:
+                    logger.warning(f"Skipping file {file_path}: {e}")
                     continue
+        # Fallback if no documents
         if not documents:
             documents = [Document(page_content="No text files were provided for initial analysis.")]
+            logger.warning("No documents loaded, using fallback.")
+        # Split documents
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
         texts = text_splitter.split_documents(documents)
+        logger.info(f"Split into {len(texts)} text chunks")
+        # Create embeddings
+        embeddings = SentenceTransformerEmbeddings(
+            model_name="all-MiniLM-L6-v2",
+            cache_folder=CACHE_DIR
+        )
+        # Create vector store
         db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
+        logger.info(f"Created ChromaDB at {chroma_db_path}")
+        # Create retriever
         retriever = db.as_retriever(search_kwargs={"k": 5})
+        # Create LLM
+        llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7)
+        # System prompt
         system_prompt = (
             "You are an expert software developer assistant. Your goal is to help users "
             "understand a GitHub repository. Use the following pieces of retrieved context "
             ("human", "{input}"),
         ])
+        # Create chains
         question_answer_chain = create_stuff_documents_chain(llm, prompt)
         rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        # Store vectorstore in session
+        update_session(session_id, "vectorstore", db)
+        logger.info(f"✅ Conversational chain created for session {session_id}")
         return rag_chain
     except Exception as e:
+        logger.error(f"❌ Error creating conversational chain: {e}")
         return None
+def embed_entire_repository(session_id: str, all_file_paths: List[str]):
+    """
+    Background task to embed all text files in the repository.
+    Args:
+        session_id: Unique session identifier
+        all_file_paths: List of all text file paths to embed
+    """
     try:
+        logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)")
+        job = get_session(session_id)
+        if not job or "vectorstore" not in job:
+            logger.error(f"No vectorstore found for session {session_id}")
+            return
+        vectorstore = job["vectorstore"]
+        # Load all documents
+        documents = []
+        for file_path in all_file_paths:
+            try:
+                loader = TextLoader(file_path, encoding='utf-8')
+                documents.extend(loader.load())
+            except Exception:
+                continue
+        if documents:
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
+            texts = text_splitter.split_documents(documents)
+            vectorstore.add_documents(texts)
+            logger.info(f"Added {len(texts)} chunks to vectorstore")
+        update_session(session_id, "embedding_complete", True)
+        logger.info(f"✅ Background embedding complete for session {session_id}")
     except Exception as e:
+        logger.error(f"❌ Error in background embedding for session {session_id}: {e}")
+def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str:
+    """
+    Query the RAG chain with additional context from pinned files.
+    Args:
+        rag_chain: The LangChain retrieval chain
+        chat_history: List of previous chat messages
+        query: The user's query
+        pinned_files: List of file paths the user has pinned for context
+        repo_path: Path to the repository root
+    Returns:
+        The AI's response as a string
+    """
     try:
+        # Build context from pinned files
         context_str = ""
         if pinned_files:
             context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
                 if file_p.is_file():
                     context_str += f"--- START OF FILE: {file} ---\n"
                     try:
+                        # Limit file content to prevent token overflow
                         context_str += file_p.read_text(encoding="utf-8")[:4000]
                     except Exception:
                         context_str += "(Could not read file content)"
                     context_str += f"\n--- END OF FILE: {file} ---\n\n"
+        # Build final query
         final_query = f"{context_str}Based on the context and our history, answer the question: {query}"
+        # Invoke the chain
         response = rag_chain.invoke({"input": final_query, "chat_history": chat_history})
         answer = response.get("answer", "I could not find an answer.")
+        # Update chat history
         chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])
         return answer
     except Exception as e:
+        logger.error(f"Error during query invocation: {e}")
         return f"An error occurred while processing your request: {str(e)}"

main.py CHANGED Viewed

@@ -1,245 +1,348 @@
 import os
 import uuid
 import shutil
 import pathlib
 import tempfile
-from typing import List, Dict, Any, Literal
 from fastapi import FastAPI, BackgroundTasks, HTTPException, Query
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
 from git import Repo
 from langchain_core.messages import AIMessage, HumanMessage
 from ai_core import create_conversational_chain, query_with_context, embed_entire_repository
 SESSIONS_BASE_DIR = pathlib.Path(tempfile.gettempdir()) / "repo_sessions"
 app = FastAPI(
-    title="Github Companion API",
     description="API for high-performance analysis and contextual chat with GitHub repositories.",
-    version="4.1.0"
 )
-origins = ["*"]
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=origins,
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-analysis_jobs: Dict[str, Dict[str, Any]] = {}
-def is_text_file(file_path):
-    try:
-        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-            f.read(512)
-        return True
-    except Exception:
-        return False
 class RepoRequest(BaseModel):
     repo_url: str
 class AnalysisResponse(BaseModel):
     session_id: str
 class StatusResponse(BaseModel):
     session_id: str
     status: Literal["pending", "cloning", "summarizing", "embedding_background", "completed", "failed"]
     message: str | None = None
 class FileDetail(BaseModel):
     path: str
     size_bytes: int
 class AnalysisResult(BaseModel):
     repo_url: str
     directory_structure: List[FileDetail]
     initial_summary: str
 class FileContentResponse(BaseModel):
     path: str
     content: str
 class ChatRequest(BaseModel):
     query: str
     pinned_files: List[str] = []
 class ChatResponse(BaseModel):
     answer: str
 class ModifiedFile(BaseModel):
     path: str
     content: str
 class DownloadRequest(BaseModel):
     modified_files: List[ModifiedFile]
 def initial_analysis_task(session_id: str, repo_url: str, background_tasks: BackgroundTasks):
     session_repo_path = SESSIONS_BASE_DIR / session_id
     try:
         if session_repo_path.exists():
             shutil.rmtree(session_repo_path)
         SESSIONS_BASE_DIR.mkdir(exist_ok=True)
-        analysis_jobs[session_id] = {"status": "cloning"}
         Repo.clone_from(repo_url, str(session_repo_path), depth=1)
-        analysis_jobs[session_id]["repo_path"] = str(session_repo_path)
         repo_name = repo_url.split('/')[-1].replace('.git', '')
-        analysis_jobs[session_id]["repo_name"] = repo_name
-        ignore_patterns = ['.git', '.gitignore', '__pycache__', 'node_modules', 'dist', 'build']
         all_file_details = []
         key_file_paths_for_summary = []
         all_text_file_paths_for_embedding = []
-        summary_candidate_names = ["readme.md", "package.json", "pyproject.toml", "requirements.txt", "pom.xml", "build.gradle"]
         for root, dirs, files in os.walk(str(session_repo_path), topdown=True):
             dirs[:] = [d for d in dirs if d not in ignore_patterns]
             for name in files:
-                if name in ignore_patterns:
                     continue
                 file_path = os.path.join(root, name)
                 if not os.path.islink(file_path):
-                    # Fixed relative path calculation using pathlib
                     try:
                         relative_path = pathlib.Path(file_path).relative_to(session_repo_path).as_posix()
                     except ValueError:
-                        # Fallback to os.path.relpath with proper escaping
                         relative_path = os.path.relpath(file_path, str(session_repo_path)).replace("\\", "/")
                     file_size = os.path.getsize(file_path)
                     all_file_details.append(FileDetail(path=relative_path, size_bytes=file_size))
                     if is_text_file(file_path):
                         all_text_file_paths_for_embedding.append(file_path)
                         if name.lower() in summary_candidate_names:
                             key_file_paths_for_summary.append(file_path)
-        analysis_jobs[session_id]["status"] = "summarizing"
-        # Initialize chat history as a simple list
-        analysis_jobs[session_id]["chat_history"] = []
         rag_chain = create_conversational_chain(key_file_paths_for_summary, session_id)
         if not rag_chain:
             raise Exception("Failed to create initial AI chain.")
-        analysis_jobs[session_id]["rag_chain"] = rag_chain
         summary_query = "Based on the provided files (like README, package.json, etc.), what is the primary purpose of this software project? Provide a concise, one-paragraph summary."
-        initial_summary = query_with_context(rag_chain, analysis_jobs[session_id]["chat_history"], summary_query, [], str(session_repo_path))
         result = AnalysisResult(
             repo_url=repo_url,
             directory_structure=sorted(all_file_details, key=lambda x: x.path),
             initial_summary=initial_summary
         )
-        analysis_jobs[session_id]["result"] = result
-        analysis_jobs[session_id]["status"] = "embedding_background"
         background_tasks.add_task(embed_entire_repository, session_id, all_text_file_paths_for_embedding)
     except Exception as e:
-        analysis_jobs[session_id]["status"] = "failed"
-        analysis_jobs[session_id]["message"] = str(e)
 @app.get("/")
 def read_root():
-    return JSONResponse(content={"message": "Github Companion Backend is Running"})
 @app.post("/analyze", response_model=AnalysisResponse, status_code=202)
 def submit_analysis(request: RepoRequest, background_tasks: BackgroundTasks):
     session_id = str(uuid.uuid4())
-    analysis_jobs[session_id] = {"status": "pending"}
     background_tasks.add_task(initial_analysis_task, session_id, request.repo_url, background_tasks)
     return AnalysisResponse(session_id=session_id)
 @app.get("/status/{session_id}", response_model=StatusResponse)
 def get_analysis_status(session_id: str):
-    job = analysis_jobs.get(session_id)
     if not job:
         raise HTTPException(status_code=404, detail="Session ID not found.")
-    if job["status"] == "embedding_background" and analysis_jobs[session_id].get("embedding_complete"):
         job["status"] = "completed"
     return StatusResponse(session_id=session_id, status=job["status"], message=job.get("message"))
 @app.get("/result/{session_id}", response_model=AnalysisResult)
 def get_analysis_result(session_id: str):
-    job = analysis_jobs.get(session_id)
     if not job or job.get("status") not in ["embedding_background", "completed"]:
         raise HTTPException(status_code=400, detail="Job not found or not ready.")
     return job["result"]
 @app.get("/file-content/{session_id}", response_model=FileContentResponse)
 def get_file_content(session_id: str, file_path: str = Query(..., alias="path")):
-    job = analysis_jobs.get(session_id)
     if not job or "repo_path" not in job:
         raise HTTPException(status_code=404, detail="Session not found.")
     repo_base_path = pathlib.Path(job["repo_path"]).resolve()
     requested_file_path = (repo_base_path / file_path).resolve()
     if not requested_file_path.is_relative_to(repo_base_path):
         raise HTTPException(status_code=403, detail="Access denied.")
     if not requested_file_path.is_file():
         raise HTTPException(status_code=404, detail="File not found.")
     try:
         content = requested_file_path.read_text(encoding="utf-8")
         return FileContentResponse(path=file_path, content=content)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
 @app.post("/chat/{session_id}", response_model=ChatResponse)
 def chat_with_repo(session_id: str, request: ChatRequest):
-    job = analysis_jobs.get(session_id)
     if not job or "rag_chain" not in job:
         raise HTTPException(status_code=404, detail="Chat session not ready.")
     rag_chain = job["rag_chain"]
     chat_history = job.get("chat_history", [])
     repo_path = job["repo_path"]
     answer = query_with_context(rag_chain, chat_history, request.query, request.pinned_files, repo_path)
     return ChatResponse(answer=answer)
 @app.post("/download-zip/{session_id}")
 async def download_zip(session_id: str, request: DownloadRequest, background_tasks: BackgroundTasks):
-    job = analysis_jobs.get(session_id)
     if not job or "repo_path" not in job:
         raise HTTPException(status_code=404, detail="Session not found.")
     repo_base_path = pathlib.Path(job["repo_path"]).resolve()
     repo_name = job.get("repo_name", session_id)
     temp_zip_dir = pathlib.Path(tempfile.gettempdir()) / "temp_zips"
     for modified_file in request.modified_files:
         file_to_update = (repo_base_path / modified_file.path).resolve()
         if not file_to_update.is_relative_to(repo_base_path):
             continue
         file_to_update.parent.mkdir(parents=True, exist_ok=True)
         file_to_update.write_text(modified_file.content, encoding="utf-8")
     temp_zip_dir.mkdir(exist_ok=True)
     zip_path_base = temp_zip_dir / f"{repo_name}-{session_id}"
     zip_path_final = shutil.make_archive(str(zip_path_base), 'zip', str(repo_base_path))
     background_tasks.add_task(os.remove, zip_path_final)
     return FileResponse(
         path=zip_path_final,
         media_type='application/zip',

+"""
+GitHub Companion API - Main FastAPI Application
+A high-performance API for analyzing and chatting with GitHub repositories.
+Optimized for Hugging Face Spaces deployment with multi-user support.
+"""
 import os
 import uuid
 import shutil
 import pathlib
 import tempfile
+import asyncio
+from typing import List, Literal
+from contextlib import asynccontextmanager
 from fastapi import FastAPI, BackgroundTasks, HTTPException, Query
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 from git import Repo
 from langchain_core.messages import AIMessage, HumanMessage
 from ai_core import create_conversational_chain, query_with_context, embed_entire_repository
+from shared import analysis_jobs, get_session, set_session, update_session
+# ============================================================================
+# Configuration
+# ============================================================================
 SESSIONS_BASE_DIR = pathlib.Path(tempfile.gettempdir()) / "repo_sessions"
+# ============================================================================
+# Lifespan Context Manager (Startup/Shutdown)
+# ============================================================================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Handle startup and shutdown events."""
+    # Startup: Ensure directories exist
+    SESSIONS_BASE_DIR.mkdir(exist_ok=True)
+    print(f"✅ GitHub Companion API started. Sessions dir: {SESSIONS_BASE_DIR}")
+    yield
+    # Shutdown: Cleanup could be added here if needed
+    print("🛑 GitHub Companion API shutting down.")
+# ============================================================================
+# FastAPI App Initialization
+# ============================================================================
 app = FastAPI(
+    title="GitHub Companion API",
     description="API for high-performance analysis and contextual chat with GitHub repositories.",
+    version="5.0.0",
+    lifespan=lifespan
 )
+# CORS Configuration (allows all origins for Hugging Face Spaces)
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# ============================================================================
+# Pydantic Models
+# ============================================================================
 class RepoRequest(BaseModel):
     repo_url: str
 class AnalysisResponse(BaseModel):
     session_id: str
 class StatusResponse(BaseModel):
     session_id: str
     status: Literal["pending", "cloning", "summarizing", "embedding_background", "completed", "failed"]
     message: str | None = None
 class FileDetail(BaseModel):
     path: str
     size_bytes: int
 class AnalysisResult(BaseModel):
     repo_url: str
     directory_structure: List[FileDetail]
     initial_summary: str
 class FileContentResponse(BaseModel):
     path: str
     content: str
 class ChatRequest(BaseModel):
     query: str
     pinned_files: List[str] = []
 class ChatResponse(BaseModel):
     answer: str
 class ModifiedFile(BaseModel):
     path: str
     content: str
 class DownloadRequest(BaseModel):
     modified_files: List[ModifiedFile]
+# ============================================================================
+# Utility Functions
+# ============================================================================
+def is_text_file(file_path: str) -> bool:
+    """Check if a file is readable as text."""
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            f.read(512)
+        return True
+    except Exception:
+        return False
 def initial_analysis_task(session_id: str, repo_url: str, background_tasks: BackgroundTasks):
+    """
+    Background task to clone and analyze a repository.
+    This runs in a thread pool to avoid blocking the main event loop.
+    """
     session_repo_path = SESSIONS_BASE_DIR / session_id
     try:
+        # Cleanup if exists
         if session_repo_path.exists():
             shutil.rmtree(session_repo_path)
         SESSIONS_BASE_DIR.mkdir(exist_ok=True)
+        update_session(session_id, "status", "cloning")
+        # Clone repository (shallow clone for speed)
         Repo.clone_from(repo_url, str(session_repo_path), depth=1)
+        update_session(session_id, "repo_path", str(session_repo_path))
         repo_name = repo_url.split('/')[-1].replace('.git', '')
+        update_session(session_id, "repo_name", repo_name)
+        # Define ignore patterns
+        ignore_patterns = {'.git', '.gitignore', '__pycache__', 'node_modules', 'dist', 'build', '.venv', 'venv'}
         all_file_details = []
         key_file_paths_for_summary = []
         all_text_file_paths_for_embedding = []
+        summary_candidate_names = {"readme.md", "package.json", "pyproject.toml", "requirements.txt", "pom.xml", "build.gradle", "cargo.toml"}
+        # Walk the repository
         for root, dirs, files in os.walk(str(session_repo_path), topdown=True):
             dirs[:] = [d for d in dirs if d not in ignore_patterns]
             for name in files:
+                if name in ignore_patterns:
                     continue
                 file_path = os.path.join(root, name)
                 if not os.path.islink(file_path):
                     try:
                         relative_path = pathlib.Path(file_path).relative_to(session_repo_path).as_posix()
                     except ValueError:
                         relative_path = os.path.relpath(file_path, str(session_repo_path)).replace("\\", "/")
                     file_size = os.path.getsize(file_path)
                     all_file_details.append(FileDetail(path=relative_path, size_bytes=file_size))
                     if is_text_file(file_path):
                         all_text_file_paths_for_embedding.append(file_path)
                         if name.lower() in summary_candidate_names:
                             key_file_paths_for_summary.append(file_path)
+        update_session(session_id, "status", "summarizing")
+        # Initialize chat history
+        update_session(session_id, "chat_history", [])
+        # Create RAG chain with key files
         rag_chain = create_conversational_chain(key_file_paths_for_summary, session_id)
         if not rag_chain:
             raise Exception("Failed to create initial AI chain.")
+        update_session(session_id, "rag_chain", rag_chain)
+        # Generate initial summary
+        job = get_session(session_id)
+        chat_history = job.get("chat_history", [])
         summary_query = "Based on the provided files (like README, package.json, etc.), what is the primary purpose of this software project? Provide a concise, one-paragraph summary."
+        initial_summary = query_with_context(rag_chain, chat_history, summary_query, [], str(session_repo_path))
+        # Store result
         result = AnalysisResult(
             repo_url=repo_url,
             directory_structure=sorted(all_file_details, key=lambda x: x.path),
             initial_summary=initial_summary
         )
+        update_session(session_id, "result", result)
+        update_session(session_id, "status", "embedding_background")
+        # Start background embedding
         background_tasks.add_task(embed_entire_repository, session_id, all_text_file_paths_for_embedding)
     except Exception as e:
+        update_session(session_id, "status", "failed")
+        update_session(session_id, "message", str(e))
+        print(f"❌ Analysis failed for session {session_id}: {e}")
+# ============================================================================
+# API Endpoints
+# ============================================================================
 @app.get("/")
 def read_root():
+    """Root endpoint with API info."""
+    return JSONResponse(content={
+        "message": "GitHub Companion Backend is Running",
+        "version": "5.0.0",
+        "docs": "/docs"
+    })
+@app.get("/health")
+def health_check():
+    """Health check endpoint for Hugging Face Spaces."""
+    return JSONResponse(content={"status": "healthy"})
 @app.post("/analyze", response_model=AnalysisResponse, status_code=202)
 def submit_analysis(request: RepoRequest, background_tasks: BackgroundTasks):
+    """Submit a repository for analysis."""
     session_id = str(uuid.uuid4())
+    set_session(session_id, {"status": "pending"})
     background_tasks.add_task(initial_analysis_task, session_id, request.repo_url, background_tasks)
     return AnalysisResponse(session_id=session_id)
 @app.get("/status/{session_id}", response_model=StatusResponse)
 def get_analysis_status(session_id: str):
+    """Get the status of an analysis job."""
+    job = get_session(session_id)
     if not job:
         raise HTTPException(status_code=404, detail="Session ID not found.")
+    # Check if embedding is complete
+    if job.get("status") == "embedding_background" and job.get("embedding_complete"):
+        update_session(session_id, "status", "completed")
         job["status"] = "completed"
     return StatusResponse(session_id=session_id, status=job["status"], message=job.get("message"))
 @app.get("/result/{session_id}", response_model=AnalysisResult)
 def get_analysis_result(session_id: str):
+    """Get the analysis result for a completed job."""
+    job = get_session(session_id)
     if not job or job.get("status") not in ["embedding_background", "completed"]:
         raise HTTPException(status_code=400, detail="Job not found or not ready.")
     return job["result"]
 @app.get("/file-content/{session_id}", response_model=FileContentResponse)
 def get_file_content(session_id: str, file_path: str = Query(..., alias="path")):
+    """Get the content of a specific file in the repository."""
+    job = get_session(session_id)
     if not job or "repo_path" not in job:
         raise HTTPException(status_code=404, detail="Session not found.")
     repo_base_path = pathlib.Path(job["repo_path"]).resolve()
     requested_file_path = (repo_base_path / file_path).resolve()
+    # Security: Prevent path traversal
     if not requested_file_path.is_relative_to(repo_base_path):
         raise HTTPException(status_code=403, detail="Access denied.")
     if not requested_file_path.is_file():
         raise HTTPException(status_code=404, detail="File not found.")
     try:
         content = requested_file_path.read_text(encoding="utf-8")
         return FileContentResponse(path=file_path, content=content)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
 @app.post("/chat/{session_id}", response_model=ChatResponse)
 def chat_with_repo(session_id: str, request: ChatRequest):
+    """Chat with the AI about the repository."""
+    job = get_session(session_id)
     if not job or "rag_chain" not in job:
         raise HTTPException(status_code=404, detail="Chat session not ready.")
     rag_chain = job["rag_chain"]
     chat_history = job.get("chat_history", [])
     repo_path = job["repo_path"]
     answer = query_with_context(rag_chain, chat_history, request.query, request.pinned_files, repo_path)
     return ChatResponse(answer=answer)
 @app.post("/download-zip/{session_id}")
 async def download_zip(session_id: str, request: DownloadRequest, background_tasks: BackgroundTasks):
+    """Download the repository as a ZIP file with any modifications applied."""
+    job = get_session(session_id)
     if not job or "repo_path" not in job:
         raise HTTPException(status_code=404, detail="Session not found.")
     repo_base_path = pathlib.Path(job["repo_path"]).resolve()
     repo_name = job.get("repo_name", session_id)
     temp_zip_dir = pathlib.Path(tempfile.gettempdir()) / "temp_zips"
+    # Apply modifications
     for modified_file in request.modified_files:
         file_to_update = (repo_base_path / modified_file.path).resolve()
         if not file_to_update.is_relative_to(repo_base_path):
             continue
         file_to_update.parent.mkdir(parents=True, exist_ok=True)
         file_to_update.write_text(modified_file.content, encoding="utf-8")
+    # Create ZIP
     temp_zip_dir.mkdir(exist_ok=True)
     zip_path_base = temp_zip_dir / f"{repo_name}-{session_id}"
     zip_path_final = shutil.make_archive(str(zip_path_base), 'zip', str(repo_base_path))
+    # Cleanup ZIP after download
     background_tasks.add_task(os.remove, zip_path_final)
     return FileResponse(
         path=zip_path_final,
         media_type='application/zip',

requirements.txt CHANGED Viewed

@@ -1,14 +1,17 @@
-fastapi>=0.111.0
-uvicorn[standard]>=0.29.0
-GitPython>=3.1.43
-langchain>=0.2.5
-langchain-core>=0.2.9
-langchain-community>=0.2.4
-langchain-google-genai>=1.0.5
-chromadb>=0.5.0
-sentence-transformers>=2.7.0
-unstructured>=0.14.4
-python-magic>=0.4.27
-torch
-transformers
-huggingface-hub>=0.23.0

+fastapi==0.115.14
+uvicorn[standard]==0.35.0
+GitPython==3.1.44
+langchain==0.3.24
+langchain-core==0.3.66
+langchain-community==0.4.1
+langchain-google-genai==2.0.8
+chromadb==0.6.3
+sentence-transformers==4.1.0
+unstructured==0.16.17
+python-magic==0.4.27
+torch==2.7.0
+transformers==4.53.0
+huggingface-hub==0.33.1

shared.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Shared state module for GitHub Companion Backend.
+This module provides thread-safe shared state for managing analysis sessions
+across multiple concurrent users. It uses threading locks to ensure safe
+access to the shared dictionary in a multi-threaded environment.
+"""
+import threading
+from typing import Dict, Any
+# Thread-safe lock for accessing analysis_jobs
+_lock = threading.Lock()
+# Global dictionary to store analysis job states
+# Each session_id maps to a dictionary containing:
+# - status: The current job status
+# - repo_path: Path to cloned repository
+# - rag_chain: The LangChain retrieval chain
+# - vectorstore: ChromaDB vectorstore
+# - chat_history: List of chat messages
+# - result: Analysis result data
+analysis_jobs: Dict[str, Dict[str, Any]] = {}
+def get_session(session_id: str) -> Dict[str, Any] | None:
+    """Thread-safe getter for a session."""
+    with _lock:
+        return analysis_jobs.get(session_id)
+def set_session(session_id: str, data: Dict[str, Any]) -> None:
+    """Thread-safe setter for a session."""
+    with _lock:
+        analysis_jobs[session_id] = data
+def update_session(session_id: str, key: str, value: Any) -> None:
+    """Thread-safe update for a specific key in a session."""
+    with _lock:
+        if session_id in analysis_jobs:
+            analysis_jobs[session_id][key] = value
+def delete_session(session_id: str) -> bool:
+    """Thread-safe deletion of a session. Returns True if deleted."""
+    with _lock:
+        if session_id in analysis_jobs:
+            del analysis_jobs[session_id]
+            return True
+        return False
+def session_exists(session_id: str) -> bool:
+    """Check if a session exists."""
+    with _lock:
+        return session_id in analysis_jobs