Sonu Prasad commited on
Commit
4c6e0cc
·
1 Parent(s): 825dc14

Optimize backend for production

Browse files
Dockerfile CHANGED
@@ -1,5 +1,6 @@
1
  FROM python:3.11-slim
2
 
 
3
  RUN apt-get update && apt-get install -y --no-install-recommends \
4
  libmagic1 \
5
  git \
@@ -10,15 +11,22 @@ WORKDIR /code
10
  # Set environment variables for cache directories
11
  ENV HF_HOME=/tmp/huggingface_cache
12
  ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface_cache
 
13
 
 
14
  COPY ./requirements.txt /code/requirements.txt
15
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
16
 
17
- # Pre-download model with proper cache directory
18
  RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface_cache/sentence_transformers')"
19
 
 
 
20
  COPY ./ai_core.py /code/ai_core.py
21
  COPY ./main.py /code/main.py
22
 
 
23
  EXPOSE 7860
 
 
24
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.11-slim
2
 
3
+ # Install system dependencies
4
  RUN apt-get update && apt-get install -y --no-install-recommends \
5
  libmagic1 \
6
  git \
 
11
  # Set environment variables for cache directories
12
  ENV HF_HOME=/tmp/huggingface_cache
13
  ENV SENTENCE_TRANSFORMERS_HOME=/tmp/huggingface_cache
14
+ ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
15
 
16
+ # Copy requirements first for better caching
17
  COPY ./requirements.txt /code/requirements.txt
18
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
19
 
20
+ # Pre-download the embedding model
21
  RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/huggingface_cache/sentence_transformers')"
22
 
23
+ # Copy application code
24
+ COPY ./shared.py /code/shared.py
25
  COPY ./ai_core.py /code/ai_core.py
26
  COPY ./main.py /code/main.py
27
 
28
+ # Expose port for Hugging Face Spaces
29
  EXPOSE 7860
30
+
31
+ # Run with uvicorn (single worker to avoid multiprocessing issues in HF Spaces)
32
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/ai_core.cpython-314.pyc ADDED
Binary file (9.8 kB). View file
 
__pycache__/main.cpython-314.pyc ADDED
Binary file (19.1 kB). View file
 
__pycache__/shared.cpython-314.pyc ADDED
Binary file (3.59 kB). View file
 
ai_core.py CHANGED
@@ -1,6 +1,18 @@
 
 
 
 
 
 
 
 
 
1
  import os
2
  import tempfile
3
  import pathlib
 
 
 
4
  from langchain_community.document_loaders import TextLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain_community.embeddings import SentenceTransformerEmbeddings
@@ -12,38 +24,74 @@ from langchain_core.prompts import ChatPromptTemplate
12
  from langchain.docstore.document import Document
13
  from langchain_core.messages import AIMessage, HumanMessage
14
 
 
 
 
 
 
 
 
 
 
 
15
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")
16
 
17
- def create_conversational_chain(file_paths: list[str], session_id: str):
18
- from main import analysis_jobs
 
 
19
 
 
 
 
 
 
 
 
20
  try:
 
 
21
  chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)
22
 
 
23
  documents = []
24
  if file_paths:
25
  for file_path in file_paths:
26
  try:
27
  loader = TextLoader(file_path, encoding='utf-8')
28
  documents.extend(loader.load())
 
29
  except Exception as e:
30
- print(f"Skipping file {file_path} due to error: {e}")
31
  continue
32
 
 
33
  if not documents:
34
  documents = [Document(page_content="No text files were provided for initial analysis.")]
 
35
 
 
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
37
  texts = text_splitter.split_documents(documents)
 
38
 
39
- embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", cache_folder=CACHE_DIR)
 
 
 
 
40
 
 
41
  db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
 
42
 
 
43
  retriever = db.as_retriever(search_kwargs={"k": 5})
44
 
45
- llm = GoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)
 
46
 
 
47
  system_prompt = (
48
  "You are an expert software developer assistant. Your goal is to help users "
49
  "understand a GitHub repository. Use the following pieces of retrieved context "
@@ -58,45 +106,77 @@ def create_conversational_chain(file_paths: list[str], session_id: str):
58
  ("human", "{input}"),
59
  ])
60
 
 
61
  question_answer_chain = create_stuff_documents_chain(llm, prompt)
62
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
63
 
64
- # Store vectorstore in analysis_jobs directly
65
- analysis_jobs[session_id]["vectorstore"] = db
66
 
 
67
  return rag_chain
68
 
69
  except Exception as e:
70
- print(f"Error creating conversational chain: {e}")
71
  return None
72
 
73
- def embed_entire_repository(session_id: str, all_file_paths: list[str]):
74
- from main import analysis_jobs
 
 
75
 
 
 
 
 
76
  try:
77
- if session_id in analysis_jobs and "vectorstore" in analysis_jobs[session_id]:
78
- vectorstore = analysis_jobs[session_id]["vectorstore"]
79
-
80
- documents = []
81
- for file_path in all_file_paths:
82
- try:
83
- loader = TextLoader(file_path, encoding='utf-8')
84
- documents.extend(loader.load())
85
- except Exception:
86
- continue
87
-
88
- if documents:
89
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
90
- texts = text_splitter.split_documents(documents)
91
- vectorstore.add_documents(texts)
92
 
93
- analysis_jobs[session_id]["embedding_complete"] = True
94
- print(f"Background embedding complete for session {session_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  except Exception as e:
96
- print(f"Error in background embedding for session {session_id}: {e}")
 
97
 
98
- def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: list[str], repo_path: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  try:
 
100
  context_str = ""
101
  if pinned_files:
102
  context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
@@ -105,21 +185,25 @@ def query_with_context(rag_chain, chat_history: list, query: str, pinned_files:
105
  if file_p.is_file():
106
  context_str += f"--- START OF FILE: {file} ---\n"
107
  try:
 
108
  context_str += file_p.read_text(encoding="utf-8")[:4000]
109
  except Exception:
110
  context_str += "(Could not read file content)"
111
  context_str += f"\n--- END OF FILE: {file} ---\n\n"
112
 
 
113
  final_query = f"{context_str}Based on the context and our history, answer the question: {query}"
114
 
 
115
  response = rag_chain.invoke({"input": final_query, "chat_history": chat_history})
116
 
117
  answer = response.get("answer", "I could not find an answer.")
118
 
119
- # Add to chat history
120
  chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])
121
 
122
  return answer
 
123
  except Exception as e:
124
- print(f"Error during query invocation: {e}")
125
  return f"An error occurred while processing your request: {str(e)}"
 
1
+ """
2
+ AI Core Module for GitHub Companion
3
+
4
+ Handles:
5
+ - Document embedding with ChromaDB
6
+ - Conversational RAG chain creation
7
+ - Context-aware query processing
8
+ """
9
+
10
  import os
11
  import tempfile
12
  import pathlib
13
+ import logging
14
+ from typing import List
15
+
16
  from langchain_community.document_loaders import TextLoader
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
18
  from langchain_community.embeddings import SentenceTransformerEmbeddings
 
24
  from langchain.docstore.document import Document
25
  from langchain_core.messages import AIMessage, HumanMessage
26
 
27
+ from shared import analysis_jobs, update_session, get_session
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Cache directory for embeddings model
37
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "huggingface_cache", "sentence_transformers")
38
 
39
+
40
+ def create_conversational_chain(file_paths: List[str], session_id: str):
41
+ """
42
+ Create a conversational RAG chain from the provided files.
43
 
44
+ Args:
45
+ file_paths: List of file paths to embed for initial context
46
+ session_id: Unique session identifier
47
+
48
+ Returns:
49
+ A LangChain retrieval chain or None if creation fails
50
+ """
51
  try:
52
+ logger.info(f"Creating conversational chain for session {session_id}")
53
+
54
  chroma_db_path = os.path.join(tempfile.gettempdir(), "chroma_db_cache", session_id)
55
 
56
+ # Load documents
57
  documents = []
58
  if file_paths:
59
  for file_path in file_paths:
60
  try:
61
  loader = TextLoader(file_path, encoding='utf-8')
62
  documents.extend(loader.load())
63
+ logger.debug(f"Loaded file: {file_path}")
64
  except Exception as e:
65
+ logger.warning(f"Skipping file {file_path}: {e}")
66
  continue
67
 
68
+ # Fallback if no documents
69
  if not documents:
70
  documents = [Document(page_content="No text files were provided for initial analysis.")]
71
+ logger.warning("No documents loaded, using fallback.")
72
 
73
+ # Split documents
74
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
75
  texts = text_splitter.split_documents(documents)
76
+ logger.info(f"Split into {len(texts)} text chunks")
77
 
78
+ # Create embeddings
79
+ embeddings = SentenceTransformerEmbeddings(
80
+ model_name="all-MiniLM-L6-v2",
81
+ cache_folder=CACHE_DIR
82
+ )
83
 
84
+ # Create vector store
85
  db = Chroma.from_documents(texts, embeddings, persist_directory=chroma_db_path)
86
+ logger.info(f"Created ChromaDB at {chroma_db_path}")
87
 
88
+ # Create retriever
89
  retriever = db.as_retriever(search_kwargs={"k": 5})
90
 
91
+ # Create LLM
92
+ llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7)
93
 
94
+ # System prompt
95
  system_prompt = (
96
  "You are an expert software developer assistant. Your goal is to help users "
97
  "understand a GitHub repository. Use the following pieces of retrieved context "
 
106
  ("human", "{input}"),
107
  ])
108
 
109
+ # Create chains
110
  question_answer_chain = create_stuff_documents_chain(llm, prompt)
111
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
112
 
113
+ # Store vectorstore in session
114
+ update_session(session_id, "vectorstore", db)
115
 
116
+ logger.info(f"✅ Conversational chain created for session {session_id}")
117
  return rag_chain
118
 
119
  except Exception as e:
120
+ logger.error(f"Error creating conversational chain: {e}")
121
  return None
122
 
123
+
124
+ def embed_entire_repository(session_id: str, all_file_paths: List[str]):
125
+ """
126
+ Background task to embed all text files in the repository.
127
 
128
+ Args:
129
+ session_id: Unique session identifier
130
+ all_file_paths: List of all text file paths to embed
131
+ """
132
  try:
133
+ logger.info(f"Starting background embedding for session {session_id} ({len(all_file_paths)} files)")
134
+
135
+ job = get_session(session_id)
136
+ if not job or "vectorstore" not in job:
137
+ logger.error(f"No vectorstore found for session {session_id}")
138
+ return
 
 
 
 
 
 
 
 
 
139
 
140
+ vectorstore = job["vectorstore"]
141
+
142
+ # Load all documents
143
+ documents = []
144
+ for file_path in all_file_paths:
145
+ try:
146
+ loader = TextLoader(file_path, encoding='utf-8')
147
+ documents.extend(loader.load())
148
+ except Exception:
149
+ continue
150
+
151
+ if documents:
152
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
153
+ texts = text_splitter.split_documents(documents)
154
+ vectorstore.add_documents(texts)
155
+ logger.info(f"Added {len(texts)} chunks to vectorstore")
156
+
157
+ update_session(session_id, "embedding_complete", True)
158
+ logger.info(f"✅ Background embedding complete for session {session_id}")
159
+
160
  except Exception as e:
161
+ logger.error(f"Error in background embedding for session {session_id}: {e}")
162
+
163
 
164
+ def query_with_context(rag_chain, chat_history: list, query: str, pinned_files: List[str], repo_path: str) -> str:
165
+ """
166
+ Query the RAG chain with additional context from pinned files.
167
+
168
+ Args:
169
+ rag_chain: The LangChain retrieval chain
170
+ chat_history: List of previous chat messages
171
+ query: The user's query
172
+ pinned_files: List of file paths the user has pinned for context
173
+ repo_path: Path to the repository root
174
+
175
+ Returns:
176
+ The AI's response as a string
177
+ """
178
  try:
179
+ # Build context from pinned files
180
  context_str = ""
181
  if pinned_files:
182
  context_str += "The user has pinned the following files for primary context. Prioritize information from these files:\n\n"
 
185
  if file_p.is_file():
186
  context_str += f"--- START OF FILE: {file} ---\n"
187
  try:
188
+ # Limit file content to prevent token overflow
189
  context_str += file_p.read_text(encoding="utf-8")[:4000]
190
  except Exception:
191
  context_str += "(Could not read file content)"
192
  context_str += f"\n--- END OF FILE: {file} ---\n\n"
193
 
194
+ # Build final query
195
  final_query = f"{context_str}Based on the context and our history, answer the question: {query}"
196
 
197
+ # Invoke the chain
198
  response = rag_chain.invoke({"input": final_query, "chat_history": chat_history})
199
 
200
  answer = response.get("answer", "I could not find an answer.")
201
 
202
+ # Update chat history
203
  chat_history.extend([HumanMessage(content=query), AIMessage(content=answer)])
204
 
205
  return answer
206
+
207
  except Exception as e:
208
+ logger.error(f"Error during query invocation: {e}")
209
  return f"An error occurred while processing your request: {str(e)}"
main.py CHANGED
@@ -1,245 +1,348 @@
 
 
 
 
 
 
 
1
  import os
2
  import uuid
3
  import shutil
4
  import pathlib
5
  import tempfile
6
- from typing import List, Dict, Any, Literal
 
 
 
7
  from fastapi import FastAPI, BackgroundTasks, HTTPException, Query
8
  from fastapi.responses import FileResponse, JSONResponse
9
  from fastapi.middleware.cors import CORSMiddleware
10
- from pydantic import BaseModel, Field
11
  from git import Repo
 
12
  from langchain_core.messages import AIMessage, HumanMessage
13
  from ai_core import create_conversational_chain, query_with_context, embed_entire_repository
 
 
 
 
 
14
 
15
  SESSIONS_BASE_DIR = pathlib.Path(tempfile.gettempdir()) / "repo_sessions"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  app = FastAPI(
18
- title="Github Companion API",
19
  description="API for high-performance analysis and contextual chat with GitHub repositories.",
20
- version="4.1.0"
 
21
  )
22
 
23
- origins = ["*"]
24
  app.add_middleware(
25
  CORSMiddleware,
26
- allow_origins=origins,
27
  allow_credentials=True,
28
  allow_methods=["*"],
29
  allow_headers=["*"],
30
  )
31
 
32
- analysis_jobs: Dict[str, Dict[str, Any]] = {}
33
 
34
- def is_text_file(file_path):
35
- try:
36
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
37
- f.read(512)
38
- return True
39
- except Exception:
40
- return False
41
 
42
  class RepoRequest(BaseModel):
43
  repo_url: str
44
 
 
45
  class AnalysisResponse(BaseModel):
46
  session_id: str
47
 
 
48
  class StatusResponse(BaseModel):
49
  session_id: str
50
  status: Literal["pending", "cloning", "summarizing", "embedding_background", "completed", "failed"]
51
  message: str | None = None
52
 
 
53
  class FileDetail(BaseModel):
54
  path: str
55
  size_bytes: int
56
 
 
57
  class AnalysisResult(BaseModel):
58
  repo_url: str
59
  directory_structure: List[FileDetail]
60
  initial_summary: str
61
 
 
62
  class FileContentResponse(BaseModel):
63
  path: str
64
  content: str
65
 
 
66
  class ChatRequest(BaseModel):
67
  query: str
68
  pinned_files: List[str] = []
69
 
 
70
  class ChatResponse(BaseModel):
71
  answer: str
72
 
 
73
  class ModifiedFile(BaseModel):
74
  path: str
75
  content: str
76
 
 
77
  class DownloadRequest(BaseModel):
78
  modified_files: List[ModifiedFile]
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def initial_analysis_task(session_id: str, repo_url: str, background_tasks: BackgroundTasks):
 
 
 
 
81
  session_repo_path = SESSIONS_BASE_DIR / session_id
82
-
83
  try:
 
84
  if session_repo_path.exists():
85
  shutil.rmtree(session_repo_path)
86
-
87
  SESSIONS_BASE_DIR.mkdir(exist_ok=True)
88
- analysis_jobs[session_id] = {"status": "cloning"}
89
-
 
90
  Repo.clone_from(repo_url, str(session_repo_path), depth=1)
91
- analysis_jobs[session_id]["repo_path"] = str(session_repo_path)
92
-
93
  repo_name = repo_url.split('/')[-1].replace('.git', '')
94
- analysis_jobs[session_id]["repo_name"] = repo_name
95
-
96
- ignore_patterns = ['.git', '.gitignore', '__pycache__', 'node_modules', 'dist', 'build']
97
-
 
98
  all_file_details = []
99
  key_file_paths_for_summary = []
100
  all_text_file_paths_for_embedding = []
101
- summary_candidate_names = ["readme.md", "package.json", "pyproject.toml", "requirements.txt", "pom.xml", "build.gradle"]
102
-
 
103
  for root, dirs, files in os.walk(str(session_repo_path), topdown=True):
104
  dirs[:] = [d for d in dirs if d not in ignore_patterns]
105
  for name in files:
106
- if name in ignore_patterns:
107
  continue
108
  file_path = os.path.join(root, name)
109
  if not os.path.islink(file_path):
110
- # Fixed relative path calculation using pathlib
111
  try:
112
  relative_path = pathlib.Path(file_path).relative_to(session_repo_path).as_posix()
113
  except ValueError:
114
- # Fallback to os.path.relpath with proper escaping
115
  relative_path = os.path.relpath(file_path, str(session_repo_path)).replace("\\", "/")
116
-
117
  file_size = os.path.getsize(file_path)
118
  all_file_details.append(FileDetail(path=relative_path, size_bytes=file_size))
119
-
120
  if is_text_file(file_path):
121
  all_text_file_paths_for_embedding.append(file_path)
122
  if name.lower() in summary_candidate_names:
123
  key_file_paths_for_summary.append(file_path)
124
-
125
- analysis_jobs[session_id]["status"] = "summarizing"
126
-
127
- # Initialize chat history as a simple list
128
- analysis_jobs[session_id]["chat_history"] = []
129
-
 
130
  rag_chain = create_conversational_chain(key_file_paths_for_summary, session_id)
131
-
132
  if not rag_chain:
133
  raise Exception("Failed to create initial AI chain.")
134
-
135
- analysis_jobs[session_id]["rag_chain"] = rag_chain
136
-
 
 
 
137
  summary_query = "Based on the provided files (like README, package.json, etc.), what is the primary purpose of this software project? Provide a concise, one-paragraph summary."
138
- initial_summary = query_with_context(rag_chain, analysis_jobs[session_id]["chat_history"], summary_query, [], str(session_repo_path))
139
-
 
140
  result = AnalysisResult(
141
  repo_url=repo_url,
142
  directory_structure=sorted(all_file_details, key=lambda x: x.path),
143
  initial_summary=initial_summary
144
  )
145
- analysis_jobs[session_id]["result"] = result
146
- analysis_jobs[session_id]["status"] = "embedding_background"
147
-
 
148
  background_tasks.add_task(embed_entire_repository, session_id, all_text_file_paths_for_embedding)
149
-
150
  except Exception as e:
151
- analysis_jobs[session_id]["status"] = "failed"
152
- analysis_jobs[session_id]["message"] = str(e)
 
 
 
 
 
 
153
 
154
  @app.get("/")
155
  def read_root():
156
- return JSONResponse(content={"message": "Github Companion Backend is Running"})
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  @app.post("/analyze", response_model=AnalysisResponse, status_code=202)
159
  def submit_analysis(request: RepoRequest, background_tasks: BackgroundTasks):
 
160
  session_id = str(uuid.uuid4())
161
- analysis_jobs[session_id] = {"status": "pending"}
162
  background_tasks.add_task(initial_analysis_task, session_id, request.repo_url, background_tasks)
163
  return AnalysisResponse(session_id=session_id)
164
 
 
165
  @app.get("/status/{session_id}", response_model=StatusResponse)
166
  def get_analysis_status(session_id: str):
167
- job = analysis_jobs.get(session_id)
 
168
  if not job:
169
  raise HTTPException(status_code=404, detail="Session ID not found.")
170
-
171
- if job["status"] == "embedding_background" and analysis_jobs[session_id].get("embedding_complete"):
 
 
172
  job["status"] = "completed"
173
-
174
  return StatusResponse(session_id=session_id, status=job["status"], message=job.get("message"))
175
 
 
176
  @app.get("/result/{session_id}", response_model=AnalysisResult)
177
  def get_analysis_result(session_id: str):
178
- job = analysis_jobs.get(session_id)
 
179
  if not job or job.get("status") not in ["embedding_background", "completed"]:
180
  raise HTTPException(status_code=400, detail="Job not found or not ready.")
181
  return job["result"]
182
 
 
183
  @app.get("/file-content/{session_id}", response_model=FileContentResponse)
184
  def get_file_content(session_id: str, file_path: str = Query(..., alias="path")):
185
- job = analysis_jobs.get(session_id)
 
186
  if not job or "repo_path" not in job:
187
  raise HTTPException(status_code=404, detail="Session not found.")
188
-
189
  repo_base_path = pathlib.Path(job["repo_path"]).resolve()
190
  requested_file_path = (repo_base_path / file_path).resolve()
191
-
 
192
  if not requested_file_path.is_relative_to(repo_base_path):
193
  raise HTTPException(status_code=403, detail="Access denied.")
194
-
195
  if not requested_file_path.is_file():
196
  raise HTTPException(status_code=404, detail="File not found.")
197
-
198
  try:
199
  content = requested_file_path.read_text(encoding="utf-8")
200
  return FileContentResponse(path=file_path, content=content)
201
  except Exception as e:
202
  raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
203
 
 
204
  @app.post("/chat/{session_id}", response_model=ChatResponse)
205
  def chat_with_repo(session_id: str, request: ChatRequest):
206
- job = analysis_jobs.get(session_id)
 
207
  if not job or "rag_chain" not in job:
208
  raise HTTPException(status_code=404, detail="Chat session not ready.")
209
-
210
  rag_chain = job["rag_chain"]
211
  chat_history = job.get("chat_history", [])
212
  repo_path = job["repo_path"]
213
-
214
  answer = query_with_context(rag_chain, chat_history, request.query, request.pinned_files, repo_path)
215
-
216
  return ChatResponse(answer=answer)
217
 
 
218
  @app.post("/download-zip/{session_id}")
219
  async def download_zip(session_id: str, request: DownloadRequest, background_tasks: BackgroundTasks):
220
- job = analysis_jobs.get(session_id)
 
221
  if not job or "repo_path" not in job:
222
  raise HTTPException(status_code=404, detail="Session not found.")
223
-
224
  repo_base_path = pathlib.Path(job["repo_path"]).resolve()
225
  repo_name = job.get("repo_name", session_id)
226
  temp_zip_dir = pathlib.Path(tempfile.gettempdir()) / "temp_zips"
227
-
 
228
  for modified_file in request.modified_files:
229
  file_to_update = (repo_base_path / modified_file.path).resolve()
230
-
231
  if not file_to_update.is_relative_to(repo_base_path):
232
  continue
233
-
234
  file_to_update.parent.mkdir(parents=True, exist_ok=True)
235
  file_to_update.write_text(modified_file.content, encoding="utf-8")
236
-
 
237
  temp_zip_dir.mkdir(exist_ok=True)
238
  zip_path_base = temp_zip_dir / f"{repo_name}-{session_id}"
239
  zip_path_final = shutil.make_archive(str(zip_path_base), 'zip', str(repo_base_path))
240
-
 
241
  background_tasks.add_task(os.remove, zip_path_final)
242
-
243
  return FileResponse(
244
  path=zip_path_final,
245
  media_type='application/zip',
 
1
+ """
2
+ GitHub Companion API - Main FastAPI Application
3
+
4
+ A high-performance API for analyzing and chatting with GitHub repositories.
5
+ Optimized for Hugging Face Spaces deployment with multi-user support.
6
+ """
7
+
8
  import os
9
  import uuid
10
  import shutil
11
  import pathlib
12
  import tempfile
13
+ import asyncio
14
+ from typing import List, Literal
15
+ from contextlib import asynccontextmanager
16
+
17
  from fastapi import FastAPI, BackgroundTasks, HTTPException, Query
18
  from fastapi.responses import FileResponse, JSONResponse
19
  from fastapi.middleware.cors import CORSMiddleware
20
+ from pydantic import BaseModel
21
  from git import Repo
22
+
23
  from langchain_core.messages import AIMessage, HumanMessage
24
  from ai_core import create_conversational_chain, query_with_context, embed_entire_repository
25
+ from shared import analysis_jobs, get_session, set_session, update_session
26
+
27
+ # ============================================================================
28
+ # Configuration
29
+ # ============================================================================
30
 
31
  SESSIONS_BASE_DIR = pathlib.Path(tempfile.gettempdir()) / "repo_sessions"
32
 
33
+
34
+ # ============================================================================
35
+ # Lifespan Context Manager (Startup/Shutdown)
36
+ # ============================================================================
37
+
38
+ @asynccontextmanager
39
+ async def lifespan(app: FastAPI):
40
+ """Handle startup and shutdown events."""
41
+ # Startup: Ensure directories exist
42
+ SESSIONS_BASE_DIR.mkdir(exist_ok=True)
43
+ print(f"✅ GitHub Companion API started. Sessions dir: {SESSIONS_BASE_DIR}")
44
+ yield
45
+ # Shutdown: Cleanup could be added here if needed
46
+ print("🛑 GitHub Companion API shutting down.")
47
+
48
+
49
+ # ============================================================================
50
+ # FastAPI App Initialization
51
+ # ============================================================================
52
+
53
  app = FastAPI(
54
+ title="GitHub Companion API",
55
  description="API for high-performance analysis and contextual chat with GitHub repositories.",
56
+ version="5.0.0",
57
+ lifespan=lifespan
58
  )
59
 
60
+ # CORS Configuration (allows all origins for Hugging Face Spaces)
61
  app.add_middleware(
62
  CORSMiddleware,
63
+ allow_origins=["*"],
64
  allow_credentials=True,
65
  allow_methods=["*"],
66
  allow_headers=["*"],
67
  )
68
 
 
69
 
70
+ # ============================================================================
71
+ # Pydantic Models
72
+ # ============================================================================
 
 
 
 
73
 
74
  class RepoRequest(BaseModel):
75
  repo_url: str
76
 
77
+
78
  class AnalysisResponse(BaseModel):
79
  session_id: str
80
 
81
+
82
  class StatusResponse(BaseModel):
83
  session_id: str
84
  status: Literal["pending", "cloning", "summarizing", "embedding_background", "completed", "failed"]
85
  message: str | None = None
86
 
87
+
88
  class FileDetail(BaseModel):
89
  path: str
90
  size_bytes: int
91
 
92
+
93
  class AnalysisResult(BaseModel):
94
  repo_url: str
95
  directory_structure: List[FileDetail]
96
  initial_summary: str
97
 
98
+
99
  class FileContentResponse(BaseModel):
100
  path: str
101
  content: str
102
 
103
+
104
  class ChatRequest(BaseModel):
105
  query: str
106
  pinned_files: List[str] = []
107
 
108
+
109
  class ChatResponse(BaseModel):
110
  answer: str
111
 
112
+
113
  class ModifiedFile(BaseModel):
114
  path: str
115
  content: str
116
 
117
+
118
  class DownloadRequest(BaseModel):
119
  modified_files: List[ModifiedFile]
120
 
121
+
122
+ # ============================================================================
123
+ # Utility Functions
124
+ # ============================================================================
125
+
126
+ def is_text_file(file_path: str) -> bool:
127
+ """Check if a file is readable as text."""
128
+ try:
129
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
130
+ f.read(512)
131
+ return True
132
+ except Exception:
133
+ return False
134
+
135
+
136
  def initial_analysis_task(session_id: str, repo_url: str, background_tasks: BackgroundTasks):
137
+ """
138
+ Background task to clone and analyze a repository.
139
+ This runs in a thread pool to avoid blocking the main event loop.
140
+ """
141
  session_repo_path = SESSIONS_BASE_DIR / session_id
142
+
143
  try:
144
+ # Cleanup if exists
145
  if session_repo_path.exists():
146
  shutil.rmtree(session_repo_path)
147
+
148
  SESSIONS_BASE_DIR.mkdir(exist_ok=True)
149
+ update_session(session_id, "status", "cloning")
150
+
151
+ # Clone repository (shallow clone for speed)
152
  Repo.clone_from(repo_url, str(session_repo_path), depth=1)
153
+ update_session(session_id, "repo_path", str(session_repo_path))
154
+
155
  repo_name = repo_url.split('/')[-1].replace('.git', '')
156
+ update_session(session_id, "repo_name", repo_name)
157
+
158
+ # Define ignore patterns
159
+ ignore_patterns = {'.git', '.gitignore', '__pycache__', 'node_modules', 'dist', 'build', '.venv', 'venv'}
160
+
161
  all_file_details = []
162
  key_file_paths_for_summary = []
163
  all_text_file_paths_for_embedding = []
164
+ summary_candidate_names = {"readme.md", "package.json", "pyproject.toml", "requirements.txt", "pom.xml", "build.gradle", "cargo.toml"}
165
+
166
+ # Walk the repository
167
  for root, dirs, files in os.walk(str(session_repo_path), topdown=True):
168
  dirs[:] = [d for d in dirs if d not in ignore_patterns]
169
  for name in files:
170
+ if name in ignore_patterns:
171
  continue
172
  file_path = os.path.join(root, name)
173
  if not os.path.islink(file_path):
 
174
  try:
175
  relative_path = pathlib.Path(file_path).relative_to(session_repo_path).as_posix()
176
  except ValueError:
 
177
  relative_path = os.path.relpath(file_path, str(session_repo_path)).replace("\\", "/")
178
+
179
  file_size = os.path.getsize(file_path)
180
  all_file_details.append(FileDetail(path=relative_path, size_bytes=file_size))
181
+
182
  if is_text_file(file_path):
183
  all_text_file_paths_for_embedding.append(file_path)
184
  if name.lower() in summary_candidate_names:
185
  key_file_paths_for_summary.append(file_path)
186
+
187
+ update_session(session_id, "status", "summarizing")
188
+
189
+ # Initialize chat history
190
+ update_session(session_id, "chat_history", [])
191
+
192
+ # Create RAG chain with key files
193
  rag_chain = create_conversational_chain(key_file_paths_for_summary, session_id)
194
+
195
  if not rag_chain:
196
  raise Exception("Failed to create initial AI chain.")
197
+
198
+ update_session(session_id, "rag_chain", rag_chain)
199
+
200
+ # Generate initial summary
201
+ job = get_session(session_id)
202
+ chat_history = job.get("chat_history", [])
203
  summary_query = "Based on the provided files (like README, package.json, etc.), what is the primary purpose of this software project? Provide a concise, one-paragraph summary."
204
+ initial_summary = query_with_context(rag_chain, chat_history, summary_query, [], str(session_repo_path))
205
+
206
+ # Store result
207
  result = AnalysisResult(
208
  repo_url=repo_url,
209
  directory_structure=sorted(all_file_details, key=lambda x: x.path),
210
  initial_summary=initial_summary
211
  )
212
+ update_session(session_id, "result", result)
213
+ update_session(session_id, "status", "embedding_background")
214
+
215
+ # Start background embedding
216
  background_tasks.add_task(embed_entire_repository, session_id, all_text_file_paths_for_embedding)
217
+
218
  except Exception as e:
219
+ update_session(session_id, "status", "failed")
220
+ update_session(session_id, "message", str(e))
221
+ print(f"❌ Analysis failed for session {session_id}: {e}")
222
+
223
+
224
+ # ============================================================================
225
+ # API Endpoints
226
+ # ============================================================================
227
 
228
  @app.get("/")
229
  def read_root():
230
+ """Root endpoint with API info."""
231
+ return JSONResponse(content={
232
+ "message": "GitHub Companion Backend is Running",
233
+ "version": "5.0.0",
234
+ "docs": "/docs"
235
+ })
236
+
237
+
238
+ @app.get("/health")
239
+ def health_check():
240
+ """Health check endpoint for Hugging Face Spaces."""
241
+ return JSONResponse(content={"status": "healthy"})
242
+
243
 
244
  @app.post("/analyze", response_model=AnalysisResponse, status_code=202)
245
  def submit_analysis(request: RepoRequest, background_tasks: BackgroundTasks):
246
+ """Submit a repository for analysis."""
247
  session_id = str(uuid.uuid4())
248
+ set_session(session_id, {"status": "pending"})
249
  background_tasks.add_task(initial_analysis_task, session_id, request.repo_url, background_tasks)
250
  return AnalysisResponse(session_id=session_id)
251
 
252
+
253
  @app.get("/status/{session_id}", response_model=StatusResponse)
254
  def get_analysis_status(session_id: str):
255
+ """Get the status of an analysis job."""
256
+ job = get_session(session_id)
257
  if not job:
258
  raise HTTPException(status_code=404, detail="Session ID not found.")
259
+
260
+ # Check if embedding is complete
261
+ if job.get("status") == "embedding_background" and job.get("embedding_complete"):
262
+ update_session(session_id, "status", "completed")
263
  job["status"] = "completed"
264
+
265
  return StatusResponse(session_id=session_id, status=job["status"], message=job.get("message"))
266
 
267
+
268
  @app.get("/result/{session_id}", response_model=AnalysisResult)
269
  def get_analysis_result(session_id: str):
270
+ """Get the analysis result for a completed job."""
271
+ job = get_session(session_id)
272
  if not job or job.get("status") not in ["embedding_background", "completed"]:
273
  raise HTTPException(status_code=400, detail="Job not found or not ready.")
274
  return job["result"]
275
 
276
+
277
  @app.get("/file-content/{session_id}", response_model=FileContentResponse)
278
  def get_file_content(session_id: str, file_path: str = Query(..., alias="path")):
279
+ """Get the content of a specific file in the repository."""
280
+ job = get_session(session_id)
281
  if not job or "repo_path" not in job:
282
  raise HTTPException(status_code=404, detail="Session not found.")
283
+
284
  repo_base_path = pathlib.Path(job["repo_path"]).resolve()
285
  requested_file_path = (repo_base_path / file_path).resolve()
286
+
287
+ # Security: Prevent path traversal
288
  if not requested_file_path.is_relative_to(repo_base_path):
289
  raise HTTPException(status_code=403, detail="Access denied.")
290
+
291
  if not requested_file_path.is_file():
292
  raise HTTPException(status_code=404, detail="File not found.")
293
+
294
  try:
295
  content = requested_file_path.read_text(encoding="utf-8")
296
  return FileContentResponse(path=file_path, content=content)
297
  except Exception as e:
298
  raise HTTPException(status_code=500, detail=f"Error reading file: {str(e)}")
299
 
300
+
301
  @app.post("/chat/{session_id}", response_model=ChatResponse)
302
  def chat_with_repo(session_id: str, request: ChatRequest):
303
+ """Chat with the AI about the repository."""
304
+ job = get_session(session_id)
305
  if not job or "rag_chain" not in job:
306
  raise HTTPException(status_code=404, detail="Chat session not ready.")
307
+
308
  rag_chain = job["rag_chain"]
309
  chat_history = job.get("chat_history", [])
310
  repo_path = job["repo_path"]
311
+
312
  answer = query_with_context(rag_chain, chat_history, request.query, request.pinned_files, repo_path)
313
+
314
  return ChatResponse(answer=answer)
315
 
316
+
317
  @app.post("/download-zip/{session_id}")
318
  async def download_zip(session_id: str, request: DownloadRequest, background_tasks: BackgroundTasks):
319
+ """Download the repository as a ZIP file with any modifications applied."""
320
+ job = get_session(session_id)
321
  if not job or "repo_path" not in job:
322
  raise HTTPException(status_code=404, detail="Session not found.")
323
+
324
  repo_base_path = pathlib.Path(job["repo_path"]).resolve()
325
  repo_name = job.get("repo_name", session_id)
326
  temp_zip_dir = pathlib.Path(tempfile.gettempdir()) / "temp_zips"
327
+
328
+ # Apply modifications
329
  for modified_file in request.modified_files:
330
  file_to_update = (repo_base_path / modified_file.path).resolve()
331
+
332
  if not file_to_update.is_relative_to(repo_base_path):
333
  continue
334
+
335
  file_to_update.parent.mkdir(parents=True, exist_ok=True)
336
  file_to_update.write_text(modified_file.content, encoding="utf-8")
337
+
338
+ # Create ZIP
339
  temp_zip_dir.mkdir(exist_ok=True)
340
  zip_path_base = temp_zip_dir / f"{repo_name}-{session_id}"
341
  zip_path_final = shutil.make_archive(str(zip_path_base), 'zip', str(repo_base_path))
342
+
343
+ # Cleanup ZIP after download
344
  background_tasks.add_task(os.remove, zip_path_final)
345
+
346
  return FileResponse(
347
  path=zip_path_final,
348
  media_type='application/zip',
requirements.txt CHANGED
@@ -1,14 +1,17 @@
1
- fastapi>=0.111.0
2
- uvicorn[standard]>=0.29.0
3
- GitPython>=3.1.43
4
- langchain>=0.2.5
5
- langchain-core>=0.2.9
6
- langchain-community>=0.2.4
7
- langchain-google-genai>=1.0.5
8
- chromadb>=0.5.0
9
- sentence-transformers>=2.7.0
10
- unstructured>=0.14.4
11
- python-magic>=0.4.27
12
- torch
13
- transformers
14
- huggingface-hub>=0.23.0
 
 
 
 
1
+ fastapi==0.115.14
2
+ uvicorn[standard]==0.35.0
3
+ GitPython==3.1.44
4
+
5
+ langchain==0.3.24
6
+ langchain-core==0.3.66
7
+ langchain-community==0.4.1
8
+ langchain-google-genai==2.0.8
9
+
10
+ chromadb==0.6.3
11
+ sentence-transformers==4.1.0
12
+ unstructured==0.16.17
13
+ python-magic==0.4.27
14
+
15
+ torch==2.7.0
16
+ transformers==4.53.0
17
+ huggingface-hub==0.33.1
shared.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared state module for GitHub Companion Backend.
3
+
4
+ This module provides thread-safe shared state for managing analysis sessions
5
+ across multiple concurrent users. It uses threading locks to ensure safe
6
+ access to the shared dictionary in a multi-threaded environment.
7
+ """
8
+
9
+ import threading
10
+ from typing import Dict, Any
11
+
12
+ # Thread-safe lock for accessing analysis_jobs
13
+ _lock = threading.Lock()
14
+
15
+ # Global dictionary to store analysis job states
16
+ # Each session_id maps to a dictionary containing:
17
+ # - status: The current job status
18
+ # - repo_path: Path to cloned repository
19
+ # - rag_chain: The LangChain retrieval chain
20
+ # - vectorstore: ChromaDB vectorstore
21
+ # - chat_history: List of chat messages
22
+ # - result: Analysis result data
23
+ analysis_jobs: Dict[str, Dict[str, Any]] = {}
24
+
25
+
26
+ def get_session(session_id: str) -> Dict[str, Any] | None:
27
+ """Thread-safe getter for a session."""
28
+ with _lock:
29
+ return analysis_jobs.get(session_id)
30
+
31
+
32
+ def set_session(session_id: str, data: Dict[str, Any]) -> None:
33
+ """Thread-safe setter for a session."""
34
+ with _lock:
35
+ analysis_jobs[session_id] = data
36
+
37
+
38
+ def update_session(session_id: str, key: str, value: Any) -> None:
39
+ """Thread-safe update for a specific key in a session."""
40
+ with _lock:
41
+ if session_id in analysis_jobs:
42
+ analysis_jobs[session_id][key] = value
43
+
44
+
45
+ def delete_session(session_id: str) -> bool:
46
+ """Thread-safe deletion of a session. Returns True if deleted."""
47
+ with _lock:
48
+ if session_id in analysis_jobs:
49
+ del analysis_jobs[session_id]
50
+ return True
51
+ return False
52
+
53
+
54
+ def session_exists(session_id: str) -> bool:
55
+ """Check if a session exists."""
56
+ with _lock:
57
+ return session_id in analysis_jobs