Spaces:
Running
Running
| """ | |
| ChromaDB Repo Indexer | |
| ====================== | |
| Indexes repository source code into ChromaDB for semantic search. | |
| Each repo gets its own ChromaDB collection, keyed by the repo's full name. | |
| How indexing works: | |
| 1. Receive file contents from GitHub API | |
| 2. Chunk each file into ~60-line blocks | |
| 3. Embed each chunk using sentence-transformers | |
| 4. Upsert into ChromaDB collection for this repo | |
| ChromaDB is an open-source vector database that: | |
| - Runs embedded in the Python process (no separate server needed) | |
| - Stores vectors + metadata + documents together | |
| - Supports fast approximate nearest neighbor (ANN) search | |
| - Can persist to disk or run entirely in-memory | |
| We use in-memory mode on Render (ephemeral storage) — the index is rebuilt | |
| on each PR review. This is acceptable because indexing the changed files | |
| takes <1 second for typical PRs. | |
| """ | |
| from __future__ import annotations | |
| import chromadb | |
| import structlog | |
| from app.config import settings | |
| from app.context.embedder import chunk_code, embed_texts | |
| logger = structlog.get_logger() | |
| # Singleton ChromaDB client (in-memory) | |
| _chroma_client: chromadb.ClientAPI | None = None | |
| def _get_chroma_client() -> chromadb.ClientAPI: | |
| """Get or create the ChromaDB client.""" | |
| global _chroma_client | |
| if _chroma_client is None: | |
| _chroma_client = chromadb.Client() # In-memory, no persistence | |
| return _chroma_client | |
| def _collection_name(repo_full_name: str) -> str: | |
| """Generate a valid ChromaDB collection name from a repo name.""" | |
| # ChromaDB requires alphanumeric + underscores, 3-63 chars | |
| name = repo_full_name.replace("/", "_").replace("-", "_") | |
| return f"repo_{name}"[:63] | |
| async def index_repo_files( | |
| repo_full_name: str, file_contents: dict[str, str] | |
| ) -> str: | |
| """ | |
| Index repository files into ChromaDB for RAG retrieval. | |
| This is called during each PR review to ensure the vector store | |
| has the latest file contents. We upsert (insert or update) so | |
| re-indexing the same file just overwrites the old vectors. | |
| Args: | |
| repo_full_name: "owner/repo" — used as collection name | |
| file_contents: dict of {filepath: source_code} | |
| Returns: | |
| Collection name (for retrieval) | |
| """ | |
| client = _get_chroma_client() | |
| collection_name = _collection_name(repo_full_name) | |
| # Get or create a collection for this repo | |
| collection = client.get_or_create_collection( | |
| name=collection_name, | |
| metadata={"repo": repo_full_name}, | |
| ) | |
| # Chunk all files | |
| all_chunks = [] | |
| for filepath, content in file_contents.items(): | |
| # Skip very large files (binary, generated code, etc.) | |
| if len(content) > 100_000: | |
| continue | |
| chunks = chunk_code(content, filepath) | |
| all_chunks.extend(chunks) | |
| if not all_chunks: | |
| logger.info("No chunks to index", repo=repo_full_name) | |
| return collection_name | |
| # Limit total chunks (Render memory constraint) | |
| max_chunks = settings.max_repo_files_index | |
| if len(all_chunks) > max_chunks: | |
| all_chunks = all_chunks[:max_chunks] | |
| # Embed all chunks | |
| texts = [chunk["text"] for chunk in all_chunks] | |
| embeddings = embed_texts(texts) | |
| if not embeddings: | |
| logger.warning("Embedding failed — RAG context unavailable") | |
| return collection_name | |
| # Upsert into ChromaDB | |
| ids = [f"{chunk['filepath']}:{chunk['start_line']}" for chunk in all_chunks] | |
| metadatas = [ | |
| {"filepath": chunk["filepath"], "start_line": chunk["start_line"], "end_line": chunk["end_line"]} | |
| for chunk in all_chunks | |
| ] | |
| collection.upsert( | |
| ids=ids, | |
| embeddings=embeddings, | |
| documents=texts, | |
| metadatas=metadatas, | |
| ) | |
| logger.info( | |
| "Indexed repo files", | |
| repo=repo_full_name, | |
| chunks=len(all_chunks), | |
| collection=collection_name, | |
| ) | |
| return collection_name | |