Spaces:
Running
Running
File size: 3,895 Bytes
4b445f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
ChromaDB Repo Indexer
======================
Indexes repository source code into ChromaDB for semantic search.
Each repo gets its own ChromaDB collection, keyed by the repo's full name.
How indexing works:
1. Receive file contents from GitHub API
2. Chunk each file into ~60-line blocks
3. Embed each chunk using sentence-transformers
4. Upsert into ChromaDB collection for this repo
ChromaDB is an open-source vector database that:
- Runs embedded in the Python process (no separate server needed)
- Stores vectors + metadata + documents together
- Supports fast approximate nearest neighbor (ANN) search
- Can persist to disk or run entirely in-memory
We use in-memory mode on Render (ephemeral storage) — the index is rebuilt
on each PR review. This is acceptable because indexing the changed files
takes <1 second for typical PRs.
"""
from __future__ import annotations
import chromadb
import structlog
from app.config import settings
from app.context.embedder import chunk_code, embed_texts
logger = structlog.get_logger()
# Singleton ChromaDB client (in-memory)
_chroma_client: chromadb.ClientAPI | None = None
def _get_chroma_client() -> chromadb.ClientAPI:
"""Get or create the ChromaDB client."""
global _chroma_client
if _chroma_client is None:
_chroma_client = chromadb.Client() # In-memory, no persistence
return _chroma_client
def _collection_name(repo_full_name: str) -> str:
"""Generate a valid ChromaDB collection name from a repo name."""
# ChromaDB requires alphanumeric + underscores, 3-63 chars
name = repo_full_name.replace("/", "_").replace("-", "_")
return f"repo_{name}"[:63]
async def index_repo_files(
repo_full_name: str, file_contents: dict[str, str]
) -> str:
"""
Index repository files into ChromaDB for RAG retrieval.
This is called during each PR review to ensure the vector store
has the latest file contents. We upsert (insert or update) so
re-indexing the same file just overwrites the old vectors.
Args:
repo_full_name: "owner/repo" — used as collection name
file_contents: dict of {filepath: source_code}
Returns:
Collection name (for retrieval)
"""
client = _get_chroma_client()
collection_name = _collection_name(repo_full_name)
# Get or create a collection for this repo
collection = client.get_or_create_collection(
name=collection_name,
metadata={"repo": repo_full_name},
)
# Chunk all files
all_chunks = []
for filepath, content in file_contents.items():
# Skip very large files (binary, generated code, etc.)
if len(content) > 100_000:
continue
chunks = chunk_code(content, filepath)
all_chunks.extend(chunks)
if not all_chunks:
logger.info("No chunks to index", repo=repo_full_name)
return collection_name
# Limit total chunks (Render memory constraint)
max_chunks = settings.max_repo_files_index
if len(all_chunks) > max_chunks:
all_chunks = all_chunks[:max_chunks]
# Embed all chunks
texts = [chunk["text"] for chunk in all_chunks]
embeddings = embed_texts(texts)
if not embeddings:
logger.warning("Embedding failed — RAG context unavailable")
return collection_name
# Upsert into ChromaDB
ids = [f"{chunk['filepath']}:{chunk['start_line']}" for chunk in all_chunks]
metadatas = [
{"filepath": chunk["filepath"], "start_line": chunk["start_line"], "end_line": chunk["end_line"]}
for chunk in all_chunks
]
collection.upsert(
ids=ids,
embeddings=embeddings,
documents=texts,
metadatas=metadatas,
)
logger.info(
"Indexed repo files",
repo=repo_full_name,
chunks=len(all_chunks),
collection=collection_name,
)
return collection_name
|