Spaces:

NinjainPJs
/

ninja-code-guard

Running

App Files Files Community

ninja-code-guard / app /context /indexer.py

NinjainPJs

initial - commit

4b445f6 3 months ago

raw

history blame contribute delete

3.9 kB

	"""
	ChromaDB Repo Indexer
	======================

	Indexes repository source code into ChromaDB for semantic search.
	Each repo gets its own ChromaDB collection, keyed by the repo's full name.

	How indexing works:
	1. Receive file contents from GitHub API
	2. Chunk each file into ~60-line blocks
	3. Embed each chunk using sentence-transformers
	4. Upsert into ChromaDB collection for this repo

	ChromaDB is an open-source vector database that:
	- Runs embedded in the Python process (no separate server needed)
	- Stores vectors + metadata + documents together
	- Supports fast approximate nearest neighbor (ANN) search
	- Can persist to disk or run entirely in-memory

	We use in-memory mode on Render (ephemeral storage) — the index is rebuilt
	on each PR review. This is acceptable because indexing the changed files
	takes <1 second for typical PRs.
	"""

	from __future__ import annotations

	import chromadb
	import structlog

	from app.config import settings
	from app.context.embedder import chunk_code, embed_texts

	logger = structlog.get_logger()

	# Singleton ChromaDB client (in-memory)
	_chroma_client: chromadb.ClientAPI \| None = None


	def _get_chroma_client() -> chromadb.ClientAPI:
	"""Get or create the ChromaDB client."""
	global _chroma_client
	if _chroma_client is None:
	_chroma_client = chromadb.Client() # In-memory, no persistence
	return _chroma_client


	def _collection_name(repo_full_name: str) -> str:
	"""Generate a valid ChromaDB collection name from a repo name."""
	# ChromaDB requires alphanumeric + underscores, 3-63 chars
	name = repo_full_name.replace("/", "_").replace("-", "_")
	return f"repo_{name}"[:63]


	async def index_repo_files(
	repo_full_name: str, file_contents: dict[str, str]
	) -> str:
	"""
	Index repository files into ChromaDB for RAG retrieval.

	This is called during each PR review to ensure the vector store
	has the latest file contents. We upsert (insert or update) so
	re-indexing the same file just overwrites the old vectors.

	Args:
	repo_full_name: "owner/repo" — used as collection name
	file_contents: dict of {filepath: source_code}

	Returns:
	Collection name (for retrieval)
	"""
	client = _get_chroma_client()
	collection_name = _collection_name(repo_full_name)

	# Get or create a collection for this repo
	collection = client.get_or_create_collection(
	name=collection_name,
	metadata={"repo": repo_full_name},
	)

	# Chunk all files
	all_chunks = []
	for filepath, content in file_contents.items():
	# Skip very large files (binary, generated code, etc.)
	if len(content) > 100_000:
	continue
	chunks = chunk_code(content, filepath)
	all_chunks.extend(chunks)

	if not all_chunks:
	logger.info("No chunks to index", repo=repo_full_name)
	return collection_name

	# Limit total chunks (Render memory constraint)
	max_chunks = settings.max_repo_files_index
	if len(all_chunks) > max_chunks:
	all_chunks = all_chunks[:max_chunks]

	# Embed all chunks
	texts = [chunk["text"] for chunk in all_chunks]
	embeddings = embed_texts(texts)

	if not embeddings:
	logger.warning("Embedding failed — RAG context unavailable")
	return collection_name

	# Upsert into ChromaDB
	ids = [f"{chunk['filepath']}:{chunk['start_line']}" for chunk in all_chunks]
	metadatas = [
	{"filepath": chunk["filepath"], "start_line": chunk["start_line"], "end_line": chunk["end_line"]}
	for chunk in all_chunks
	]

	collection.upsert(
	ids=ids,
	embeddings=embeddings,
	documents=texts,
	metadatas=metadatas,
	)

	logger.info(
	"Indexed repo files",
	repo=repo_full_name,
	chunks=len(all_chunks),
	collection=collection_name,
	)

	return collection_name