File size: 3,895 Bytes
4b445f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
ChromaDB Repo Indexer
======================

Indexes repository source code into ChromaDB for semantic search.
Each repo gets its own ChromaDB collection, keyed by the repo's full name.

How indexing works:
1. Receive file contents from GitHub API
2. Chunk each file into ~60-line blocks
3. Embed each chunk using sentence-transformers
4. Upsert into ChromaDB collection for this repo

ChromaDB is an open-source vector database that:
- Runs embedded in the Python process (no separate server needed)
- Stores vectors + metadata + documents together
- Supports fast approximate nearest neighbor (ANN) search
- Can persist to disk or run entirely in-memory

We use in-memory mode on Render (ephemeral storage) — the index is rebuilt
on each PR review. This is acceptable because indexing the changed files
takes <1 second for typical PRs.
"""

from __future__ import annotations

import chromadb
import structlog

from app.config import settings
from app.context.embedder import chunk_code, embed_texts

logger = structlog.get_logger()

# Singleton ChromaDB client (in-memory)
_chroma_client: chromadb.ClientAPI | None = None


def _get_chroma_client() -> chromadb.ClientAPI:
    """Get or create the ChromaDB client."""
    global _chroma_client
    if _chroma_client is None:
        _chroma_client = chromadb.Client()  # In-memory, no persistence
    return _chroma_client


def _collection_name(repo_full_name: str) -> str:
    """Generate a valid ChromaDB collection name from a repo name."""
    # ChromaDB requires alphanumeric + underscores, 3-63 chars
    name = repo_full_name.replace("/", "_").replace("-", "_")
    return f"repo_{name}"[:63]


async def index_repo_files(
    repo_full_name: str, file_contents: dict[str, str]
) -> str:
    """
    Index repository files into ChromaDB for RAG retrieval.

    This is called during each PR review to ensure the vector store
    has the latest file contents. We upsert (insert or update) so
    re-indexing the same file just overwrites the old vectors.

    Args:
        repo_full_name: "owner/repo" — used as collection name
        file_contents: dict of {filepath: source_code}

    Returns:
        Collection name (for retrieval)
    """
    client = _get_chroma_client()
    collection_name = _collection_name(repo_full_name)

    # Get or create a collection for this repo
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"repo": repo_full_name},
    )

    # Chunk all files
    all_chunks = []
    for filepath, content in file_contents.items():
        # Skip very large files (binary, generated code, etc.)
        if len(content) > 100_000:
            continue
        chunks = chunk_code(content, filepath)
        all_chunks.extend(chunks)

    if not all_chunks:
        logger.info("No chunks to index", repo=repo_full_name)
        return collection_name

    # Limit total chunks (Render memory constraint)
    max_chunks = settings.max_repo_files_index
    if len(all_chunks) > max_chunks:
        all_chunks = all_chunks[:max_chunks]

    # Embed all chunks
    texts = [chunk["text"] for chunk in all_chunks]
    embeddings = embed_texts(texts)

    if not embeddings:
        logger.warning("Embedding failed — RAG context unavailable")
        return collection_name

    # Upsert into ChromaDB
    ids = [f"{chunk['filepath']}:{chunk['start_line']}" for chunk in all_chunks]
    metadatas = [
        {"filepath": chunk["filepath"], "start_line": chunk["start_line"], "end_line": chunk["end_line"]}
        for chunk in all_chunks
    ]

    collection.upsert(
        ids=ids,
        embeddings=embeddings,
        documents=texts,
        metadatas=metadatas,
    )

    logger.info(
        "Indexed repo files",
        repo=repo_full_name,
        chunks=len(all_chunks),
        collection=collection_name,
    )

    return collection_name