Spaces:

kamp0010
/

cc1

Runtime error

App Files Files Community

kamp0010 commited on 1 day ago

Commit

42abbab

verified ·

1 Parent(s): a872f7a

Update main.py

Browse files

Files changed (1) hide show

main.py +319 -332

main.py CHANGED Viewed

@@ -1,302 +1,342 @@
 import os
 import ast
 import re
-import pickle
 import pathlib
 import asyncio
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
 from typing import Annotated
 os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
-os.environ["TOKENIZERS_PARALLELISM"]       = "false"
-os.environ["HF_HUB_VERBOSITY"]            = "error"
-import torch
 import numpy as np
-import faiss
-# ── Compatibility patches ──────────────────────────────────────────────────────
-# jina-bert-v2 (trust_remote_code) was written against transformers 4.x.
-# Transformers 5.x removed / broke three things the model relies on.
-# All patches are no-ops when the symbol already exists.
-#
-# 1. find_pruneable_heads_and_indices  — removed from pytorch_utils
-# 2. PretrainedConfig.is_decoder etc  — no longer set as instance defaults
-# 3. PreTrainedModel.get_head_mask     — removed from modeling_utils in T5
-# ── patch 1: pytorch_utils ────────────────────────────────────────────────────
-import transformers.pytorch_utils as _pt_utils
-if not hasattr(_pt_utils, "find_pruneable_heads_and_indices"):
-    def _find_pruneable_heads_and_indices(
-        heads, n_heads: int, head_size: int, already_pruned_heads
-    ):
-        mask = torch.ones(n_heads, head_size)
-        heads = set(heads) - already_pruned_heads
-        for head in heads:
-            head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        return heads, index
-    _pt_utils.find_pruneable_heads_and_indices = _find_pruneable_heads_and_indices
-# ── patch 2: PretrainedConfig legacy defaults ─────────────────────────────────
-import transformers.configuration_utils as _cfg_utils
-_PC = _cfg_utils.PretrainedConfig
-if not hasattr(_PC, "_jina_compat_patched"):
-    _LEGACY_CFG_DEFAULTS = {
-        "is_decoder":                  False,
-        "add_cross_attention":         False,
-        "cross_attention_hidden_size": None,
-        "use_cache":                   True,
-    }
-    def _pc_getattr(self, key: str):
-        if key in _LEGACY_CFG_DEFAULTS:
-            return _LEGACY_CFG_DEFAULTS[key]
-        raise AttributeError(
-            f"'{type(self).__name__}' object has no attribute '{key}'"
-        )
-    _PC.__getattr__ = _pc_getattr
-    _PC._jina_compat_patched = True
-# ── patch 3: PreTrainedModel.get_head_mask ────────────────────────────────────
-import transformers.modeling_utils as _mod_utils
-_PTM = _mod_utils.PreTrainedModel
-if not hasattr(_PTM, "get_head_mask"):
-    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
-        if head_mask.dim() == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
-        elif head_mask.dim() == 2:
-            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
-        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(dtype=self.dtype)
-        return head_mask
-    def _get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
-        if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
-            if is_attention_chunked:
-                head_mask = head_mask.unsqueeze(-1)
-        else:
-            head_mask = [None] * num_hidden_layers
-        return head_mask
-    if not hasattr(_PTM, "_convert_head_mask_to_5d"):
-        _PTM._convert_head_mask_to_5d = _convert_head_mask_to_5d
-    _PTM.get_head_mask = _get_head_mask
-# ──────────────────────────────────────────────────────────────────────────────
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from sentence_transformers import SentenceTransformer
-# ─────────────────────────── Constants ───────────────────────────────────────
-DIM = 768   # jina-embeddings-v2-base-code output dimension
 def _resolve_store_dir() -> pathlib.Path:
-    """
-    Try /data/indexes (HF Spaces persistent volume).
-    Fall back to ~/.cache/code-search/indexes if /data is not writable
-    (local dev, or volume not yet mounted with correct permissions).
-    """
-    primary = pathlib.Path("/data/indexes")
     try:
         primary.mkdir(parents=True, exist_ok=True)
         probe = primary / ".write_probe"
-        probe.touch()
-        probe.unlink()
         return primary
     except OSError:
-        fallback = pathlib.Path.home() / ".cache" / "code-search" / "indexes"
         fallback.mkdir(parents=True, exist_ok=True)
-        print(f"Warning: /data/indexes not writable — using fallback: {fallback}")
         return fallback
 STORE_DIR = _resolve_store_dir()
-LANGUAGE_MAP = {
-    ".py":   "python",
-    ".js":   "javascript",
-    ".ts":   "typescript",
-    ".tsx":  "typescript",
-    ".jsx":  "javascript",
-    ".go":   "go",
-    ".rs":   "rust",
-    ".java": "java",
-    ".cpp":  "cpp",
-    ".c":    "c",
-    ".cs":   "csharp",
-    ".rb":   "ruby",
-    ".php":  "php",
-    ".md":   "markdown",
-    ".txt":  "text",
-}
-# ─────────────────────────── Global state ────────────────────────────────────
-models: dict = {}
-# store[doc_id] = {"chunks": list[str], "index": faiss.Index}
-store: dict[str, dict] = {}
-_executor = ThreadPoolExecutor(max_workers=2)
-# ─────────────────────────── Lifespan ────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    print("Loading jina-embeddings-v2-base-code…")
     model = SentenceTransformer(
-        "jinaai/jina-embeddings-v2-base-code", trust_remote_code=True
     )
     model.max_seq_length = 8192
-    # Cast to float16 — cuts model VRAM/RAM from ~550 MB to ~275 MB.
-    # SentenceTransformer wraps a nn.Module; half() applies recursively.
-    model.half()
-    model.eval()
     models["model"] = model
-    print("Model ready.")
-    # Restore persisted indexes from /data
-    if STORE_DIR.exists():
-        for faiss_path in STORE_DIR.glob("*.faiss"):
-            doc_id = faiss_path.stem
-            meta_path = STORE_DIR / f"{doc_id}.meta.pkl"
-            if not meta_path.exists():
-                continue
-            try:
-                index = faiss.read_index(str(faiss_path))
-                with open(meta_path, "rb") as f:
-                    meta = pickle.load(f)
-                store[doc_id] = {"chunks": meta["chunks"], "index": index}
-                print(f"Restored index: {doc_id} ({index.ntotal} vectors)")
-            except Exception as e:
-                print(f"Warning: could not restore {doc_id}: {e}")
     yield
     models.clear()
-# ─────────────────────────── App ─────────────────────────────────────────────
-MAX_UPLOAD_BYTES = int(os.getenv("MAX_UPLOAD_MB", "50")) * 1024 * 1024  # default 50 MB
 app = FastAPI(
     title="Code Search API",
-    description=(
-        "Upload source files and search them semantically using "
-        "jinaai/jina-embeddings-v2-base-code + FAISS ANN search."
-    ),
-    version="2.0.0",
     lifespan=lifespan,
 )
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# ─────────────────────────── Embedding helpers ────────────────────────────────
-ENCODE_BATCH_SIZE = int(os.getenv("ENCODE_BATCH_SIZE", "8"))  # lower = less RAM peak
-def encode(texts: list[str]) -> np.ndarray:
     """
-    Synchronous encode with micro-batching + explicit GC.
-    - float16 model weights halve static RAM.
-    - Small batch size (8) keeps activation RAM low during forward pass.
-    - gc.collect() + torch.cuda.empty_cache() after each batch release
-      intermediate tensors immediately instead of waiting for GC.
     """
-    import gc
-    all_embeddings = []
     for i in range(0, len(texts), ENCODE_BATCH_SIZE):
         batch = texts[i : i + ENCODE_BATCH_SIZE]
-        with torch.no_grad():
-            embs = models["model"].encode(
-                batch,
-                show_progress_bar=False,
-                convert_to_numpy=True,
-                normalize_embeddings=False,  # we normalise in FAISS
-            )
-        all_embeddings.append(np.array(embs, dtype=np.float32))
-        # Free activations between batches
         gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    return np.vstack(all_embeddings)
-async def encode_async(texts: list[str]) -> np.ndarray:
-    """Non-blocking wrapper — frees the event loop during model inference."""
     loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(_executor, encode, texts)
-# ─────────────────────────── FAISS helpers ───────────────────────────────────
-def build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
     """
-    Use HNSW for datasets up to ~500k vectors:
-    - ~2x less RAM than IndexFlatIP (stores graph links, not raw vectors twice)
-    - O(log n) search vs O(n) flat scan
-    - M=32 is a good balance of speed/recall; raise to 64 for higher recall
-    Fall back to IndexFlatIP for tiny datasets where HNSW overhead isn't worth it.
     """
-    import gc
-    faiss.normalize_L2(embeddings)   # in-place — no copy
-    n = len(embeddings)
-    if n >= 100:
-        index = faiss.IndexHNSWFlat(DIM, 32, faiss.METRIC_INNER_PRODUCT)
-        index.hnsw.efConstruction = 200
-        index.hnsw.efSearch       = 64
-    else:
-        index = faiss.IndexFlatIP(DIM)
-    index.add(embeddings)
-    del embeddings
-    gc.collect()
-    return index
-def search_index(query: str, doc_id: str, top_k: int) -> list[dict]:
-    q = encode([query])
-    faiss.normalize_L2(q)
-    scores, indices = store[doc_id]["index"].search(q, top_k)
-    chunks = store[doc_id]["chunks"]
-    return [
-        {
-            "rank": i + 1,
-            "score": round(float(scores[0][i]), 4),
-            "text": chunks[indices[0][i]],
-        }
-        for i in range(len(indices[0]))
-        if indices[0][i] >= 0   # FAISS returns -1 for empty slots
-    ]
-# ─────────────────────────── Persistence helpers ─────────────────────────────
-def persist_index(doc_id: str, chunks: list[str], index: faiss.Index) -> None:
-    STORE_DIR.mkdir(parents=True, exist_ok=True)
-    faiss.write_index(index, str(STORE_DIR / f"{doc_id}.faiss"))
-    with open(STORE_DIR / f"{doc_id}.meta.pkl", "wb") as f:
-        pickle.dump({"chunks": chunks, "doc_id": doc_id}, f)
-def delete_persisted(doc_id: str) -> None:
-    (STORE_DIR / f"{doc_id}.faiss").unlink(missing_ok=True)
-    (STORE_DIR / f"{doc_id}.meta.pkl").unlink(missing_ok=True)
-# ─────────────────────────── Chunking helpers ────────────────────────────────
 def detect_language(filename: str) -> str:
-    ext = os.path.splitext(filename)[-1].lower()
-    return LANGUAGE_MAP.get(ext, "text")
 def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
-    """Sentence-window chunker for prose / markdown."""
     sentences = re.split(r'(?<=[.!?])\s+', text.strip())
     sentences = [s.strip() for s in sentences if s.strip()]
     chunks, i = [], 0
@@ -307,7 +347,6 @@ def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
 def chunk_fallback(source: str, max_lines: int = 40, overlap: int = 5) -> list[str]:
-    """Fixed line-window chunking with overlap — last resort."""
     lines  = source.splitlines()
     chunks = []
     i = 0
@@ -318,16 +357,13 @@ def chunk_fallback(source: str, max_lines: int = 40, overlap: int = 5) -> list[s
 def chunk_python(source: str, filepath: str = "") -> list[str]:
-    """AST-based chunker — extracts functions and classes."""
     try:
         tree  = ast.parse(source)
         lines = source.splitlines()
         chunks = []
         for node in ast.walk(tree):
             if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
-                start   = node.lineno - 1
-                end     = node.end_lineno
-                snippet = "\n".join(lines[start:end])
                 prefix  = f"# {filepath}\n" if filepath else ""
                 chunks.append(f"{prefix}{snippet}")
         return chunks if chunks else chunk_fallback(source)
@@ -336,14 +372,9 @@ def chunk_python(source: str, filepath: str = "") -> list[str]:
 def chunk_generic(source: str, filepath: str = "") -> list[str]:
-    """
-    Regex chunker for JS, TS, Go, Rust, Java, C++, etc.
-    Splits on function / class declaration boundaries.
-    """
     pattern = re.compile(
         r'(?:^|\n)(?='
-        r'(?:export\s+)?'
-        r'(?:async\s+)?'
         r'(?:function|class|const\s+\w+\s*=\s*(?:async\s+)?(?:\(|function)|'
         r'(?:public|private|protected|static|\s)*(?:fn|func|def)\s+\w+)'
         r')',
@@ -356,7 +387,6 @@ def chunk_generic(source: str, filepath: str = "") -> list[str]:
 def chunk_code(source: str, filename: str = "") -> list[str]:
-    """Master dispatcher — routes to the best chunker for the file type."""
     lang = detect_language(filename)
     if lang == "python":
         return chunk_python(source, filepath=filename)
@@ -366,58 +396,36 @@ def chunk_code(source: str, filename: str = "") -> list[str]:
         return chunk_generic(source, filepath=filename)
-# ─────────────────────────── Schemas ─────────────────────────────────────────
 class IndexResponse(BaseModel):
-    doc_id: str
-    chunks_indexed: int
-    message: str
 class SearchRequest(BaseModel):
-    doc_id: str = Field(..., description="ID returned by /index")
-    query:  str = Field(..., description="Natural language or code query")
-    top_k:  int = Field(5, ge=1, le=20)
 class SearchResult(BaseModel):
-    rank:  int
-    score: float
-    text:  str
 class SearchResponse(BaseModel):
-    doc_id:  str
-    query:   str
-    results: list[SearchResult]
 class EmbedRequest(BaseModel):
-    texts: list[str] = Field(..., description="List of strings to embed")
 class EmbedResponse(BaseModel):
-    embeddings: list[list[float]]
-    dimensions: int
 class FileEntry(BaseModel):
-    filename: str
-    content:  str   # raw file content as string
 class BatchIndexRequest(BaseModel):
-    doc_id:  str              # one doc_id for the whole project / repo
-    files:   list[FileEntry]
-    replace: bool = True      # if True, replaces existing index for this doc_id
 class BatchIndexResponse(BaseModel):
-    doc_id:         str
-    files_indexed:  int
-    chunks_indexed: int
-# ─────────────────────────── Routes ──────────────────────────────────────────
 @app.get("/", tags=["health"])
 def root():
     return {"status": "ok", "docs": "/docs"}
@@ -425,42 +433,34 @@ def root():
 @app.get("/health", tags=["health"])
 def health():
-    return {"status": "ok", "models_loaded": bool(models)}
 @app.post("/index", response_model=IndexResponse, tags=["search"])
 async def index_document(
-    file:    Annotated[UploadFile, File(description="Source file to index")],
-    doc_id:  Annotated[str,        Form(description="Unique ID (defaults to filename)")] = "",
 ):
-    """
-    Upload a source file and embed it with code-aware chunking.
-    Returns the doc_id to use in /search.
-    """
     if not models:
-        raise HTTPException(503, "Model not loaded yet — please retry in a few seconds.")
-    content  = await file.read()
     if len(content) > MAX_UPLOAD_BYTES:
-        raise HTTPException(
-            413,
-            f"File too large ({len(content) / 1024 / 1024:.1f} MB). "
-            f"Max allowed: {MAX_UPLOAD_BYTES // 1024 // 1024} MB. "
-            "Use /index/batch for large codebases.",
-        )
-    source   = content.decode("utf-8", errors="replace")
-    filename = file.filename or "unknown"
     resolved_id = doc_id.strip() or os.path.splitext(filename)[0]
     chunks = chunk_code(source, filename=filename)
     if not chunks:
-        raise HTTPException(400, "Document produced no chunks. Check the file contents.")
-    embeddings = await encode_async(chunks)
-    index      = build_faiss_index(embeddings.astype("float32"))
-    store[resolved_id] = {"chunks": chunks, "index": index}
-    persist_index(resolved_id, chunks, index)
-    import gc; gc.collect()   # free encoding intermediates before responding
     return IndexResponse(
         doc_id=resolved_id,
@@ -471,36 +471,23 @@ async def index_document(
 @app.post("/index/batch", response_model=BatchIndexResponse, tags=["search"])
 async def index_batch(req: BatchIndexRequest):
-    """
-    Index an entire codebase in one HTTP call.
-    Ideal for IDE integrations — send all files, get one searchable doc_id back.
-    """
     if not models:
         raise HTTPException(503, "Model not loaded yet.")
-    if req.replace and req.doc_id in store:
-        del store[req.doc_id]
-        delete_persisted(req.doc_id)
     all_chunks: list[str] = []
     for entry in req.files:
         all_chunks.extend(chunk_code(entry.content, filename=entry.filename))
     if not all_chunks:
         raise HTTPException(400, "No chunks produced from provided files.")
-    MAX_CHUNKS = int(os.getenv("MAX_CHUNKS", "10000"))  # ~3 GB RAM at 10k chunks; raise carefully
     if len(all_chunks) > MAX_CHUNKS:
-        raise HTTPException(
-            413,
-            f"Too many chunks ({len(all_chunks):,}). Max: {MAX_CHUNKS:,}. "
-            "Split your project into smaller doc_id groups.",
-        )
-    embeddings = await encode_async(all_chunks)
-    index      = build_faiss_index(embeddings.astype("float32"))
-    store[req.doc_id] = {"chunks": all_chunks, "index": index}
-    persist_index(req.doc_id, all_chunks, index)
     return BatchIndexResponse(
         doc_id=req.doc_id,
@@ -511,11 +498,13 @@ async def index_batch(req: BatchIndexRequest):
 @app.post("/search", response_model=SearchResponse, tags=["search"])
 async def search_document(req: SearchRequest):
-    """Search a previously indexed document or codebase by doc_id."""
-    if req.doc_id not in store:
         raise HTTPException(404, f"doc_id '{req.doc_id}' not found. Call /index first.")
-    results = search_index(req.query, req.doc_id, req.top_k)
     return SearchResponse(
         doc_id=req.doc_id,
         query=req.query,
@@ -525,35 +514,33 @@ async def search_document(req: SearchRequest):
 @app.post("/embed", response_model=EmbedResponse, tags=["embeddings"])
 async def embed_texts(req: EmbedRequest):
-    """Embed arbitrary texts. Returns raw float embeddings."""
     if not models:
         raise HTTPException(503, "Model not loaded yet.")
     if len(req.texts) > 64:
         raise HTTPException(400, "Maximum 64 texts per request.")
-    embs = await encode_async(req.texts)
-    return EmbedResponse(
-        embeddings=embs.tolist(),
-        dimensions=embs.shape[1],
-    )
 @app.get("/documents", tags=["search"])
 def list_documents():
-    """List all currently indexed document IDs."""
-    return {
-        "documents": [
-            {"doc_id": k, "chunks": len(v["chunks"])}
-            for k, v in store.items()
-        ]
-    }
 @app.delete("/documents/{doc_id}", tags=["search"])
 def delete_document(doc_id: str):
-    """Remove a document from the in-memory index and from disk."""
-    if doc_id not in store:
         raise HTTPException(404, f"doc_id '{doc_id}' not found.")
-    del store[doc_id]
-    delete_persisted(doc_id)
     return {"deleted": doc_id}

+"""
+Code Search API  —  v3.0
+────────────────────────────────────────────────────────────────────────────
+Key architecture changes from v2:
+  • Model   : ONNX fp16 via sentence-transformers backend="onnx"
+              → ONNX Runtime replaces PyTorch for every forward pass.
+              → Pre-built onnx/model_fp16.onnx from the HF repo is used
+                directly — no export step, no trust_remote_code issues.
+              → All three transformers-compatibility patches removed.
+  • Storage : LanceDB (disk-backed, columnar, mmap)
+              → Vectors live on disk, not in Python RAM.
+              → Chunks stored alongside vectors in the same table —
+                no separate pickle files.
+              → FAISS removed entirely.
+  • Indexing: Streaming pipeline
+              → Chunks are produced, encoded in micro-batches, and written
+                to LanceDB immediately.  The full embeddings array is never
+                held in RAM.
+  • Retrieval: On-demand table loading + LRU cache
+              → Tables are opened from disk per request.
+              → An LRU cache (default: 5 tables, TTL: 10 min) keeps
+                recently used handles warm without pinning everything.
+  • RAM budget (approximate, CPU-only HF Space):
+              Model weights (fp16 ONNX)  ~275 MB
+              Encoding peak (batch=8)    ~100 MB transient
+              LanceDB per query          ~10-50 MB transient
+              Python overhead            ~150 MB
+              ─────────────────────────────────────
+              Total steady-state         ~425 MB  (vs ~16 GB before)
+"""
 import os
 import ast
 import re
+import gc
+import time
 import pathlib
 import asyncio
+from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
+from threading import Lock
 from typing import Annotated
 os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"]        = "false"
+os.environ["HF_HUB_VERBOSITY"]             = "error"
+# Tell ONNX Runtime to use a modest thread count so it doesn't spike RSS
+os.environ.setdefault("OMP_NUM_THREADS", "2")
 import numpy as np
+import lancedb
+import pyarrow as pa
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from sentence_transformers import SentenceTransformer
+# ─────────────────────────── Constants ────────────────────────────────────────
+DIM              = 768
+ENCODE_BATCH_SIZE = int(os.getenv("ENCODE_BATCH_SIZE", "8"))
+MAX_UPLOAD_BYTES  = int(os.getenv("MAX_UPLOAD_MB",    "50"))  * 1024 * 1024
+MAX_CHUNKS        = int(os.getenv("MAX_CHUNKS",       "10000"))
+LRU_MAXSIZE       = int(os.getenv("LRU_TABLE_CACHE",  "5"))
+LRU_TTL           = int(os.getenv("LRU_TTL_SECONDS",  "600"))   # 10 min
+LANGUAGE_MAP = {
+    ".py":  "python",  ".js": "javascript", ".ts":  "typescript",
+    ".tsx": "typescript", ".jsx": "javascript", ".go": "go",
+    ".rs":  "rust",    ".java": "java",    ".cpp":  "cpp",
+    ".c":   "c",       ".cs":  "csharp",  ".rb":   "ruby",
+    ".php": "php",     ".md":  "markdown", ".txt":  "text",
+}
+# LanceDB schema — one row per chunk
+_SCHEMA = pa.schema([
+    pa.field("chunk_id", pa.int32()),
+    pa.field("text",     pa.large_utf8()),
+    pa.field("vector",   pa.list_(pa.float32(), DIM)),
+])
+# ─────────────────────────── Storage directory ────────────────────────────────
 def _resolve_store_dir() -> pathlib.Path:
+    primary = pathlib.Path("/data/lancedb")
     try:
         primary.mkdir(parents=True, exist_ok=True)
         probe = primary / ".write_probe"
+        probe.touch(); probe.unlink()
         return primary
     except OSError:
+        fallback = pathlib.Path.home() / ".cache" / "code-search" / "lancedb"
         fallback.mkdir(parents=True, exist_ok=True)
+        print(f"Warning: /data/lancedb not writable — using fallback: {fallback}")
         return fallback
 STORE_DIR = _resolve_store_dir()
+# ─────────────────────────── LRU table-handle cache ───────────────────────────
+class _LRUTableCache:
+    """
+    Keeps up to `maxsize` LanceDB table handles open in memory.
+    Entries expire after `ttl` seconds of inactivity.
+    Opening a LanceDB table is cheap (no vectors loaded into RAM), so
+    this is primarily about limiting open file-descriptor churn.
+    """
+    def __init__(self, maxsize: int = 5, ttl: int = 600):
+        self._cache:   OrderedDict = OrderedDict()
+        self._maxsize  = maxsize
+        self._ttl      = ttl
+        self._lock     = Lock()
+    def get(self, key: str):
+        with self._lock:
+            entry = self._cache.get(key)
+            if entry is None:
+                return None
+            ts, tbl = entry
+            if time.monotonic() - ts > self._ttl:
+                del self._cache[key]
+                return None
+            self._cache.move_to_end(key)
+            self._cache[key] = (time.monotonic(), tbl)
+            return tbl
+    def set(self, key: str, tbl) -> None:
+        with self._lock:
+            if key in self._cache:
+                self._cache.move_to_end(key)
+            self._cache[key] = (time.monotonic(), tbl)
+            while len(self._cache) > self._maxsize:
+                self._cache.popitem(last=False)
+    def evict(self, key: str) -> None:
+        with self._lock:
+            self._cache.pop(key, None)
+    def keys(self):
+        with self._lock:
+            now = time.monotonic()
+            return [k for k, (ts, _) in self._cache.items()
+                    if now - ts <= self._ttl]
+_table_cache = _LRUTableCache(maxsize=LRU_MAXSIZE, ttl=LRU_TTL)
+# ─────────────────────────── Global state ─────────────────────────────────────
+models:    dict = {}
+_executor       = ThreadPoolExecutor(max_workers=2)
+# ─────────────────────────── Lifespan ─────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    print("Loading jina-embeddings-v2-base-code (ONNX fp16)…")
+    # backend="onnx" tells sentence-transformers to use ONNX Runtime instead
+    # of PyTorch for the forward pass.  file_name points to the pre-built
+    # fp16 ONNX graph that ships with the model on HuggingFace Hub.
+    # This completely bypasses the custom trust_remote_code PyTorch modeling
+    # code — no compat patches needed, no PyTorch GPU/RAM usage for inference.
     model = SentenceTransformer(
+        "jinaai/jina-embeddings-v2-base-code",
+        backend="onnx",
+        model_kwargs={
+            "file_name":         "onnx/model_fp16.onnx",
+            "provider":          "CPUExecutionProvider",
+            "provider_options":  [{
+                "intra_op_num_threads": int(os.getenv("OMP_NUM_THREADS", "2")),
+            }],
+        },
+        trust_remote_code=True,
     )
     model.max_seq_length = 8192
     models["model"] = model
+    print(f"Model ready  [backend={model.backend}]")
     yield
     models.clear()
+# ─────────────────────────── App ───────────���──────────────────────────────────
 app = FastAPI(
     title="Code Search API",
+    description="Semantic code search — jina-embeddings-v2-base-code ONNX fp16 + LanceDB",
+    version="3.0.0",
     lifespan=lifespan,
 )
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"], allow_methods=["*"], allow_headers=["*"],
 )
+# ─────────────────────────── Encoding ─────────────────────────────────────────
+def _encode_sync(texts: list[str]) -> np.ndarray:
     """
+    Synchronous encode via ONNX Runtime.
+    Processes ENCODE_BATCH_SIZE texts at a time; GC between batches.
+    Returns float32 array of shape (len(texts), DIM).
+    Note: no torch.no_grad() needed — ONNX Runtime has no autograd.
     """
+    parts = []
     for i in range(0, len(texts), ENCODE_BATCH_SIZE):
         batch = texts[i : i + ENCODE_BATCH_SIZE]
+        embs  = models["model"].encode(
+            batch,
+            show_progress_bar=False,
+            convert_to_numpy=True,
+            normalize_embeddings=False,
+        )
+        parts.append(np.asarray(embs, dtype=np.float32))
         gc.collect()
+    return np.vstack(parts)
+async def _encode_async(texts: list[str]) -> np.ndarray:
     loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(_executor, _encode_sync, texts)
+def _normalize(embs: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(embs, axis=1, keepdims=True)
+    return embs / np.maximum(norms, 1e-9)
+# ─────────────────────────── LanceDB helpers ──────────────────────────────────
+def _db() -> lancedb.DBConnection:
+    return lancedb.connect(str(STORE_DIR))
+def _table_exists(doc_id: str) -> bool:
+    return doc_id in _db().table_names()
+def _open_table(doc_id: str):
+    """Return table handle from LRU cache or open from disk."""
+    tbl = _table_cache.get(doc_id)
+    if tbl is None:
+        tbl = _db().open_table(doc_id)
+        _table_cache.set(doc_id, tbl)
+    return tbl
+async def _build_table_streaming(doc_id: str, chunks: list[str]) -> None:
     """
+    Streaming index build — the heart of the memory optimisation.
+    Instead of:  chunk_all → encode_all → build_index (full array in RAM)
+    We do:       for each micro-batch → encode → write to LanceDB → free
+    Peak RAM = one micro-batch of embeddings (8 × 768 × 4 bytes ≈ 24 KB).
+    LanceDB stores vectors as a memory-mapped Lance file on disk; only
+    the pages touched during a query are paged into RAM at search time.
     """
+    db  = _db()
+    # Drop stale table if it exists
+    if doc_id in db.table_names():
+        db.drop_table(doc_id)
+        _table_cache.evict(doc_id)
+    tbl = None
+    for i in range(0, len(chunks), ENCODE_BATCH_SIZE):
+        batch  = chunks[i : i + ENCODE_BATCH_SIZE]
+        embs   = await _encode_async(batch)
+        embs   = _normalize(embs)
+        records = [
+            {
+                "chunk_id": i + j,
+                "text":     text,
+                "vector":   vec.tolist(),
+            }
+            for j, (text, vec) in enumerate(zip(batch, embs))
+        ]
+        if tbl is None:
+            tbl = db.create_table(doc_id, data=records,
+                                  schema=_SCHEMA, mode="overwrite")
+        else:
+            tbl.add(records)
+        del embs, records
+        gc.collect()
+    # Create ANN vector index for tables large enough to benefit
+    if tbl is not None and len(chunks) >= 256:
+        try:
+            tbl.create_index(
+                metric="dot",               # vectors are pre-normalised
+                vector_column_name="vector",
+                num_partitions=max(1, min(256, len(chunks) // 40)),
+                num_sub_vectors=96,
+            )
+        except Exception as e:
+            print(f"Warning: ANN index creation skipped for '{doc_id}': {e}")
+    if tbl is not None:
+        _table_cache.set(doc_id, tbl)
+def _search_table(doc_id: str, query: str, top_k: int) -> list[dict]:
+    """
+    On-demand search.  Opens the table handle (from LRU cache or disk),
+    runs a vector search, returns top_k results.  Only the pages of the
+    Lance file containing the nearest vectors are paged into RAM.
+    """
+    q_emb = _encode_sync([query])
+    q_emb = _normalize(q_emb)[0]
+    tbl     = _open_table(doc_id)
+    results = (
+        tbl.search(q_emb.tolist(), vector_column_name="vector")
+           .metric("dot")
+           .limit(top_k)
+           .to_list()
+    )
+    return [
+        {
+            "rank":  i + 1,
+            "score": round(float(r.get("_distance", r.get("score", 0.0))), 4),
+            "text":  r["text"],
+        }
+        for i, r in enumerate(results)
+    ]
+# ─────────────────────────── Chunking ─────────────────────────────────────────
 def detect_language(filename: str) -> str:
+    return LANGUAGE_MAP.get(os.path.splitext(filename)[-1].lower(), "text")
 def chunk_text(text: str, chunk_size: int = 3, overlap: int = 1) -> list[str]:
     sentences = re.split(r'(?<=[.!?])\s+', text.strip())
     sentences = [s.strip() for s in sentences if s.strip()]
     chunks, i = [], 0
 def chunk_fallback(source: str, max_lines: int = 40, overlap: int = 5) -> list[str]:
     lines  = source.splitlines()
     chunks = []
     i = 0
 def chunk_python(source: str, filepath: str = "") -> list[str]:
     try:
         tree  = ast.parse(source)
         lines = source.splitlines()
         chunks = []
         for node in ast.walk(tree):
             if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                snippet = "\n".join(lines[node.lineno - 1 : node.end_lineno])
                 prefix  = f"# {filepath}\n" if filepath else ""
                 chunks.append(f"{prefix}{snippet}")
         return chunks if chunks else chunk_fallback(source)
 def chunk_generic(source: str, filepath: str = "") -> list[str]:
     pattern = re.compile(
         r'(?:^|\n)(?='
+        r'(?:export\s+)?(?:async\s+)?'
         r'(?:function|class|const\s+\w+\s*=\s*(?:async\s+)?(?:\(|function)|'
         r'(?:public|private|protected|static|\s)*(?:fn|func|def)\s+\w+)'
         r')',
 def chunk_code(source: str, filename: str = "") -> list[str]:
     lang = detect_language(filename)
     if lang == "python":
         return chunk_python(source, filepath=filename)
         return chunk_generic(source, filepath=filename)
+# ─────────────────────────── Schemas ──────────────────────────────────────────
 class IndexResponse(BaseModel):
+    doc_id: str; chunks_indexed: int; message: str
 class SearchRequest(BaseModel):
+    doc_id: str = Field(...); query: str = Field(...); top_k: int = Field(5, ge=1, le=20)
 class SearchResult(BaseModel):
+    rank: int; score: float; text: str
 class SearchResponse(BaseModel):
+    doc_id: str; query: str; results: list[SearchResult]
 class EmbedRequest(BaseModel):
+    texts: list[str] = Field(...)
 class EmbedResponse(BaseModel):
+    embeddings: list[list[float]]; dimensions: int
 class FileEntry(BaseModel):
+    filename: str; content: str
 class BatchIndexRequest(BaseModel):
+    doc_id: str; files: list[FileEntry]; replace: bool = True
 class BatchIndexResponse(BaseModel):
+    doc_id: str; files_indexed: int; chunks_indexed: int
+# ─────────────────────────── Routes ───────────────────────────────────────────
 @app.get("/", tags=["health"])
 def root():
     return {"status": "ok", "docs": "/docs"}
 @app.get("/health", tags=["health"])
 def health():
+    return {"status": "ok", "models_loaded": bool(models),
+            "backend": models["model"].backend if models else None}
 @app.post("/index", response_model=IndexResponse, tags=["search"])
 async def index_document(
+    file:   Annotated[UploadFile, File(description="Source file to index")],
+    doc_id: Annotated[str, Form(description="Unique ID (defaults to filename)")] = "",
 ):
     if not models:
+        raise HTTPException(503, "Model not loaded yet.")
+    content = await file.read()
     if len(content) > MAX_UPLOAD_BYTES:
+        raise HTTPException(413,
+            f"File too large ({len(content)/1024/1024:.1f} MB). "
+            f"Max: {MAX_UPLOAD_BYTES//1024//1024} MB.")
+    source      = content.decode("utf-8", errors="replace")
+    filename    = file.filename or "unknown"
     resolved_id = doc_id.strip() or os.path.splitext(filename)[0]
     chunks = chunk_code(source, filename=filename)
     if not chunks:
+        raise HTTPException(400, "Document produced no chunks.")
+    await _build_table_streaming(resolved_id, chunks)
+    gc.collect()
     return IndexResponse(
         doc_id=resolved_id,
 @app.post("/index/batch", response_model=BatchIndexResponse, tags=["search"])
 async def index_batch(req: BatchIndexRequest):
     if not models:
         raise HTTPException(503, "Model not loaded yet.")
+    # Collect all chunks first (just strings — negligible RAM)
     all_chunks: list[str] = []
     for entry in req.files:
         all_chunks.extend(chunk_code(entry.content, filename=entry.filename))
     if not all_chunks:
         raise HTTPException(400, "No chunks produced from provided files.")
     if len(all_chunks) > MAX_CHUNKS:
+        raise HTTPException(413,
+            f"Too many chunks ({len(all_chunks):,}). Max: {MAX_CHUNKS:,}.")
+    # Streaming build — never holds full embeddings array
+    await _build_table_streaming(req.doc_id, all_chunks)
+    gc.collect()
     return BatchIndexResponse(
         doc_id=req.doc_id,
 @app.post("/search", response_model=SearchResponse, tags=["search"])
 async def search_document(req: SearchRequest):
+    if not _table_exists(req.doc_id):
         raise HTTPException(404, f"doc_id '{req.doc_id}' not found. Call /index first.")
+    loop    = asyncio.get_event_loop()
+    results = await loop.run_in_executor(
+        _executor, _search_table, req.doc_id, req.query, req.top_k
+    )
     return SearchResponse(
         doc_id=req.doc_id,
         query=req.query,
 @app.post("/embed", response_model=EmbedResponse, tags=["embeddings"])
 async def embed_texts(req: EmbedRequest):
     if not models:
         raise HTTPException(503, "Model not loaded yet.")
     if len(req.texts) > 64:
         raise HTTPException(400, "Maximum 64 texts per request.")
+    embs = await _encode_async(req.texts)
+    return EmbedResponse(embeddings=embs.tolist(), dimensions=embs.shape[1])
 @app.get("/documents", tags=["search"])
 def list_documents():
+    db   = _db()
+    docs = []
+    for name in db.table_names():
+        try:
+            tbl    = db.open_table(name)
+            count  = tbl.count_rows()
+            docs.append({"doc_id": name, "chunks": count})
+        except Exception:
+            docs.append({"doc_id": name, "chunks": -1})
+    return {"documents": docs}
 @app.delete("/documents/{doc_id}", tags=["search"])
 def delete_document(doc_id: str):
+    if not _table_exists(doc_id):
         raise HTTPException(404, f"doc_id '{doc_id}' not found.")
+    _db().drop_table(doc_id)
+    _table_cache.evict(doc_id)
     return {"deleted": doc_id}