auto-swe-agent-ui / indexing /vector_store.py
DevilBits's picture
fix: enforce safe empty bounds for tracking data charts and match dataframe list alignments
6085b61
"""FAISS-based vector store for code chunk retrieval.
Uses inner-product (cosine similarity on L2-normalised vectors).
Falls back to brute-force numpy search when FAISS is unavailable.
"""
from __future__ import annotations
import pickle
import warnings
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
from indexing.parser import CodeChunk
DEFAULT_INDEX_PATH = str(Path(__file__).parent / "code_index.faiss")
class CodeVectorStore:
"""FAISS index + metadata for code chunk similarity search."""
def __init__(self, index_path: str = DEFAULT_INDEX_PATH):
self.index_path = Path(index_path)
self.metadata_path = self.index_path.with_suffix(".pkl")
self.dimension = 384 # all-MiniLM-L6-v2 output dim
self._index = None
self.metadata: List[CodeChunk] = []
self._use_fallback = False
def build(self, chunks: List[CodeChunk], embeddings: np.ndarray) -> None:
"""Build index from chunks and their embeddings."""
self.metadata = chunks
try:
import faiss
embeddings = embeddings.astype(np.float32)
faiss.normalize_L2(embeddings)
self._index = faiss.IndexFlatIP(self.dimension)
self._index.add(embeddings)
self._use_fallback = False
except ImportError:
warnings.warn("faiss not available — using brute-force numpy search")
self._use_fallback = True
self._fallback_embeddings = embeddings.copy()
self.save()
def search(
self, query_embedding: np.ndarray, k: int = 5
) -> List[Tuple[CodeChunk, float]]:
"""Return top-k (chunk, cosine_similarity) matches."""
if not self.metadata:
return []
if not self._use_fallback and self._index is None:
self.load()
query = query_embedding.astype(np.float32).reshape(1, -1)
if self._use_fallback or self._index is None:
return self._fallback_search(query, k)
import faiss
faiss.normalize_L2(query)
distances, indices = self._index.search(query, k)
results: List[Tuple[CodeChunk, float]] = []
for idx, dist in zip(indices[0], distances[0]):
if 0 <= idx < len(self.metadata):
results.append((self.metadata[idx], float(dist)))
return results
def _fallback_search(
self, query: np.ndarray, k: int
) -> List[Tuple[CodeChunk, float]]:
"""Brute-force cosine similarity when FAISS is unavailable."""
if not hasattr(self, "_fallback_embeddings"):
return []
query_norm = query / (np.linalg.norm(query) + 1e-12)
emb_norm = self._fallback_embeddings / (
np.linalg.norm(self._fallback_embeddings, axis=1, keepdims=True) + 1e-12
)
scores = emb_norm @ query_norm.T
scores = scores.flatten()
top_k = min(k, len(scores))
indices = np.argsort(-scores)[:top_k]
results: List[Tuple[CodeChunk, float]] = []
for idx in indices:
results.append((self.metadata[idx], float(scores[idx])))
return results
def save(self) -> None:
"""Persist index and metadata to disk."""
if not self._use_fallback:
try:
import faiss
faiss.write_index(self._index, str(self.index_path))
except Exception:
pass
# Always save metadata and fallback embeddings
payload = {
"metadata": self.metadata,
"fallback_embeddings": getattr(self, "_fallback_embeddings", None),
}
self.metadata_path.write_bytes(pickle.dumps(payload))
def load(self) -> bool:
"""Load index and metadata from disk. Returns True on success."""
if not self.index_path.exists() and not self.metadata_path.exists():
return False
# Load metadata
if self.metadata_path.exists():
try:
payload = pickle.loads(self.metadata_path.read_bytes())
self.metadata = payload.get("metadata", [])
fb_emb = payload.get("fallback_embeddings")
if fb_emb is not None:
self._fallback_embeddings = fb_emb
self._use_fallback = True
except Exception:
return False
# Load FAISS index
if self.index_path.exists():
try:
import faiss
self._index = faiss.read_index(str(self.index_path))
self._use_fallback = False
return True
except Exception:
pass
return bool(self.metadata)
def index_exists(self) -> bool:
return self.index_path.exists() and self.metadata_path.exists()