"""Code embedding using sentence-transformers. Embeds code chunks into 384-dim vectors using all-MiniLM-L6-v2. Falls back to a simple TF-IDF-like bag-of-words embedding if sentence-transformers is unavailable (e.g. on first run before download completes). """ from __future__ import annotations import re import warnings from typing import List, Optional import numpy as np from indexing.parser import CodeChunk class CodeEmbedder: """Wraps a sentence-transformer model for code embedding.""" def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"): self.model_name = model_name self._model = None self._fallback_vocab: dict[str, int] = {} self._use_fallback = False def _load_model(self): if self._model is not None: return try: from sentence_transformers import SentenceTransformer self._model = SentenceTransformer(self.model_name) except (ImportError, OSError, Exception) as exc: warnings.warn( f"sentence-transformers not available ({exc}). " f"Using fallback bag-of-words embeddings." ) self._use_fallback = True self._build_fallback_vocab() def _build_fallback_vocab(self): """Build a simple vocabulary for fallback embeddings.""" common_tokens = ( "def class import from return if else for while try except " "with as async await yield lambda self super init str int " "float bool list dict set tuple none true false raise pass " "break continue and or not in is assert global nonlocal " "del print len range open read write get set add append " "pop remove clear copy sort reverse find index split join " "replace strip format startswith endswith encode decode " ) self._fallback_vocab = {t: i for i, t in enumerate(common_tokens.split())} # Extend with common programming terms for i, c in enumerate("abcdefghijklmnopqrstuvwxyz_"): self._fallback_vocab.setdefault(c, len(self._fallback_vocab)) def _fallback_encode(self, texts: List[str]) -> np.ndarray: """Simple bag-of-words fallback embedding (384-dim).""" embeddings = np.zeros((len(texts), 384), dtype=np.float32) for i, text in enumerate(texts): tokens = re.findall(r"\w+", text.lower()) for token in tokens: idx = self._fallback_vocab.get(token, hash(token) % 384) if idx < 384: embeddings[i, idx] += 1.0 # Normalize norm = np.linalg.norm(embeddings[i]) if norm > 0: embeddings[i] /= norm return embeddings def embed(self, texts: List[str]) -> np.ndarray: """Embed a list of strings into vectors (384-dim float32).""" self._load_model() if self._use_fallback: return self._fallback_encode(texts) return self._model.encode(texts, show_progress_bar=False) def embed_chunk(self, chunk: CodeChunk) -> np.ndarray: """Create a rich text representation of a code chunk and embed it.""" text = ( f"{chunk.name}\n" f"{chunk.signature}\n" f"{chunk.docstring}\n" f"{chunk.body_preview}" ) return self.embed([text])[0]