File size: 3,439 Bytes
6085b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""Code embedding using sentence-transformers.

Embeds code chunks into 384-dim vectors using all-MiniLM-L6-v2.
Falls back to a simple TF-IDF-like bag-of-words embedding if sentence-transformers
is unavailable (e.g. on first run before download completes).
"""

from __future__ import annotations

import re
import warnings
from typing import List, Optional

import numpy as np

from indexing.parser import CodeChunk


class CodeEmbedder:
    """Wraps a sentence-transformer model for code embedding."""

    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        self.model_name = model_name
        self._model = None
        self._fallback_vocab: dict[str, int] = {}
        self._use_fallback = False

    def _load_model(self):
        if self._model is not None:
            return
        try:
            from sentence_transformers import SentenceTransformer

            self._model = SentenceTransformer(self.model_name)
        except (ImportError, OSError, Exception) as exc:
            warnings.warn(
                f"sentence-transformers not available ({exc}). "
                f"Using fallback bag-of-words embeddings."
            )
            self._use_fallback = True
            self._build_fallback_vocab()

    def _build_fallback_vocab(self):
        """Build a simple vocabulary for fallback embeddings."""
        common_tokens = (
            "def class import from return if else for while try except "
            "with as async await yield lambda self super init str int "
            "float bool list dict set tuple none true false raise pass "
            "break continue and or not in is assert global nonlocal "
            "del print len range open read write get set add append "
            "pop remove clear copy sort reverse find index split join "
            "replace strip format startswith endswith encode decode "
        )
        self._fallback_vocab = {t: i for i, t in enumerate(common_tokens.split())}
        # Extend with common programming terms
        for i, c in enumerate("abcdefghijklmnopqrstuvwxyz_"):
            self._fallback_vocab.setdefault(c, len(self._fallback_vocab))

    def _fallback_encode(self, texts: List[str]) -> np.ndarray:
        """Simple bag-of-words fallback embedding (384-dim)."""
        embeddings = np.zeros((len(texts), 384), dtype=np.float32)
        for i, text in enumerate(texts):
            tokens = re.findall(r"\w+", text.lower())
            for token in tokens:
                idx = self._fallback_vocab.get(token, hash(token) % 384)
                if idx < 384:
                    embeddings[i, idx] += 1.0
            # Normalize
            norm = np.linalg.norm(embeddings[i])
            if norm > 0:
                embeddings[i] /= norm
        return embeddings

    def embed(self, texts: List[str]) -> np.ndarray:
        """Embed a list of strings into vectors (384-dim float32)."""
        self._load_model()
        if self._use_fallback:
            return self._fallback_encode(texts)
        return self._model.encode(texts, show_progress_bar=False)

    def embed_chunk(self, chunk: CodeChunk) -> np.ndarray:
        """Create a rich text representation of a code chunk and embed it."""
        text = (
            f"{chunk.name}\n"
            f"{chunk.signature}\n"
            f"{chunk.docstring}\n"
            f"{chunk.body_preview}"
        )
        return self.embed([text])[0]