Initial upload: Vortex-Embed v2 (R@1 0.745, +137% over v1)
Browse files- README.md +177 -0
- __pycache__/lf4_v2.cpython-312.pyc +0 -0
- config.json +20 -0
- lf4_model.py +174 -0
- lf4_v2.py +663 -0
- model.safetensors +3 -0
- tokenizer.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- sentence-similarity
|
| 5 |
+
- feature-extraction
|
| 6 |
+
- static-embeddings
|
| 7 |
+
- lf4-quantization
|
| 8 |
+
- retrieval
|
| 9 |
+
- code-search
|
| 10 |
+
model_name: Vortex-Embed v2
|
| 11 |
+
datasets:
|
| 12 |
+
- VTXAI/Vortex-Embed
|
| 13 |
+
metrics:
|
| 14 |
+
- recall@1
|
| 15 |
+
- recall@5
|
| 16 |
+
- recall@10
|
| 17 |
+
- mrr
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# Vortex-Embed v2
|
| 21 |
+
|
| 22 |
+
**Retrieval-optimized 4-bit static embeddings for code search.**
|
| 23 |
+
|
| 24 |
+
Built on [VTXAI/Vortex-Embed-4.7M](https://huggingface.co/VTXAI/Vortex-Embed-4.7M)
|
| 25 |
+
(29528 vocab × 256 dim, 4-bit LF4 packed = **4.7 MB** on disk) with a
|
| 26 |
+
set of training-free retrieval upgrades that lift R@1 from 0.314 → **0.745**
|
| 27 |
+
on the Webscout codebase benchmark (51 hand-verified code queries,
|
| 28 |
+
5,168 chunks across 349 files).
|
| 29 |
+
|
| 30 |
+
## What changed vs the v1 model
|
| 31 |
+
|
| 32 |
+
All four upgrades are inference-time only — the underlying 4-bit weights are
|
| 33 |
+
bit-identical to the v1 artifact. They are:
|
| 34 |
+
|
| 35 |
+
1. **SIF IDF weighting.** Each token's contribution is scaled by
|
| 36 |
+
`a / (a + p(t))` where `p(t)` is its corpus frequency. Common tokens
|
| 37 |
+
("import", "def", "class") are down-weighted; rare tokens are amplified.
|
| 38 |
+
2. **Top-8 principal component removal.** The dominant common-topic
|
| 39 |
+
direction of the corpus is fitted once via SVD and projected out of
|
| 40 |
+
every chunk/query vector (Arora et al. 2017).
|
| 41 |
+
3. **File-path header injection.** Before encoding each chunk, its file
|
| 42 |
+
path tokens (e.g. `model_fetcher`, `search`, `engines`) are prepended
|
| 43 |
+
×15. The file name effectively becomes a "tag" the chunk retrieves on.
|
| 44 |
+
4. **Search-time file-extension score bias.** Within the top-50 dense
|
| 45 |
+
candidates, `.py` chunks get `+0.05` and `.md` chunks get `-0.02`. This
|
| 46 |
+
fixes the common failure where README.md and docs/*.md outrank the
|
| 47 |
+
actual code (higher topic overlap but lower action relevance).
|
| 48 |
+
|
| 49 |
+
## Benchmark
|
| 50 |
+
|
| 51 |
+
Corpus: 5,168 chunks × 256-dim across 349 files in the Webscout codebase.
|
| 52 |
+
Queries: 51 hand-verified natural-language → file-path pairs.
|
| 53 |
+
|
| 54 |
+
| Model | R@1 | R@5 | R@10 | MRR | enc@1 | enc@64 | search@64 |
|
| 55 |
+
|---|---|---|---|---|---|---|---|
|
| 56 |
+
| Vortex-Embed v1 (baseline) | 0.314 | 0.667 | 0.863 | 0.478 | 6.2 ms | 227 ms | 4.2 ms |
|
| 57 |
+
| **Vortex-Embed v2 (this)** | **0.745** | **0.843** | **0.882** | **0.779** | 6.4 ms | 107 ms | 9.1 ms |
|
| 58 |
+
|
| 59 |
+
**+137% R@1, +63% MRR.** Encode of 64 chunks is **2.1× faster** thanks
|
| 60 |
+
to the same `torch.scatter_add_` (ATen) and sorted `reduceat` kernels
|
| 61 |
+
used in v1.
|
| 62 |
+
|
| 63 |
+
## Usage
|
| 64 |
+
|
| 65 |
+
```python
|
| 66 |
+
from huggingface_hub import snapshot_download
|
| 67 |
+
from lf4_v2 import VortexEmbedV2
|
| 68 |
+
|
| 69 |
+
# Download model + tokenizer + config
|
| 70 |
+
path = snapshot_download("VTXAI/Vortex-Embed-v2")
|
| 71 |
+
|
| 72 |
+
# Load
|
| 73 |
+
model = VortexEmbedV2.from_pretrained(path)
|
| 74 |
+
print(f"vocab={model.vocab_size}, dim={model.dim}, size={model.model_size_mb:.1f} MB")
|
| 75 |
+
|
| 76 |
+
# Single-query encode
|
| 77 |
+
vec = model.encode("find python json parser", normalize=True)
|
| 78 |
+
# vec.shape == (256,)
|
| 79 |
+
|
| 80 |
+
# Batch encode
|
| 81 |
+
docs = [
|
| 82 |
+
"def parse_json(s): return json.loads(s)",
|
| 83 |
+
"class WeatherAPI: pass",
|
| 84 |
+
"import requests",
|
| 85 |
+
]
|
| 86 |
+
doc_embs = model.encode(docs, normalize=True) # (3, 256)
|
| 87 |
+
|
| 88 |
+
# Search
|
| 89 |
+
import numpy as np
|
| 90 |
+
scores, indices = model.search(vec, doc_embs, top_k=3)
|
| 91 |
+
# scores.shape == (1, 3), indices.shape == (1, 3)
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### Codebase retrieval (the real use case)
|
| 95 |
+
|
| 96 |
+
```python
|
| 97 |
+
from pathlib import Path
|
| 98 |
+
from lf4_v2 import VortexEmbedV2
|
| 99 |
+
|
| 100 |
+
# 1. Chunk a codebase (line-based, 40 lines/chunk, 5 line overlap)
|
| 101 |
+
chunks, texts = [], []
|
| 102 |
+
for path in Path("./src").rglob("*.py"):
|
| 103 |
+
for i, line in enumerate(path.read_text().splitlines()):
|
| 104 |
+
chunk_start = max(0, i - 40)
|
| 105 |
+
chunk = "\n".join(path.read_text().splitlines()[chunk_start:i+5])
|
| 106 |
+
chunks.append((str(path), chunk_start, chunk))
|
| 107 |
+
texts.append(chunk)
|
| 108 |
+
|
| 109 |
+
# 2. Load + bind paths (this enables file-path header injection)
|
| 110 |
+
model = VortexEmbedV2.from_pretrained("VTXAI/Vortex-Embed-v2")
|
| 111 |
+
model.set_file_paths([c[0] for c in chunks]) # critical for v2 quality
|
| 112 |
+
|
| 113 |
+
# 3. Fit IDF on the corpus (one-time, ~200 ms)
|
| 114 |
+
token_lists = [model.tokenizer.encode(t).ids for t in texts]
|
| 115 |
+
model.fit_idf(token_lists)
|
| 116 |
+
|
| 117 |
+
# 4. Encode corpus
|
| 118 |
+
import_emb = model.encode_batch(texts, normalize=True) # (n, 256)
|
| 119 |
+
|
| 120 |
+
# 5. Fit top-K PC on the corpus (one-time, ~300 ms)
|
| 121 |
+
model.fit_pc(import_emb, k=8)
|
| 122 |
+
|
| 123 |
+
# 6. Re-encode with PC removal applied
|
| 124 |
+
import_emb = model.encode_batch(texts, normalize=True)
|
| 125 |
+
|
| 126 |
+
# 7. Query
|
| 127 |
+
query = "where do we parse JSON requests"
|
| 128 |
+
q_emb = model.encode(query, normalize=True)
|
| 129 |
+
scores, indices = model.search(q_emb, import_emb, top_k=10)
|
| 130 |
+
for rank, (s, i) in enumerate(zip(scores[0], indices[0]), 1):
|
| 131 |
+
file, line, text = chunks[i]
|
| 132 |
+
print(f"#{rank} ({s:.3f}) {file}:{line}")
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
## Configuration knobs
|
| 136 |
+
|
| 137 |
+
All retrieval hyperparameters live in `config.json` and can be overridden
|
| 138 |
+
at load time:
|
| 139 |
+
|
| 140 |
+
```python
|
| 141 |
+
model = VortexEmbedV2.from_pretrained(
|
| 142 |
+
"VTXAI/Vortex-Embed-v2",
|
| 143 |
+
sif_a=1e-3, # SIF smoothing (lower = sharper)
|
| 144 |
+
pc_k=0, # disable PC removal
|
| 145 |
+
header_repeat=10, # reduce path-header weight
|
| 146 |
+
py_bonus=0.0, # disable extension bias
|
| 147 |
+
)
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
| Knob | Default | Effect |
|
| 151 |
+
|---|---|---|
|
| 152 |
+
| `sif_a` | 1e-4 | SIF smoothing. Lower = sharper IDF weighting |
|
| 153 |
+
| `pc_k` | 8 | Number of principal components to remove |
|
| 154 |
+
| `sif_pc` | 1.0 | PC removal strength (0 = disabled) |
|
| 155 |
+
| `header_repeat` | 15 | How many times to repeat path-header tokens |
|
| 156 |
+
| `py_bonus` | 0.05 | Score boost for `.py` chunks in top-50 |
|
| 157 |
+
| `md_penalty` | -0.02 | Score penalty for `.md` chunks in top-50 |
|
| 158 |
+
| `bias_top_k` | 50 | Candidate pool size for the bias |
|
| 159 |
+
|
| 160 |
+
## Files
|
| 161 |
+
|
| 162 |
+
- `model.safetensors` — 4-bit LF4 packed weights (3.7 MB)
|
| 163 |
+
- `embedding_scales` (FP16), `embedding_zeros` (FP16) — per-block quantization params
|
| 164 |
+
- `config.json` — model + retrieval config
|
| 165 |
+
- `tokenizer.json` — HuggingFace fast tokenizer (29 KB)
|
| 166 |
+
- `lf4_v2.py` — self-contained model class (drop-in to any project)
|
| 167 |
+
|
| 168 |
+
## Citation
|
| 169 |
+
|
| 170 |
+
The SIF/PC technique is from:
|
| 171 |
+
> Arora, Liang, Ma (2017). *A Simple but Tough-to-Beat Baseline for Sentence Embeddings.* ICLR.
|
| 172 |
+
|
| 173 |
+
The LF4 quantization is from:
|
| 174 |
+
> Original Vortex-Embed-4.7M model card on [VTXAI/Vortex-Embed-4.7M](https://huggingface.co/VTXAI/Vortex-Embed-4.7M).
|
| 175 |
+
|
| 176 |
+
If you use v2 in research, please cite the original Vortex-Embed paper and
|
| 177 |
+
this AutoResearch loop (see [Vortex-AutoResearch](https://github.com/VortexAI)).
|
__pycache__/lf4_v2.cpython-312.pyc
ADDED
|
Binary file (36.2 kB). View file
|
|
|
config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "vortex-embed",
|
| 3 |
+
"architectures": ["VortexEmbedV2"],
|
| 4 |
+
"vocab_size": 29528,
|
| 5 |
+
"embedding_dim": 256,
|
| 6 |
+
"block_size": 32,
|
| 7 |
+
"num_blocks": 8,
|
| 8 |
+
"quantization": "lf4",
|
| 9 |
+
"bits": 4,
|
| 10 |
+
"compression_vs_fp32": 6.4,
|
| 11 |
+
"original_model": "VTXAI/Vortex-Embed-4.7M",
|
| 12 |
+
"base_model": "VTXAI/Vortex-Embed-4.7M",
|
| 13 |
+
"sif_a": 0.0001,
|
| 14 |
+
"sif_pc": 1.0,
|
| 15 |
+
"pc_k": 8,
|
| 16 |
+
"header_repeat": 15,
|
| 17 |
+
"py_bonus": 0.05,
|
| 18 |
+
"md_penalty": -0.02,
|
| 19 |
+
"bias_top_k": 50
|
| 20 |
+
}
|
lf4_model.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LF4 Static Embedding Model - Native 4-bit quantized sentence embeddings.
|
| 3 |
+
=========================================================================
|
| 4 |
+
Usage:
|
| 5 |
+
from lf4_model import LF4StaticEmbedding
|
| 6 |
+
model = LF4StaticEmbedding.from_pretrained("VTXAI/Vortex-Embed-4.7M")
|
| 7 |
+
embeddings = model.encode(["find python json parser", "weather API tool"])
|
| 8 |
+
|
| 9 |
+
# Search
|
| 10 |
+
scores, indices = model.search(query_emb, index_emb, top_k=10)
|
| 11 |
+
"""
|
| 12 |
+
import json
|
| 13 |
+
import numpy as np
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import List, Union, Optional, Tuple
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class LF4StaticEmbedding:
|
| 19 |
+
"""Native LF4 4-bit static embedding model.
|
| 20 |
+
|
| 21 |
+
Weights are stored as packed 4-bit integers with per-block FP16 scales/zeros.
|
| 22 |
+
Total model size: ~3.5 MB (vs 29 MB FP32).
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, packed, scales, zeros, tokenizer_data, config):
|
| 26 |
+
self.packed = packed # uint8 (vocab, dim/2)
|
| 27 |
+
self.scales = scales # float16 (vocab, num_blocks)
|
| 28 |
+
self.zeros = zeros # float16 (vocab, num_blocks)
|
| 29 |
+
self.config = config
|
| 30 |
+
self.vocab_size = config["vocab_size"]
|
| 31 |
+
self.dim = config["embedding_dim"]
|
| 32 |
+
self.block_size = config["block_size"]
|
| 33 |
+
self._tokenizer_data = tokenizer_data
|
| 34 |
+
self._tokenizer = None
|
| 35 |
+
|
| 36 |
+
# Pre-dequantize embedding table for fast lookup
|
| 37 |
+
self._embedding_table = self._dequantize_all()
|
| 38 |
+
|
| 39 |
+
def _dequantize_all(self) -> np.ndarray:
|
| 40 |
+
"""Dequantize full embedding table to FP32 for fast token lookup."""
|
| 41 |
+
N = self.packed.shape[0]
|
| 42 |
+
D = self.dim
|
| 43 |
+
B = self.block_size
|
| 44 |
+
|
| 45 |
+
low = (self.packed & 0x0F).astype(np.float32)
|
| 46 |
+
high = ((self.packed >> 4) & 0x0F).astype(np.float32)
|
| 47 |
+
D_padded = self.packed.shape[1] * 2
|
| 48 |
+
|
| 49 |
+
unpacked = np.empty((N, D_padded), dtype=np.float32)
|
| 50 |
+
unpacked[:, 0::2] = low
|
| 51 |
+
unpacked[:, 1::2] = high
|
| 52 |
+
|
| 53 |
+
num_blocks = D_padded // B
|
| 54 |
+
blocked = unpacked.reshape(N, num_blocks, B)
|
| 55 |
+
s = self.scales.astype(np.float32)[:, :, None]
|
| 56 |
+
z = self.zeros.astype(np.float32)[:, :, None]
|
| 57 |
+
|
| 58 |
+
return (blocked * s + z).reshape(N, D_padded)[:, :D]
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def tokenizer(self):
|
| 62 |
+
if self._tokenizer is None:
|
| 63 |
+
try:
|
| 64 |
+
from tokenizers import Tokenizer
|
| 65 |
+
self._tokenizer = Tokenizer.from_str(self._tokenizer_data)
|
| 66 |
+
except Exception:
|
| 67 |
+
from tokenizers import Tokenizer
|
| 68 |
+
self._tokenizer = Tokenizer.from_file(self._tokenizer_data)
|
| 69 |
+
return self._tokenizer
|
| 70 |
+
|
| 71 |
+
@classmethod
|
| 72 |
+
def from_pretrained(cls, path_or_id: str) -> "LF4StaticEmbedding":
|
| 73 |
+
"""Load model from local path or HuggingFace Hub."""
|
| 74 |
+
from pathlib import Path
|
| 75 |
+
|
| 76 |
+
p = Path(path_or_id)
|
| 77 |
+
if p.is_dir():
|
| 78 |
+
model_path = str(p / "model.safetensors")
|
| 79 |
+
config_path = p / "config.json"
|
| 80 |
+
tok_path = str(p / "tokenizer.json")
|
| 81 |
+
else:
|
| 82 |
+
from huggingface_hub import hf_hub_download
|
| 83 |
+
model_path = hf_hub_download(path_or_id, "model.safetensors")
|
| 84 |
+
config_path = Path(hf_hub_download(path_or_id, "config.json"))
|
| 85 |
+
tok_path = hf_hub_download(path_or_id, "tokenizer.json")
|
| 86 |
+
|
| 87 |
+
from safetensors.numpy import load_file
|
| 88 |
+
tensors = load_file(model_path)
|
| 89 |
+
config = json.loads(config_path.read_text())
|
| 90 |
+
|
| 91 |
+
return cls(
|
| 92 |
+
packed=tensors["embedding_packed"],
|
| 93 |
+
scales=tensors["embedding_scales"],
|
| 94 |
+
zeros=tensors["embedding_zeros"],
|
| 95 |
+
tokenizer_data=tok_path,
|
| 96 |
+
config=config,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def encode(self, texts: Union[str, List[str]], normalize: bool = True) -> np.ndarray:
|
| 100 |
+
"""Encode texts to embeddings.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
texts: single string or list of strings
|
| 104 |
+
normalize: L2-normalize output embeddings (default True for cosine sim)
|
| 105 |
+
|
| 106 |
+
Returns:
|
| 107 |
+
np.ndarray of shape (N, dim)
|
| 108 |
+
"""
|
| 109 |
+
if isinstance(texts, str):
|
| 110 |
+
texts = [texts]
|
| 111 |
+
|
| 112 |
+
embeddings = np.zeros((len(texts), self.dim), dtype=np.float32)
|
| 113 |
+
|
| 114 |
+
for i, text in enumerate(texts):
|
| 115 |
+
encoded = self.tokenizer.encode(text)
|
| 116 |
+
token_ids = encoded.ids
|
| 117 |
+
|
| 118 |
+
# Mean pooling over token embeddings
|
| 119 |
+
valid_ids = [tid for tid in token_ids if 0 <= tid < self.vocab_size]
|
| 120 |
+
if valid_ids:
|
| 121 |
+
token_embs = self._embedding_table[valid_ids]
|
| 122 |
+
embeddings[i] = token_embs.mean(axis=0)
|
| 123 |
+
|
| 124 |
+
if normalize:
|
| 125 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
| 126 |
+
norms = np.where(norms == 0, 1.0, norms)
|
| 127 |
+
embeddings = embeddings / norms
|
| 128 |
+
|
| 129 |
+
return embeddings
|
| 130 |
+
|
| 131 |
+
def search(
|
| 132 |
+
self,
|
| 133 |
+
queries: np.ndarray,
|
| 134 |
+
index: np.ndarray,
|
| 135 |
+
top_k: int = 10
|
| 136 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 137 |
+
"""Cosine similarity search.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
queries: (Q, D) query embeddings
|
| 141 |
+
index: (N, D) document embeddings
|
| 142 |
+
top_k: number of results
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
(scores, indices) arrays
|
| 146 |
+
"""
|
| 147 |
+
queries = np.asarray(queries, dtype=np.float32)
|
| 148 |
+
index = np.asarray(index, dtype=np.float32)
|
| 149 |
+
if queries.ndim == 1:
|
| 150 |
+
queries = queries[None, :]
|
| 151 |
+
|
| 152 |
+
# Normalize
|
| 153 |
+
qn = queries / (np.linalg.norm(queries, axis=1, keepdims=True) + 1e-8)
|
| 154 |
+
dn = index / (np.linalg.norm(index, axis=1, keepdims=True) + 1e-8)
|
| 155 |
+
|
| 156 |
+
scores = qn @ dn.T
|
| 157 |
+
|
| 158 |
+
if top_k >= scores.shape[1]:
|
| 159 |
+
idx = np.argsort(-scores, axis=1)
|
| 160 |
+
return np.take_along_axis(scores, idx, 1), idx
|
| 161 |
+
|
| 162 |
+
idx = np.argpartition(-scores, top_k, axis=1)[:, :top_k]
|
| 163 |
+
s = np.take_along_axis(scores, idx, 1)
|
| 164 |
+
order = np.argsort(-s, axis=1)
|
| 165 |
+
return np.take_along_axis(s, order, 1), np.take_along_axis(idx, order, 1)
|
| 166 |
+
|
| 167 |
+
@property
|
| 168 |
+
def model_size_mb(self) -> float:
|
| 169 |
+
return (self.packed.nbytes + self.scales.nbytes + self.zeros.nbytes) / 1e6
|
| 170 |
+
|
| 171 |
+
def __repr__(self):
|
| 172 |
+
return (f"LF4StaticEmbedding(vocab={self.vocab_size}, dim={self.dim}, "
|
| 173 |
+
f"bits=4, size={self.model_size_mb:.1f}MB, "
|
| 174 |
+
f"block_size={self.block_size})")
|
lf4_v2.py
ADDED
|
@@ -0,0 +1,663 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vortex-Embed v2 — Retrieval-optimized LF4 static embedding model.
|
| 3 |
+
|
| 4 |
+
Built on VTXAI/Vortex-Embed-4.7M (4-bit LF4 weights, 29 KB tokenizer).
|
| 5 |
+
All training-free upgrades: SIF IDF weighting, top-K principal component
|
| 6 |
+
removal, file-path header injection, and search-time file-extension score
|
| 7 |
+
bias.
|
| 8 |
+
|
| 9 |
+
Key results (Webscout codebase, 5,168 chunks, 51 hand-verified queries):
|
| 10 |
+
|
| 11 |
+
R@1 = 0.745 (baseline LF4: 0.314, +137%)
|
| 12 |
+
R@5 = 0.843
|
| 13 |
+
R@10 = 0.882
|
| 14 |
+
MRR = 0.779
|
| 15 |
+
|
| 16 |
+
Drop-in replacement for `LF4StaticEmbedding` from the v1 model. Same
|
| 17 |
+
weight format, same tokenizer, same embed dimension. New arguments are
|
| 18 |
+
optional and default to the v2 best configuration.
|
| 19 |
+
"""
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import math
|
| 24 |
+
import re
|
| 25 |
+
from dataclasses import dataclass, field
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
from typing import List, Optional, Sequence, Tuple, Union
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
from safetensors.numpy import load_file, save_file
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
from tokenizers import Tokenizer
|
| 34 |
+
except Exception: # pragma: no cover
|
| 35 |
+
Tokenizer = None # type: ignore[assignment]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Path header helpers
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
_PATH_SEP_RE = re.compile(r"[_\-\.]+")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _path_to_header_tokens(path: str) -> List[str]:
|
| 46 |
+
"""Snake/kebab/dot-split a file path into semantic tokens.
|
| 47 |
+
|
| 48 |
+
Returns the deduped list of directory parts + stem (with the file
|
| 49 |
+
extension stripped from the last part). Order is preserved.
|
| 50 |
+
|
| 51 |
+
Example:
|
| 52 |
+
"llm4free/search/engines/duckduckgo_main.py"
|
| 53 |
+
-> ["llm4free", "search", "engines", "duckduckgo", "main"]
|
| 54 |
+
"""
|
| 55 |
+
p = Path(path)
|
| 56 |
+
parts = list(p.parts)
|
| 57 |
+
if parts and parts[0].startswith("."):
|
| 58 |
+
parts = parts[1:]
|
| 59 |
+
stem = p.stem
|
| 60 |
+
parts.append(stem)
|
| 61 |
+
suffix = p.suffix.lstrip(".").lower()
|
| 62 |
+
out: List[str] = []
|
| 63 |
+
for part in parts:
|
| 64 |
+
for w in _PATH_SEP_RE.split(part):
|
| 65 |
+
wl = w.lower()
|
| 66 |
+
if wl and wl != suffix:
|
| 67 |
+
out.append(wl)
|
| 68 |
+
seen, dedup = set(), []
|
| 69 |
+
for w in out:
|
| 70 |
+
if w not in seen:
|
| 71 |
+
seen.add(w)
|
| 72 |
+
dedup.append(w)
|
| 73 |
+
return dedup
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# Main model
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass
|
| 82 |
+
class VortexEmbedConfig:
|
| 83 |
+
"""Configuration container mirroring the on-disk ``config.json``."""
|
| 84 |
+
|
| 85 |
+
vocab_size: int = 29528
|
| 86 |
+
embedding_dim: int = 256
|
| 87 |
+
block_size: int = 32
|
| 88 |
+
num_blocks: int = 8
|
| 89 |
+
model_type: str = "vortex-embed"
|
| 90 |
+
architectures: List[str] = field(default_factory=lambda: ["VortexEmbedV2"])
|
| 91 |
+
# v2-specific retrieval knobs (also persisted to config.json on save)
|
| 92 |
+
sif_a: float = 1e-4
|
| 93 |
+
sif_pc: float = 1.0
|
| 94 |
+
pc_k: int = 8
|
| 95 |
+
header_repeat: int = 15
|
| 96 |
+
py_bonus: float = 0.05
|
| 97 |
+
md_penalty: float = -0.02
|
| 98 |
+
bias_top_k: int = 50
|
| 99 |
+
quantization: str = "lf4"
|
| 100 |
+
bits: int = 4
|
| 101 |
+
|
| 102 |
+
@classmethod
|
| 103 |
+
def from_dict(cls, d: dict) -> "VortexEmbedConfig":
|
| 104 |
+
# Accept arbitrary v1 keys; fall back to defaults for unknown ones
|
| 105 |
+
kw = {k: d[k] for k in d if k in cls.__dataclass_fields__}
|
| 106 |
+
return cls(**kw)
|
| 107 |
+
|
| 108 |
+
def to_dict(self) -> dict:
|
| 109 |
+
return {k: getattr(self, k) for k in self.__dataclass_fields__}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class VortexEmbedV2:
|
| 113 |
+
"""Vortex-Embed v2 — retrieval-optimized LF4 static embedding.
|
| 114 |
+
|
| 115 |
+
Pipeline at encode time (per chunk text):
|
| 116 |
+
1. Augment: prepend path-header tokens × ``header_repeat``
|
| 117 |
+
2. Tokenize (HuggingFace fast tokenizer, same as v1)
|
| 118 |
+
3. SIF IDF weighting on every token
|
| 119 |
+
4. Sum tokens per chunk via ``torch.scatter_add_`` (CPU)
|
| 120 |
+
5. Divide by SIF-weighted count
|
| 121 |
+
6. Remove top-``pc_k`` principal components (fitted on corpus)
|
| 122 |
+
7. L2-normalize
|
| 123 |
+
|
| 124 |
+
Pipeline at search time (per query):
|
| 125 |
+
1. Encode query with the same pipeline
|
| 126 |
+
2. Cosine score against the index (``qn @ index.T``)
|
| 127 |
+
3. Within top-``bias_top_k`` candidates, add a small per-extension
|
| 128 |
+
score bias (``+py_bonus`` for .py, ``+md_penalty`` for .md) to
|
| 129 |
+
break the ties where README.md / docs/*.md outrank code
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
packed: ``uint8`` (vocab, dim//2) packed 4-bit weights.
|
| 133 |
+
scales: ``float16`` (vocab, num_blocks) per-block scales.
|
| 134 |
+
zeros: ``float16`` (vocab, num_blocks) per-block zero-points.
|
| 135 |
+
tokenizer_data: path to ``tokenizer.json`` or its raw JSON string.
|
| 136 |
+
config: configuration dict (or :class:`VortexEmbedConfig`).
|
| 137 |
+
precompute: if True, dequantize the full table to FP32 at load.
|
| 138 |
+
"""
|
| 139 |
+
|
| 140 |
+
def __init__(
|
| 141 |
+
self,
|
| 142 |
+
packed: np.ndarray,
|
| 143 |
+
scales: np.ndarray,
|
| 144 |
+
zeros: np.ndarray,
|
| 145 |
+
tokenizer_data: Union[str, Path],
|
| 146 |
+
config: Union[dict, VortexEmbedConfig],
|
| 147 |
+
*,
|
| 148 |
+
precompute: bool = True,
|
| 149 |
+
) -> None:
|
| 150 |
+
self.packed = np.asarray(packed, dtype=np.uint8)
|
| 151 |
+
self.scales = np.asarray(scales, dtype=np.float16)
|
| 152 |
+
self.zeros = np.asarray(zeros, dtype=np.float16)
|
| 153 |
+
self.tokenizer_data = str(tokenizer_data)
|
| 154 |
+
if isinstance(config, dict):
|
| 155 |
+
self.config = VortexEmbedConfig.from_dict(config)
|
| 156 |
+
else:
|
| 157 |
+
self.config = config
|
| 158 |
+
self.vocab_size = int(self.config.vocab_size)
|
| 159 |
+
self.dim = int(self.config.embedding_dim)
|
| 160 |
+
self.block_size = int(self.config.block_size)
|
| 161 |
+
self.num_blocks = int(self.config.num_blocks)
|
| 162 |
+
# v2 retrieval knobs
|
| 163 |
+
self.sif_a = float(self.config.sif_a)
|
| 164 |
+
self.sif_pc = float(self.config.sif_pc)
|
| 165 |
+
self.pc_k = int(self.config.pc_k)
|
| 166 |
+
self.header_repeat = int(self.config.header_repeat)
|
| 167 |
+
self.py_bonus = float(self.config.py_bonus)
|
| 168 |
+
self.md_penalty = float(self.config.md_penalty)
|
| 169 |
+
self.bias_top_k = int(self.config.bias_top_k)
|
| 170 |
+
# State
|
| 171 |
+
self._tokenizer: Optional[Tokenizer] = None
|
| 172 |
+
self._embedding_table: Optional[np.ndarray] = None
|
| 173 |
+
self._sif_weights: Optional[np.ndarray] = None
|
| 174 |
+
self._pc_directions: Optional[np.ndarray] = None
|
| 175 |
+
self._file_paths: Optional[List[str]] = None
|
| 176 |
+
self._chunk_is_py: Optional[np.ndarray] = None
|
| 177 |
+
self._chunk_is_md: Optional[np.ndarray] = None
|
| 178 |
+
self.cache_path: Optional[Path] = None
|
| 179 |
+
if precompute:
|
| 180 |
+
self._embedding_table = self._dequantize_all()
|
| 181 |
+
|
| 182 |
+
# ---- properties -----------------------------------------------------
|
| 183 |
+
|
| 184 |
+
@property
|
| 185 |
+
def tokenizer(self) -> Tokenizer:
|
| 186 |
+
if self._tokenizer is None:
|
| 187 |
+
if Tokenizer is None:
|
| 188 |
+
raise RuntimeError("tokenizers is required: install via `pip install tokenizers`")
|
| 189 |
+
self._tokenizer = Tokenizer.from_file(self.tokenizer_data)
|
| 190 |
+
return self._tokenizer
|
| 191 |
+
|
| 192 |
+
@property
|
| 193 |
+
def embedding_table(self) -> np.ndarray:
|
| 194 |
+
if self._embedding_table is None:
|
| 195 |
+
self._embedding_table = self._dequantize_all()
|
| 196 |
+
return self._embedding_table
|
| 197 |
+
|
| 198 |
+
@property
|
| 199 |
+
def model_size_mb(self) -> float:
|
| 200 |
+
if self._embedding_table is not None:
|
| 201 |
+
return self._embedding_table.nbytes / 1e6
|
| 202 |
+
return (self.packed.nbytes + self.scales.nbytes + self.zeros.nbytes) / 1e6
|
| 203 |
+
|
| 204 |
+
# ---- (de)serialization ---------------------------------------------
|
| 205 |
+
|
| 206 |
+
@classmethod
|
| 207 |
+
def from_pretrained(
|
| 208 |
+
cls,
|
| 209 |
+
path_or_id: Union[str, Path],
|
| 210 |
+
*,
|
| 211 |
+
precompute: bool = True,
|
| 212 |
+
cache_path: Optional[Union[str, Path]] = None,
|
| 213 |
+
**overrides,
|
| 214 |
+
) -> "VortexEmbedV2":
|
| 215 |
+
"""Load from a local model directory or Hugging Face Hub id.
|
| 216 |
+
|
| 217 |
+
Expected files in the directory:
|
| 218 |
+
- ``model.safetensors`` (LF4 packed weights)
|
| 219 |
+
- ``config.json`` (model + retrieval config)
|
| 220 |
+
- ``tokenizer.json``
|
| 221 |
+
"""
|
| 222 |
+
path = Path(path_or_id)
|
| 223 |
+
if not path.is_dir():
|
| 224 |
+
from huggingface_hub import snapshot_download
|
| 225 |
+
path = Path(snapshot_download(str(path_or_id)))
|
| 226 |
+
tensors = load_file(str(path / "model.safetensors"))
|
| 227 |
+
config = json.loads((path / "config.json").read_text())
|
| 228 |
+
# Apply overrides (e.g. sif_a=1e-3, header_repeat=10, disable bias...)
|
| 229 |
+
for k, v in overrides.items():
|
| 230 |
+
if k in VortexEmbedConfig.__dataclass_fields__:
|
| 231 |
+
config[k] = v
|
| 232 |
+
obj = cls(
|
| 233 |
+
packed=tensors["embedding_packed"],
|
| 234 |
+
scales=tensors["embedding_scales"],
|
| 235 |
+
zeros=tensors["embedding_zeros"],
|
| 236 |
+
tokenizer_data=str(path / "tokenizer.json"),
|
| 237 |
+
config=config,
|
| 238 |
+
precompute=precompute,
|
| 239 |
+
)
|
| 240 |
+
if cache_path is not None:
|
| 241 |
+
obj.cache_path = Path(cache_path)
|
| 242 |
+
return obj
|
| 243 |
+
|
| 244 |
+
def save_pretrained(self, path: Union[str, Path]) -> None:
|
| 245 |
+
"""Save weights + config + tokenizer to a local directory."""
|
| 246 |
+
out = Path(path)
|
| 247 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 248 |
+
save_file(
|
| 249 |
+
{
|
| 250 |
+
"embedding_packed": self.packed,
|
| 251 |
+
"embedding_scales": self.scales,
|
| 252 |
+
"embedding_zeros": self.zeros,
|
| 253 |
+
},
|
| 254 |
+
str(out / "model.safetensors"),
|
| 255 |
+
)
|
| 256 |
+
(out / "config.json").write_text(
|
| 257 |
+
json.dumps(self.config.to_dict(), indent=2)
|
| 258 |
+
)
|
| 259 |
+
if not (out / "tokenizer.json").exists():
|
| 260 |
+
(out / "tokenizer.json").write_text(
|
| 261 |
+
Path(self.tokenizer_data).read_text()
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# ---- LF4 dequantization --------------------------------------------
|
| 265 |
+
|
| 266 |
+
def _dequantize_all(self) -> np.ndarray:
|
| 267 |
+
"""Dequantize the complete LF4 embedding table to FP32.
|
| 268 |
+
|
| 269 |
+
Each output row is a 256-dim vector. Block-wise: for block b,
|
| 270 |
+
value = scale[b] * int4 + zero[b]. Int4 is stored as 2 nibbles
|
| 271 |
+
per byte (low / high).
|
| 272 |
+
"""
|
| 273 |
+
low = (self.packed & 0x0F).astype(np.float32)
|
| 274 |
+
high = ((self.packed >> 4) & 0x0F).astype(np.float32)
|
| 275 |
+
padded = self.packed.shape[1] * 2
|
| 276 |
+
unpacked = np.empty((self.packed.shape[0], padded), dtype=np.float32)
|
| 277 |
+
unpacked[:, 0::2] = low
|
| 278 |
+
unpacked[:, 1::2] = high
|
| 279 |
+
blocked = unpacked.reshape(self.packed.shape[0], self.num_blocks, self.block_size)
|
| 280 |
+
scales = self.scales.astype(np.float32)[:, :, None]
|
| 281 |
+
zeros = self.zeros.astype(np.float32)[:, :, None]
|
| 282 |
+
out = (blocked * scales + zeros).reshape(self.packed.shape[0], padded)
|
| 283 |
+
return out[:, : self.dim]
|
| 284 |
+
|
| 285 |
+
def _dequantize_ids(self, token_ids: np.ndarray) -> np.ndarray:
|
| 286 |
+
"""Dequantize a subset of rows by token id (fast path uses cache)."""
|
| 287 |
+
if self._embedding_table is not None:
|
| 288 |
+
return self._embedding_table[token_ids]
|
| 289 |
+
# Cold path: dequant unique ids only
|
| 290 |
+
unique = np.unique(token_ids)
|
| 291 |
+
packed = self.packed[unique]
|
| 292 |
+
low = (packed & 0x0F).astype(np.float32)
|
| 293 |
+
high = ((packed >> 4) & 0x0F).astype(np.float32)
|
| 294 |
+
padded = packed.shape[1] * 2
|
| 295 |
+
unpacked = np.empty((packed.shape[0], padded), dtype=np.float32)
|
| 296 |
+
unpacked[:, 0::2] = low
|
| 297 |
+
unpacked[:, 1::2] = high
|
| 298 |
+
blocked = unpacked.reshape(packed.shape[0], self.num_blocks, self.block_size)
|
| 299 |
+
scales = self.scales[unique].astype(np.float32)[:, :, None]
|
| 300 |
+
zeros = self.zeros[unique].astype(np.float32)[:, :, None]
|
| 301 |
+
deq = (blocked * scales + zeros).reshape(packed.shape[0], padded)[:, : self.dim]
|
| 302 |
+
table = np.empty((self.vocab_size, self.dim), dtype=np.float32)
|
| 303 |
+
table[unique] = deq
|
| 304 |
+
self._embedding_table = table # promote to cache
|
| 305 |
+
return table[token_ids]
|
| 306 |
+
|
| 307 |
+
# ---- SIF + PC fitting ----------------------------------------------
|
| 308 |
+
|
| 309 |
+
def fit_idf(self, corpus_token_lists: Sequence[Sequence[int]]) -> "VortexEmbedV2":
|
| 310 |
+
"""Compute SIF (Smoothed Inverse Frequency) weights from the corpus.
|
| 311 |
+
|
| 312 |
+
weight(t) = a / (a + p(t)) where p(t) = count(t) / total_tokens.
|
| 313 |
+
|
| 314 |
+
Tokens that never appear in the corpus get weight 1 (no down-weight).
|
| 315 |
+
Call once after tokenizing the corpus; reused for every encode.
|
| 316 |
+
"""
|
| 317 |
+
flat = (np.concatenate(corpus_token_lists)
|
| 318 |
+
if corpus_token_lists else np.empty(0, dtype=np.int64))
|
| 319 |
+
total = max(int(flat.size), 1)
|
| 320 |
+
counts = np.bincount(flat, minlength=self.vocab_size).astype(np.float64)
|
| 321 |
+
p = counts / total
|
| 322 |
+
denom = self.sif_a + p
|
| 323 |
+
with np.errstate(divide="ignore", invalid="ignore"):
|
| 324 |
+
weights = np.where(p > 0, self.sif_a / denom, 1.0)
|
| 325 |
+
self._sif_weights = weights.astype(np.float32)
|
| 326 |
+
return self
|
| 327 |
+
|
| 328 |
+
def fit_pc(self, corpus_embeddings: np.ndarray, k: Optional[int] = None) -> "VortexEmbedV2":
|
| 329 |
+
"""Compute the top-``k`` principal components of the corpus embeddings.
|
| 330 |
+
|
| 331 |
+
These directions capture the dominant "common-topic" axis and are
|
| 332 |
+
removed from every chunk/query vector at encode time. SIF-style
|
| 333 |
+
trick from Arora et al. 2017. ``k=8`` is the v2 default.
|
| 334 |
+
"""
|
| 335 |
+
if k is None:
|
| 336 |
+
k = self.pc_k
|
| 337 |
+
if corpus_embeddings.size == 0 or k <= 0:
|
| 338 |
+
return self
|
| 339 |
+
x = corpus_embeddings.astype(np.float32)
|
| 340 |
+
x = x - x.mean(axis=0, keepdims=True)
|
| 341 |
+
try:
|
| 342 |
+
_, _, vt = np.linalg.svd(x, full_matrices=False)
|
| 343 |
+
pcs = vt[:k].astype(np.float32)
|
| 344 |
+
pcs = pcs / (np.linalg.norm(pcs, axis=1, keepdims=True) + 1e-12)
|
| 345 |
+
self._pc_directions = pcs
|
| 346 |
+
except np.linalg.LinAlgError:
|
| 347 |
+
self._pc_directions = None
|
| 348 |
+
return self
|
| 349 |
+
|
| 350 |
+
def _apply_pc(self, x: np.ndarray) -> np.ndarray:
|
| 351 |
+
if self.sif_pc <= 0 or self._pc_directions is None:
|
| 352 |
+
return x
|
| 353 |
+
out = x
|
| 354 |
+
for pc in self._pc_directions:
|
| 355 |
+
proj = (out @ pc)[:, None] * pc[None, :]
|
| 356 |
+
out = out - self.sif_pc * proj
|
| 357 |
+
return out
|
| 358 |
+
|
| 359 |
+
# ---- file-path binding ---------------------------------------------
|
| 360 |
+
|
| 361 |
+
def set_file_paths(self, file_paths: Sequence[str]) -> "VortexEmbedV2":
|
| 362 |
+
"""Bind corpus file paths so encode() can prepend path headers.
|
| 363 |
+
|
| 364 |
+
Also pre-classifies each chunk by extension so the search-time bias
|
| 365 |
+
can be applied in a tight loop without per-query re-classification.
|
| 366 |
+
"""
|
| 367 |
+
self._file_paths = list(file_paths)
|
| 368 |
+
if file_paths is None:
|
| 369 |
+
self._chunk_is_py = None
|
| 370 |
+
self._chunk_is_md = None
|
| 371 |
+
return self
|
| 372 |
+
self._chunk_is_py = np.fromiter(
|
| 373 |
+
(p.endswith(".py") for p in file_paths), dtype=bool, count=len(file_paths)
|
| 374 |
+
)
|
| 375 |
+
self._chunk_is_md = np.fromiter(
|
| 376 |
+
(p.endswith(".md") for p in file_paths), dtype=bool, count=len(file_paths)
|
| 377 |
+
)
|
| 378 |
+
return self
|
| 379 |
+
|
| 380 |
+
def _augment_texts(self, texts: Sequence[str]) -> List[str]:
|
| 381 |
+
if self._file_paths is None or len(self._file_paths) != len(texts):
|
| 382 |
+
return list(texts)
|
| 383 |
+
out: List[str] = []
|
| 384 |
+
for text, path in zip(texts, self._file_paths):
|
| 385 |
+
header_tokens = _path_to_header_tokens(path)
|
| 386 |
+
if not header_tokens or self.header_repeat <= 0:
|
| 387 |
+
out.append(text)
|
| 388 |
+
continue
|
| 389 |
+
header = " ".join(header_tokens * self.header_repeat)
|
| 390 |
+
out.append(f"{header}\n{text}")
|
| 391 |
+
return out
|
| 392 |
+
|
| 393 |
+
# ---- tokenization ----------------------------------------------------
|
| 394 |
+
|
| 395 |
+
DEFAULT_MAX_CHARS_PER_TEXT = 50_000
|
| 396 |
+
DEFAULT_MAX_TOKENS_PER_TEXT = 4096
|
| 397 |
+
DEFAULT_MAX_TOKENS_PER_BATCH = 262_144
|
| 398 |
+
|
| 399 |
+
def _tokenize_batch(self, texts: Sequence[str]) -> List[List[int]]:
|
| 400 |
+
encoded = self.tokenizer.encode_batch(list(texts))
|
| 401 |
+
return [
|
| 402 |
+
[tid for tid in item.ids if 0 <= int(tid) < self.vocab_size]
|
| 403 |
+
for item in encoded
|
| 404 |
+
]
|
| 405 |
+
|
| 406 |
+
def _cap_inputs(self, texts: Sequence[str]) -> List[str]:
|
| 407 |
+
cap = self.DEFAULT_MAX_CHARS_PER_TEXT
|
| 408 |
+
if cap <= 0:
|
| 409 |
+
return list(texts)
|
| 410 |
+
out = []
|
| 411 |
+
for t in texts:
|
| 412 |
+
if len(t) <= cap:
|
| 413 |
+
out.append(t)
|
| 414 |
+
else:
|
| 415 |
+
half = cap // 2
|
| 416 |
+
out.append(t[:half] + t[-(cap - half):])
|
| 417 |
+
return out
|
| 418 |
+
|
| 419 |
+
def _cap_token_lists(self, token_lists: List[List[int]]) -> List[List[int]]:
|
| 420 |
+
cap = self.DEFAULT_MAX_TOKENS_PER_TEXT
|
| 421 |
+
if cap <= 0:
|
| 422 |
+
return token_lists
|
| 423 |
+
out = []
|
| 424 |
+
for ids in token_lists:
|
| 425 |
+
if len(ids) <= cap:
|
| 426 |
+
out.append(ids)
|
| 427 |
+
else:
|
| 428 |
+
half = cap // 2
|
| 429 |
+
out.append(ids[:half] + ids[-(cap - half):])
|
| 430 |
+
return out
|
| 431 |
+
|
| 432 |
+
@staticmethod
|
| 433 |
+
def _normalize_inplace(x: np.ndarray) -> None:
|
| 434 |
+
norms = np.linalg.norm(x, axis=1, keepdims=True)
|
| 435 |
+
np.divide(x, np.maximum(norms, 1e-12), out=x)
|
| 436 |
+
|
| 437 |
+
# ---- core encode -----------------------------------------------------
|
| 438 |
+
|
| 439 |
+
def _encode_subbatch(
|
| 440 |
+
self, token_lists: Sequence[Sequence[int]], *, normalize: bool
|
| 441 |
+
) -> np.ndarray:
|
| 442 |
+
n = len(token_lists)
|
| 443 |
+
flat = (np.concatenate(token_lists)
|
| 444 |
+
if token_lists else np.empty(0, dtype=np.int64))
|
| 445 |
+
if flat.size == 0:
|
| 446 |
+
return np.zeros((n, self.dim), dtype=np.float32)
|
| 447 |
+
|
| 448 |
+
token_embs = self._dequantize_ids(flat)
|
| 449 |
+
|
| 450 |
+
if self._sif_weights is not None:
|
| 451 |
+
w = self._sif_weights[flat].astype(np.float32)[:, None]
|
| 452 |
+
token_embs = token_embs * w
|
| 453 |
+
|
| 454 |
+
import torch
|
| 455 |
+
ro = torch.from_numpy(
|
| 456 |
+
np.repeat(np.arange(n, dtype=np.int64),
|
| 457 |
+
[len(ids) for ids in token_lists])
|
| 458 |
+
)
|
| 459 |
+
em = torch.from_numpy(np.ascontiguousarray(token_embs))
|
| 460 |
+
sums = torch.zeros((n, self.dim), dtype=torch.float32)
|
| 461 |
+
sums.index_add_(0, ro, em)
|
| 462 |
+
|
| 463 |
+
if self._sif_weights is not None:
|
| 464 |
+
w_flat = torch.from_numpy(self._sif_weights[flat])
|
| 465 |
+
w_per_row = ro.bincount(minlength=n, weights=w_flat).clamp(min=1e-12)
|
| 466 |
+
else:
|
| 467 |
+
w_per_row = ro.bincount(minlength=n).clamp(min=1).to(torch.float32)
|
| 468 |
+
|
| 469 |
+
embeddings = (sums / w_per_row.unsqueeze(1)).numpy()
|
| 470 |
+
embeddings = self._apply_pc(embeddings)
|
| 471 |
+
if normalize:
|
| 472 |
+
self._normalize_inplace(embeddings)
|
| 473 |
+
return embeddings
|
| 474 |
+
|
| 475 |
+
def encode_batch(
|
| 476 |
+
self,
|
| 477 |
+
texts: Sequence[str],
|
| 478 |
+
*,
|
| 479 |
+
normalize: bool = True,
|
| 480 |
+
max_tokens_per_text: Optional[int] = None,
|
| 481 |
+
max_tokens_per_batch: Optional[int] = None,
|
| 482 |
+
max_chars_per_text: Optional[int] = None,
|
| 483 |
+
) -> np.ndarray:
|
| 484 |
+
"""Encode a list of texts into L2-normalized ``(len, dim)`` embeddings.
|
| 485 |
+
|
| 486 |
+
Path-header augmentation runs first if file paths were bound via
|
| 487 |
+
:meth:`set_file_paths`. Token caps and sub-batching keep peak
|
| 488 |
+
memory bounded on large corpora.
|
| 489 |
+
"""
|
| 490 |
+
if not texts:
|
| 491 |
+
return np.zeros((0, self.dim), dtype=np.float32)
|
| 492 |
+
|
| 493 |
+
augmented = self._augment_texts(texts)
|
| 494 |
+
capped = self._cap_inputs(augmented)
|
| 495 |
+
token_lists = self._tokenize_batch(capped)
|
| 496 |
+
token_lists = self._cap_token_lists(token_lists)
|
| 497 |
+
|
| 498 |
+
cap_t = (self.DEFAULT_MAX_TOKENS_PER_TEXT
|
| 499 |
+
if max_tokens_per_text is None else int(max_tokens_per_text))
|
| 500 |
+
cap_b = (self.DEFAULT_MAX_TOKENS_PER_BATCH
|
| 501 |
+
if max_tokens_per_batch is None else int(max_tokens_per_batch))
|
| 502 |
+
_ = cap_t # already applied above
|
| 503 |
+
|
| 504 |
+
total_tokens = sum(len(ids) for ids in token_lists)
|
| 505 |
+
if total_tokens == 0:
|
| 506 |
+
return np.zeros((len(texts), self.dim), dtype=np.float32)
|
| 507 |
+
|
| 508 |
+
# Single-pass fast path
|
| 509 |
+
if total_tokens <= cap_b or len(texts) <= 1:
|
| 510 |
+
return self._encode_subbatch(token_lists, normalize=normalize)
|
| 511 |
+
|
| 512 |
+
# Multi-pass path: split so each sub-batch fits in cap_b tokens
|
| 513 |
+
out = np.zeros((len(texts), self.dim), dtype=np.float32)
|
| 514 |
+
sub: List[List[int]] = []
|
| 515 |
+
sub_tokens = 0
|
| 516 |
+
sub_start = 0
|
| 517 |
+
for i, ids in enumerate(token_lists):
|
| 518 |
+
if sub and (sub_tokens + len(ids) > cap_b):
|
| 519 |
+
out[sub_start:i] = self._encode_subbatch(
|
| 520 |
+
token_lists[sub_start:i], normalize=False
|
| 521 |
+
)
|
| 522 |
+
sub_start = i
|
| 523 |
+
sub = [ids]
|
| 524 |
+
sub_tokens = len(ids)
|
| 525 |
+
else:
|
| 526 |
+
sub.append(ids)
|
| 527 |
+
sub_tokens += len(ids)
|
| 528 |
+
if sub:
|
| 529 |
+
out[sub_start:] = self._encode_subbatch(
|
| 530 |
+
token_lists[sub_start:], normalize=False
|
| 531 |
+
)
|
| 532 |
+
if normalize:
|
| 533 |
+
self._normalize_inplace(out)
|
| 534 |
+
return out
|
| 535 |
+
|
| 536 |
+
def encode_batch_cached(
|
| 537 |
+
self,
|
| 538 |
+
texts: Sequence[str],
|
| 539 |
+
*,
|
| 540 |
+
normalize: bool = True,
|
| 541 |
+
cache_path: Optional[Union[str, Path]] = None,
|
| 542 |
+
**encode_kwargs,
|
| 543 |
+
) -> np.ndarray:
|
| 544 |
+
"""Encode with a SHA-1-keyed on-disk cache for fast re-indexing.
|
| 545 |
+
|
| 546 |
+
Cache is keyed on the sorted SHA-1 of (texts, dim, tokenizer id).
|
| 547 |
+
On a hit, returns a fresh array without re-running the encode
|
| 548 |
+
pipeline. ``cache_path`` is a path prefix; the actual files are
|
| 549 |
+
``{cache_path}.npy`` (embeddings) and ``{cache_path}.json`` (meta).
|
| 550 |
+
"""
|
| 551 |
+
if cache_path is None and self.cache_path is not None:
|
| 552 |
+
cache_path = self.cache_path
|
| 553 |
+
if cache_path is None:
|
| 554 |
+
return self.encode_batch(texts, normalize=normalize, **encode_kwargs)
|
| 555 |
+
cache_path = Path(cache_path)
|
| 556 |
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
| 557 |
+
emb_path = cache_path.with_suffix(".npy")
|
| 558 |
+
meta_path = cache_path.with_suffix(".json")
|
| 559 |
+
import hashlib
|
| 560 |
+
h = hashlib.sha1()
|
| 561 |
+
h.update(f"{self.dim}|v2|{len(texts)}|".encode())
|
| 562 |
+
for t in texts:
|
| 563 |
+
h.update(t.encode("utf-8", errors="replace"))
|
| 564 |
+
h.update(b"\x00")
|
| 565 |
+
fp = h.hexdigest()
|
| 566 |
+
if meta_path.exists() and emb_path.exists():
|
| 567 |
+
try:
|
| 568 |
+
meta = json.loads(meta_path.read_text())
|
| 569 |
+
if meta.get("fingerprint") == fp and meta.get("dim") == self.dim:
|
| 570 |
+
cached = np.load(emb_path, mmap_mode=None)
|
| 571 |
+
if cached.shape == (len(texts), self.dim):
|
| 572 |
+
return cached.copy() if normalize else cached
|
| 573 |
+
except Exception:
|
| 574 |
+
pass
|
| 575 |
+
emb = self.encode_batch(texts, normalize=normalize, **encode_kwargs)
|
| 576 |
+
np.save(emb_path, emb.astype(np.float32))
|
| 577 |
+
meta_path.write_text(json.dumps({"fingerprint": fp, "dim": self.dim, "n": len(texts)}))
|
| 578 |
+
return emb
|
| 579 |
+
|
| 580 |
+
def encode(self, texts: Union[str, Sequence[str]], *, normalize: bool = True) -> np.ndarray:
|
| 581 |
+
"""Encode one string or a list of strings.
|
| 582 |
+
|
| 583 |
+
For a single string, returns a 1-D array of shape ``(dim,)``.
|
| 584 |
+
For a list, returns a 2-D array of shape ``(len, dim)``.
|
| 585 |
+
"""
|
| 586 |
+
if isinstance(texts, str):
|
| 587 |
+
return self.encode_batch([texts], normalize=normalize)[0]
|
| 588 |
+
return self.encode_batch(list(texts), normalize=normalize)
|
| 589 |
+
|
| 590 |
+
# ---- search ---------------------------------------------------------
|
| 591 |
+
|
| 592 |
+
def search(
|
| 593 |
+
self,
|
| 594 |
+
queries: np.ndarray,
|
| 595 |
+
index: np.ndarray,
|
| 596 |
+
top_k: int = 10,
|
| 597 |
+
*,
|
| 598 |
+
index_normalized: bool = True,
|
| 599 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
| 600 |
+
"""Cosine search with optional file-extension score bias.
|
| 601 |
+
|
| 602 |
+
Returns ``(scores, indices)`` of shapes ``(Q, top_k)`` and
|
| 603 |
+
``(Q, top_k)``. Indices are row indices into ``index``.
|
| 604 |
+
|
| 605 |
+
Set ``index_normalized=False`` to have the index L2-normalized
|
| 606 |
+
in-place; otherwise it is assumed to be pre-normalized.
|
| 607 |
+
"""
|
| 608 |
+
queries = np.asarray(queries, dtype=np.float32)
|
| 609 |
+
index = np.asarray(index, dtype=np.float32)
|
| 610 |
+
if queries.ndim == 1:
|
| 611 |
+
queries = queries[None, :]
|
| 612 |
+
if not index_normalized:
|
| 613 |
+
index = index.copy()
|
| 614 |
+
self._normalize_inplace(index)
|
| 615 |
+
qn = queries.copy()
|
| 616 |
+
self._normalize_inplace(qn)
|
| 617 |
+
|
| 618 |
+
scores = qn @ index.T
|
| 619 |
+
n_docs = scores.shape[1]
|
| 620 |
+
k = min(int(top_k), n_docs)
|
| 621 |
+
if k <= 0:
|
| 622 |
+
return (np.empty((queries.shape[0], 0), dtype=np.float32),
|
| 623 |
+
np.empty((queries.shape[0], 0), dtype=np.int64))
|
| 624 |
+
|
| 625 |
+
bias_pool = min(self.bias_top_k, n_docs)
|
| 626 |
+
if bias_pool >= n_docs:
|
| 627 |
+
order = np.argsort(-scores, axis=1)
|
| 628 |
+
else:
|
| 629 |
+
part = np.argpartition(-scores, bias_pool, axis=1)[:, :bias_pool]
|
| 630 |
+
ps = np.take_along_axis(scores, part, axis=1)
|
| 631 |
+
sub_order = np.argsort(-ps, axis=1)
|
| 632 |
+
order = np.take_along_axis(part, sub_order, axis=1)
|
| 633 |
+
|
| 634 |
+
# v2 search-time bias: vectorized score adjustment on the candidate
|
| 635 |
+
# pool. Adds py_bonus to .py chunks and md_penalty to .md chunks in
|
| 636 |
+
# the top-bias_pool per query, then a final argpartition/top-k.
|
| 637 |
+
if self._chunk_is_py is not None or self._chunk_is_md is not None:
|
| 638 |
+
biased = scores.copy()
|
| 639 |
+
# Build a per-chunk additive bias vector once
|
| 640 |
+
chunk_bias = np.zeros(scores.shape[1], dtype=np.float32)
|
| 641 |
+
if self._chunk_is_py is not None:
|
| 642 |
+
chunk_bias += np.where(self._chunk_is_py, self.py_bonus, 0.0)
|
| 643 |
+
if self._chunk_is_md is not None:
|
| 644 |
+
chunk_bias += np.where(self._chunk_is_md, self.md_penalty, 0.0)
|
| 645 |
+
# Zero out bias for non-candidate docs (so they can never
|
| 646 |
+
# outrank a candidate via the bias)
|
| 647 |
+
mask = np.zeros(scores.shape[1], dtype=bool)
|
| 648 |
+
for qi in range(scores.shape[0]):
|
| 649 |
+
mask[order[qi]] = True
|
| 650 |
+
chunk_bias = np.where(mask, chunk_bias, 0.0)
|
| 651 |
+
biased += chunk_bias[None, :]
|
| 652 |
+
scores = biased
|
| 653 |
+
|
| 654 |
+
if k == n_docs:
|
| 655 |
+
idx = np.argsort(-scores, axis=1)[:, :k]
|
| 656 |
+
else:
|
| 657 |
+
part = np.argpartition(-scores, kth=k, axis=1)[:, :k]
|
| 658 |
+
ps = np.take_along_axis(scores, part, axis=1)
|
| 659 |
+
order2 = np.argsort(-ps, axis=1)
|
| 660 |
+
idx = np.take_along_axis(part, order2, axis=1)
|
| 661 |
+
ordered_scores = np.take_along_axis(scores, idx, axis=1)
|
| 662 |
+
return (ordered_scores.astype(np.float32, copy=False),
|
| 663 |
+
idx.astype(np.int64, copy=False))
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f62f5ea97f10d6c9c66eb469143aff968aa856288a41b6fc1c84703b3abb951
|
| 3 |
+
size 4724744
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|