AmrYassinIsFree commited on
Commit
173f28e
·
1 Parent(s): a1ad6c7
Files changed (10) hide show
  1. README.md +68 -1
  2. bench.py +75 -0
  3. corpus.py +13 -0
  4. evals/__init__.py +5 -0
  5. evals/memory.py +27 -0
  6. evals/quality.py +17 -0
  7. evals/speed.py +29 -0
  8. models.py +23 -0
  9. report.py +34 -0
  10. requirements.txt +5 -0
README.md CHANGED
@@ -1,2 +1,69 @@
1
  # embedding-bench
2
- Compare text embedding models across retrieval performance, inference speed, and memory footprint.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # embedding-bench
2
+
3
+ Compare text embedding models across retrieval quality, inference speed, and memory footprint. Everything runs locally — no external API calls.
4
+
5
+ ## Models
6
+
7
+ | Key | Model | Role |
8
+ |-----|-------|------|
9
+ | `mpnet` | `sentence-transformers/all-mpnet-base-v2` | Baseline |
10
+ | `bge-small` | `BAAI/bge-small-en-v1.5` | |
11
+
12
+ ## Setup
13
+
14
+ ```bash
15
+ python3 -m venv .venv
16
+ source .venv/bin/activate
17
+ pip install -r requirements.txt
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ ```bash
23
+ # Full benchmark (quality + speed + memory)
24
+ python bench.py
25
+
26
+ # Specific models
27
+ python bench.py --models mpnet bge-small
28
+
29
+ # Skip expensive evals
30
+ python bench.py --skip-quality
31
+ python bench.py --skip-memory
32
+
33
+ # Tune corpus size and batch size
34
+ python bench.py --corpus-size 500 --batch-size 32 --num-runs 5
35
+ ```
36
+
37
+ ## Metrics
38
+
39
+ | Dimension | Metric | Method |
40
+ |-----------|--------|--------|
41
+ | Quality | Spearman rho | STS Benchmark test set (1,379 pairs) |
42
+ | Speed | Median encode time | Wall-clock over N runs with warmup |
43
+ | Memory | Peak RSS delta | Isolated subprocess via `psutil` |
44
+
45
+ ## Adding a model
46
+
47
+ Edit `models.py` and add an entry to `REGISTRY`:
48
+
49
+ ```python
50
+ "e5-small": ModelConfig(
51
+ name="e5-small-v2",
52
+ model_id="intfloat/e5-small-v2",
53
+ ),
54
+ ```
55
+
56
+ ## Project structure
57
+
58
+ ```
59
+ embedding-bench/
60
+ ├── bench.py # CLI entry point
61
+ ├── models.py # Model registry
62
+ ├── corpus.py # Sentence corpus builder
63
+ ├── report.py # Table formatting
64
+ ├── evals/
65
+ │ ├── quality.py # STS Benchmark evaluation
66
+ │ ├── speed.py # Latency measurement
67
+ │ └── memory.py # Memory measurement
68
+ └── requirements.txt
69
+ ```
bench.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ from corpus import build_corpus
8
+ from evals import evaluate_memory, evaluate_quality, evaluate_speed
9
+ from models import REGISTRY
10
+ from report import print_report
11
+
12
+
13
+ def main(argv: list[str] | None = None) -> None:
14
+ parser = argparse.ArgumentParser(
15
+ prog="embedding-bench",
16
+ description="Compare embedding models on quality, speed, and memory.",
17
+ )
18
+ parser.add_argument(
19
+ "--models",
20
+ nargs="+",
21
+ default=list(REGISTRY.keys()),
22
+ choices=list(REGISTRY.keys()),
23
+ help="Models to benchmark (default: all)",
24
+ )
25
+ parser.add_argument("--corpus-size", type=int, default=1000)
26
+ parser.add_argument("--batch-size", type=int, default=64)
27
+ parser.add_argument("--num-runs", type=int, default=3)
28
+ parser.add_argument("--skip-quality", action="store_true")
29
+ parser.add_argument("--skip-speed", action="store_true")
30
+ parser.add_argument("--skip-memory", action="store_true")
31
+
32
+ args = parser.parse_args(argv)
33
+
34
+ configs = [REGISTRY[k] for k in args.models]
35
+ baseline_name = next((c.name for c in configs if c.is_baseline), None)
36
+
37
+ corpus: list[str] | None = None
38
+ if not args.skip_speed or not args.skip_memory:
39
+ print(f"Preparing corpus ({args.corpus_size} sentences)...")
40
+ corpus = build_corpus(args.corpus_size)
41
+
42
+ results = []
43
+ for cfg in configs:
44
+ print(f"\n{'='*50}")
45
+ print(f"Benchmarking: {cfg.name}")
46
+ print(f"{'='*50}")
47
+
48
+ result: dict = {"name": cfg.name, "is_baseline": cfg.is_baseline}
49
+
50
+ if not args.skip_quality:
51
+ print(" Evaluating quality (STS Benchmark)...")
52
+ model = SentenceTransformer(cfg.model_id)
53
+ result["quality"] = evaluate_quality(model)
54
+ print(f" Quality: {result['quality']:.4f}")
55
+ del model
56
+
57
+ if not args.skip_speed and corpus is not None:
58
+ print(f" Evaluating speed ({args.num_runs} runs, {args.corpus_size} sentences)...")
59
+ model = SentenceTransformer(cfg.model_id)
60
+ result["speed"] = evaluate_speed(model, corpus, num_runs=args.num_runs, batch_size=args.batch_size)
61
+ print(f" Speed: {result['speed']['sentences_per_second']} sent/s")
62
+ del model
63
+
64
+ if not args.skip_memory and corpus is not None:
65
+ print(" Evaluating memory (isolated subprocess)...")
66
+ result["memory_mb"] = evaluate_memory(cfg.model_id, corpus, batch_size=args.batch_size)
67
+ print(f" Memory: {result['memory_mb']} MB")
68
+
69
+ results.append(result)
70
+
71
+ print_report(results, baseline_name=baseline_name)
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
corpus.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datasets import load_dataset
4
+
5
+
6
+ def build_corpus(size: int) -> list[str]:
7
+ """Build a corpus of real sentences from the STS Benchmark dataset."""
8
+ dataset = load_dataset("mteb/stsbenchmark-sts", split="test")
9
+ sentences = list(dataset["sentence1"]) + list(dataset["sentence2"])
10
+ full: list[str] = []
11
+ while len(full) < size:
12
+ full.extend(sentences)
13
+ return full[:size]
evals/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from evals.quality import evaluate_quality
2
+ from evals.speed import evaluate_speed
3
+ from evals.memory import evaluate_memory
4
+
5
+ __all__ = ["evaluate_quality", "evaluate_speed", "evaluate_memory"]
evals/memory.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ import os
5
+
6
+
7
+ def _measure(model_id: str, sentences: list[str], batch_size: int, queue: multiprocessing.Queue) -> None:
8
+ import psutil
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ process = psutil.Process(os.getpid())
12
+ baseline = process.memory_info().rss
13
+ model = SentenceTransformer(model_id)
14
+ model.encode(sentences, batch_size=batch_size, show_progress_bar=False)
15
+ peak = process.memory_info().rss
16
+ queue.put(peak - baseline)
17
+
18
+
19
+ def evaluate_memory(model_id: str, sentences: list[str], batch_size: int = 64) -> float:
20
+ """Return memory delta in MB, measured in an isolated subprocess."""
21
+ ctx = multiprocessing.get_context("spawn")
22
+ q = ctx.Queue()
23
+ p = ctx.Process(target=_measure, args=(model_id, sentences, batch_size, q))
24
+ p.start()
25
+ p.join()
26
+ bytes_delta = q.get()
27
+ return round(bytes_delta / (1024 * 1024), 1)
evals/quality.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datasets import load_dataset
4
+ from sentence_transformers import SentenceTransformer
5
+ from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
6
+
7
+
8
+ def evaluate_quality(model: SentenceTransformer) -> float:
9
+ """Return Spearman correlation on the STS Benchmark test set."""
10
+ dataset = load_dataset("mteb/stsbenchmark-sts", split="test")
11
+ sentences1 = list(dataset["sentence1"])
12
+ sentences2 = list(dataset["sentence2"])
13
+ scores = [s / 5.0 for s in dataset["score"]]
14
+
15
+ evaluator = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
16
+ results = evaluator(model)
17
+ return results["spearman_cosine"]
evals/speed.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import statistics
4
+ import time
5
+
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+
9
+ def evaluate_speed(
10
+ model: SentenceTransformer,
11
+ sentences: list[str],
12
+ num_runs: int = 3,
13
+ batch_size: int = 64,
14
+ ) -> dict[str, float]:
15
+ """Measure encoding latency. Returns median time and throughput."""
16
+ model.encode(sentences, batch_size=batch_size, show_progress_bar=False)
17
+
18
+ times: list[float] = []
19
+ for _ in range(num_runs):
20
+ start = time.perf_counter()
21
+ model.encode(sentences, batch_size=batch_size, show_progress_bar=False)
22
+ elapsed = time.perf_counter() - start
23
+ times.append(elapsed)
24
+
25
+ median_time = statistics.median(times)
26
+ return {
27
+ "median_seconds": round(median_time, 4),
28
+ "sentences_per_second": round(len(sentences) / median_time, 1),
29
+ }
models.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class ModelConfig:
8
+ name: str
9
+ model_id: str
10
+ is_baseline: bool = False
11
+
12
+
13
+ REGISTRY: dict[str, ModelConfig] = {
14
+ "mpnet": ModelConfig(
15
+ name="all-mpnet-base-v2",
16
+ model_id="sentence-transformers/all-mpnet-base-v2",
17
+ is_baseline=True,
18
+ ),
19
+ "bge-small": ModelConfig(
20
+ name="bge-small-en-v1.5",
21
+ model_id="BAAI/bge-small-en-v1.5",
22
+ ),
23
+ }
report.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Optional
4
+
5
+ from tabulate import tabulate
6
+
7
+
8
+ def print_report(results: list[dict[str, Any]], baseline_name: Optional[str] = None) -> None:
9
+ """Print a formatted comparison table to stdout."""
10
+ headers = ["Model", "Quality (STS)", "Speed (sent/s)", "Median Time (s)", "Memory (MB)"]
11
+ rows: list[list[Any]] = []
12
+
13
+ for r in results:
14
+ name = r["name"]
15
+ if r.get("is_baseline"):
16
+ name += " [B]"
17
+
18
+ quality = r.get("quality")
19
+ speed = r.get("speed")
20
+ memory = r.get("memory_mb")
21
+
22
+ rows.append([
23
+ name,
24
+ f"{quality:.4f}" if quality is not None else "—",
25
+ f"{speed['sentences_per_second']}" if speed else "—",
26
+ f"{speed['median_seconds']}" if speed else "—",
27
+ f"{memory}" if memory is not None else "—",
28
+ ])
29
+
30
+ print()
31
+ print(tabulate(rows, headers=headers, tablefmt="simple"))
32
+ if baseline_name:
33
+ print(f"\n[B] = baseline ({baseline_name})")
34
+ print()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentence-transformers>=2.2.0
2
+ torch
3
+ datasets
4
+ psutil
5
+ tabulate