iris_backend / backend /src /embeddings /benchmark_bge.py
Saandraahh's picture
Implemented clustering
4b3a33f
import time
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
import psutil
import os
def benchmark_bge():
print("πŸš€ Starting BGE-M3 Efficiency Benchmark...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ’» Device: {device}")
print("πŸ“₯ Loading BAAI/bge-m3...")
start_load = time.time()
model = SentenceTransformer('BAAI/bge-m3', device=device)
print(f"⏱️ Load Time: {time.time() - start_load:.2f}s")
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
print(f"πŸ“Š Memory Usage (RAM): {mem_info.rss / 1024 / 1024:.2f} MB")
sentences = [
"The quick brown fox jumps over the lazy dog.",
"Artificial intelligence is transforming the recruitment industry.",
"Candidate has 5 years of experience in Python and FastAPI.",
"Looking for a Senior Software Engineer with cloud expertise."
] * 25 # 100 sentences
batch_sizes = [1, 4, 8, 16, 32]
print("\n--- Latency vs Batch Size ---")
print(f"{'Batch Size':<12} | {'Time (s)':<10} | {'Sec/Sent':<10} | {'Throughput (sent/s)':<20}")
print("-" * 65)
for bs in batch_sizes:
start_time = time.time()
# Warmup
model.encode(sentences[:bs], batch_size=bs, show_progress_bar=False)
# Actual benchmark
start_time = time.time()
model.encode(sentences, batch_size=bs, show_progress_bar=False)
end_time = time.time()
total_time = end_time - start_time
sec_per_sent = total_time / len(sentences)
throughput = len(sentences) / total_time
print(f"{bs:<12} | {total_time:<10.3f} | {sec_per_sent:<10.4f} | {throughput:<20.2f}")
print("\nβœ… Benchmark Complete.")
if __name__ == "__main__":
benchmark_bge()