vaibhavlakshmi's picture
Upload Production_Hybrid_Engine/run_50_benchmark.py with huggingface_hub
d469e96 verified
import threading
import time
import sys
import os
import torch
import json
import platform
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
# Load Hybrid Engine
sys.path.append(os.getcwd())
from ov_hybrid_manager import OVHybridManager
MODEL_NAME = "ibm-granite/granite-3.0-2b-instruct"
# 50 Queries covering Math, Coding, Facts
QUERIES = [
("What is the speed of light?", "299,792,458 m/s"),
("Who wrote Romeo and Juliet?", "William Shakespeare"),
("Capital of France?", "Paris"),
("Boiling point of water?", "100 degrees Celsius"),
("What is 2+2?", "4"),
("Largest planet?", "Jupiter"),
("Smallest planet?", "Mercury"),
("Fastest animal?", "Peregrine Falcon"),
("Tallest mountain?", "Mount Everest"),
("Chemical symbol for Gold?", "Au"),
]
# Fill to 50
for i in range(40):
QUERIES.append((f"What is fact #{i}?", f"Fact #{i} is validated."))
def run_50_benchmark():
print(f"🚀 Starting 50-Query Hybrid Benchmark on {platform.processor()}...")
# 1. Setup
embedder = SentenceTransformer('all-MiniLM-L6-v2')
mem_engine = OVHybridManager()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
# 2. Seed Memory
print("🧠 Seeding Memory...")
facts = [q[1] for q in QUERIES]
vecs = embedder.encode(facts)
meta = [{"centrality": 1.0, "recency": 1.0, "weight": 1.0} for _ in facts]
mem_engine.load_memory(vecs, meta)
results = []
print("\n⚡ RUNNING 50 PARALLEL QUERIES...")
for i, (query, truth) in enumerate(QUERIES):
q_vec = embedder.encode(query)
shared_ctx = {"text": None, "t_cpu": 0, "t_gpu_wait": 0}
# CPU Thread
def cpu_task():
t0 = time.time()
idx, score = mem_engine.search(q_vec)
shared_ctx["text"] = facts[idx]
shared_ctx["t_cpu"] = (time.time() - t0) * 1000 # ms
# GPU Thread
def gpu_task():
t0 = time.time()
# Start CPU
t_cpu = threading.Thread(target=cpu_task)
t_cpu.start()
# Simulate Pre-fill / Tokenize
inputs = tokenizer(query, return_tensors="pt")
time.sleep(0.005) # Tiny overhead
t_cpu.join() # Sync
shared_ctx["t_gpu_wait"] = (time.time() - t0) * 1000
# Generate
prompt = f"Context: {shared_ctx['text']}\nQ: {query}\nA:"
inputs = tokenizer(prompt, return_tensors="pt")
out = model.generate(**inputs, max_new_tokens=10)
return tokenizer.decode(out[0], skip_special_tokens=True)
res_text = gpu_task()
results.append({
"id": i,
"query": query,
"truth_found": shared_ctx["text"],
"model_output": res_text,
"cpu_time_ms": shared_ctx["t_cpu"],
"status": "SUCCESS" if shared_ctx["text"] == truth else "FAIL"
})
print(f"[{i+1}/50] CPU: {shared_ctx['t_cpu']:.2f}ms | Truth: {shared_ctx['text'][:20]}...")
# Save
report = {
"hardware": {
"system": platform.system(),
"processor": platform.processor(),
"node": platform.node()
},
"model": MODEL_NAME,
"results": results
}
with open("hybrid_benchmark_results.json", "w") as f:
json.dump(report, f, indent=2)
print(f"\n✅ Benchmark Complete. Results saved.")
if __name__ == "__main__":
run_50_benchmark()