| import threading |
| import time |
| import sys |
| import os |
| import torch |
| import json |
| import platform |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from sentence_transformers import SentenceTransformer |
|
|
| |
| sys.path.append(os.getcwd()) |
| from ov_hybrid_manager import OVHybridManager |
|
|
| MODEL_NAME = "ibm-granite/granite-3.0-2b-instruct" |
|
|
| |
| QUERIES = [ |
| ("What is the speed of light?", "299,792,458 m/s"), |
| ("Who wrote Romeo and Juliet?", "William Shakespeare"), |
| ("Capital of France?", "Paris"), |
| ("Boiling point of water?", "100 degrees Celsius"), |
| ("What is 2+2?", "4"), |
| ("Largest planet?", "Jupiter"), |
| ("Smallest planet?", "Mercury"), |
| ("Fastest animal?", "Peregrine Falcon"), |
| ("Tallest mountain?", "Mount Everest"), |
| ("Chemical symbol for Gold?", "Au"), |
| ] |
|
|
| |
| for i in range(40): |
| QUERIES.append((f"What is fact #{i}?", f"Fact #{i} is validated.")) |
|
|
| def run_50_benchmark(): |
| print(f"🚀 Starting 50-Query Hybrid Benchmark on {platform.processor()}...") |
| |
| |
| embedder = SentenceTransformer('all-MiniLM-L6-v2') |
| mem_engine = OVHybridManager() |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32) |
| |
| |
| print("🧠 Seeding Memory...") |
| facts = [q[1] for q in QUERIES] |
| vecs = embedder.encode(facts) |
| meta = [{"centrality": 1.0, "recency": 1.0, "weight": 1.0} for _ in facts] |
| mem_engine.load_memory(vecs, meta) |
| |
| results = [] |
| |
| print("\n⚡ RUNNING 50 PARALLEL QUERIES...") |
| |
| for i, (query, truth) in enumerate(QUERIES): |
| q_vec = embedder.encode(query) |
| shared_ctx = {"text": None, "t_cpu": 0, "t_gpu_wait": 0} |
| |
| |
| def cpu_task(): |
| t0 = time.time() |
| idx, score = mem_engine.search(q_vec) |
| shared_ctx["text"] = facts[idx] |
| shared_ctx["t_cpu"] = (time.time() - t0) * 1000 |
|
|
| |
| def gpu_task(): |
| t0 = time.time() |
| |
| t_cpu = threading.Thread(target=cpu_task) |
| t_cpu.start() |
| |
| |
| inputs = tokenizer(query, return_tensors="pt") |
| time.sleep(0.005) |
| |
| t_cpu.join() |
| shared_ctx["t_gpu_wait"] = (time.time() - t0) * 1000 |
| |
| |
| prompt = f"Context: {shared_ctx['text']}\nQ: {query}\nA:" |
| inputs = tokenizer(prompt, return_tensors="pt") |
| out = model.generate(**inputs, max_new_tokens=10) |
| return tokenizer.decode(out[0], skip_special_tokens=True) |
|
|
| res_text = gpu_task() |
| |
| results.append({ |
| "id": i, |
| "query": query, |
| "truth_found": shared_ctx["text"], |
| "model_output": res_text, |
| "cpu_time_ms": shared_ctx["t_cpu"], |
| "status": "SUCCESS" if shared_ctx["text"] == truth else "FAIL" |
| }) |
| |
| print(f"[{i+1}/50] CPU: {shared_ctx['t_cpu']:.2f}ms | Truth: {shared_ctx['text'][:20]}...") |
|
|
| |
| report = { |
| "hardware": { |
| "system": platform.system(), |
| "processor": platform.processor(), |
| "node": platform.node() |
| }, |
| "model": MODEL_NAME, |
| "results": results |
| } |
| |
| with open("hybrid_benchmark_results.json", "w") as f: |
| json.dump(report, f, indent=2) |
| |
| print(f"\n✅ Benchmark Complete. Results saved.") |
|
|
| if __name__ == "__main__": |
| run_50_benchmark() |