| """ |
| Needle-in-a-Haystack test for TurboQuant. |
| |
| Hides a specific fact in a long document and checks if the model can retrieve it. |
| This is the paper's flagship benchmark (0.997 recall at 4x compression). |
| """ |
|
|
| import sys |
| sys.path.insert(0, "/home/azureuser/turboquant") |
|
|
| import torch |
| import gc |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| from turboquant.cache import TurboQuantCache |
|
|
| NEEDLE = "The secret code for the treasure chest is BLUE-DRAGON-42." |
|
|
| HAYSTACK_UNIT = ( |
| "The history of artificial intelligence began in antiquity, with myths and stories of " |
| "artificial beings endowed with intelligence by master craftsmen. Classical philosophers " |
| "attempted to describe the process of human thinking as the mechanical manipulation of " |
| "symbols. This work culminated in the invention of the programmable digital computer in " |
| "the 1940s. Alan Turing proposed that machines could simulate any conceivable act of " |
| "mathematical reasoning. The field of AI research was founded at a workshop at Dartmouth " |
| "College in 1956. Early AI programs solved algebra problems, proved theorems, and learned " |
| "to speak English. By the mid-1960s, research was heavily funded by the Department of " |
| "Defense. In the 1970s, AI faced criticism and funding cuts known as the AI winter. " |
| "Expert systems were developed in the 1980s, and neural networks regained popularity. " |
| "Deep learning breakthroughs in the 2010s led to dramatic advances in computer vision " |
| "and natural language processing. Today, AI powers search engines, recommendation systems, " |
| "autonomous vehicles, and language models that can generate human-like text. " |
| ) |
|
|
| QUESTION = "What is the secret code for the treasure chest?" |
|
|
|
|
| def build_prompt(context_tokens, tokenizer, needle_position=0.5): |
| """Build a prompt with a needle hidden in a haystack at the given position.""" |
| |
| haystack_tokens = tokenizer.encode(HAYSTACK_UNIT) |
| needle_tokens = tokenizer.encode(NEEDLE) |
| target_hay_tokens = context_tokens - len(needle_tokens) - 50 |
|
|
| n_repeats = target_hay_tokens // len(haystack_tokens) + 1 |
| full_haystack = HAYSTACK_UNIT * n_repeats |
|
|
| |
| hay_encoded = tokenizer.encode(full_haystack)[:target_hay_tokens] |
|
|
| |
| insert_idx = int(len(hay_encoded) * needle_position) |
| combined = hay_encoded[:insert_idx] + needle_tokens + hay_encoded[insert_idx:] |
| combined_text = tokenizer.decode(combined) |
|
|
| prompt = f"{combined_text}\n\nBased on the text above, answer this question: {QUESTION}" |
| return prompt |
|
|
|
|
| def test_needle(model, tokenizer, context_length, needle_position=0.5, use_turboquant=False, skip_layers=None): |
| """Run one needle test and check if the model retrieves the answer.""" |
| prompt = build_prompt(context_length, tokenizer, needle_position) |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_length).to(model.device) |
| actual_len = inputs.input_ids.shape[1] |
|
|
| if use_turboquant: |
| cache = TurboQuantCache(model.config, nbits=4, residual_length=128, |
| device="cuda", skip_layers=skip_layers or set()) |
| else: |
| cache = None |
|
|
| with torch.no_grad(): |
| output = model.generate( |
| **inputs, max_new_tokens=50, do_sample=False, |
| past_key_values=cache, |
| ) |
| answer = tokenizer.decode(output[0][actual_len:], skip_special_tokens=True) |
|
|
| |
| found = "BLUE-DRAGON-42" in answer or "BLUE" in answer and "DRAGON" in answer and "42" in answer |
| return { |
| "context_length": actual_len, |
| "needle_position": needle_position, |
| "found": found, |
| "answer": answer[:200], |
| } |
|
|
|
|
| def main(): |
| model_id = "Qwen/Qwen2.5-7B-Instruct" |
| print(f"Loading {model_id}...") |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, device_map="auto", trust_remote_code=True, dtype=torch.bfloat16, |
| quantization_config=BitsAndBytesConfig( |
| load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", |
| ), |
| ) |
| print(f"Loaded: {torch.cuda.memory_allocated()/1024**3:.1f} GB") |
|
|
| skip = TurboQuantCache.calibrate_skip_layers(model, tokenizer) |
| print(f"Skip layers: {skip}") |
|
|
| context_lengths = [1024, 2048, 4096, 8192, 16384] |
| positions = [0.25, 0.5, 0.75] |
|
|
| print(f"\n{'Context':>8} {'Position':>8} | {'Default':>10} {'TurboQuant':>12} | {'Match':>6}") |
| print("-" * 60) |
|
|
| total_default = 0 |
| total_tq = 0 |
| total_tests = 0 |
|
|
| for ctx in context_lengths: |
| for pos in positions: |
| |
| r_default = test_needle(model, tokenizer, ctx, pos, use_turboquant=False) |
| gc.collect(); torch.cuda.empty_cache() |
|
|
| |
| r_tq = test_needle(model, tokenizer, ctx, pos, use_turboquant=True, skip_layers=skip) |
| gc.collect(); torch.cuda.empty_cache() |
|
|
| match = r_default["found"] == r_tq["found"] |
| total_default += r_default["found"] |
| total_tq += r_tq["found"] |
| total_tests += 1 |
|
|
| d_str = "FOUND" if r_default["found"] else "MISS" |
| t_str = "FOUND" if r_tq["found"] else "MISS" |
| m_str = "=" if match else "DIFF" |
|
|
| print(f"{r_default['context_length']:>8} {pos:>8.2f} | {d_str:>10} {t_str:>12} | {m_str:>6}") |
|
|
| if not r_tq["found"]: |
| print(f" TQ answer: {r_tq['answer'][:80]}") |
|
|
| print(f"\nResults: Default {total_default}/{total_tests}, TurboQuant {total_tq}/{total_tests}") |
| print(f"Default recall: {100*total_default/total_tests:.1f}%") |
| print(f"TurboQuant recall: {100*total_tq/total_tests:.1f}%") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|