Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Evaluate RLM Needle-in-Haystack Model | |
| Compare Base vs Trained performance | |
| """ | |
| import os | |
| import re | |
| import json | |
| import random | |
| import string | |
| from datetime import datetime | |
| import torch | |
| from datasets import Dataset | |
| from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| # === CONFIG === | |
| BASE_MODEL = "Qwen/Qwen3-0.6B-Base" | |
| TRAINED_MODEL = "mindchain/qwen3-0.6b-rlm-needle" | |
| NUM_TEST_SAMPLES = 20 # Quick eval | |
| print("="*70) | |
| print("π RLM Model Evaluation - Needle in Haystack") | |
| print("="*70) | |
| print(f"Base Model: {BASE_MODEL}") | |
| print(f"Trained Model: {TRAINED_MODEL}") | |
| print(f"Test Samples: {NUM_TEST_SAMPLES}") | |
| print("="*70) | |
| # === GENERATE TEST DATA (different from training) === | |
| def generate_test_data(num_samples=20, seed=123): | |
| random.seed(seed) | |
| needles = [ | |
| ("The secret code is", "ALPHA9"), | |
| ("The magic number is", "99"), | |
| ("The password is", "omega2026"), | |
| ("The answer is", "23"), | |
| ("The key value is", "beta-gamma-3"), | |
| ("The hidden word is", "ephemeral"), | |
| ("The special ID is", "ID-999888"), | |
| ("The unique code is", "TIGER-42"), | |
| ("The mystery number is", "271828"), | |
| ("The secret phrase is", "crimson dawn"), | |
| ] | |
| samples = [] | |
| for i in range(num_samples): | |
| prefix, needle = random.choice(needles) | |
| # Generate haystack | |
| words = [] | |
| for _ in range(500): # Shorter for eval | |
| word_len = random.randint(3, 10) | |
| word = ''.join(random.choices(string.ascii_lowercase, k=word_len)) | |
| words.append(word) | |
| haystack = ' '.join(words) | |
| # Insert needle | |
| insert_pos = random.randint(len(haystack) // 4, 3 * len(haystack) // 4) | |
| context = haystack[:insert_pos] + f" {prefix} {needle}. " + haystack[insert_pos:] | |
| prompt = f"""Find the hidden information in this text. | |
| The text contains a secret piece of information. Find it and report ONLY the value. | |
| Text: | |
| {context} | |
| What is the hidden value?""" | |
| samples.append({ | |
| "prompt": prompt, | |
| "needle": needle, | |
| }) | |
| return samples | |
| print("\nπ Generating test data...") | |
| test_data = generate_test_data(NUM_TEST_SAMPLES) | |
| print(f"β {len(test_data)} test samples") | |
| # === LOAD MODELS === | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"\nDevice: {device}") | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| # Load Base Model | |
| print(f"\nπ¦ Loading base model: {BASE_MODEL}") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| print("β Base model loaded") | |
| # Load Trained Model (Base + Adapters) | |
| print(f"\nπ¦ Loading trained model: {TRAINED_MODEL}") | |
| trained_model = PeftModel.from_pretrained(base_model, TRAINED_MODEL) | |
| print("β Trained model loaded") | |
| # === EVALUATION FUNCTION === | |
| def extract_needle(text): | |
| """Extract needle from model output""" | |
| text = text.strip() | |
| # Get last word/token | |
| words = text.split() | |
| if words: | |
| return words[-1].strip('.,;:!?"\')').rstrip('.') | |
| return "" | |
| def evaluate_model(model, tokenizer, samples, name="Model"): | |
| print(f"\nπ Evaluating {name}...") | |
| correct = 0 | |
| results = [] | |
| for i, sample in enumerate(samples): | |
| prompt = sample["prompt"] | |
| truth = sample["needle"] | |
| # Generate | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=32, | |
| do_sample=False, | |
| pad_token_id=tokenizer.pad_token_id, | |
| ) | |
| generated = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| response = generated[len(prompt):].strip() if len(generated) > len(prompt) else generated | |
| # Extract prediction | |
| pred = extract_needle(response) | |
| # Compare (case-insensitive) | |
| is_correct = pred.lower() == truth.lower() | |
| if is_correct: | |
| correct += 1 | |
| results.append({ | |
| "sample": i+1, | |
| "pred": pred, | |
| "truth": truth, | |
| "correct": is_correct | |
| }) | |
| if i < 3: # Show first 3 | |
| status = "β " if is_correct else "β" | |
| print(f" [{i+1}] {status} pred='{pred}' truth='{truth}'") | |
| accuracy = correct / len(samples) | |
| print(f"\nπ {name} Results:") | |
| print(f" Correct: {correct}/{len(samples)}") | |
| print(f" Accuracy: {accuracy*100:.1f}%") | |
| return accuracy, results | |
| # === RUN EVALUATION === | |
| print("\n" + "="*70) | |
| print("π Running Evaluation") | |
| print("="*70) | |
| base_acc, base_results = evaluate_model(base_model, tokenizer, test_data, "Base Model") | |
| trained_acc, trained_results = evaluate_model(trained_model, tokenizer, test_data, "Trained Model") | |
| # === SUMMARY === | |
| print("\n" + "="*70) | |
| print("π EVALUATION SUMMARY") | |
| print("="*70) | |
| print(f"Base Model Accuracy: {base_acc*100:.1f}%") | |
| print(f"Trained Model Accuracy: {trained_acc*100:.1f}%") | |
| print(f"Improvement: {(trained_acc - base_acc)*100:+.1f}%") | |
| if trained_acc > base_acc: | |
| print("\nβ Training was successful! Model improved.") | |
| elif trained_acc == base_acc: | |
| print("\nβ οΈ No improvement detected.") | |
| else: | |
| print("\nβ Model got worse after training.") | |
| # Save results | |
| eval_results = { | |
| "timestamp": datetime.now().isoformat(), | |
| "base_model": BASE_MODEL, | |
| "trained_model": TRAINED_MODEL, | |
| "num_samples": NUM_TEST_SAMPLES, | |
| "base_accuracy": base_acc, | |
| "trained_accuracy": trained_acc, | |
| "improvement": trained_acc - base_acc, | |
| "base_results": base_results, | |
| "trained_results": trained_results, | |
| } | |
| with open("eval_results.json", "w") as f: | |
| json.dump(eval_results, f, indent=2) | |
| print(f"\nπΎ Results saved to eval_results.json") | |
| print("="*70) | |