#!/usr/bin/env python3 """ Evaluate RLM Needle-in-Haystack Model Compare Base vs Trained performance """ import os import re import json import random import string from datetime import datetime import torch from datasets import Dataset from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer from peft import PeftModel # === CONFIG === BASE_MODEL = "Qwen/Qwen3-0.6B-Base" TRAINED_MODEL = "mindchain/qwen3-0.6b-rlm-needle" NUM_TEST_SAMPLES = 20 # Quick eval print("="*70) print("šŸ“Š RLM Model Evaluation - Needle in Haystack") print("="*70) print(f"Base Model: {BASE_MODEL}") print(f"Trained Model: {TRAINED_MODEL}") print(f"Test Samples: {NUM_TEST_SAMPLES}") print("="*70) # === GENERATE TEST DATA (different from training) === def generate_test_data(num_samples=20, seed=123): random.seed(seed) needles = [ ("The secret code is", "ALPHA9"), ("The magic number is", "99"), ("The password is", "omega2026"), ("The answer is", "23"), ("The key value is", "beta-gamma-3"), ("The hidden word is", "ephemeral"), ("The special ID is", "ID-999888"), ("The unique code is", "TIGER-42"), ("The mystery number is", "271828"), ("The secret phrase is", "crimson dawn"), ] samples = [] for i in range(num_samples): prefix, needle = random.choice(needles) # Generate haystack words = [] for _ in range(500): # Shorter for eval word_len = random.randint(3, 10) word = ''.join(random.choices(string.ascii_lowercase, k=word_len)) words.append(word) haystack = ' '.join(words) # Insert needle insert_pos = random.randint(len(haystack) // 4, 3 * len(haystack) // 4) context = haystack[:insert_pos] + f" {prefix} {needle}. " + haystack[insert_pos:] prompt = f"""Find the hidden information in this text. The text contains a secret piece of information. Find it and report ONLY the value. Text: {context} What is the hidden value?""" samples.append({ "prompt": prompt, "needle": needle, }) return samples print("\nšŸ“Š Generating test data...") test_data = generate_test_data(NUM_TEST_SAMPLES) print(f"āœ… {len(test_data)} test samples") # === LOAD MODELS === device = "cuda" if torch.cuda.is_available() else "cpu" print(f"\nDevice: {device}") quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) # Load Base Model print(f"\nšŸ“¦ Loading base model: {BASE_MODEL}") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, quantization_config=quantization_config, device_map="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) print("āœ… Base model loaded") # Load Trained Model (Base + Adapters) print(f"\nšŸ“¦ Loading trained model: {TRAINED_MODEL}") trained_model = PeftModel.from_pretrained(base_model, TRAINED_MODEL) print("āœ… Trained model loaded") # === EVALUATION FUNCTION === def extract_needle(text): """Extract needle from model output""" text = text.strip() # Get last word/token words = text.split() if words: return words[-1].strip('.,;:!?"\')').rstrip('.') return "" def evaluate_model(model, tokenizer, samples, name="Model"): print(f"\nšŸ“Š Evaluating {name}...") correct = 0 results = [] for i, sample in enumerate(samples): prompt = sample["prompt"] truth = sample["needle"] # Generate inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=32, do_sample=False, pad_token_id=tokenizer.pad_token_id, ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) response = generated[len(prompt):].strip() if len(generated) > len(prompt) else generated # Extract prediction pred = extract_needle(response) # Compare (case-insensitive) is_correct = pred.lower() == truth.lower() if is_correct: correct += 1 results.append({ "sample": i+1, "pred": pred, "truth": truth, "correct": is_correct }) if i < 3: # Show first 3 status = "āœ…" if is_correct else "āŒ" print(f" [{i+1}] {status} pred='{pred}' truth='{truth}'") accuracy = correct / len(samples) print(f"\nšŸ“Š {name} Results:") print(f" Correct: {correct}/{len(samples)}") print(f" Accuracy: {accuracy*100:.1f}%") return accuracy, results # === RUN EVALUATION === print("\n" + "="*70) print("šŸš€ Running Evaluation") print("="*70) base_acc, base_results = evaluate_model(base_model, tokenizer, test_data, "Base Model") trained_acc, trained_results = evaluate_model(trained_model, tokenizer, test_data, "Trained Model") # === SUMMARY === print("\n" + "="*70) print("šŸ“Š EVALUATION SUMMARY") print("="*70) print(f"Base Model Accuracy: {base_acc*100:.1f}%") print(f"Trained Model Accuracy: {trained_acc*100:.1f}%") print(f"Improvement: {(trained_acc - base_acc)*100:+.1f}%") if trained_acc > base_acc: print("\nāœ… Training was successful! Model improved.") elif trained_acc == base_acc: print("\nāš ļø No improvement detected.") else: print("\nāŒ Model got worse after training.") # Save results eval_results = { "timestamp": datetime.now().isoformat(), "base_model": BASE_MODEL, "trained_model": TRAINED_MODEL, "num_samples": NUM_TEST_SAMPLES, "base_accuracy": base_acc, "trained_accuracy": trained_acc, "improvement": trained_acc - base_acc, "base_results": base_results, "trained_results": trained_results, } with open("eval_results.json", "w") as f: json.dump(eval_results, f, indent=2) print(f"\nšŸ’¾ Results saved to eval_results.json") print("="*70)