Spaces:
Runtime error
Runtime error
File size: 6,302 Bytes
94a59aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | #!/usr/bin/env python3
"""
Evaluate RLM Needle-in-Haystack Model
Compare Base vs Trained performance
"""
import os
import re
import json
import random
import string
from datetime import datetime
import torch
from datasets import Dataset
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# === CONFIG ===
BASE_MODEL = "Qwen/Qwen3-0.6B-Base"
TRAINED_MODEL = "mindchain/qwen3-0.6b-rlm-needle"
NUM_TEST_SAMPLES = 20 # Quick eval
print("="*70)
print("π RLM Model Evaluation - Needle in Haystack")
print("="*70)
print(f"Base Model: {BASE_MODEL}")
print(f"Trained Model: {TRAINED_MODEL}")
print(f"Test Samples: {NUM_TEST_SAMPLES}")
print("="*70)
# === GENERATE TEST DATA (different from training) ===
def generate_test_data(num_samples=20, seed=123):
random.seed(seed)
needles = [
("The secret code is", "ALPHA9"),
("The magic number is", "99"),
("The password is", "omega2026"),
("The answer is", "23"),
("The key value is", "beta-gamma-3"),
("The hidden word is", "ephemeral"),
("The special ID is", "ID-999888"),
("The unique code is", "TIGER-42"),
("The mystery number is", "271828"),
("The secret phrase is", "crimson dawn"),
]
samples = []
for i in range(num_samples):
prefix, needle = random.choice(needles)
# Generate haystack
words = []
for _ in range(500): # Shorter for eval
word_len = random.randint(3, 10)
word = ''.join(random.choices(string.ascii_lowercase, k=word_len))
words.append(word)
haystack = ' '.join(words)
# Insert needle
insert_pos = random.randint(len(haystack) // 4, 3 * len(haystack) // 4)
context = haystack[:insert_pos] + f" {prefix} {needle}. " + haystack[insert_pos:]
prompt = f"""Find the hidden information in this text.
The text contains a secret piece of information. Find it and report ONLY the value.
Text:
{context}
What is the hidden value?"""
samples.append({
"prompt": prompt,
"needle": needle,
})
return samples
print("\nπ Generating test data...")
test_data = generate_test_data(NUM_TEST_SAMPLES)
print(f"β
{len(test_data)} test samples")
# === LOAD MODELS ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nDevice: {device}")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load Base Model
print(f"\nπ¦ Loading base model: {BASE_MODEL}")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
print("β
Base model loaded")
# Load Trained Model (Base + Adapters)
print(f"\nπ¦ Loading trained model: {TRAINED_MODEL}")
trained_model = PeftModel.from_pretrained(base_model, TRAINED_MODEL)
print("β
Trained model loaded")
# === EVALUATION FUNCTION ===
def extract_needle(text):
"""Extract needle from model output"""
text = text.strip()
# Get last word/token
words = text.split()
if words:
return words[-1].strip('.,;:!?"\')').rstrip('.')
return ""
def evaluate_model(model, tokenizer, samples, name="Model"):
print(f"\nπ Evaluating {name}...")
correct = 0
results = []
for i, sample in enumerate(samples):
prompt = sample["prompt"]
truth = sample["needle"]
# Generate
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=32,
do_sample=False,
pad_token_id=tokenizer.pad_token_id,
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = generated[len(prompt):].strip() if len(generated) > len(prompt) else generated
# Extract prediction
pred = extract_needle(response)
# Compare (case-insensitive)
is_correct = pred.lower() == truth.lower()
if is_correct:
correct += 1
results.append({
"sample": i+1,
"pred": pred,
"truth": truth,
"correct": is_correct
})
if i < 3: # Show first 3
status = "β
" if is_correct else "β"
print(f" [{i+1}] {status} pred='{pred}' truth='{truth}'")
accuracy = correct / len(samples)
print(f"\nπ {name} Results:")
print(f" Correct: {correct}/{len(samples)}")
print(f" Accuracy: {accuracy*100:.1f}%")
return accuracy, results
# === RUN EVALUATION ===
print("\n" + "="*70)
print("π Running Evaluation")
print("="*70)
base_acc, base_results = evaluate_model(base_model, tokenizer, test_data, "Base Model")
trained_acc, trained_results = evaluate_model(trained_model, tokenizer, test_data, "Trained Model")
# === SUMMARY ===
print("\n" + "="*70)
print("π EVALUATION SUMMARY")
print("="*70)
print(f"Base Model Accuracy: {base_acc*100:.1f}%")
print(f"Trained Model Accuracy: {trained_acc*100:.1f}%")
print(f"Improvement: {(trained_acc - base_acc)*100:+.1f}%")
if trained_acc > base_acc:
print("\nβ
Training was successful! Model improved.")
elif trained_acc == base_acc:
print("\nβ οΈ No improvement detected.")
else:
print("\nβ Model got worse after training.")
# Save results
eval_results = {
"timestamp": datetime.now().isoformat(),
"base_model": BASE_MODEL,
"trained_model": TRAINED_MODEL,
"num_samples": NUM_TEST_SAMPLES,
"base_accuracy": base_acc,
"trained_accuracy": trained_acc,
"improvement": trained_acc - base_acc,
"base_results": base_results,
"trained_results": trained_results,
}
with open("eval_results.json", "w") as f:
json.dump(eval_results, f, indent=2)
print(f"\nπΎ Results saved to eval_results.json")
print("="*70)
|