Qwen3-0.6B-Looped / test_loop_generation.py
coolpoodle's picture
Uploaded Training / Testing File / Eval
b871d11 verified
#!/usr/bin/env python3
"""Test generation with Loop Attention (use_cache=False)."""
import sys
import torch
sys.path.insert(0, '/content')
from modeling_qwen_loop import Qwen3LoopForCausalLM
from transformers import AutoTokenizer
MODEL_PATH = "/content/Qwen3-0.6B"
GATE_PATH = "/content/Qwen3-0.6B-looped/checkpoints/gate_projections_epoch_3.pt"
print("\n1. Loading model...")
model = Qwen3LoopForCausalLM.from_pretrained(MODEL_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("2. Loading trained gates...")
gate_state = torch.load(GATE_PATH, map_location=device)
for key, value in gate_state.items():
parts = key.split('.')
layer_idx = int(parts[1])
param_name = parts[-1]
if param_name == 'weight':
model.model.layers[layer_idx].self_attn.gate.weight.data = value.to(device)
elif param_name == 'bias':
model.model.layers[layer_idx].self_attn.gate.bias.data = value.to(device)
print(" Gates loaded!")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
model.eval()
prompts = [
"The capital of France is",
"def fibonacci(n):",
"In the year 2050,",
"The quick brown fox",
"Explain quantum computing in simple terms:"
]
for prompt in prompts:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
out = model.generate(
input_ids=inputs.input_ids,
max_new_tokens=50,
do_sample=True,
temperature=0.7,
top_p=0.9,
use_cache=False,
pad_token_id=tokenizer.eos_token_id
)
text = tokenizer.decode(out[0], skip_special_tokens=True)
print(f"\nPrompt: {prompt}")
print(f"Output: {text}")