#!/usr/bin/env python3 """Test generation with Loop Attention (use_cache=False).""" import sys import torch sys.path.insert(0, '/content') from modeling_qwen_loop import Qwen3LoopForCausalLM from transformers import AutoTokenizer MODEL_PATH = "/content/Qwen3-0.6B" GATE_PATH = "/content/Qwen3-0.6B-looped/checkpoints/gate_projections_epoch_3.pt" print("\n1. Loading model...") model = Qwen3LoopForCausalLM.from_pretrained(MODEL_PATH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) print("2. Loading trained gates...") gate_state = torch.load(GATE_PATH, map_location=device) for key, value in gate_state.items(): parts = key.split('.') layer_idx = int(parts[1]) param_name = parts[-1] if param_name == 'weight': model.model.layers[layer_idx].self_attn.gate.weight.data = value.to(device) elif param_name == 'bias': model.model.layers[layer_idx].self_attn.gate.bias.data = value.to(device) print(" Gates loaded!") tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) tokenizer.pad_token = tokenizer.eos_token model.eval() prompts = [ "The capital of France is", "def fibonacci(n):", "In the year 2050,", "The quick brown fox", "Explain quantum computing in simple terms:" ] for prompt in prompts: inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): out = model.generate( input_ids=inputs.input_ids, max_new_tokens=50, do_sample=True, temperature=0.7, top_p=0.9, use_cache=False, pad_token_id=tokenizer.eos_token_id ) text = tokenizer.decode(out[0], skip_special_tokens=True) print(f"\nPrompt: {prompt}") print(f"Output: {text}")