github-actions
Sync from GitHub 2025-12-17T12:18:53Z
5a3b322
from __future__ import annotations
"""
Simple one-off loader to verify Qwen model availability and a basic generation.
Run:
/usr/bin/python3 scripts/test_qwen_load.py --model Qwen/Qwen2.5-1.5B-Instruct --prompt "Test prompt"
"""
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def main():
parser = argparse.ArgumentParser(description="Test loading Qwen and run a short generation.")
parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct")
parser.add_argument("--prompt", default="Give me a short introduction to large language models.")
args = parser.parse_args()
print(f"Loading model: {args.model}")
model = AutoModelForCausalLM.from_pretrained(
args.model,
torch_dtype="auto",
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(args.model)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": args.prompt},
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt").to(model.device)
print("Generating...")
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=128,
temperature=0.1,
do_sample=False,
top_p=0.9,
)
# strip input tokens
gen_ids = [out[len(inp):] for inp, out in zip(inputs["input_ids"], output_ids)]
response = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)[0]
print("Response:")
print(response)
if __name__ == "__main__":
main()