remiai3's picture
Upload 7 files
9c4eaff verified
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Path to the downloaded model
model_path = "./qwen2.5_1.5b_model"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Load the model without quantization for CPU
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="cpu",
torch_dtype=torch.float16, # Use float16 to reduce memory usage
trust_remote_code=True
)
# Test input
prompt = "Solve the equation: 2x + 5 = 15"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=50).to("cpu")
# Generate output with attention mask
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask, # Explicitly pass attention mask
max_length=200,
num_return_sequences=1,
do_sample=True, # Enable sampling
top_k=50, # Use top-k sampling to reduce memory
top_p=0.9 # Use top-p sampling to reduce memory
)
# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Model response:", response)