from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Path to the downloaded model
model_path = "./qwen2.5_1.5b_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model without quantization for CPU
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="cpu",
    torch_dtype=torch.float16,  # Use float16 to reduce memory usage
    trust_remote_code=True
)

# Test input
prompt = "Solve the equation: 2x + 5 = 15"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=50).to("cpu")

# Generate output with attention mask
outputs = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,  # Explicitly pass attention mask
    max_length=200,
    num_return_sequences=1,
    do_sample=True,  # Enable sampling
    top_k=50,  # Use top-k sampling to reduce memory
    top_p=0.9  # Use top-p sampling to reduce memory
)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Model response:", response)