from transformers import AutoModelForCausalLM, AutoTokenizer import torch # Path to the downloaded model model_path = "./qwen2.5_1.5b_model" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) # Load the model without quantization for CPU model = AutoModelForCausalLM.from_pretrained( model_path, device_map="cpu", torch_dtype=torch.float16, # Use float16 to reduce memory usage trust_remote_code=True ) # Test input prompt = "Solve the equation: 2x + 5 = 15" inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=50).to("cpu") # Generate output with attention mask outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, # Explicitly pass attention mask max_length=200, num_return_sequences=1, do_sample=True, # Enable sampling top_k=50, # Use top-k sampling to reduce memory top_p=0.9 # Use top-p sampling to reduce memory ) # Decode and print the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) print("Model response:", response)