Upload 7 files

9c4eaff verified 7 months ago

1.13 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	# Path to the downloaded model
	model_path = "./qwen2.5_1.5b_model"

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# Load the model without quantization for CPU
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="cpu",
	torch_dtype=torch.float16, # Use float16 to reduce memory usage
	trust_remote_code=True
	)

	# Test input
	prompt = "Solve the equation: 2x + 5 = 15"
	inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=50).to("cpu")

	# Generate output with attention mask
	outputs = model.generate(
	inputs.input_ids,
	attention_mask=inputs.attention_mask, # Explicitly pass attention mask
	max_length=200,
	num_return_sequences=1,
	do_sample=True, # Enable sampling
	top_k=50, # Use top-k sampling to reduce memory
	top_p=0.9 # Use top-p sampling to reduce memory
	)

	# Decode and print the response
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print("Model response:", response)