run colab cpu

#1
by rakmik - opened

from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
import transformers
import torch

quantized_model = AutoModelForCausalLM.from_pretrained(
"ISTA-DASLab/Llama-3.2-1B-Instruct-AQLM-PV-2Bit-2x8", trust_remote_code=True, torch_dtype=torch.float32,
).cpu()
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Llama-3.2-1B-Instruct-AQLM-PV-2Bit-2x8")

inputs = tokenizer(["Napoleon Bonaparte is "], return_tensors="pt")["input_ids"].cpu()

streamer = TextStreamer(tokenizer)
_ = quantized_model.generate(inputs, streamer=streamer, max_new_tokens=40)

34s
inputs = tokenizer(["Napoleon Bonaparte is "], return_tensors="pt")["input_ids"].cpu()

streamer = TextStreamer(tokenizer)
_ = quantized_model.generate(inputs, streamer=streamer, max_new_tokens=40)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:128001 for open-end generation.
<|begin_of_text|>Napoleon Bonaparte is 1st of 3 sons of José Bonaparte, a French general and politician. He was born in 1769 in Paris, France. He was educated at the École de Paris

Sign up or log in to comment