OpenMOSE's picture
Upload folder using huggingface_hub
411272a
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
#quantization_config = BitsAndBytesConfig(load_in_8bit=True)
MODEL_PATH = "/workspace/output/glm4_7_30b/hf_temp_07i/"
#MODEL_PATH = "/workspace/llm/GLM-4.7-Flash/"
messages = [{"role": "user", "content": "who is rick astley?"}]
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH
,torch_dtype="auto",
device_map="auto",
trust_remote_code=True,
#quantization_config=quantization_config
)
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
enable_thinking=False,
return_tensors="pt",
)
print(type(tokenizer))
print("chat_template is None?", tokenizer.chat_template is None)
print("chat_template head:\n", (tokenizer.chat_template or "")[:400])
print(inputs)
print('---------------------------')
print(tokenizer.decode(inputs['input_ids']))
#exit()
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
inputs = inputs.to(model.device)
generated_ids = model.generate(**inputs, max_new_tokens=256,use_cache=True, do_sample=True)
output_text = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:])
print('--------------------------------------------------------------------------------------')
print(output_text)