import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig #quantization_config = BitsAndBytesConfig(load_in_8bit=True) MODEL_PATH = "/workspace/output/glm4_7_30b/hf_temp_07i/" #MODEL_PATH = "/workspace/llm/GLM-4.7-Flash/" messages = [{"role": "user", "content": "who is rick astley?"}] tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH ,torch_dtype="auto", device_map="auto", trust_remote_code=True, #quantization_config=quantization_config ) inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, enable_thinking=False, return_tensors="pt", ) print(type(tokenizer)) print("chat_template is None?", tokenizer.chat_template is None) print("chat_template head:\n", (tokenizer.chat_template or "")[:400]) print(inputs) print('---------------------------') print(tokenizer.decode(inputs['input_ids'])) #exit() model = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path=MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) inputs = inputs.to(model.device) generated_ids = model.generate(**inputs, max_new_tokens=256,use_cache=True, do_sample=True) output_text = tokenizer.decode(generated_ids[0][inputs.input_ids.shape[1]:]) print('--------------------------------------------------------------------------------------') print(output_text)