#!/usr/bin/python3 # -*- coding: utf-8 -*- """ https://github.com/jingyaogong/minimind/blob/master/eval_llm.py """ import argparse import os from pathlib import Path import platform import time if platform.system() in ("Windows", "Darwin"): from project_settings import project_path, temp_directory else: project_path = os.path.abspath("../../") project_path = Path(project_path) temp_directory = Path("/root/autodl-tmp/OpenMiniMind/temp") import torch from modelscope import AutoTokenizer, AutoModelForCausalLM from transformers import TextStreamer def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--pretrained_model_name_or_path", default="qgyd2021/Qwen2.5-0.5B-ultrachat-sft-deepspeed", type=str ) parser.add_argument( "--model_cache_dir", default=(temp_directory / "hub_models").as_posix(), type=str ) parser.add_argument( "--max_new_tokens", default=8192, # 8192, 128 type=int, help="最大生成长度(注意:并非模型实际长文本能力)" ) parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)") parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)") parser.add_argument( "--show_speed", default=1, # 1, 0 type=int, help="显示decode速度(tokens/s)" ) args = parser.parse_args() return args def main(): args = get_args() os.environ["MODELSCOPE_CACHE"] = args.model_cache_dir if torch.cuda.is_available(): device = "cuda" elif torch.backends.mps.is_available(): # device = "mps" device = "cpu" else: device = "cpu" print(f"device: {device}") model = AutoModelForCausalLM.from_pretrained( args.pretrained_model_name_or_path, cache_dir=args.model_cache_dir, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained( args.pretrained_model_name_or_path, cache_dir=args.model_cache_dir, trust_remote_code=True, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id model = model.eval().to(device) # print(tokenizer) # print(model) prompts = [ "你有什么特长?", "为什么天空是蓝色的", "请用Python写一个计算斐波那契数列的函数", '解释一下"光合作用"的基本过程', "如果明天下雨,我应该如何出门", "比较一下猫和狗作为宠物的优缺点", "解释什么是机器学习", "推荐一些中国的美食" ] input_mode = int(input("[0] 自动测试\n[1] 手动输入\n")) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # conversation = list() conversation = [ {"role": "system", "content": "You are a helpful assistant"} ] while True: if input_mode == 0: if len(prompts) == 0: break user_input = prompts.pop(0) print(f"💬: {user_input}") else: user_input = input("💬: ") user_input = str(user_input).strip() conversation.append({"role": "user", "content": user_input}) inputs = tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=True ) inputs = tokenizer.__call__( inputs, return_tensors="pt", truncation=True ) inputs = inputs.to(device) # print(inputs) print("🤖: ", end="") st = time.time() generated_ids = model.generate( inputs=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=args.max_new_tokens, do_sample=True, streamer=streamer, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, top_p=args.top_p, temperature=args.temperature, repetition_penalty=3.0, ) response = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True) conversation.append({"role": "assistant", "content": response}) gen_tokens = len(generated_ids[0]) - len(inputs["input_ids"][0]) print(f"\n[Speed]: {gen_tokens / (time.time() - st):.2f} tokens/s\n\n") if args.show_speed else print("\n\n") return if __name__ == "__main__": main()