import transformers import torch from modelscope import snapshot_download model_id = snapshot_download("LLM-Research/Llama-3.3-70B-Instruct") pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto", ) messages = [ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, {"role": "user", "content": "Who are you?"}, ] with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, # torch.profiler.ProfilerActivity.CUDA, # 捕捉 aten function 的调用仅开 CPU 就够了 ], #record_shapes=True, #with_stack=True, ) as p: outputs = pipeline( messages, max_new_tokens=256, ) print(outputs[0]["generated_text"][-1]) table_str = p.key_averages().table( sort_by="count", row_limit=-1, max_src_column_width=100, max_name_column_width=100, # 限制列宽 ) with open("Llama-3.3-70B-Instruct.txt", 'wt') as f: f.write(table_str)