import transformers
import torch
from modelscope import snapshot_download

model_id = snapshot_download("LLM-Research/Llama-3.3-70B-Instruct")

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]


with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        # torch.profiler.ProfilerActivity.CUDA, # 捕捉 aten function 的调用仅开 CPU 就够了
    ],
    #record_shapes=True,
    #with_stack=True,    
) as p:
    outputs = pipeline(
        messages,
        max_new_tokens=256,
    )
    print(outputs[0]["generated_text"][-1])

table_str = p.key_averages().table(
    sort_by="count", 
    row_limit=-1, 
    max_src_column_width=100, 
    max_name_column_width=100, # 限制列宽
)
with open("Llama-3.3-70B-Instruct.txt", 'wt') as f:
    f.write(table_str)