File size: 2,758 Bytes
1a6e584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import onnx
import onnxruntime as ort
import numpy as np
import os
from tqdm import tqdm
from transformers import AutoConfig, AutoTokenizer
from typing import List, Tuple
from axengine import InferenceSession
from ml_dtypes import bfloat16
from utils.infer_func import InferManager
import argparse
from PIL import Image
from torchvision.transforms import Resize, ToTensor, Normalize, Compose
from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD


if __name__ == "__main__":

    prompt = None
    parser = argparse.ArgumentParser(description="Model configuration parameters")
    parser.add_argument("--hf_model", type=str, default="./smolvlm3_tokenizer/",
                        help="Path to HuggingFace model")
    parser.add_argument("--axmodel_path", type=str, default="./smollm3_axmodel/",
                        help="Path to save compiled axmodel of llama model")
    parser.add_argument("--disable-think", action="store_true", default=False,
                        help="Disable thinking.")
    parser.add_argument("-q", "--question", type=str, default="Give me a brief explanation of gravity in simple terms.",
                        help="Your question that you want to ask the model.")
    args = parser.parse_args()

    hf_model_path = args.hf_model
    axmodel_path = args.axmodel_path
    prompt = args.question

    device = "cuda" if torch.cuda.is_available() else "cpu"
    embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))

    # load the tokenizer and the model
    tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
    cfg = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)

    # model = AutoModelForCausalLM.from_pretrained(
    #     hf_model_path,
    # ).to(device)

    # prepare the model input
    if not args.disable_think:
        messages = [
            {"role": "user", "content": prompt}
        ]
    else:
        messages = [
            {"role": "system", "content": "/no_think"},
            {"role": "user", "content": prompt}
        ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    input_ids = model_inputs.input_ids

    token_ids = input_ids[0].cpu().numpy().tolist()
    token_len = len(token_ids)
    prefill_data = np.take(embeds, token_ids, axis=0)
    prefill_data = prefill_data.astype(bfloat16)

    imer = InferManager(cfg, axmodel_path)

    token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=128)
    imer.decode(tokenizer, token_ids, embeds, slice_len=128)
    print("\n")