happyme531 commited on
Commit
f5d111e
·
verified ·
1 Parent(s): 726c079

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ gen_head.rknn filter=lfs diff=lfs merge=lfs -text
37
+ gen_img_embeds.rknn filter=lfs diff=lfs merge=lfs -text
38
+ image_decode.rknn filter=lfs diff=lfs merge=lfs -text
39
+ lm_head.rknn filter=lfs diff=lfs merge=lfs -text
40
+ vision_encoder.rknn filter=lfs diff=lfs merge=lfs -text
convert_gen_head.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ztu_somemodelruntime_rknn2: gen_head
3
+
4
+ from rknn.api import RKNN
5
+ import os
6
+ import numpy as np
7
+
8
+ def main():
9
+ # 创建RKNN实例
10
+ rknn = RKNN(verbose=True)
11
+
12
+ # ONNX模型路径
13
+ ONNX_MODEL = "gen_head.onnx"
14
+ # 输出RKNN模型路径
15
+ RKNN_MODEL = "gen_head.rknn"
16
+
17
+ # 配置参数
18
+ print("--> Config model")
19
+ ret = rknn.config(target_platform="rk3588",
20
+ dynamic_input=None)
21
+ if ret != 0:
22
+ print('Config model failed!')
23
+ exit(ret)
24
+
25
+ # 加载ONNX模型
26
+ print("--> Loading model")
27
+ ret = rknn.load_onnx(model=ONNX_MODEL,
28
+ inputs=['hidden_states'],
29
+ input_size_list=[[1, 1, 2048]])
30
+ if ret != 0:
31
+ print('Load model failed!')
32
+ exit(ret)
33
+
34
+ # 构建模型
35
+ print("--> Building model")
36
+ ret = rknn.build(do_quantization=False)
37
+ if ret != 0:
38
+ print('Build model failed!')
39
+ exit(ret)
40
+
41
+ # 导出RKNN模型
42
+ print("--> Export RKNN model")
43
+ ret = rknn.export_rknn(RKNN_MODEL)
44
+ if ret != 0:
45
+ print('Export RKNN model failed!')
46
+ exit(ret)
47
+
48
+ print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
49
+ rknn.release()
50
+
51
+ if __name__ == '__main__':
52
+ main()
convert_gen_img_embeds.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ztu_somemodelruntime_rknn2: gen_img_embeds
3
+
4
+ from rknn.api import RKNN
5
+ import os
6
+ import numpy as np
7
+
8
+ def main():
9
+ # 创建RKNN实例
10
+ rknn = RKNN(verbose=True)
11
+
12
+ # ONNX模型路径
13
+ ONNX_MODEL = "gen_img_embeds.onnx"
14
+ # 输出RKNN模型路径
15
+ RKNN_MODEL = "gen_img_embeds.rknn"
16
+
17
+ # 配置参数
18
+ print("--> Config model")
19
+ ret = rknn.config(target_platform="rk3588",
20
+ dynamic_input=None)
21
+ if ret != 0:
22
+ print('Config model failed!')
23
+ exit(ret)
24
+
25
+ # 加载ONNX模型
26
+ print("--> Loading model")
27
+ ret = rknn.load_onnx(model=ONNX_MODEL,
28
+ inputs=['image_ids'],
29
+ input_size_list=[[1, 1]])
30
+ if ret != 0:
31
+ print('Load model failed!')
32
+ exit(ret)
33
+
34
+ # 构建模型
35
+ print("--> Building model")
36
+ ret = rknn.build(do_quantization=False)
37
+ if ret != 0:
38
+ print('Build model failed!')
39
+ exit(ret)
40
+
41
+ # 导出RKNN模型
42
+ print("--> Export RKNN model")
43
+ ret = rknn.export_rknn(RKNN_MODEL)
44
+ if ret != 0:
45
+ print('Export RKNN model failed!')
46
+ exit(ret)
47
+
48
+ print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
49
+ rknn.release()
50
+
51
+ if __name__ == '__main__':
52
+ main()
convert_image_decode.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ztu_somemodelruntime_rknn2: image_decode
3
+
4
+ from rknn.api import RKNN
5
+ import os
6
+ import numpy as np
7
+
8
+ def main():
9
+ # 创建RKNN实例
10
+ rknn = RKNN(verbose=True)
11
+
12
+ # ONNX模型路径
13
+ ONNX_MODEL = "image_decode.onnx"
14
+ # 输出RKNN模型路径
15
+ RKNN_MODEL = "image_decode.rknn"
16
+
17
+ # 配置参数
18
+ print("--> Config model")
19
+ ret = rknn.config(target_platform="rk3588",
20
+ dynamic_input=None)
21
+ if ret != 0:
22
+ print('Config model failed!')
23
+ exit(ret)
24
+
25
+ # 加载ONNX模型
26
+ print("--> Loading model")
27
+ ret = rknn.load_onnx(model=ONNX_MODEL,
28
+ inputs=['generated_tokens'],
29
+ input_size_list=[[1, 576]])
30
+ if ret != 0:
31
+ print('Load model failed!')
32
+ exit(ret)
33
+
34
+ # 构建模型
35
+ print("--> Building model")
36
+ ret = rknn.build(do_quantization=False)
37
+ if ret != 0:
38
+ print('Build model failed!')
39
+ exit(ret)
40
+
41
+ # 导出RKNN模型
42
+ print("--> Export RKNN model")
43
+ ret = rknn.export_rknn(RKNN_MODEL)
44
+ if ret != 0:
45
+ print('Export RKNN model failed!')
46
+ exit(ret)
47
+
48
+ print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
49
+ rknn.release()
50
+
51
+ if __name__ == '__main__':
52
+ main()
convert_lm_head.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ztu_somemodelruntime_rknn2: lm_head
3
+
4
+ from rknn.api import RKNN
5
+ import os
6
+ import numpy as np
7
+
8
+ def main():
9
+ # 创建RKNN实例
10
+ rknn = RKNN(verbose=True)
11
+
12
+ # ONNX模型路径
13
+ ONNX_MODEL = "lm_head.onnx"
14
+ # 输出RKNN模型路径
15
+ RKNN_MODEL = "lm_head.rknn"
16
+
17
+ # 配置参数
18
+ print("--> Config model")
19
+ ret = rknn.config(target_platform="rk3588",
20
+ dynamic_input=None)
21
+ if ret != 0:
22
+ print('Config model failed!')
23
+ exit(ret)
24
+
25
+ # 加载ONNX模型
26
+ print("--> Loading model")
27
+ ret = rknn.load_onnx(model=ONNX_MODEL,
28
+ inputs=['hidden_states'],
29
+ input_size_list=[[1, 1, 2048]])
30
+ if ret != 0:
31
+ print('Load model failed!')
32
+ exit(ret)
33
+
34
+ # 构建模型
35
+ print("--> Building model")
36
+ ret = rknn.build(do_quantization=False)
37
+ if ret != 0:
38
+ print('Build model failed!')
39
+ exit(ret)
40
+
41
+ # 导出RKNN模型
42
+ print("--> Export RKNN model")
43
+ ret = rknn.export_rknn(RKNN_MODEL)
44
+ if ret != 0:
45
+ print('Export RKNN model failed!')
46
+ exit(ret)
47
+
48
+ print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
49
+ rknn.release()
50
+
51
+ if __name__ == '__main__':
52
+ main()
convert_vision_encoder.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # ztu_somemodelruntime_rknn2: prepare_inputs_embeds
3
+
4
+ from rknn.api import RKNN
5
+ import os
6
+ import numpy as np
7
+
8
+ def main():
9
+ # 创建RKNN实例
10
+ rknn = RKNN(verbose=True)
11
+
12
+ # ONNX模型路径
13
+ ONNX_MODEL = "prepare_inputs_embeds.onnx"
14
+ # 输出RKNN模型路径
15
+ RKNN_MODEL = "vision_encoder.rknn"
16
+
17
+ # 配置参数
18
+ print("--> Config model")
19
+ ret = rknn.config(target_platform="rk3588",
20
+ dynamic_input=None)
21
+ if ret != 0:
22
+ print('Config model failed!')
23
+ exit(ret)
24
+
25
+ # 加载ONNX模型
26
+ print("--> Loading model")
27
+ ret = rknn.load_onnx(model=ONNX_MODEL,
28
+ inputs=['pixel_values'],
29
+ input_size_list=[[1, 1, 3, 384, 384]],
30
+ outputs=['/aligner/layers/layers.2/Add_output_0'])
31
+ if ret != 0:
32
+ print('Load model failed!')
33
+ exit(ret)
34
+
35
+ # 构建模型
36
+ print("--> Building model")
37
+ ret = rknn.build(do_quantization=False)
38
+ if ret != 0:
39
+ print('Build model failed!')
40
+ exit(ret)
41
+
42
+ # 导出RKNN模型
43
+ print("--> Export RKNN model")
44
+ ret = rknn.export_rknn(RKNN_MODEL)
45
+ if ret != 0:
46
+ print('Export RKNN model failed!')
47
+ exit(ret)
48
+
49
+ print(f'Done! The converted RKNN model has been saved to: ' + RKNN_MODEL)
50
+ rknn.release()
51
+
52
+ if __name__ == '__main__':
53
+ main()
embed_tokens.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b65c83b08bd82cfb4ef6009755244316374f593970625761957278f12920c6
3
+ size 838861039
gen_head.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97742dd3a76eb491cc6311b80b05ecb9d9547963223c095539993064e0605b64
3
+ size 75616889
gen_img_embeds.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b8e29ad96ee5cb254a5cfe3401f73df6c872b9a1f4138d4bf5209a9b5d860d
3
+ size 8741750
image_decode.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f8d30304794961107bd904db7953103da7d2535aad3534f51d0722ad87e1ec1
3
+ size 247330434
lm_head.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af8830d6b3d889abfd2a1ba2801a98cce88eb49ab637a1fb84bd989a34071aaa
3
+ size 419496186
rkllm-convert.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rkllm.api import RKLLM
2
+
3
+ modelpath = '.'
4
+ llm = RKLLM()
5
+
6
+ ret = llm.load_huggingface(model=modelpath, model_lora=None, device='cpu')
7
+ if ret != 0:
8
+ print('Load model failed!')
9
+ exit(ret)
10
+
11
+ qparams = None
12
+ ret = llm.build(do_quantization=False, optimization_level=1, quantized_dtype='w8a8',
13
+ quantized_algorithm='normal', target_platform='rk3588', num_npu_core=3, extra_qparams=qparams)
14
+
15
+ if ret != 0:
16
+ print('Build model failed!')
17
+ exit(ret)
18
+
19
+ # Export rkllm model
20
+ ret = llm.export_rkllm("./language_model.rkllm")
21
+ if ret != 0:
22
+ print('Export model failed!')
23
+ exit(ret)
run_rkllm.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faulthandler
2
+ faulthandler.enable()
3
+
4
+ import os
5
+ os.environ["RKLLM_LOG_LEVEL"] = "1"
6
+ import numpy as np
7
+ import onnxruntime as real_ort
8
+ import ztu_somemodelruntime_rknn2 as ort
9
+ from tokenizers import Tokenizer
10
+ import cv2
11
+ import tqdm
12
+ import time
13
+ import ctypes
14
+
15
+ from rkllm_binding import *
16
+
17
+ model_path = "."
18
+ onnx_model_path = f"{model_path}"
19
+ tokenizer = Tokenizer.from_file(f"{model_path}/tokenizer.json")
20
+ # np.random.seed(0)
21
+
22
+ # image = None
23
+ # prompt = "A stunning princess from kabul in red, white traditional clothing, blue eyes, brown hair"
24
+ # mode = "t2i" # 文本生成图片, 反过来是it2t -> 图片/文本生成文本
25
+
26
+ image = "./test.jpg"
27
+ prompt = "仔细描述这张图片。"
28
+ mode = "it2t"
29
+
30
+ tempature = 0.7
31
+
32
+ # 全局变量用于存储 rkllm 推理结果
33
+ rkllm_result_data = {
34
+ 'hidden_states': None,
35
+ 'finished': False,
36
+ 'error': False
37
+ }
38
+
39
+ def rkllm_callback(result_ptr, userdata_ptr, state_enum):
40
+ """RKLLM 推理回调函数"""
41
+ global rkllm_result_data
42
+
43
+ try:
44
+ state = LLMCallState(state_enum)
45
+ # print(f"回调状态: {state.name}")
46
+
47
+ if state == LLMCallState.RKLLM_RUN_FINISH:
48
+ rkllm_result_data['finished'] = True
49
+ print("RKLLM 推理完成")
50
+ return
51
+ elif state == LLMCallState.RKLLM_RUN_ERROR:
52
+ rkllm_result_data['error'] = True
53
+ rkllm_result_data['error_msg'] = "RKLLM 推理出错"
54
+ rkllm_result_data['finished'] = True
55
+ print("错误: RKLLM 推理出错")
56
+
57
+ # 检查 result_ptr 是否为空
58
+ if not result_ptr:
59
+ print("警告: result_ptr 为空指针")
60
+ return
61
+
62
+ result = result_ptr.contents
63
+ # print(result.perf)
64
+ if state == LLMCallState.RKLLM_RUN_NORMAL:
65
+ # 获取 last hidden layer 结果
66
+ if result.last_hidden_layer.hidden_states and result.last_hidden_layer.embd_size > 0:
67
+ # 将 C 数组转换为 numpy 数组
68
+ hidden_size = result.last_hidden_layer.embd_size
69
+ num_tokens = result.last_hidden_layer.num_tokens
70
+
71
+ # print(f"Hidden layer info: num_tokens={num_tokens}, embd_size={hidden_size}")
72
+
73
+ # 创建 numpy 数组从 C 指针
74
+ hidden_array = np.ctypeslib.as_array(
75
+ result.last_hidden_layer.hidden_states,
76
+ shape=(num_tokens, hidden_size)
77
+ ).copy() # 复制数据以避免内存问题
78
+
79
+ rkllm_result_data['hidden_states'] = hidden_array
80
+ # print(f"成功获取 hidden states,形状: {hidden_array.shape}")
81
+ rkllm_result_data['finished'] = True
82
+ return 1
83
+ else:
84
+ print("警告: 没有获取到有效的 hidden states")
85
+
86
+ return 1
87
+ except Exception as e:
88
+ print(f"回调函数异常: {e}")
89
+ rkllm_result_data['error'] = True
90
+ rkllm_result_data['error_msg'] = str(e)
91
+ rkllm_result_data['finished'] = True
92
+
93
+ # 1. 加载模型
94
+
95
+ # 视觉编码器
96
+ # <- pixel_values: float32[batch_size,num_images,3,384,384]
97
+ # -> inputs_embeds: float32[batch_size*num_images,576,2048]
98
+ vision_encoder = ort.InferenceSession(f"{onnx_model_path}/vision_encoder.rknn")
99
+
100
+ # 初始化 RKLLM 语言模型
101
+ print("初始化 RKLLM 语言模型...")
102
+ rkllm_runtime = RKLLMRuntime()
103
+ rkllm_params = rkllm_runtime.create_default_param()
104
+ rkllm_params.model_path = f"{model_path}/language_model.rkllm".encode('utf-8')
105
+ rkllm_params.max_context_len = 1024
106
+ rkllm_params.max_new_tokens = 5
107
+ # rkllm_params.temperature = tempature
108
+ rkllm_params.skip_special_token = 0
109
+ rkllm_params.extend_param.base_domain_id = 1
110
+ rkllm_runtime.init(rkllm_params, rkllm_callback)
111
+
112
+ # LM Head
113
+ # <- hidden_states: float32[batch_size,sequence_length,2048]
114
+ # -> logits: float32[batch_size,sequence_length,102400]
115
+ lm_head = ort.InferenceSession(f"{onnx_model_path}/lm_head.onnx")
116
+ # 图片生成Head
117
+ # <- hidden_states: float32[batch_size,sequence_length,2048]
118
+ # -> logits: float32[batch_size,sequence_length,16384]
119
+ gen_head = ort.InferenceSession(f"{onnx_model_path}/gen_head.onnx")
120
+ # 图片生成Embedding
121
+ # <- image_ids: int64[batch_size,sequence_length]
122
+ # -> inputs_embeds: float32[batch_size,sequence_length,2048]
123
+ gen_img_embeds = ort.InferenceSession(f"{onnx_model_path}/gen_img_embeds.onnx")
124
+ # 文本Embedding
125
+ # <- input_ids: int64[batch_size,sequence_length]
126
+ # -> inputs_embeds: float32[batch_size,sequence_length,2048]
127
+ text_embeds = real_ort.InferenceSession(f"{onnx_model_path}/embed_tokens.onnx")
128
+ # VQVAE 解码器 (576个token变成一个384x384的图片)
129
+ # <- generated_tokens: int64[batch_size,sequence_length]
130
+ # -> decoded_image: float32[batch_size,3,384,384]
131
+ image_decode = ort.InferenceSession(f"{onnx_model_path}/image_decode.onnx")
132
+
133
+ # 2. 预处理输入
134
+ # tokenizer会在最开始加<|begin▁of▁sentence|>, 这里不要加!
135
+ if mode == "t2i":
136
+ input_str = f"""<|User|>: {prompt}
137
+
138
+ <|Assistant|>:<begin_of_image>"""
139
+ else:
140
+ input_str = f"""You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
141
+
142
+ <|User|>: <image_placeholder>
143
+ {prompt}
144
+
145
+ <|Assistant|>:"""
146
+
147
+ # 3. 生成Embedding
148
+
149
+ # 把<image_placeholder>替换为576个<image_placeholder>
150
+ input_str = input_str.replace("<image_placeholder>", "<image_placeholder>" * 576)
151
+ input = tokenizer.encode(input_str)
152
+ input_ids = np.array([input.ids], dtype=np.int64)
153
+ input_len = len(input.ids)
154
+ attention_mask = np.array([input.attention_mask], dtype=np.int64)
155
+ images_seq_mask = np.array([[1 if id == 100581 else 0 for id in input.ids]], dtype=np.bool_) # 为什么<image_placeholder>有两个id?
156
+ position_ids = np.expand_dims(np.arange(input_len), axis=0)
157
+ #图片预处理
158
+ if image:
159
+ img = cv2.imread(image)
160
+ if img is None:
161
+ raise ValueError(f"无法读取图片: {image}")
162
+ # 将BGR转换为RGB
163
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
164
+ # 调整尺寸为目标大小:384x384
165
+ target_size = 384
166
+ img = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_LINEAR)
167
+ # 转换数据类型为float32,并使用rescale_factor归一化像素值到[0,1]
168
+ img = img.astype(np.float32) * 0.00392156862745098 # 0.00392156 = 1/255
169
+ # 根据配置文件归一化: (img - image_mean) / image_std,其中 image_mean = [0.5, 0.5, 0.5], image_std = [0.5, 0.5, 0.5]
170
+ img = (img - np.array([0.5, 0.5, 0.5], dtype=np.float32)) / np.array([0.5, 0.5, 0.5], dtype=np.float32)
171
+ # 如果图像尺寸不是正方形,还可以用background_color填充,这里由于直接resize为正方形,故忽略这部分
172
+ # 转换图像维度为 [batch_size, num_images, channels, height, width]
173
+ # 先将HWC格式转为CHW
174
+ img = img.transpose(2, 0, 1) # 得到 [3, 384, 384]
175
+ pixel_values = np.expand_dims(np.expand_dims(img, axis=0), axis=1) # [1, 1, 3, 384, 384]
176
+ images_emb_mask = np.ones((1, 1, 576), dtype=np.bool_)
177
+ else:
178
+ pixel_values = np.zeros((0, 0, 3, 384, 384), dtype=np.float32)
179
+ images_emb_mask = np.zeros((1, 0, 576), dtype=np.bool_)
180
+
181
+ # 手动处理输入embeddings
182
+ # 1. 先获取文本embeddings
183
+ text_inputs_embeds = text_embeds.run(None, {"input_ids": input_ids})[0] # [1, input_len, 2048]
184
+
185
+ # 2. 如果有图片,获取视觉embeddings并插入到文本中
186
+ if image:
187
+ # 运行视觉编码器
188
+ vision_embeds = vision_encoder.run(None, {"pixel_values": pixel_values})[0] # [1, 576, 2048]
189
+
190
+ # 找到所有 <image_placeholder> token 的位置(images_seq_mask中为True的位置)
191
+ image_token_positions = np.where(images_seq_mask[0])[0] # 获取所有为True的索引
192
+
193
+ # 将视觉embeddings插入到对应位置
194
+ # 遍历每个图片token位置,替换为对应的视觉embedding
195
+ for idx, pos in enumerate(image_token_positions):
196
+ if idx < vision_embeds.shape[1]: # 确保不超出vision_embeds的范围
197
+ text_inputs_embeds[0, pos, :] = vision_embeds[0, idx, :]
198
+
199
+ inputs_embeds = text_inputs_embeds
200
+
201
+ # 4. 语言模型推理(使用 RKLLM)
202
+ # 用于保存生成的图片 token(这里仅保留条件分支生成的 token)
203
+ generated_tokens = []
204
+
205
+ # 初始化可复用的对象
206
+ rkllm_input = RKLLMInput()
207
+ rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_EMBED
208
+ embed_input = RKLLMEmbedInput()
209
+ infer_params = RKLLMInferParam()
210
+ infer_params.mode = RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER
211
+ infer_params.keep_history = 1
212
+
213
+ def run_rkllm_inference(inputs_embeds):
214
+ """使用 RKLLM 进行推理,输入 embedding,输出 hidden states"""
215
+ global rkllm_result_data
216
+
217
+ # 重置结果
218
+ rkllm_result_data = {
219
+ 'hidden_states': None,
220
+ 'finished': False,
221
+ 'error': False
222
+ }
223
+
224
+ # 更新embedding输入数据
225
+ embed_flat = inputs_embeds.flatten().astype(np.float32)
226
+ embed_c_array = (ctypes.c_float * len(embed_flat))(*embed_flat)
227
+ embed_input.embed = embed_c_array
228
+ embed_input.n_tokens = inputs_embeds.shape[1] # sequence length
229
+
230
+ rkllm_input._union_data.embed_input = embed_input
231
+
232
+ # 运行推理
233
+ rkllm_runtime.run(rkllm_input, infer_params)
234
+
235
+ # 等待结果
236
+ while not rkllm_result_data['finished']:
237
+ time.sleep(0.001) # 短暂等待
238
+
239
+ if rkllm_result_data['error']:
240
+ raise RuntimeError("RKLLM 推理出错")
241
+
242
+ return rkllm_result_data['hidden_states']
243
+
244
+ # 循环生成576个图片 token
245
+ with tqdm.tqdm(range(576)) as pbar:
246
+ for i in pbar:
247
+ # 使用 RKLLM 进行推理
248
+ hidden_states = run_rkllm_inference(inputs_embeds)
249
+
250
+ if hidden_states is None:
251
+ raise RuntimeError("RKLLM 未返回有效的 hidden states")
252
+
253
+ # 重新整形为期望的格式 [batch_size, sequence_length, hidden_size]
254
+ if len(hidden_states.shape) == 2:
255
+ # 如果是 [num_tokens, hidden_size],添加 batch 维度
256
+ hidden_states = hidden_states.reshape(1, hidden_states.shape[0], hidden_states.shape[1])
257
+
258
+ # 取最后一个 token 的 hidden state
259
+ hs = hidden_states[:, -1:, :] # shape: [1, 1, 2048]
260
+
261
+ # 使用 Head 得到当前步输出的 logits
262
+ logits = (gen_head if mode == "t2i" else lm_head).run(None, {"hidden_states": hs})[0]
263
+ logits = logits[:, -1, :] # shape: [1, vocab_size]
264
+
265
+ # 温度采样,调整 logits 分布并随机采样 (不能用贪婪采样)
266
+ logits = logits / tempature
267
+ # 计算 softmax
268
+ exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))[0]
269
+ probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
270
+ # 多项式采样
271
+ probs = probs.astype(np.float64)
272
+ probs /= probs.sum()
273
+ next_token = int(np.random.multinomial(1, probs).argmax())
274
+ pbar.set_postfix(next_token=tokenizer.decode([next_token]))
275
+ generated_tokens.append(next_token)
276
+ if next_token == 100001: # eos
277
+ break
278
+
279
+ # 将生成的 token 转换为 embedding 并与之前的 embedding 拼接
280
+ if mode == "t2i":
281
+ new_embed = gen_img_embeds.run(None, {"image_ids": np.array([[next_token]], dtype=np.int64)})[0]
282
+ else:
283
+ new_embed = text_embeds.run(None, {"input_ids": np.array([[next_token]], dtype=np.int64)})[0]
284
+
285
+ # 将新生成的 embedding 拼接到 inputs_embeds 后面
286
+ # inputs_embeds = np.concatenate([inputs_embeds, new_embed], axis=1)
287
+ inputs_embeds = new_embed
288
+
289
+ rkllm_runtime.clear_kv_cache(False)
290
+
291
+ # 5. 图片或者文本解码
292
+ if mode == "t2i":
293
+ # 将生成的576个图片token拼接为数组,输入到VQVAE解码器进行图像解码
294
+ generated_tokens_array = np.array([generated_tokens], dtype=np.int64) # shape: [1, 576]
295
+ decoded_image = image_decode.run(None, {"generated_tokens": generated_tokens_array})[0] # 输出形状: [1, 3, 384, 384]
296
+ decoded_image = np.clip((decoded_image + 1) / 2 * 255, 0, 255)
297
+ # 后处理图像:将图像从CHW转换为HWC,并利用cv2保存为png格式
298
+ decoded_image = np.squeeze(decoded_image, axis=0) # [3, 384, 384]
299
+ decoded_image = np.transpose(decoded_image, (1, 2, 0)) # [384, 384, 3]
300
+ cv2.imwrite("generated.png", cv2.cvtColor(decoded_image, cv2.COLOR_RGB2BGR))
301
+ print("(generated.png)")
302
+ else:
303
+ decoded_text = tokenizer.decode(generated_tokens)
304
+ print(f"{decoded_text}")
305
+
306
+ # 清理资源
307
+ print("清理 RKLLM 资源...")
308
+ rkllm_runtime.destroy()
vision_encoder.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fad8869233fe5c212fd035a51dc3dc21b32b08822f8cf28a23426288eb278c9
3
+ size 642104989