|
|
import cv2 |
|
|
from transformers import AutoTokenizer, AutoConfig |
|
|
import numpy as np |
|
|
from ml_dtypes import bfloat16 |
|
|
from axengine import InferenceSession |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
def img_preprocess(img, input_size): |
|
|
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) |
|
|
IMAGENET_STD = np.array((0.229, 0.224, 0.225), dtype=np.float32) |
|
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) |
|
|
img = cv2.resize(img, (input_size, input_size)) |
|
|
img = img.astype(np.float32) / 255.0 |
|
|
img = (img - IMAGENET_MEAN) / IMAGENET_STD |
|
|
img = img.transpose(2, 0, 1).reshape(1, 3, input_size, input_size) |
|
|
return img |
|
|
|
|
|
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): |
|
|
best_ratio_diff = float('inf') |
|
|
best_ratio = (1, 1) |
|
|
area = width * height |
|
|
for ratio in target_ratios: |
|
|
target_aspect_ratio = ratio[0] / ratio[1] |
|
|
ratio_diff = abs(aspect_ratio - target_aspect_ratio) |
|
|
if ratio_diff < best_ratio_diff: |
|
|
best_ratio_diff = ratio_diff |
|
|
best_ratio = ratio |
|
|
elif ratio_diff == best_ratio_diff: |
|
|
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: |
|
|
best_ratio = ratio |
|
|
return best_ratio |
|
|
|
|
|
def dynamic_preprocess(image:np.array, min_num=1, max_num=12, image_size=448, use_thumbnail=False): |
|
|
orig_height, orig_width, = image.shape[:2] |
|
|
aspect_ratio = orig_width / orig_height |
|
|
|
|
|
|
|
|
target_ratios = set( |
|
|
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if |
|
|
i * j <= max_num and i * j >= min_num) |
|
|
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) |
|
|
|
|
|
|
|
|
target_aspect_ratio = find_closest_aspect_ratio( |
|
|
aspect_ratio, target_ratios, orig_width, orig_height, image_size) |
|
|
|
|
|
|
|
|
target_width = image_size * target_aspect_ratio[0] |
|
|
target_height = image_size * target_aspect_ratio[1] |
|
|
blocks = target_aspect_ratio[0] * target_aspect_ratio[1] |
|
|
|
|
|
|
|
|
|
|
|
resized_img = cv2.resize(image, (target_width, target_height)) |
|
|
processed_images = [] |
|
|
for i in range(blocks): |
|
|
box = ( |
|
|
(i % (target_width // image_size)) * image_size, |
|
|
(i // (target_width // image_size)) * image_size, |
|
|
((i % (target_width // image_size)) + 1) * image_size, |
|
|
((i // (target_width // image_size)) + 1) * image_size |
|
|
) |
|
|
|
|
|
|
|
|
split_img = resized_img[box[1]:box[3], box[0]:box[2]] |
|
|
processed_images.append(split_img) |
|
|
assert len(processed_images) == blocks |
|
|
if use_thumbnail and len(processed_images) != 1: |
|
|
|
|
|
thumbnail_img = cv2.resize(image, (image_size, image_size)) |
|
|
processed_images.append(thumbnail_img) |
|
|
return processed_images |
|
|
|
|
|
def pre_process(image, input_size=448, max_num=12): |
|
|
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) |
|
|
pixel_values = [img_preprocess(image, input_size) for image in images] |
|
|
pixel_values = np.concatenate(pixel_values, axis=0) |
|
|
return pixel_values |
|
|
|
|
|
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): |
|
|
if bound: |
|
|
start, end = bound[0], bound[1] |
|
|
else: |
|
|
start, end = -100000, 100000 |
|
|
start_idx = max(first_idx, round(start * fps)) |
|
|
end_idx = min(round(end * fps), max_frame) |
|
|
seg_size = float(end_idx - start_idx) / num_segments |
|
|
frame_indices = np.array([ |
|
|
int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) |
|
|
for idx in range(num_segments) |
|
|
]) |
|
|
return frame_indices |
|
|
|
|
|
|
|
|
def load_video_opencv(video_path, bound=None, num_segments=32): |
|
|
cap = cv2.VideoCapture(video_path) |
|
|
if not cap.isOpened(): |
|
|
raise IOError(f"Cannot open video: {video_path}") |
|
|
|
|
|
fps = cap.get(cv2.CAP_PROP_FPS) |
|
|
max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1 |
|
|
|
|
|
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments) |
|
|
|
|
|
images_list = [] |
|
|
for frame_index in frame_indices: |
|
|
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index) |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
print(f"⚠ Failed to read frame {frame_index}") |
|
|
continue |
|
|
images_list.append(frame) |
|
|
|
|
|
cap.release() |
|
|
return images_list |
|
|
|
|
|
|
|
|
def is_video_file(path): |
|
|
return str(path).lower().endswith((".mp4", ".avi", ".mov", ".mkv", ".webm")) |
|
|
|
|
|
def is_image_file(path): |
|
|
return str(path).lower().endswith((".jpg", ".png", ".jpeg", ".webp")) |
|
|
|
|
|
def load_image(path): |
|
|
image = cv2.imread(str(path)) |
|
|
if image is None: |
|
|
raise ValueError(f"Image {path} not found or cannot be read.") |
|
|
return image |
|
|
|
|
|
def post_process(data, topk=1, topp=0.9, temperature=0.6): |
|
|
def top_p(l: np.ndarray, p: float) -> np.ndarray: |
|
|
index = np.argsort(l) |
|
|
res = l.copy() |
|
|
sum_p = 0 |
|
|
for i in index[::-1]: |
|
|
if sum_p >= p: |
|
|
res[i] = 0 |
|
|
sum_p += res[i] |
|
|
return res / sum_p |
|
|
|
|
|
def softmax(l: np.ndarray) -> np.ndarray: |
|
|
l_max = l - l.max() |
|
|
l_exp = np.exp(l_max) |
|
|
res = l_exp / np.sum(l_exp) |
|
|
return res.astype(np.float64) |
|
|
|
|
|
r = data.astype(np.float32) |
|
|
r = r.flatten() |
|
|
|
|
|
candidate_index = np.argpartition(r, -topk)[-topk:] |
|
|
candidate_value = r[candidate_index] |
|
|
|
|
|
candidate_value /= temperature |
|
|
|
|
|
candidate_soft = softmax(candidate_value) |
|
|
|
|
|
candidate_soft = top_p(candidate_soft, topp) |
|
|
candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum() |
|
|
pos = np.random.multinomial(1, candidate_soft).argmax() |
|
|
next_token = candidate_index[pos] |
|
|
return next_token, candidate_index, candidate_soft |
|
|
|
|
|
|
|
|
class LLM: |
|
|
|
|
|
def __init__(self, hf_model_path, axmodel_path, vit_axmodel_path ): |
|
|
self.hf_model_path = hf_model_path |
|
|
self.tag = "image" |
|
|
|
|
|
config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True) |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(hf_model_path, trust_remote_code=True, use_fast=False) |
|
|
self.cfg = config.llm_config |
|
|
|
|
|
self.prefill_slice_len=128 |
|
|
self.kv_cache_len=2559 |
|
|
|
|
|
self.prefill_decoder_sessins = [] |
|
|
for i in tqdm(range(self.cfg.num_hidden_layers), desc="Init InferenceSession"): |
|
|
session = InferenceSession( |
|
|
f"{axmodel_path}/qwen2_p128_l{i}_together.axmodel" |
|
|
) |
|
|
self.prefill_decoder_sessins.append(session) |
|
|
|
|
|
self.post_process_session = InferenceSession( |
|
|
f"{axmodel_path}/qwen2_post.axmodel" |
|
|
) |
|
|
print("model load done!") |
|
|
|
|
|
self.kv_dim = self.cfg.hidden_size // self.cfg.num_attention_heads * self.cfg.num_key_value_heads |
|
|
|
|
|
|
|
|
self.vit_session = InferenceSession(vit_axmodel_path) |
|
|
|
|
|
self.embeds = np.load(f"{axmodel_path}/model.embed_tokens.weight.npy") |
|
|
|
|
|
self.stop = False |
|
|
|
|
|
def stop_generate(self): |
|
|
self.stop = True |
|
|
|
|
|
def image_encode(self, images_list): |
|
|
pixel_values_list = [] |
|
|
vit_output_list = [] |
|
|
if images_list is not None: |
|
|
for img in images_list: |
|
|
pixel_values = pre_process(img, input_size=448, max_num=1) |
|
|
pixel_values_list.append(pixel_values) |
|
|
print(f"输入图像数: {len(pixel_values_list)}") |
|
|
print("preprocess image done!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for idx, pixel_values in enumerate(pixel_values_list): |
|
|
vit_output = self.vit_session.run(None, {"image": pixel_values})[0] |
|
|
vit_output_list.append(vit_output.copy()) |
|
|
|
|
|
print(f"vit_output.shape is {vit_output_list[0].shape}, vit feature extract done!") |
|
|
|
|
|
return vit_output_list |
|
|
|
|
|
def prompt_encode(self, question, num_of_images) -> list: |
|
|
prompt = "<|im_start|>system\n你是书生·万象, 英文名是InternVL, 是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型.<|im_end|>\n" |
|
|
|
|
|
|
|
|
if num_of_images > 0: |
|
|
for idx in range(num_of_images): |
|
|
if self.tag == "video": |
|
|
prompt += "<|im_start|>user" |
|
|
prompt += f"\nFrame{idx+1}: <img>" + "<IMG_CONTEXT>" * 256 + "</img>\n" |
|
|
prompt += f"\n{question}<|im_end|>\n<|im_start|>assistant\n" |
|
|
else: |
|
|
prompt += "<|im_start|>user\n" + question |
|
|
prompt += "\n<img>" + "<IMG_CONTEXT>" * 256 + "</img>\n" |
|
|
prompt += "<|im_end|>\n<|im_start|>assistant\n" |
|
|
|
|
|
token_ids = self.tokenizer.encode(prompt) |
|
|
print(f"prompt is {prompt}, \ntoken_len is {len(token_ids)}") |
|
|
return token_ids |
|
|
|
|
|
|
|
|
def generate(self, sources, prompt, video_segments=8): |
|
|
self.stop = False |
|
|
images_list = [] |
|
|
|
|
|
|
|
|
if isinstance(sources, str) and is_video_file(sources): |
|
|
images_list = load_video_opencv(sources, num_segments=video_segments) |
|
|
|
|
|
|
|
|
elif isinstance(sources, list) and len(sources) == 1 and isinstance(sources[0], str) and is_video_file(sources[0]): |
|
|
images_list = load_video_opencv(sources[0], num_segments=video_segments) |
|
|
|
|
|
|
|
|
elif isinstance(sources, str) and is_image_file(sources): |
|
|
images_list = [load_image(sources)] |
|
|
|
|
|
|
|
|
elif isinstance(sources, np.ndarray): |
|
|
images_list = [sources] |
|
|
|
|
|
|
|
|
elif isinstance(sources, list): |
|
|
for img in sources: |
|
|
if isinstance(img, str): |
|
|
images_list.append(load_image(img)) |
|
|
elif isinstance(img, np.ndarray): |
|
|
images_list.append(img) |
|
|
else: |
|
|
raise ValueError(f"Unsupported image type: {type(img)}") |
|
|
else: |
|
|
raise ValueError("Unsupported input format for 'sources'.") |
|
|
|
|
|
vit_output_list = self.image_encode(images_list) |
|
|
|
|
|
token_ids = self.prompt_encode(prompt, len(vit_output_list)) |
|
|
|
|
|
k_caches = [ |
|
|
np.zeros((1, self.kv_cache_len, self.kv_dim), dtype=bfloat16) |
|
|
for _ in range(self.cfg.num_hidden_layers) |
|
|
] |
|
|
v_caches = [ |
|
|
np.zeros((1, self.kv_cache_len, self.kv_dim), dtype=bfloat16) |
|
|
for _ in range(self.cfg.num_hidden_layers) |
|
|
] |
|
|
|
|
|
|
|
|
image_start_indices = np.where(np.array(token_ids) == 151665)[0].tolist() |
|
|
|
|
|
prefill_data = np.take(self.embeds, token_ids, axis=0) |
|
|
prefill_data = prefill_data.astype(bfloat16) |
|
|
token_len = len(token_ids) |
|
|
|
|
|
assert token_len < 2048 + 128, f"输入 prompt({token_len}) 超过最大限度!" |
|
|
for idx, image_start_index in enumerate(image_start_indices): |
|
|
image_insert_index = image_start_index + 1 |
|
|
prefill_data[image_insert_index : image_insert_index + 256] = vit_output_list[idx][0, :, :] |
|
|
|
|
|
print("prefill token_len: ", token_len) |
|
|
|
|
|
""" |
|
|
prefill |
|
|
""" |
|
|
prefill_slice_len = self.prefill_slice_len |
|
|
|
|
|
slice_indexs = [ |
|
|
e for e in range(token_len // prefill_slice_len + 1) |
|
|
] |
|
|
|
|
|
prefill_len = prefill_slice_len * slice_indexs[-1] if slice_indexs[-1] != 0 else prefill_slice_len |
|
|
|
|
|
if prefill_len > 0: |
|
|
for slice_index in tqdm(slice_indexs, desc="prefill"): |
|
|
indices = np.array( |
|
|
list( |
|
|
range( |
|
|
slice_index * prefill_slice_len, |
|
|
(slice_index + 1) * prefill_slice_len, |
|
|
) |
|
|
), |
|
|
np.uint32, |
|
|
).reshape((1, prefill_slice_len)) |
|
|
|
|
|
mask = ( |
|
|
np.zeros((1, prefill_slice_len, prefill_slice_len * (slice_index + 1))) |
|
|
- 65536 |
|
|
) |
|
|
data = np.zeros((1, prefill_slice_len, self.cfg.hidden_size)).astype(bfloat16) |
|
|
for i, t in enumerate( |
|
|
range( |
|
|
slice_index * prefill_slice_len, |
|
|
(slice_index + 1) * prefill_slice_len, |
|
|
) |
|
|
): |
|
|
if t < len(token_ids): |
|
|
mask[:, i, : slice_index * prefill_slice_len + i + 1] = 0 |
|
|
data[:, i : i + 1, :] = ( |
|
|
prefill_data[t] |
|
|
.reshape((1, 1, self.cfg.hidden_size)) |
|
|
.astype(bfloat16) |
|
|
) |
|
|
|
|
|
if slice_index == slice_indexs[-1]: |
|
|
remain_len = token_len - slice_index * prefill_slice_len |
|
|
else: |
|
|
remain_len = prefill_slice_len |
|
|
mask = mask.astype(bfloat16) |
|
|
for i in range(self.cfg.num_hidden_layers): |
|
|
input_feed = { |
|
|
"K_cache": ( |
|
|
k_caches[i][:, 0 : prefill_slice_len * slice_index, :] |
|
|
if slice_index |
|
|
else np.zeros((1, 1, self.cfg.hidden_size), dtype=bfloat16) |
|
|
), |
|
|
"V_cache": ( |
|
|
v_caches[i][:, 0 : prefill_slice_len * slice_index, :] |
|
|
if slice_index |
|
|
else np.zeros((1, 1, self.cfg.hidden_size), dtype=bfloat16) |
|
|
), |
|
|
"indices": indices, |
|
|
"input": data, |
|
|
"mask": mask, |
|
|
} |
|
|
outputs = self.prefill_decoder_sessins[i].run(None, input_feed, shape_group=slice_index + 1) |
|
|
k_caches[i][ |
|
|
:, |
|
|
slice_index |
|
|
* prefill_slice_len : slice_index |
|
|
* prefill_slice_len + remain_len, |
|
|
:, |
|
|
] = outputs[0][:, :remain_len, :] |
|
|
v_caches[i][ |
|
|
:, |
|
|
slice_index |
|
|
* prefill_slice_len : slice_index |
|
|
* prefill_slice_len + remain_len, |
|
|
:, |
|
|
] = outputs[1][:, :remain_len, :] |
|
|
data = outputs[2] |
|
|
|
|
|
if self.stop: |
|
|
return |
|
|
|
|
|
|
|
|
post_out = self.post_process_session.run( |
|
|
None, |
|
|
{ |
|
|
"input": data[ |
|
|
:, token_len - (len(slice_indexs) - 1) * prefill_slice_len - 1, None, : |
|
|
] |
|
|
} |
|
|
)[0] |
|
|
next_token, posssible_tokens, possible_soft = post_process(post_out) |
|
|
posibles = [self.tokenizer.decode([t]) for t in posssible_tokens] |
|
|
posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)] |
|
|
token_ids.append(next_token) |
|
|
|
|
|
|
|
|
token_ids_cached = [] |
|
|
token_ids_cached.append(next_token) |
|
|
|
|
|
mask = np.zeros((1, 1, self.kv_cache_len + 1), dtype=np.float32).astype(bfloat16) |
|
|
mask[:, :, :self.kv_cache_len] -= 65536 |
|
|
if prefill_len > 0: |
|
|
mask[:, :, :token_len] = 0 |
|
|
|
|
|
for start_indice in range(self.kv_cache_len): |
|
|
if prefill_len > 0 and start_indice < token_len: |
|
|
continue |
|
|
|
|
|
next_token = token_ids[start_indice] |
|
|
indices = np.array([start_indice], np.uint32).reshape((1, 1)) |
|
|
data = self.embeds[next_token, :].reshape((1, 1, self.cfg.hidden_size)).astype(bfloat16) |
|
|
for i in range(self.cfg.num_hidden_layers): |
|
|
input_feed = { |
|
|
"K_cache": k_caches[i], |
|
|
"V_cache": v_caches[i], |
|
|
"indices": indices, |
|
|
"input": data, |
|
|
"mask": mask, |
|
|
} |
|
|
outputs = self.prefill_decoder_sessins[i].run(None, input_feed, shape_group=0) |
|
|
k_caches[i][:, start_indice, :] = outputs[0][:, :, :] |
|
|
v_caches[i][:, start_indice, :] = outputs[1][:, :, :] |
|
|
data = outputs[2] |
|
|
mask[..., start_indice] = 0 |
|
|
if start_indice < token_len - 1: |
|
|
pass |
|
|
else: |
|
|
post_out = self.post_process_session.run(None, {"input": data})[0] |
|
|
next_token, posssible_tokens, possible_soft = post_process(post_out) |
|
|
token_ids.append(next_token) |
|
|
|
|
|
if next_token == self.tokenizer.eos_token_id and next_token > token_len: |
|
|
if len(token_ids_cached) > 0: |
|
|
msg = self.tokenizer.decode(token_ids_cached) |
|
|
token_ids_cached.clear() |
|
|
if "\ufffd" in msg: |
|
|
msg = msg.replace("\ufffd", "") |
|
|
|
|
|
yield msg |
|
|
break |
|
|
|
|
|
token_ids_cached.append(next_token) |
|
|
|
|
|
if len(token_ids_cached) >= 3: |
|
|
msg = self.tokenizer.decode(token_ids_cached) |
|
|
token_ids_cached.clear() |
|
|
if "\ufffd" in msg: |
|
|
msg = msg.replace("\ufffd", "") |
|
|
|
|
|
yield msg |
|
|
|
|
|
|
|
|
if self.stop: |
|
|
return |
|
|
|