InternVL3-2B / llm.py

qqc1989

Upload 18 files

6bced81 verified 6 months ago

18.5 kB

	import cv2
	from transformers import AutoTokenizer, AutoConfig
	import numpy as np
	from ml_dtypes import bfloat16
	from axengine import InferenceSession
	from tqdm import tqdm
	# from decord import VideoReader

	def img_preprocess(img, input_size):
	IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
	IMAGENET_STD = np.array((0.229, 0.224, 0.225), dtype=np.float32)
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
	img = cv2.resize(img, (input_size, input_size))
	img = img.astype(np.float32) / 255.0
	img = (img - IMAGENET_MEAN) / IMAGENET_STD
	img = img.transpose(2, 0, 1).reshape(1, 3, input_size, input_size)
	return img

	def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
	best_ratio_diff = float('inf')
	best_ratio = (1, 1)
	area = width * height
	for ratio in target_ratios:
	target_aspect_ratio = ratio[0] / ratio[1]
	ratio_diff = abs(aspect_ratio - target_aspect_ratio)
	if ratio_diff < best_ratio_diff:
	best_ratio_diff = ratio_diff
	best_ratio = ratio
	elif ratio_diff == best_ratio_diff:
	if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
	best_ratio = ratio
	return best_ratio

	def dynamic_preprocess(image:np.array, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
	orig_height, orig_width, = image.shape[:2]
	aspect_ratio = orig_width / orig_height

	# calculate the existing image aspect ratio
	target_ratios = set(
	(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
	i * j <= max_num and i * j >= min_num)
	target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

	# find the closest aspect ratio to the target
	target_aspect_ratio = find_closest_aspect_ratio(
	aspect_ratio, target_ratios, orig_width, orig_height, image_size)

	# calculate the target width and height
	target_width = image_size * target_aspect_ratio[0]
	target_height = image_size * target_aspect_ratio[1]
	blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

	# resize the image
	# resized_img = image.resize((target_width, target_height))
	resized_img = cv2.resize(image, (target_width, target_height))
	processed_images = []
	for i in range(blocks):
	box = (
	(i % (target_width // image_size)) * image_size,
	(i // (target_width // image_size)) * image_size,
	((i % (target_width // image_size)) + 1) * image_size,
	((i // (target_width // image_size)) + 1) * image_size
	)
	# split the image
	# split_img = resized_img.crop(box)
	split_img = resized_img[box[1]:box[3], box[0]:box[2]]
	processed_images.append(split_img)
	assert len(processed_images) == blocks
	if use_thumbnail and len(processed_images) != 1:
	# thumbnail_img = image.resize((image_size, image_size))
	thumbnail_img = cv2.resize(image, (image_size, image_size))
	processed_images.append(thumbnail_img)
	return processed_images

	def pre_process(image, input_size=448, max_num=12):
	images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
	pixel_values = [img_preprocess(image, input_size) for image in images]
	pixel_values = np.concatenate(pixel_values, axis=0)
	return pixel_values

	def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
	if bound:
	start, end = bound[0], bound[1]
	else:
	start, end = -100000, 100000
	start_idx = max(first_idx, round(start * fps))
	end_idx = min(round(end * fps), max_frame)
	seg_size = float(end_idx - start_idx) / num_segments
	frame_indices = np.array([
	int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
	for idx in range(num_segments)
	])
	return frame_indices


	def load_video_opencv(video_path, bound=None, num_segments=32):
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	raise IOError(f"Cannot open video: {video_path}")

	fps = cap.get(cv2.CAP_PROP_FPS)
	max_frame = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1

	frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)

	images_list = []
	for frame_index in frame_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
	ret, frame = cap.read()
	if not ret:
	print(f"⚠ Failed to read frame {frame_index}")
	continue
	images_list.append(frame)

	cap.release()
	return images_list


	def is_video_file(path):
	return str(path).lower().endswith((".mp4", ".avi", ".mov", ".mkv", ".webm"))

	def is_image_file(path):
	return str(path).lower().endswith((".jpg", ".png", ".jpeg", ".webp"))

	def load_image(path):
	image = cv2.imread(str(path))
	if image is None:
	raise ValueError(f"Image {path} not found or cannot be read.")
	return image

	def post_process(data, topk=1, topp=0.9, temperature=0.6):
	def top_p(l: np.ndarray, p: float) -> np.ndarray:
	index = np.argsort(l)
	res = l.copy()
	sum_p = 0
	for i in index[::-1]:
	if sum_p >= p:
	res[i] = 0
	sum_p += res[i]
	return res / sum_p

	def softmax(l: np.ndarray) -> np.ndarray:
	l_max = l - l.max()
	l_exp = np.exp(l_max)
	res = l_exp / np.sum(l_exp)
	return res.astype(np.float64)

	r = data.astype(np.float32)
	r = r.flatten()
	# topk
	candidate_index = np.argpartition(r, -topk)[-topk:]
	candidate_value = r[candidate_index]
	# temperature
	candidate_value /= temperature
	# softmax
	candidate_soft = softmax(candidate_value)
	# topp
	candidate_soft = top_p(candidate_soft, topp)
	candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
	pos = np.random.multinomial(1, candidate_soft).argmax()
	next_token = candidate_index[pos]
	return next_token, candidate_index, candidate_soft


	class LLM:

	def __init__(self, hf_model_path, axmodel_path, vit_axmodel_path ):
	self.hf_model_path = hf_model_path
	self.tag = "image"

	config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
	self.tokenizer = AutoTokenizer.from_pretrained(hf_model_path, trust_remote_code=True, use_fast=False)
	self.cfg = config.llm_config

	self.prefill_slice_len=128
	self.kv_cache_len=2559

	self.prefill_decoder_sessins = []
	for i in tqdm(range(self.cfg.num_hidden_layers), desc="Init InferenceSession"):
	session = InferenceSession(
	f"{axmodel_path}/qwen2_p128_l{i}_together.axmodel"
	)
	self.prefill_decoder_sessins.append(session)

	self.post_process_session = InferenceSession(
	f"{axmodel_path}/qwen2_post.axmodel"
	)
	print("model load done!")

	self.kv_dim = self.cfg.hidden_size // self.cfg.num_attention_heads * self.cfg.num_key_value_heads


	self.vit_session = InferenceSession(vit_axmodel_path)

	self.embeds = np.load(f"{axmodel_path}/model.embed_tokens.weight.npy")

	self.stop = False

	def stop_generate(self):
	self.stop = True

	def image_encode(self, images_list):
	pixel_values_list = []
	vit_output_list = []
	if images_list is not None:
	for img in images_list:
	pixel_values = pre_process(img, input_size=448, max_num=1)
	pixel_values_list.append(pixel_values)
	print(f"输入图像数: {len(pixel_values_list)}")
	print("preprocess image done!")

	# extract img feature by vit


	for idx, pixel_values in enumerate(pixel_values_list):
	vit_output = self.vit_session.run(None, {"image": pixel_values})[0]
	vit_output_list.append(vit_output.copy()) # 避免 vit 输出结果使用同一块内存

	print(f"vit_output.shape is {vit_output_list[0].shape}, vit feature extract done!")

	return vit_output_list

	def prompt_encode(self, question, num_of_images) -> list:
	prompt = "<\|im_start\|>system\n你是书生·万象, 英文名是InternVL, 是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型.<\|im_end\|>\n"
	# question = args.question

	if num_of_images > 0:
	for idx in range(num_of_images):
	if self.tag == "video":
	prompt += "<\|im_start\|>user"
	prompt += f"\nFrame{idx+1}: <img>" + "<IMG_CONTEXT>" * 256 + "</img>\n"
	prompt += f"\n{question}<\|im_end\|>\n<\|im_start\|>assistant\n"
	else:
	prompt += "<\|im_start\|>user\n" + question
	prompt += "\n<img>" + "<IMG_CONTEXT>" * 256 + "</img>\n"
	prompt += "<\|im_end\|>\n<\|im_start\|>assistant\n"

	token_ids = self.tokenizer.encode(prompt)
	print(f"prompt is {prompt}, \ntoken_len is {len(token_ids)}")
	return token_ids


	def generate(self, sources, prompt, video_segments=8):
	self.stop = False
	images_list = []

	# 1. Handle single video path string
	if isinstance(sources, str) and is_video_file(sources):
	images_list = load_video_opencv(sources, num_segments=video_segments)

	# 2. Handle [video_path] list
	elif isinstance(sources, list) and len(sources) == 1 and isinstance(sources[0], str) and is_video_file(sources[0]):
	images_list = load_video_opencv(sources[0], num_segments=video_segments)

	# 3. Handle single image path
	elif isinstance(sources, str) and is_image_file(sources):
	images_list = [load_image(sources)]

	# 4. Handle single image as np.ndarray
	elif isinstance(sources, np.ndarray):
	images_list = [sources]

	# 5. Handle list of images or paths
	elif isinstance(sources, list):
	for img in sources:
	if isinstance(img, str):
	images_list.append(load_image(img))
	elif isinstance(img, np.ndarray):
	images_list.append(img)
	else:
	raise ValueError(f"Unsupported image type: {type(img)}")
	else:
	raise ValueError("Unsupported input format for 'sources'.")

	vit_output_list = self.image_encode(images_list)

	token_ids = self.prompt_encode(prompt, len(vit_output_list))

	k_caches = [
	np.zeros((1, self.kv_cache_len, self.kv_dim), dtype=bfloat16)
	for _ in range(self.cfg.num_hidden_layers)
	]
	v_caches = [
	np.zeros((1, self.kv_cache_len, self.kv_dim), dtype=bfloat16)
	for _ in range(self.cfg.num_hidden_layers)
	]

	# 图像理解
	image_start_indices = np.where(np.array(token_ids) == 151665)[0].tolist() # <img> tag

	prefill_data = np.take(self.embeds, token_ids, axis=0)
	prefill_data = prefill_data.astype(bfloat16)
	token_len = len(token_ids)

	assert token_len < 2048 + 128, f"输入 prompt({token_len}) 超过最大限度!"
	for idx, image_start_index in enumerate(image_start_indices):
	image_insert_index = image_start_index + 1
	prefill_data[image_insert_index : image_insert_index + 256] = vit_output_list[idx][0, :, :]
	##################################
	print("prefill token_len: ", token_len)

	"""
	prefill
	"""
	prefill_slice_len = self.prefill_slice_len
	# slice_indexs = [0, 1, 2, 3, 4, 5, 6, 7, 8]
	slice_indexs = [
	e for e in range(token_len // prefill_slice_len + 1)
	]
	# print(f"slice_indexs is {slice_indexs}")
	prefill_len = prefill_slice_len * slice_indexs[-1] if slice_indexs[-1] != 0 else prefill_slice_len # 这里的 128 就是 prefill_slice_len

	if prefill_len > 0:
	for slice_index in tqdm(slice_indexs, desc="prefill"):
	indices = np.array(
	list(
	range(
	slice_index * prefill_slice_len,
	(slice_index + 1) * prefill_slice_len,
	)
	),
	np.uint32,
	).reshape((1, prefill_slice_len))

	mask = (
	np.zeros((1, prefill_slice_len, prefill_slice_len * (slice_index + 1)))
	- 65536
	)
	data = np.zeros((1, prefill_slice_len, self.cfg.hidden_size)).astype(bfloat16)
	for i, t in enumerate(
	range(
	slice_index * prefill_slice_len,
	(slice_index + 1) * prefill_slice_len,
	)
	):
	if t < len(token_ids):
	mask[:, i, : slice_index * prefill_slice_len + i + 1] = 0
	data[:, i : i + 1, :] = (
	prefill_data[t]
	.reshape((1, 1, self.cfg.hidden_size))
	.astype(bfloat16)
	)

	if slice_index == slice_indexs[-1]:
	remain_len = token_len - slice_index * prefill_slice_len
	else:
	remain_len = prefill_slice_len
	mask = mask.astype(bfloat16)
	for i in range(self.cfg.num_hidden_layers):
	input_feed = {
	"K_cache": (
	k_caches[i][:, 0 : prefill_slice_len * slice_index, :]
	if slice_index
	else np.zeros((1, 1, self.cfg.hidden_size), dtype=bfloat16)
	),
	"V_cache": (
	v_caches[i][:, 0 : prefill_slice_len * slice_index, :]
	if slice_index
	else np.zeros((1, 1, self.cfg.hidden_size), dtype=bfloat16)
	),
	"indices": indices,
	"input": data,
	"mask": mask,
	}
	outputs = self.prefill_decoder_sessins[i].run(None, input_feed, shape_group=slice_index + 1)
	k_caches[i][
	:,
	slice_index
	* prefill_slice_len : slice_index
	* prefill_slice_len + remain_len,
	:,
	] = outputs[0][:, :remain_len, :]
	v_caches[i][
	:,
	slice_index
	* prefill_slice_len : slice_index
	* prefill_slice_len + remain_len,
	:,
	] = outputs[1][:, :remain_len, :]
	data = outputs[2]

	if self.stop:
	return

	# print("slice prefill done", slice_index)
	post_out = self.post_process_session.run(
	None,
	{
	"input": data[
	:, token_len - (len(slice_indexs) - 1) * prefill_slice_len - 1, None, :
	]
	}
	)[0]
	next_token, posssible_tokens, possible_soft = post_process(post_out)
	posibles = [self.tokenizer.decode([t]) for t in posssible_tokens]
	posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
	token_ids.append(next_token)

	# set to decoder
	token_ids_cached = []
	token_ids_cached.append(next_token)

	mask = np.zeros((1, 1, self.kv_cache_len + 1), dtype=np.float32).astype(bfloat16)
	mask[:, :, :self.kv_cache_len] -= 65536
	if prefill_len > 0:
	mask[:, :, :token_len] = 0

	for start_indice in range(self.kv_cache_len):
	if prefill_len > 0 and start_indice < token_len:
	continue

	next_token = token_ids[start_indice]
	indices = np.array([start_indice], np.uint32).reshape((1, 1))
	data = self.embeds[next_token, :].reshape((1, 1, self.cfg.hidden_size)).astype(bfloat16)
	for i in range(self.cfg.num_hidden_layers):
	input_feed = {
	"K_cache": k_caches[i],
	"V_cache": v_caches[i],
	"indices": indices,
	"input": data,
	"mask": mask,
	}
	outputs = self.prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
	k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
	v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
	data = outputs[2]
	mask[..., start_indice] = 0
	if start_indice < token_len - 1:
	pass
	else:
	post_out = self.post_process_session.run(None, {"input": data})[0]
	next_token, posssible_tokens, possible_soft = post_process(post_out)
	token_ids.append(next_token)

	if next_token == self.tokenizer.eos_token_id and next_token > token_len:
	if len(token_ids_cached) > 0:
	msg = self.tokenizer.decode(token_ids_cached)
	token_ids_cached.clear()
	if "\ufffd" in msg:
	msg = msg.replace("\ufffd", "")
	# print(msg, end="", flush=True)
	yield msg
	break

	token_ids_cached.append(next_token)

	if len(token_ids_cached) >= 3:
	msg = self.tokenizer.decode(token_ids_cached)
	token_ids_cached.clear()
	if "\ufffd" in msg:
	msg = msg.replace("\ufffd", "")
	# print(msg, end="", flush=True)
	yield msg


	if self.stop:
	return