Spaces:

MashiroLn
/

custom_toolbox

Running

App Files Files Community

custom_toolbox / apps /text_tools.py

MashiroLn

Upload folder using huggingface_hub

20092ea verified 17 days ago

raw

history blame

14 kB

	import gradio as gr
	import math
	import json
	import os
	from transformers import AutoTokenizer

	# Try to import qwen_vl_utils, otherwise use the built-in official implementation copy
	try:
	from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize
	except ImportError:
	# Qwen-VL-Utils official implementation copy
	def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280):
	"""
	Official implementation from qwen_vl_utils.vision_process
	"""
	if max(height, width) / min(height, width) > 200:
	factor = 1 # For extreme aspect ratios

	h_bar = round(height / factor) * factor
	w_bar = round(width / factor) * factor

	if h_bar * w_bar > max_pixels:
	beta = math.sqrt((height * width) / max_pixels)
	h_bar = math.floor(height / beta / factor) * factor
	w_bar = math.floor(width / beta / factor) * factor
	elif h_bar * w_bar < min_pixels:
	beta = math.sqrt(min_pixels / (height * width))
	h_bar = math.ceil(height * beta / factor) * factor
	w_bar = math.ceil(width * beta / factor) * factor

	return h_bar, w_bar

	# --- Tokenizer Loading Logic ---
	TOKENIZERS = {}

	def get_tokenizer(model_name):
	if model_name in TOKENIZERS:
	return TOKENIZERS[model_name]

	try:
	if model_name == "Qwen2.5-VL / Qwen2-VL":
	# Qwen2-VL uses Qwen2 tokenizer
	tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
	elif model_name == "Llava-1.6 (Next)":
	# Llava-1.6 based on Vicuna/Llama-2
	tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
	else:
	return None

	TOKENIZERS[model_name] = tokenizer
	return tokenizer
	except Exception as e:
	print(f"Error loading tokenizer for {model_name}: {e}")
	return None

	# --- Token Calculation Logic ---

	def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels):
	"""
	Qwen2-VL / Qwen2.5-VL Token Calculation Formula
	"""
	text_tokens_count = 0
	image_tokens_count = 0
	video_tokens_count = 0

	# 1. Text Tokens (Real Calculation)
	text_tokens_ids = []
	if tokenizer:
	text_tokens_ids = tokenizer.encode(text)
	text_tokens_count = len(text_tokens_ids)
	else:
	# Fallback
	text_tokens_count = len(text) // 2

	# 2. Image Tokens
	image_details = []
	for group in image_groups:
	width, height = group['width'], group['height']
	count = int(group['count'])
	if count <= 0:
	continue

	# Apply Qwen Official Smart Resize
	new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)

	grid_w = new_w // 14
	grid_h = new_h // 14
	img_tokens = grid_h * grid_w

	group_tokens = img_tokens * count
	image_tokens_count += group_tokens

	image_details.append({
	"count": count,
	"original_size": [width, height],
	"resized_size": [new_w, new_h],
	"tokens_per_image": img_tokens,
	"total_tokens": group_tokens
	})

	# 3. Video Tokens
	video_details = []
	for vid in videos:
	frames = vid['frames']
	width, height = vid['width'], vid['height']

	# Video processing logic is similar to images
	new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)

	grid_w = new_w // 14
	grid_h = new_h // 14
	frame_tokens = grid_h * grid_w

	vid_total = frames * frame_tokens
	video_tokens_count += vid_total
	video_details.append({
	"original_size": [width, height],
	"resized_size": [new_w, new_h],
	"frames": frames,
	"tokens": vid_total
	})

	total_tokens = text_tokens_count + image_tokens_count + video_tokens_count

	breakdown = {
	"text_tokens": text_tokens_count,
	"image_tokens": image_tokens_count,
	"video_tokens": video_tokens_count
	}

	media_details = {
	"images": image_details,
	"videos": video_details
	}

	return total_tokens, text_tokens_ids, breakdown, media_details

	def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels):
	"""
	Llava-1.6 (Next) Token Calculation Formula
	"""
	text_tokens_count = 0
	image_tokens_count = 0

	# 1. Text Tokens
	text_tokens_ids = []
	if tokenizer:
	text_tokens_ids = tokenizer.encode(text)
	text_tokens_count = len(text_tokens_ids)
	else:
	text_tokens_count = len(text) // 2

	# 2. Image Tokens
	image_details = []
	for group in image_groups:
	width, height = group['width'], group['height']
	count = int(group['count'])
	if count <= 0:
	continue

	# Llava-Next Logic:
	# If max_pixels is specified, resize first
	if max_pixels > 0 and (width * height > max_pixels):
	scale_factor = math.sqrt(max_pixels / (width * height))
	width = int(width * scale_factor)
	height = int(height * scale_factor)

	scale_res = 336
	patch_x = math.ceil(width / scale_res)
	patch_y = math.ceil(height / scale_res)
	num_patches = patch_x * patch_y
	img_tokens = (num_patches + 1) * 576

	group_tokens = img_tokens * count
	image_tokens_count += group_tokens

	image_details.append({
	"count": count,
	"original_size": [group['width'], group['height']],
	"resized_size": [width, height],
	"grid_patches": f"{patch_x}x{patch_y}",
	"tokens_per_image": img_tokens,
	"total_tokens": group_tokens
	})

	total_tokens = text_tokens_count + image_tokens_count

	breakdown = {
	"text_tokens": text_tokens_count,
	"image_tokens": image_tokens_count,
	"video_tokens": 0
	}

	media_details = {
	"images": image_details,
	"videos": []
	}

	return total_tokens, text_tokens_ids, breakdown, media_details

	# --- Actual UI Logic ---

	def run_calculation(text, model, img_max_pixels, vid_count, vid_frames, vid_w, vid_h, *args):
	# Parse variable number of image group arguments
	# args structure: c1, w1, h1, c2, w2, h2, ...
	image_groups = []

	# Group args into triplets
	for i in range(0, len(args), 3):
	if i + 2 < len(args):
	c = args[i]
	w = args[i+1]
	h = args[i+2]
	try:
	c = int(c)
	w = int(w)
	h = int(h)
	if c > 0:
	image_groups.append({'count': c, 'width': w, 'height': h})
	except Exception:
	pass

	videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]

	# Get Tokenizer
	tokenizer = get_tokenizer(model)

	# Determine real model ID
	model_id_map = {
	"Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
	"Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
	}
	real_model_id = model_id_map.get(model, model)

	text_tokens_ids = []
	breakdown = {}
	media_details = {}
	tokens = 0

	if model == "Qwen2.5-VL / Qwen2-VL":
	tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels)
	elif model == "Llava-1.6 (Next)":
	tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels)
	else:
	tokens = 0

	# Generate Token Analysis File
	token_file_path = None
	if tokenizer and text_tokens_ids:
	token_data = []
	# Decode each token id
	for tid in text_tokens_ids:
	token_str = tokenizer.decode([tid])
	token_data.append({"id": tid, "token": token_str})

	token_file_path = "token_analysis.json"
	with open(token_file_path, "w", encoding="utf-8") as f:
	json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)

	# Construct final JSON result
	result = {
	"model_id": real_model_id,
	"tokenizer_loaded": tokenizer is not None,
	"total_tokens": tokens,
	"breakdown": breakdown,
	"text_stats": {
	"char_count": len(text)
	},
	"media_details": media_details
	}

	return result, token_file_path

	def create_ui():
	with gr.Row():
	with gr.Column(scale=1):
	model_select = gr.Dropdown(
	choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"],
	value="Qwen2.5-VL / Qwen2-VL",
	label="选择模型"
	)
	text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")

	with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
	# Group 1 (Always visible)
	with gr.Row():
	img_c_1 = gr.Number(value=1, label="图片数量 (Group 1)", precision=0)
	img_w_1 = gr.Number(value=1080, label="宽 (px)")
	img_h_1 = gr.Number(value=1920, label="高 (px)")

	# Group 2 (Hidden by default)
	with gr.Row(visible=False) as group_2:
	img_c_2 = gr.Number(value=0, label="图片数量 (Group 2)", precision=0)
	img_w_2 = gr.Number(value=1024, label="宽 (px)")
	img_h_2 = gr.Number(value=1024, label="高 (px)")

	# Group 3 (Hidden by default)
	with gr.Row(visible=False) as group_3:
	img_c_3 = gr.Number(value=0, label="图片数量 (Group 3)", precision=0)
	img_w_3 = gr.Number(value=1024, label="宽 (px)")
	img_h_3 = gr.Number(value=1024, label="高 (px)")

	# Group 4 (Hidden by default)
	with gr.Row(visible=False) as group_4:
	img_c_4 = gr.Number(value=0, label="图片数量 (Group 4)", precision=0)
	img_w_4 = gr.Number(value=1024, label="宽 (px)")
	img_h_4 = gr.Number(value=1024, label="高 (px)")

	add_group_btn = gr.Button("➕ 增加一组图片 (Add Group)", size="sm")

	# State to track visible groups
	visible_groups = gr.State(1)

	def add_group(curr_count):
	next_count = min(curr_count + 1, 4)

	# Helper to create update for a group
	def get_update(group_idx):
	if next_count == group_idx:
	# Just revealed, set count to 1
	return gr.update(visible=True, value=1)
	elif next_count > group_idx:
	# Already visible, keep as is (don't reset value)
	return gr.update(visible=True)
	else:
	# Still hidden
	return gr.update(visible=False)

	return (
	next_count,
	get_update(2),
	get_update(3),
	get_update(4)
	)

	add_group_btn.click(
	add_group,
	[visible_groups],
	[visible_groups, group_2, group_3, group_4]
	)

	with gr.Row():
	img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)

	with gr.Accordion("🎥 视频设置 (Videos)", open=False):
	with gr.Row():
	vid_count = gr.Number(value=0, label="视频数量", precision=0)
	vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0)
	vid_w = gr.Number(value=512, label="宽 (px)")
	vid_h = gr.Number(value=512, label="高 (px)")

	btn = gr.Button("🚀 计算 Token", variant="primary")

	with gr.Column(scale=1):
	out_json = gr.JSON(label="计算结果")
	out_file = gr.File(label="下载 Token 分析 (JSON)")
	gr.Markdown("""
	### 说明
	* 真实 Tokenizer: 首次运行时会自动下载 `transformers` 模型配置，可能需要几秒钟。
	* Qwen2-VL: 基于 `H/14 * W/14` 计算，自动对齐到 28px 网格。
	* Llava-1.6: 基于 `(Patches + 1) * 576` 计算，Patch 大小为 336px。
	""")

	btn.click(
	run_calculation,
	[
	text_input, model_select, img_max_pixels, vid_count, vid_frames, vid_w, vid_h,
	img_c_1, img_w_1, img_h_1,
	img_c_2, img_w_2, img_h_2,
	img_c_3, img_w_3, img_h_3,
	img_c_4, img_w_4, img_h_4
	],
	[out_json, out_file]
	)