custom_toolbox / apps /text_tools.py
MashiroLn's picture
Upload folder using huggingface_hub
6ee5c32 verified
raw
history blame
10.4 kB
import gradio as gr
import math
import json
import os
from transformers import AutoTokenizer
# Try to import qwen_vl_utils, otherwise use the built-in official implementation copy
try:
from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize
except ImportError:
# Qwen-VL-Utils official implementation copy
def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280):
"""
Official implementation from qwen_vl_utils.vision_process
"""
if max(height, width) / min(height, width) > 200:
factor = 1 # For extreme aspect ratios
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
# --- Tokenizer Loading Logic ---
TOKENIZERS = {}
def get_tokenizer(model_name):
if model_name in TOKENIZERS:
return TOKENIZERS[model_name]
try:
if model_name == "Qwen2.5-VL / Qwen2-VL":
# Qwen2-VL uses Qwen2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
elif model_name == "Llava-1.6 (Next)":
# Llava-1.6 based on Vicuna/Llama-2
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
else:
return None
TOKENIZERS[model_name] = tokenizer
return tokenizer
except Exception as e:
print(f"Error loading tokenizer for {model_name}: {e}")
return None
# --- Token Calculation Logic ---
def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
"""
Qwen2-VL / Qwen2.5-VL Token Calculation Formula
"""
text_tokens_count = 0
image_tokens_count = 0
video_tokens_count = 0
# 1. Text Tokens (Real Calculation)
text_tokens_ids = []
if tokenizer:
text_tokens_ids = tokenizer.encode(text)
text_tokens_count = len(text_tokens_ids)
else:
# Fallback
text_tokens_count = len(text) // 2
# 2. Image Tokens
image_details = []
for img in images:
width, height = img['width'], img['height']
# Apply Qwen Official Smart Resize
new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
grid_w = new_w // 14
grid_h = new_h // 14
img_tokens = grid_h * grid_w
image_tokens_count += img_tokens
image_details.append({
"original_size": [width, height],
"resized_size": [new_w, new_h],
"tokens": img_tokens
})
# 3. Video Tokens
video_details = []
for vid in videos:
frames = vid['frames']
width, height = vid['width'], vid['height']
# Video processing logic is similar to images
new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
grid_w = new_w // 14
grid_h = new_h // 14
frame_tokens = grid_h * grid_w
vid_total = frames * frame_tokens
video_tokens_count += vid_total
video_details.append({
"original_size": [width, height],
"resized_size": [new_w, new_h],
"frames": frames,
"tokens": vid_total
})
total_tokens = text_tokens_count + image_tokens_count + video_tokens_count
breakdown = {
"text_tokens": text_tokens_count,
"image_tokens": image_tokens_count,
"video_tokens": video_tokens_count
}
media_details = {
"images": image_details,
"videos": video_details
}
return total_tokens, text_tokens_ids, breakdown, media_details
def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
"""
Llava-1.6 (Next) Token Calculation Formula
"""
text_tokens_count = 0
image_tokens_count = 0
# 1. Text Tokens
text_tokens_ids = []
if tokenizer:
text_tokens_ids = tokenizer.encode(text)
text_tokens_count = len(text_tokens_ids)
else:
text_tokens_count = len(text) // 2
# 2. Image Tokens
image_details = []
for img in images:
width, height = img['width'], img['height']
# Llava-Next Logic:
# If max_pixels is specified, resize first
if max_pixels > 0 and (width * height > max_pixels):
scale_factor = math.sqrt(max_pixels / (width * height))
width = int(width * scale_factor)
height = int(height * scale_factor)
scale_res = 336
patch_x = math.ceil(width / scale_res)
patch_y = math.ceil(height / scale_res)
num_patches = patch_x * patch_y
img_tokens = (num_patches + 1) * 576
image_tokens_count += img_tokens
image_details.append({
"original_size": [img['width'], img['height']],
"resized_size": [width, height],
"grid_patches": f"{patch_x}x{patch_y}",
"tokens": img_tokens
})
total_tokens = text_tokens_count + image_tokens_count
breakdown = {
"text_tokens": text_tokens_count,
"image_tokens": image_tokens_count,
"video_tokens": 0
}
media_details = {
"images": image_details,
"videos": []
}
return total_tokens, text_tokens_ids, breakdown, media_details
# --- Actual UI Logic ---
def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
# Construct virtual data
images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
# Get Tokenizer
tokenizer = get_tokenizer(model)
# Determine real model ID
model_id_map = {
"Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
"Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
}
real_model_id = model_id_map.get(model, model)
text_tokens_ids = []
breakdown = {}
media_details = {}
tokens = 0
if model == "Qwen2.5-VL / Qwen2-VL":
tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels)
elif model == "Llava-1.6 (Next)":
tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels)
else:
tokens = 0
# Generate Token Analysis File
token_file_path = None
if tokenizer and text_tokens_ids:
token_data = []
# Decode each token id
for tid in text_tokens_ids:
token_str = tokenizer.decode([tid])
token_data.append({"id": tid, "token": token_str})
token_file_path = "token_analysis.json"
with open(token_file_path, "w", encoding="utf-8") as f:
json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
# Construct final JSON result
result = {
"model_id": real_model_id,
"tokenizer_loaded": tokenizer is not None,
"total_tokens": tokens,
"breakdown": breakdown,
"text_stats": {
"char_count": len(text)
},
"media_details": media_details
}
return result, token_file_path
def create_ui():
with gr.Row():
with gr.Column(scale=1):
model_select = gr.Dropdown(
choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"],
value="Qwen2.5-VL / Qwen2-VL",
label="选择模型"
)
text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
with gr.Row():
img_count = gr.Number(value=1, label="图片数量", precision=0)
img_w = gr.Number(value=1024, label="宽 (px)")
img_h = gr.Number(value=1024, label="高 (px)")
with gr.Row():
img_max_pixels = gr.Number(value=1280*1280, label="Max Pixels (最大像素限制)", precision=0)
with gr.Accordion("🎥 视频设置 (Videos)", open=False):
with gr.Row():
vid_count = gr.Number(value=0, label="视频数量", precision=0)
vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0)
vid_w = gr.Number(value=512, label="宽 (px)")
vid_h = gr.Number(value=512, label="高 (px)")
btn = gr.Button("🚀 计算 Token", variant="primary")
with gr.Column(scale=1):
out_json = gr.JSON(label="计算结果")
out_file = gr.File(label="下载 Token 分析 (JSON)")
gr.Markdown("""
### 说明
* **真实 Tokenizer**: 首次运行时会自动下载 `transformers` 模型配置,可能需要几秒钟。
* **Qwen2-VL**: 基于 `H/14 * W/14` 计算,自动对齐到 28px 网格。
* **Llava-1.6**: 基于 `(Patches + 1) * 576` 计算,Patch 大小为 336px。
""")
btn.click(
run_calculation,
[text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
[out_json, out_file]
)