import gradio as gr import math import json import os from transformers import AutoTokenizer # Try to import qwen_vl_utils, otherwise use the built-in official implementation copy try: from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize except ImportError: # Qwen-VL-Utils official implementation copy def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280): """ Official implementation from qwen_vl_utils.vision_process """ if max(height, width) / min(height, width) > 200: factor = 1 # For extreme aspect ratios h_bar = round(height / factor) * factor w_bar = round(width / factor) * factor if h_bar * w_bar > max_pixels: beta = math.sqrt((height * width) / max_pixels) h_bar = math.floor(height / beta / factor) * factor w_bar = math.floor(width / beta / factor) * factor elif h_bar * w_bar < min_pixels: beta = math.sqrt(min_pixels / (height * width)) h_bar = math.ceil(height * beta / factor) * factor w_bar = math.ceil(width * beta / factor) * factor return h_bar, w_bar # --- Tokenizer Loading Logic --- TOKENIZERS = {} def get_tokenizer(model_name): if model_name in TOKENIZERS: return TOKENIZERS[model_name] try: if model_name == "Qwen2.5-VL / Qwen2-VL": # Qwen2-VL uses Qwen2 tokenizer tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True) elif model_name == "Llava-1.6 (Next)": # Llava-1.6 based on Vicuna/Llama-2 tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True) else: return None TOKENIZERS[model_name] = tokenizer return tokenizer except Exception as e: print(f"Error loading tokenizer for {model_name}: {e}") return None # --- Token Calculation Logic --- def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels): """ Qwen2-VL / Qwen2.5-VL Token Calculation Formula """ text_tokens_count = 0 image_tokens_count = 0 video_tokens_count = 0 # 1. Text Tokens (Real Calculation) text_tokens_ids = [] if tokenizer: text_tokens_ids = tokenizer.encode(text) text_tokens_count = len(text_tokens_ids) else: # Fallback text_tokens_count = len(text) // 2 # 2. Image Tokens image_details = [] for img in images: width, height = img['width'], img['height'] # Apply Qwen Official Smart Resize new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels) grid_w = new_w // 14 grid_h = new_h // 14 img_tokens = grid_h * grid_w image_tokens_count += img_tokens image_details.append({ "original_size": [width, height], "resized_size": [new_w, new_h], "tokens": img_tokens }) # 3. Video Tokens video_details = [] for vid in videos: frames = vid['frames'] width, height = vid['width'], vid['height'] # Video processing logic is similar to images new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels) grid_w = new_w // 14 grid_h = new_h // 14 frame_tokens = grid_h * grid_w vid_total = frames * frame_tokens video_tokens_count += vid_total video_details.append({ "original_size": [width, height], "resized_size": [new_w, new_h], "frames": frames, "tokens": vid_total }) total_tokens = text_tokens_count + image_tokens_count + video_tokens_count breakdown = { "text_tokens": text_tokens_count, "image_tokens": image_tokens_count, "video_tokens": video_tokens_count } media_details = { "images": image_details, "videos": video_details } return total_tokens, text_tokens_ids, breakdown, media_details def calculate_llava_next_tokens(text, images, tokenizer, max_pixels): """ Llava-1.6 (Next) Token Calculation Formula """ text_tokens_count = 0 image_tokens_count = 0 # 1. Text Tokens text_tokens_ids = [] if tokenizer: text_tokens_ids = tokenizer.encode(text) text_tokens_count = len(text_tokens_ids) else: text_tokens_count = len(text) // 2 # 2. Image Tokens image_details = [] for img in images: width, height = img['width'], img['height'] # Llava-Next Logic: # If max_pixels is specified, resize first if max_pixels > 0 and (width * height > max_pixels): scale_factor = math.sqrt(max_pixels / (width * height)) width = int(width * scale_factor) height = int(height * scale_factor) scale_res = 336 patch_x = math.ceil(width / scale_res) patch_y = math.ceil(height / scale_res) num_patches = patch_x * patch_y img_tokens = (num_patches + 1) * 576 image_tokens_count += img_tokens image_details.append({ "original_size": [img['width'], img['height']], "resized_size": [width, height], "grid_patches": f"{patch_x}x{patch_y}", "tokens": img_tokens }) total_tokens = text_tokens_count + image_tokens_count breakdown = { "text_tokens": text_tokens_count, "image_tokens": image_tokens_count, "video_tokens": 0 } media_details = { "images": image_details, "videos": [] } return total_tokens, text_tokens_ids, breakdown, media_details # --- Actual UI Logic --- def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h): # Construct virtual data images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))] videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))] # Get Tokenizer tokenizer = get_tokenizer(model) # Determine real model ID model_id_map = { "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct", "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf" } real_model_id = model_id_map.get(model, model) text_tokens_ids = [] breakdown = {} media_details = {} tokens = 0 if model == "Qwen2.5-VL / Qwen2-VL": tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels) elif model == "Llava-1.6 (Next)": tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels) else: tokens = 0 # Generate Token Analysis File token_file_path = None if tokenizer and text_tokens_ids: token_data = [] # Decode each token id for tid in text_tokens_ids: token_str = tokenizer.decode([tid]) token_data.append({"id": tid, "token": token_str}) token_file_path = "token_analysis.json" with open(token_file_path, "w", encoding="utf-8") as f: json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2) # Construct final JSON result result = { "model_id": real_model_id, "tokenizer_loaded": tokenizer is not None, "total_tokens": tokens, "breakdown": breakdown, "text_stats": { "char_count": len(text) }, "media_details": media_details } return result, token_file_path def create_ui(): with gr.Row(): with gr.Column(scale=1): model_select = gr.Dropdown( choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"], value="Qwen2.5-VL / Qwen2-VL", label="选择模型" ) text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...") with gr.Accordion("🖼️ 图片设置 (Images)", open=True): with gr.Row(): img_count = gr.Number(value=1, label="图片数量", precision=0) img_w = gr.Number(value=1024, label="宽 (px)") img_h = gr.Number(value=1024, label="高 (px)") with gr.Row(): img_max_pixels = gr.Number(value=1280*1280, label="Max Pixels (最大像素限制)", precision=0) with gr.Accordion("🎥 视频设置 (Videos)", open=False): with gr.Row(): vid_count = gr.Number(value=0, label="视频数量", precision=0) vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0) vid_w = gr.Number(value=512, label="宽 (px)") vid_h = gr.Number(value=512, label="高 (px)") btn = gr.Button("🚀 计算 Token", variant="primary") with gr.Column(scale=1): out_json = gr.JSON(label="计算结果") out_file = gr.File(label="下载 Token 分析 (JSON)") gr.Markdown(""" ### 说明 * **真实 Tokenizer**: 首次运行时会自动下载 `transformers` 模型配置,可能需要几秒钟。 * **Qwen2-VL**: 基于 `H/14 * W/14` 计算,自动对齐到 28px 网格。 * **Llava-1.6**: 基于 `(Patches + 1) * 576` 计算,Patch 大小为 336px。 """) btn.click( run_calculation, [text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h], [out_json, out_file] )