Spaces:
Running
Running
| import gradio as gr | |
| import math | |
| import json | |
| import os | |
| from transformers import AutoTokenizer | |
| # Try to import qwen_vl_utils, otherwise use the built-in official implementation copy | |
| try: | |
| from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize | |
| except ImportError: | |
| # Qwen-VL-Utils official implementation copy | |
| def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280): | |
| """ | |
| Official implementation from qwen_vl_utils.vision_process | |
| """ | |
| if max(height, width) / min(height, width) > 200: | |
| factor = 1 # For extreme aspect ratios | |
| h_bar = round(height / factor) * factor | |
| w_bar = round(width / factor) * factor | |
| if h_bar * w_bar > max_pixels: | |
| beta = math.sqrt((height * width) / max_pixels) | |
| h_bar = math.floor(height / beta / factor) * factor | |
| w_bar = math.floor(width / beta / factor) * factor | |
| elif h_bar * w_bar < min_pixels: | |
| beta = math.sqrt(min_pixels / (height * width)) | |
| h_bar = math.ceil(height * beta / factor) * factor | |
| w_bar = math.ceil(width * beta / factor) * factor | |
| return h_bar, w_bar | |
| # --- Tokenizer Loading Logic --- | |
| TOKENIZERS = {} | |
| def get_tokenizer(model_name): | |
| if model_name in TOKENIZERS: | |
| return TOKENIZERS[model_name] | |
| try: | |
| if model_name == "Qwen2.5-VL / Qwen2-VL": | |
| # Qwen2-VL uses Qwen2 tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True) | |
| elif model_name == "Llava-1.6 (Next)": | |
| # Llava-1.6 based on Vicuna/Llama-2 | |
| tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True) | |
| else: | |
| return None | |
| TOKENIZERS[model_name] = tokenizer | |
| return tokenizer | |
| except Exception as e: | |
| print(f"Error loading tokenizer for {model_name}: {e}") | |
| return None | |
| # --- Token Calculation Logic --- | |
| def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels): | |
| """ | |
| Qwen2-VL / Qwen2.5-VL Token Calculation Formula | |
| """ | |
| text_tokens_count = 0 | |
| image_tokens_count = 0 | |
| video_tokens_count = 0 | |
| # 1. Text Tokens (Real Calculation) | |
| text_tokens_ids = [] | |
| if tokenizer: | |
| text_tokens_ids = tokenizer.encode(text) | |
| text_tokens_count = len(text_tokens_ids) | |
| else: | |
| # Fallback | |
| text_tokens_count = len(text) // 2 | |
| # 2. Image Tokens | |
| image_details = [] | |
| for group in image_groups: | |
| width, height = group['width'], group['height'] | |
| count = int(group['count']) | |
| if count <= 0: | |
| continue | |
| # Apply Qwen Official Smart Resize | |
| new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels) | |
| grid_w = new_w // 14 | |
| grid_h = new_h // 14 | |
| img_tokens = grid_h * grid_w | |
| group_tokens = img_tokens * count | |
| image_tokens_count += group_tokens | |
| image_details.append({ | |
| "count": count, | |
| "original_size": [width, height], | |
| "resized_size": [new_w, new_h], | |
| "tokens_per_image": img_tokens, | |
| "total_tokens": group_tokens | |
| }) | |
| # 3. Video Tokens | |
| video_details = [] | |
| for vid in videos: | |
| frames = vid['frames'] | |
| width, height = vid['width'], vid['height'] | |
| # Video processing logic is similar to images | |
| new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels) | |
| grid_w = new_w // 14 | |
| grid_h = new_h // 14 | |
| frame_tokens = grid_h * grid_w | |
| vid_total = frames * frame_tokens | |
| video_tokens_count += vid_total | |
| video_details.append({ | |
| "original_size": [width, height], | |
| "resized_size": [new_w, new_h], | |
| "frames": frames, | |
| "tokens": vid_total | |
| }) | |
| total_tokens = text_tokens_count + image_tokens_count + video_tokens_count | |
| breakdown = { | |
| "text_tokens": text_tokens_count, | |
| "image_tokens": image_tokens_count, | |
| "video_tokens": video_tokens_count | |
| } | |
| media_details = { | |
| "images": image_details, | |
| "videos": video_details | |
| } | |
| return total_tokens, text_tokens_ids, breakdown, media_details | |
| def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels): | |
| """ | |
| Llava-1.6 (Next) Token Calculation Formula | |
| """ | |
| text_tokens_count = 0 | |
| image_tokens_count = 0 | |
| # 1. Text Tokens | |
| text_tokens_ids = [] | |
| if tokenizer: | |
| text_tokens_ids = tokenizer.encode(text) | |
| text_tokens_count = len(text_tokens_ids) | |
| else: | |
| text_tokens_count = len(text) // 2 | |
| # 2. Image Tokens | |
| image_details = [] | |
| for group in image_groups: | |
| width, height = group['width'], group['height'] | |
| count = int(group['count']) | |
| if count <= 0: | |
| continue | |
| # Llava-Next Logic: | |
| # If max_pixels is specified, resize first | |
| if max_pixels > 0 and (width * height > max_pixels): | |
| scale_factor = math.sqrt(max_pixels / (width * height)) | |
| width = int(width * scale_factor) | |
| height = int(height * scale_factor) | |
| scale_res = 336 | |
| patch_x = math.ceil(width / scale_res) | |
| patch_y = math.ceil(height / scale_res) | |
| num_patches = patch_x * patch_y | |
| img_tokens = (num_patches + 1) * 576 | |
| group_tokens = img_tokens * count | |
| image_tokens_count += group_tokens | |
| image_details.append({ | |
| "count": count, | |
| "original_size": [group['width'], group['height']], | |
| "resized_size": [width, height], | |
| "grid_patches": f"{patch_x}x{patch_y}", | |
| "tokens_per_image": img_tokens, | |
| "total_tokens": group_tokens | |
| }) | |
| total_tokens = text_tokens_count + image_tokens_count | |
| breakdown = { | |
| "text_tokens": text_tokens_count, | |
| "image_tokens": image_tokens_count, | |
| "video_tokens": 0 | |
| } | |
| media_details = { | |
| "images": image_details, | |
| "videos": [] | |
| } | |
| return total_tokens, text_tokens_ids, breakdown, media_details | |
| # --- Actual UI Logic --- | |
| def run_calculation(text, model, img_max_pixels, vid_count, vid_frames, vid_w, vid_h, *args): | |
| # Parse variable number of image group arguments | |
| # args structure: c1, w1, h1, c2, w2, h2, ... | |
| image_groups = [] | |
| # Group args into triplets | |
| for i in range(0, len(args), 3): | |
| if i + 2 < len(args): | |
| c = args[i] | |
| w = args[i+1] | |
| h = args[i+2] | |
| try: | |
| c = int(c) | |
| w = int(w) | |
| h = int(h) | |
| if c > 0: | |
| image_groups.append({'count': c, 'width': w, 'height': h}) | |
| except Exception: | |
| pass | |
| videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))] | |
| # Get Tokenizer | |
| tokenizer = get_tokenizer(model) | |
| # Determine real model ID | |
| model_id_map = { | |
| "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct", | |
| "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf" | |
| } | |
| real_model_id = model_id_map.get(model, model) | |
| text_tokens_ids = [] | |
| breakdown = {} | |
| media_details = {} | |
| tokens = 0 | |
| if model == "Qwen2.5-VL / Qwen2-VL": | |
| tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels) | |
| elif model == "Llava-1.6 (Next)": | |
| tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels) | |
| else: | |
| tokens = 0 | |
| # Generate Token Analysis File | |
| token_file_path = None | |
| if tokenizer and text_tokens_ids: | |
| token_data = [] | |
| # Decode each token id | |
| for tid in text_tokens_ids: | |
| token_str = tokenizer.decode([tid]) | |
| token_data.append({"id": tid, "token": token_str}) | |
| token_file_path = "token_analysis.json" | |
| with open(token_file_path, "w", encoding="utf-8") as f: | |
| json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2) | |
| # Construct final JSON result | |
| result = { | |
| "model_id": real_model_id, | |
| "tokenizer_loaded": tokenizer is not None, | |
| "total_tokens": tokens, | |
| "breakdown": breakdown, | |
| "text_stats": { | |
| "char_count": len(text) | |
| }, | |
| "media_details": media_details | |
| } | |
| return result, token_file_path | |
| def create_ui(): | |
| gr.Markdown(""" | |
| ## 📝 Token Stats (文本/多模态 Token 统计) | |
| Estimate token usage for text, images, and videos using various model tokenizers (e.g., Qwen2.5-VL, LLaVa). | |
| 估算文本、图片和视频在不同模型(如 Qwen2.5-VL, LLaVa)下的 Token 用量。 | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_select = gr.Dropdown( | |
| choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"], | |
| value="Qwen2.5-VL / Qwen2-VL", | |
| label="选择模型" | |
| ) | |
| text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...") | |
| with gr.Accordion("🖼️ 图片设置 (Images)", open=True): | |
| # Group 1 (Always visible) | |
| with gr.Row(): | |
| img_c_1 = gr.Number(value=1, label="图片数量 (Group 1)", precision=0) | |
| img_w_1 = gr.Number(value=1080, label="宽 (px)") | |
| img_h_1 = gr.Number(value=1920, label="高 (px)") | |
| # Group 2 (Hidden by default) | |
| with gr.Row(visible=False) as group_2: | |
| img_c_2 = gr.Number(value=0, label="图片数量 (Group 2)", precision=0) | |
| img_w_2 = gr.Number(value=1024, label="宽 (px)") | |
| img_h_2 = gr.Number(value=1024, label="高 (px)") | |
| # Group 3 (Hidden by default) | |
| with gr.Row(visible=False) as group_3: | |
| img_c_3 = gr.Number(value=0, label="图片数量 (Group 3)", precision=0) | |
| img_w_3 = gr.Number(value=1024, label="宽 (px)") | |
| img_h_3 = gr.Number(value=1024, label="高 (px)") | |
| # Group 4 (Hidden by default) | |
| with gr.Row(visible=False) as group_4: | |
| img_c_4 = gr.Number(value=0, label="图片数量 (Group 4)", precision=0) | |
| img_w_4 = gr.Number(value=1024, label="宽 (px)") | |
| img_h_4 = gr.Number(value=1024, label="高 (px)") | |
| add_group_btn = gr.Button("➕ 增加一组图片 (Add Group)", size="sm") | |
| # State to track visible groups | |
| visible_groups = gr.State(1) | |
| def add_group(curr_count): | |
| next_count = min(curr_count + 1, 4) | |
| # Helper to create update for a group | |
| def get_update(group_idx): | |
| if next_count == group_idx: | |
| # Just revealed, set count to 1 | |
| return gr.update(visible=True, value=1) | |
| elif next_count > group_idx: | |
| # Already visible, keep as is (don't reset value) | |
| return gr.update(visible=True) | |
| else: | |
| # Still hidden | |
| return gr.update(visible=False) | |
| return ( | |
| next_count, | |
| get_update(2), | |
| get_update(3), | |
| get_update(4) | |
| ) | |
| add_group_btn.click( | |
| add_group, | |
| [visible_groups], | |
| [visible_groups, group_2, group_3, group_4] | |
| ) | |
| with gr.Row(): | |
| img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0) | |
| with gr.Accordion("🎥 视频设置 (Videos)", open=False): | |
| with gr.Row(): | |
| vid_count = gr.Number(value=0, label="视频数量", precision=0) | |
| vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0) | |
| vid_w = gr.Number(value=512, label="宽 (px)") | |
| vid_h = gr.Number(value=512, label="高 (px)") | |
| btn = gr.Button("🚀 计算 Token", variant="primary") | |
| with gr.Column(scale=1): | |
| out_json = gr.JSON(label="计算结果") | |
| out_file = gr.File(label="下载 Token 分析 (JSON)") | |
| gr.Markdown(""" | |
| ### 说明 | |
| * **真实 Tokenizer**: 首次运行时会自动下载 `transformers` 模型配置,可能需要几秒钟。 | |
| * **Qwen2-VL**: 基于 `H/14 * W/14` 计算,自动对齐到 28px 网格。 | |
| * **Llava-1.6**: 基于 `(Patches + 1) * 576` 计算,Patch 大小为 336px。 | |
| """) | |
| btn.click( | |
| run_calculation, | |
| [ | |
| text_input, model_select, img_max_pixels, vid_count, vid_frames, vid_w, vid_h, | |
| img_c_1, img_w_1, img_h_1, | |
| img_c_2, img_w_2, img_h_2, | |
| img_c_3, img_w_3, img_h_3, | |
| img_c_4, img_w_4, img_h_4 | |
| ], | |
| [out_json, out_file] | |
| ) | |