Spaces:

MashiroLn
/

custom_toolbox

Running

App Files Files Community

MashiroLn commited on 18 days ago

Commit

6ee5c32

verified ·

1 Parent(s): 7442a76

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

apps/text_tools.py +71 -36
requirements.txt +2 -1

apps/text_tools.py CHANGED Viewed

@@ -4,11 +4,33 @@ import json
 import os
 from transformers import AutoTokenizer
-# --- Tokenizer 加载逻辑 ---
-# 为了避免每次请求都重新加载，我们可以尝试缓存 tokenizer
-# 但在 HF Spaces 中，内存有限，且模型可能很大。
-# 对于 Qwen2.5-VL，我们可以使用 Qwen/Qwen2.5-VL-7B-Instruct 的 tokenizer
-# 对于 Llava，通常使用 Llama-2 或 Vicuna 的 tokenizer
 TOKENIZERS = {}
 def get_tokenizer(model_name):
@@ -17,12 +39,10 @@ def get_tokenizer(model_name):
     try:
         if model_name == "Qwen2.5-VL / Qwen2-VL":
-            # Qwen2-VL 使用 Qwen2 的 tokenizer
-            # 注意：这里需要联网下载 tokenizer.json，HF Spaces 通常允许
             tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
         elif model_name == "Llava-1.6 (Next)":
-            # Llava-1.6 基于 Vicuna/Llama-2，这里用 Llama-2 tokenizer 近似，或者直接用 llava-hf
-            # 为了通用性，我们使用 llava-hf/llava-v1.6-vicuna-7b-hf
             tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
         else:
             return None
@@ -33,17 +53,17 @@ def get_tokenizer(model_name):
         print(f"Error loading tokenizer for {model_name}: {e}")
         return None
-# --- Token 计算逻辑 ---
-def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
     """
-    Qwen2-VL / Qwen2.5-VL Token 计算公式
     """
     text_tokens_count = 0
     image_tokens_count = 0
     video_tokens_count = 0
-    # 1. 文本 Token (真实计算)
     text_tokens_ids = []
     if tokenizer:
         text_tokens_ids = tokenizer.encode(text)
@@ -52,12 +72,14 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
         # Fallback
         text_tokens_count = len(text) // 2
-    # 2. 图片 Token
     image_details = []
     for img in images:
         width, height = img['width'], img['height']
-        new_w = int(round(width / 28.0) * 28)
-        new_h = int(round(height / 28.0) * 28)
         grid_w = new_w // 14
         grid_h = new_h // 14
         img_tokens = grid_h * grid_w
@@ -69,13 +91,15 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
             "tokens": img_tokens
         })
-    # 3. 视频 Token
     video_details = []
     for vid in videos:
         frames = vid['frames']
         width, height = vid['width'], vid['height']
-        new_w = int(round(width / 28.0) * 28)
-        new_h = int(round(height / 28.0) * 28)
         grid_w = new_w // 14
         grid_h = new_h // 14
         frame_tokens = grid_h * grid_w
@@ -104,14 +128,14 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
     return total_tokens, text_tokens_ids, breakdown, media_details
-def calculate_llava_next_tokens(text, images, tokenizer):
     """
-    Llava-1.6 (Next) Token 计算公式
     """
     text_tokens_count = 0
     image_tokens_count = 0
-    # 1. 文本 Token
     text_tokens_ids = []
     if tokenizer:
         text_tokens_ids = tokenizer.encode(text)
@@ -119,10 +143,18 @@ def calculate_llava_next_tokens(text, images, tokenizer):
     else:
         text_tokens_count = len(text) // 2
-    # 2. 图片 Token
     image_details = []
     for img in images:
         width, height = img['width'], img['height']
         scale_res = 336
         patch_x = math.ceil(width / scale_res)
         patch_y = math.ceil(height / scale_res)
@@ -131,8 +163,9 @@ def calculate_llava_next_tokens(text, images, tokenizer):
         image_tokens_count += img_tokens
         image_details.append({
-            "original_size": [width, height],
-            "resized_size": ["Dynamic Grid", f"{patch_x}x{patch_y} patches"],
             "tokens": img_tokens
         })
@@ -151,17 +184,17 @@ def calculate_llava_next_tokens(text, images, tokenizer):
     return total_tokens, text_tokens_ids, breakdown, media_details
-# --- 实际 UI 逻辑 ---
-def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h):
-    # 构造虚拟数据
     images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
     videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
-    # 获取 Tokenizer
     tokenizer = get_tokenizer(model)
-    # 确定真实模型 ID
     model_id_map = {
         "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
         "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
@@ -174,17 +207,17 @@ def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames,
     tokens = 0
     if model == "Qwen2.5-VL / Qwen2-VL":
-        tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer)
     elif model == "Llava-1.6 (Next)":
-        tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer)
     else:
         tokens = 0
-    # 生成 Token 对应文件
     token_file_path = None
     if tokenizer and text_tokens_ids:
         token_data = []
-        # 解码每个 token id 对应的 string
         for tid in text_tokens_ids:
             token_str = tokenizer.decode([tid])
             token_data.append({"id": tid, "token": token_str})
@@ -193,7 +226,7 @@ def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames,
         with open(token_file_path, "w", encoding="utf-8") as f:
             json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
-    # 构造最终返回的 JSON
     result = {
         "model_id": real_model_id,
         "tokenizer_loaded": tokenizer is not None,
@@ -222,6 +255,8 @@ def create_ui():
                     img_count = gr.Number(value=1, label="图片数量", precision=0)
                     img_w = gr.Number(value=1024, label="宽 (px)")
                     img_h = gr.Number(value=1024, label="高 (px)")
             with gr.Accordion("🎥 视频设置 (Videos)", open=False):
                 with gr.Row():
@@ -244,6 +279,6 @@ def create_ui():
     btn.click(
         run_calculation,
-        [text_input, model_select, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h],
         [out_json, out_file]
     )

 import os
 from transformers import AutoTokenizer
+# Try to import qwen_vl_utils, otherwise use the built-in official implementation copy
+try:
+    from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize
+except ImportError:
+    # Qwen-VL-Utils official implementation copy
+    def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280):
+        """
+        Official implementation from qwen_vl_utils.vision_process
+        """
+        if max(height, width) / min(height, width) > 200:
+            factor = 1 # For extreme aspect ratios
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+# --- Tokenizer Loading Logic ---
 TOKENIZERS = {}
 def get_tokenizer(model_name):
     try:
         if model_name == "Qwen2.5-VL / Qwen2-VL":
+            # Qwen2-VL uses Qwen2 tokenizer
             tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
         elif model_name == "Llava-1.6 (Next)":
+            # Llava-1.6 based on Vicuna/Llama-2
             tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
         else:
             return None
         print(f"Error loading tokenizer for {model_name}: {e}")
         return None
+# --- Token Calculation Logic ---
+def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
     """
+    Qwen2-VL / Qwen2.5-VL Token Calculation Formula
     """
     text_tokens_count = 0
     image_tokens_count = 0
     video_tokens_count = 0
+    # 1. Text Tokens (Real Calculation)
     text_tokens_ids = []
     if tokenizer:
         text_tokens_ids = tokenizer.encode(text)
         # Fallback
         text_tokens_count = len(text) // 2
+    # 2. Image Tokens
     image_details = []
     for img in images:
         width, height = img['width'], img['height']
+        # Apply Qwen Official Smart Resize
+        new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
         grid_w = new_w // 14
         grid_h = new_h // 14
         img_tokens = grid_h * grid_w
             "tokens": img_tokens
         })
+    # 3. Video Tokens
     video_details = []
     for vid in videos:
         frames = vid['frames']
         width, height = vid['width'], vid['height']
+        # Video processing logic is similar to images
+        new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
         grid_w = new_w // 14
         grid_h = new_h // 14
         frame_tokens = grid_h * grid_w
     return total_tokens, text_tokens_ids, breakdown, media_details
+def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
     """
+    Llava-1.6 (Next) Token Calculation Formula
     """
     text_tokens_count = 0
     image_tokens_count = 0
+    # 1. Text Tokens
     text_tokens_ids = []
     if tokenizer:
         text_tokens_ids = tokenizer.encode(text)
     else:
         text_tokens_count = len(text) // 2
+    # 2. Image Tokens
     image_details = []
     for img in images:
         width, height = img['width'], img['height']
+        # Llava-Next Logic:
+        # If max_pixels is specified, resize first
+        if max_pixels > 0 and (width * height > max_pixels):
+            scale_factor = math.sqrt(max_pixels / (width * height))
+            width = int(width * scale_factor)
+            height = int(height * scale_factor)
         scale_res = 336
         patch_x = math.ceil(width / scale_res)
         patch_y = math.ceil(height / scale_res)
         image_tokens_count += img_tokens
         image_details.append({
+            "original_size": [img['width'], img['height']],
+            "resized_size": [width, height],
+            "grid_patches": f"{patch_x}x{patch_y}",
             "tokens": img_tokens
         })
     return total_tokens, text_tokens_ids, breakdown, media_details
+# --- Actual UI Logic ---
+def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
+    # Construct virtual data
     images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
     videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
+    # Get Tokenizer
     tokenizer = get_tokenizer(model)
+    # Determine real model ID
     model_id_map = {
         "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
         "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
     tokens = 0
     if model == "Qwen2.5-VL / Qwen2-VL":
+        tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels)
     elif model == "Llava-1.6 (Next)":
+        tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels)
     else:
         tokens = 0
+    # Generate Token Analysis File
     token_file_path = None
     if tokenizer and text_tokens_ids:
         token_data = []
+        # Decode each token id
         for tid in text_tokens_ids:
             token_str = tokenizer.decode([tid])
             token_data.append({"id": tid, "token": token_str})
         with open(token_file_path, "w", encoding="utf-8") as f:
             json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
+    # Construct final JSON result
     result = {
         "model_id": real_model_id,
         "tokenizer_loaded": tokenizer is not None,
                     img_count = gr.Number(value=1, label="图片数量", precision=0)
                     img_w = gr.Number(value=1024, label="宽 (px)")
                     img_h = gr.Number(value=1024, label="高 (px)")
+                with gr.Row():
+                    img_max_pixels = gr.Number(value=1280*1280, label="Max Pixels (最大像素限制)", precision=0)
             with gr.Accordion("🎥 视频设置 (Videos)", open=False):
                 with gr.Row():
     btn.click(
         run_calculation,
+        [text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
         [out_json, out_file]
     )

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ Pillow
 img2pdf
 huggingface_hub
 transformers
-tiktoken

 img2pdf
 huggingface_hub
 transformers
+tiktoken
+qwen-vl-utils