Spaces:

MashiroLn
/

custom_toolbox

Running

App Files Files Community

MashiroLn commited on 18 days ago

Commit

f78b115

verified ·

1 Parent(s): c0f4653

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

apps/text_tools.py +55 -21

apps/text_tools.py CHANGED Viewed

@@ -55,7 +55,7 @@ def get_tokenizer(model_name):
 # --- Token Calculation Logic ---
-def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
     """
     Qwen2-VL / Qwen2.5-VL Token Calculation Formula
     """
@@ -74,8 +74,11 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
     # 2. Image Tokens
     image_details = []
-    for img in images:
-        width, height = img['width'], img['height']
         # Apply Qwen Official Smart Resize
         new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
@@ -84,11 +87,15 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
         grid_h = new_h // 14
         img_tokens = grid_h * grid_w
-        image_tokens_count += img_tokens
         image_details.append({
             "original_size": [width, height],
             "resized_size": [new_w, new_h],
-            "tokens": img_tokens
         })
     # 3. Video Tokens
@@ -128,7 +135,7 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
     return total_tokens, text_tokens_ids, breakdown, media_details
-def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
     """
     Llava-1.6 (Next) Token Calculation Formula
     """
@@ -145,8 +152,11 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
     # 2. Image Tokens
     image_details = []
-    for img in images:
-        width, height = img['width'], img['height']
         # Llava-Next Logic:
         # If max_pixels is specified, resize first
@@ -161,12 +171,16 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
         num_patches = patch_x * patch_y
         img_tokens = (num_patches + 1) * 576
-        image_tokens_count += img_tokens
         image_details.append({
-            "original_size": [img['width'], img['height']],
             "resized_size": [width, height],
             "grid_patches": f"{patch_x}x{patch_y}",
-            "tokens": img_tokens
         })
     total_tokens = text_tokens_count + image_tokens_count
@@ -186,9 +200,25 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
 # --- Actual UI Logic ---
-def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
-    # Construct virtual data
-    images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
     videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
     # Get Tokenizer
@@ -207,9 +237,9 @@ def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_co
     tokens = 0
     if model == "Qwen2.5-VL / Qwen2-VL":
-        tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels)
     elif model == "Llava-1.6 (Next)":
-        tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels)
     else:
         tokens = 0
@@ -251,10 +281,14 @@ def create_ui():
             text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
             with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
-                with gr.Row():
-                    img_count = gr.Number(value=0, label="图片数量", precision=0)
-                    img_w = gr.Number(value=1080, label="宽 (px)")
-                    img_h = gr.Number(value=1920, label="高 (px)")
                 with gr.Row():
                     img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
@@ -279,6 +313,6 @@ def create_ui():
     btn.click(
         run_calculation,
-        [text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
         [out_json, out_file]
     )

 # --- Token Calculation Logic ---
+def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels):
     """
     Qwen2-VL / Qwen2.5-VL Token Calculation Formula
     """
     # 2. Image Tokens
     image_details = []
+    for group in image_groups:
+        width, height = group['width'], group['height']
+        count = int(group['count'])
+        if count <= 0:
+            continue
         # Apply Qwen Official Smart Resize
         new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
         grid_h = new_h // 14
         img_tokens = grid_h * grid_w
+        group_tokens = img_tokens * count
+        image_tokens_count += group_tokens
         image_details.append({
+            "count": count,
             "original_size": [width, height],
             "resized_size": [new_w, new_h],
+            "tokens_per_image": img_tokens,
+            "total_tokens": group_tokens
         })
     # 3. Video Tokens
     return total_tokens, text_tokens_ids, breakdown, media_details
+def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels):
     """
     Llava-1.6 (Next) Token Calculation Formula
     """
     # 2. Image Tokens
     image_details = []
+    for group in image_groups:
+        width, height = group['width'], group['height']
+        count = int(group['count'])
+        if count <= 0:
+            continue
         # Llava-Next Logic:
         # If max_pixels is specified, resize first
         num_patches = patch_x * patch_y
         img_tokens = (num_patches + 1) * 576
+        group_tokens = img_tokens * count
+        image_tokens_count += group_tokens
         image_details.append({
+            "count": count,
+            "original_size": [group['width'], group['height']],
             "resized_size": [width, height],
             "grid_patches": f"{patch_x}x{patch_y}",
+            "tokens_per_image": img_tokens,
+            "total_tokens": group_tokens
         })
     total_tokens = text_tokens_count + image_tokens_count
 # --- Actual UI Logic ---
+def run_calculation(text, model, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
+    # Construct image groups from Dataframe
+    image_groups = []
+    if img_data is not None:
+        # Handle different Dataframe formats (list of lists or pandas)
+        # Gradio usually returns list of lists if type='array' (default?)
+        # Let's assume list of lists for now, or handle pandas if needed.
+        # But wait, Gradio Dataframe `value` is list of lists.
+        for row in img_data:
+            try:
+                c, w, h = row
+                c = int(c)
+                w = int(w)
+                h = int(h)
+                if c > 0:
+                    image_groups.append({'count': c, 'width': w, 'height': h})
+            except Exception:
+                pass
     videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
     # Get Tokenizer
     tokens = 0
     if model == "Qwen2.5-VL / Qwen2-VL":
+        tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels)
     elif model == "Llava-1.6 (Next)":
+        tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels)
     else:
         tokens = 0
             text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
             with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
+                img_data = gr.Dataframe(
+                    headers=["数量", "宽 (px)", "高 (px)"],
+                    datatype=["number", "number", "number"],
+                    value=[[0, 1080, 1920]],
+                    label="图片列表 (可添加多行)",
+                    col_count=(3, "fixed"),
+                    interactive=True
+                )
                 with gr.Row():
                     img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
     btn.click(
         run_calculation,
+        [text_input, model_select, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
         [out_json, out_file]
     )