MashiroLn commited on
Commit
f78b115
·
verified ·
1 Parent(s): c0f4653

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. apps/text_tools.py +55 -21
apps/text_tools.py CHANGED
@@ -55,7 +55,7 @@ def get_tokenizer(model_name):
55
 
56
  # --- Token Calculation Logic ---
57
 
58
- def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
59
  """
60
  Qwen2-VL / Qwen2.5-VL Token Calculation Formula
61
  """
@@ -74,8 +74,11 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
74
 
75
  # 2. Image Tokens
76
  image_details = []
77
- for img in images:
78
- width, height = img['width'], img['height']
 
 
 
79
 
80
  # Apply Qwen Official Smart Resize
81
  new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
@@ -84,11 +87,15 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
84
  grid_h = new_h // 14
85
  img_tokens = grid_h * grid_w
86
 
87
- image_tokens_count += img_tokens
 
 
88
  image_details.append({
 
89
  "original_size": [width, height],
90
  "resized_size": [new_w, new_h],
91
- "tokens": img_tokens
 
92
  })
93
 
94
  # 3. Video Tokens
@@ -128,7 +135,7 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
128
 
129
  return total_tokens, text_tokens_ids, breakdown, media_details
130
 
131
- def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
132
  """
133
  Llava-1.6 (Next) Token Calculation Formula
134
  """
@@ -145,8 +152,11 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
145
 
146
  # 2. Image Tokens
147
  image_details = []
148
- for img in images:
149
- width, height = img['width'], img['height']
 
 
 
150
 
151
  # Llava-Next Logic:
152
  # If max_pixels is specified, resize first
@@ -161,12 +171,16 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
161
  num_patches = patch_x * patch_y
162
  img_tokens = (num_patches + 1) * 576
163
 
164
- image_tokens_count += img_tokens
 
 
165
  image_details.append({
166
- "original_size": [img['width'], img['height']],
 
167
  "resized_size": [width, height],
168
  "grid_patches": f"{patch_x}x{patch_y}",
169
- "tokens": img_tokens
 
170
  })
171
 
172
  total_tokens = text_tokens_count + image_tokens_count
@@ -186,9 +200,25 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
186
 
187
  # --- Actual UI Logic ---
188
 
189
- def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
190
- # Construct virtual data
191
- images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
193
 
194
  # Get Tokenizer
@@ -207,9 +237,9 @@ def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_co
207
  tokens = 0
208
 
209
  if model == "Qwen2.5-VL / Qwen2-VL":
210
- tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels)
211
  elif model == "Llava-1.6 (Next)":
212
- tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels)
213
  else:
214
  tokens = 0
215
 
@@ -251,10 +281,14 @@ def create_ui():
251
  text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
252
 
253
  with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
254
- with gr.Row():
255
- img_count = gr.Number(value=0, label="图片数量", precision=0)
256
- img_w = gr.Number(value=1080, label=" (px)")
257
- img_h = gr.Number(value=1920, label="高 (px)")
 
 
 
 
258
  with gr.Row():
259
  img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
260
 
@@ -279,6 +313,6 @@ def create_ui():
279
 
280
  btn.click(
281
  run_calculation,
282
- [text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
283
  [out_json, out_file]
284
  )
 
55
 
56
  # --- Token Calculation Logic ---
57
 
58
+ def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels):
59
  """
60
  Qwen2-VL / Qwen2.5-VL Token Calculation Formula
61
  """
 
74
 
75
  # 2. Image Tokens
76
  image_details = []
77
+ for group in image_groups:
78
+ width, height = group['width'], group['height']
79
+ count = int(group['count'])
80
+ if count <= 0:
81
+ continue
82
 
83
  # Apply Qwen Official Smart Resize
84
  new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
 
87
  grid_h = new_h // 14
88
  img_tokens = grid_h * grid_w
89
 
90
+ group_tokens = img_tokens * count
91
+ image_tokens_count += group_tokens
92
+
93
  image_details.append({
94
+ "count": count,
95
  "original_size": [width, height],
96
  "resized_size": [new_w, new_h],
97
+ "tokens_per_image": img_tokens,
98
+ "total_tokens": group_tokens
99
  })
100
 
101
  # 3. Video Tokens
 
135
 
136
  return total_tokens, text_tokens_ids, breakdown, media_details
137
 
138
+ def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels):
139
  """
140
  Llava-1.6 (Next) Token Calculation Formula
141
  """
 
152
 
153
  # 2. Image Tokens
154
  image_details = []
155
+ for group in image_groups:
156
+ width, height = group['width'], group['height']
157
+ count = int(group['count'])
158
+ if count <= 0:
159
+ continue
160
 
161
  # Llava-Next Logic:
162
  # If max_pixels is specified, resize first
 
171
  num_patches = patch_x * patch_y
172
  img_tokens = (num_patches + 1) * 576
173
 
174
+ group_tokens = img_tokens * count
175
+ image_tokens_count += group_tokens
176
+
177
  image_details.append({
178
+ "count": count,
179
+ "original_size": [group['width'], group['height']],
180
  "resized_size": [width, height],
181
  "grid_patches": f"{patch_x}x{patch_y}",
182
+ "tokens_per_image": img_tokens,
183
+ "total_tokens": group_tokens
184
  })
185
 
186
  total_tokens = text_tokens_count + image_tokens_count
 
200
 
201
  # --- Actual UI Logic ---
202
 
203
+ def run_calculation(text, model, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
204
+ # Construct image groups from Dataframe
205
+ image_groups = []
206
+ if img_data is not None:
207
+ # Handle different Dataframe formats (list of lists or pandas)
208
+ # Gradio usually returns list of lists if type='array' (default?)
209
+ # Let's assume list of lists for now, or handle pandas if needed.
210
+ # But wait, Gradio Dataframe `value` is list of lists.
211
+ for row in img_data:
212
+ try:
213
+ c, w, h = row
214
+ c = int(c)
215
+ w = int(w)
216
+ h = int(h)
217
+ if c > 0:
218
+ image_groups.append({'count': c, 'width': w, 'height': h})
219
+ except Exception:
220
+ pass
221
+
222
  videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
223
 
224
  # Get Tokenizer
 
237
  tokens = 0
238
 
239
  if model == "Qwen2.5-VL / Qwen2-VL":
240
+ tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels)
241
  elif model == "Llava-1.6 (Next)":
242
+ tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels)
243
  else:
244
  tokens = 0
245
 
 
281
  text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
282
 
283
  with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
284
+ img_data = gr.Dataframe(
285
+ headers=["数量", "宽 (px)", "高 (px)"],
286
+ datatype=["number", "number", "number"],
287
+ value=[[0, 1080, 1920]],
288
+ label="图片列表 (可添加多行)",
289
+ col_count=(3, "fixed"),
290
+ interactive=True
291
+ )
292
  with gr.Row():
293
  img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
294
 
 
313
 
314
  btn.click(
315
  run_calculation,
316
+ [text_input, model_select, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
317
  [out_json, out_file]
318
  )