MashiroLn commited on
Commit
6ee5c32
·
verified ·
1 Parent(s): 7442a76

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. apps/text_tools.py +71 -36
  2. requirements.txt +2 -1
apps/text_tools.py CHANGED
@@ -4,11 +4,33 @@ import json
4
  import os
5
  from transformers import AutoTokenizer
6
 
7
- # --- Tokenizer 加载逻辑 ---
8
- # 为了避免每次请求都重新加载,我们可以尝试缓存 tokenizer
9
- # 但在 HF Spaces 中,内存有限,且模型可能很大。
10
- # 对于 Qwen2.5-VL,我们可以使用 Qwen/Qwen2.5-VL-7B-Instruct 的 tokenizer
11
- # 对于 Llava,通常使用 Llama-2 Vicuna 的 tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  TOKENIZERS = {}
13
 
14
  def get_tokenizer(model_name):
@@ -17,12 +39,10 @@ def get_tokenizer(model_name):
17
 
18
  try:
19
  if model_name == "Qwen2.5-VL / Qwen2-VL":
20
- # Qwen2-VL 使用 Qwen2 tokenizer
21
- # 注意:这里需要联网下载 tokenizer.json,HF Spaces 通常允许
22
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
23
  elif model_name == "Llava-1.6 (Next)":
24
- # Llava-1.6 基于 Vicuna/Llama-2,这里用 Llama-2 tokenizer 近似,或者直接用 llava-hf
25
- # 为了通用性,我们使用 llava-hf/llava-v1.6-vicuna-7b-hf
26
  tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
27
  else:
28
  return None
@@ -33,17 +53,17 @@ def get_tokenizer(model_name):
33
  print(f"Error loading tokenizer for {model_name}: {e}")
34
  return None
35
 
36
- # --- Token 计算逻辑 ---
37
 
38
- def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
39
  """
40
- Qwen2-VL / Qwen2.5-VL Token 计算公式
41
  """
42
  text_tokens_count = 0
43
  image_tokens_count = 0
44
  video_tokens_count = 0
45
 
46
- # 1. 文本 Token (真实计算)
47
  text_tokens_ids = []
48
  if tokenizer:
49
  text_tokens_ids = tokenizer.encode(text)
@@ -52,12 +72,14 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
52
  # Fallback
53
  text_tokens_count = len(text) // 2
54
 
55
- # 2. 图片 Token
56
  image_details = []
57
  for img in images:
58
  width, height = img['width'], img['height']
59
- new_w = int(round(width / 28.0) * 28)
60
- new_h = int(round(height / 28.0) * 28)
 
 
61
  grid_w = new_w // 14
62
  grid_h = new_h // 14
63
  img_tokens = grid_h * grid_w
@@ -69,13 +91,15 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
69
  "tokens": img_tokens
70
  })
71
 
72
- # 3. 视频 Token
73
  video_details = []
74
  for vid in videos:
75
  frames = vid['frames']
76
  width, height = vid['width'], vid['height']
77
- new_w = int(round(width / 28.0) * 28)
78
- new_h = int(round(height / 28.0) * 28)
 
 
79
  grid_w = new_w // 14
80
  grid_h = new_h // 14
81
  frame_tokens = grid_h * grid_w
@@ -104,14 +128,14 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
104
 
105
  return total_tokens, text_tokens_ids, breakdown, media_details
106
 
107
- def calculate_llava_next_tokens(text, images, tokenizer):
108
  """
109
- Llava-1.6 (Next) Token 计算公式
110
  """
111
  text_tokens_count = 0
112
  image_tokens_count = 0
113
 
114
- # 1. 文本 Token
115
  text_tokens_ids = []
116
  if tokenizer:
117
  text_tokens_ids = tokenizer.encode(text)
@@ -119,10 +143,18 @@ def calculate_llava_next_tokens(text, images, tokenizer):
119
  else:
120
  text_tokens_count = len(text) // 2
121
 
122
- # 2. 图片 Token
123
  image_details = []
124
  for img in images:
125
  width, height = img['width'], img['height']
 
 
 
 
 
 
 
 
126
  scale_res = 336
127
  patch_x = math.ceil(width / scale_res)
128
  patch_y = math.ceil(height / scale_res)
@@ -131,8 +163,9 @@ def calculate_llava_next_tokens(text, images, tokenizer):
131
 
132
  image_tokens_count += img_tokens
133
  image_details.append({
134
- "original_size": [width, height],
135
- "resized_size": ["Dynamic Grid", f"{patch_x}x{patch_y} patches"],
 
136
  "tokens": img_tokens
137
  })
138
 
@@ -151,17 +184,17 @@ def calculate_llava_next_tokens(text, images, tokenizer):
151
 
152
  return total_tokens, text_tokens_ids, breakdown, media_details
153
 
154
- # --- 实际 UI 逻辑 ---
155
 
156
- def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h):
157
- # 构造虚拟数据
158
  images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
159
  videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
160
 
161
- # 获取 Tokenizer
162
  tokenizer = get_tokenizer(model)
163
 
164
- # 确定真实模型 ID
165
  model_id_map = {
166
  "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
167
  "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
@@ -174,17 +207,17 @@ def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames,
174
  tokens = 0
175
 
176
  if model == "Qwen2.5-VL / Qwen2-VL":
177
- tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer)
178
  elif model == "Llava-1.6 (Next)":
179
- tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer)
180
  else:
181
  tokens = 0
182
 
183
- # 生成 Token 对应文件
184
  token_file_path = None
185
  if tokenizer and text_tokens_ids:
186
  token_data = []
187
- # 解码每个 token id 对应的 string
188
  for tid in text_tokens_ids:
189
  token_str = tokenizer.decode([tid])
190
  token_data.append({"id": tid, "token": token_str})
@@ -193,7 +226,7 @@ def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames,
193
  with open(token_file_path, "w", encoding="utf-8") as f:
194
  json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
195
 
196
- # 构造最终返回的 JSON
197
  result = {
198
  "model_id": real_model_id,
199
  "tokenizer_loaded": tokenizer is not None,
@@ -222,6 +255,8 @@ def create_ui():
222
  img_count = gr.Number(value=1, label="图片数量", precision=0)
223
  img_w = gr.Number(value=1024, label="宽 (px)")
224
  img_h = gr.Number(value=1024, label="高 (px)")
 
 
225
 
226
  with gr.Accordion("🎥 视频设置 (Videos)", open=False):
227
  with gr.Row():
@@ -244,6 +279,6 @@ def create_ui():
244
 
245
  btn.click(
246
  run_calculation,
247
- [text_input, model_select, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h],
248
  [out_json, out_file]
249
  )
 
4
  import os
5
  from transformers import AutoTokenizer
6
 
7
+ # Try to import qwen_vl_utils, otherwise use the built-in official implementation copy
8
+ try:
9
+ from qwen_vl_utils.vision_process import smart_resize as qwen_smart_resize
10
+ except ImportError:
11
+ # Qwen-VL-Utils official implementation copy
12
+ def qwen_smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=1280 * 1280):
13
+ """
14
+ Official implementation from qwen_vl_utils.vision_process
15
+ """
16
+ if max(height, width) / min(height, width) > 200:
17
+ factor = 1 # For extreme aspect ratios
18
+
19
+ h_bar = round(height / factor) * factor
20
+ w_bar = round(width / factor) * factor
21
+
22
+ if h_bar * w_bar > max_pixels:
23
+ beta = math.sqrt((height * width) / max_pixels)
24
+ h_bar = math.floor(height / beta / factor) * factor
25
+ w_bar = math.floor(width / beta / factor) * factor
26
+ elif h_bar * w_bar < min_pixels:
27
+ beta = math.sqrt(min_pixels / (height * width))
28
+ h_bar = math.ceil(height * beta / factor) * factor
29
+ w_bar = math.ceil(width * beta / factor) * factor
30
+
31
+ return h_bar, w_bar
32
+
33
+ # --- Tokenizer Loading Logic ---
34
  TOKENIZERS = {}
35
 
36
  def get_tokenizer(model_name):
 
39
 
40
  try:
41
  if model_name == "Qwen2.5-VL / Qwen2-VL":
42
+ # Qwen2-VL uses Qwen2 tokenizer
 
43
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
44
  elif model_name == "Llava-1.6 (Next)":
45
+ # Llava-1.6 based on Vicuna/Llama-2
 
46
  tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
47
  else:
48
  return None
 
53
  print(f"Error loading tokenizer for {model_name}: {e}")
54
  return None
55
 
56
+ # --- Token Calculation Logic ---
57
 
58
+ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
59
  """
60
+ Qwen2-VL / Qwen2.5-VL Token Calculation Formula
61
  """
62
  text_tokens_count = 0
63
  image_tokens_count = 0
64
  video_tokens_count = 0
65
 
66
+ # 1. Text Tokens (Real Calculation)
67
  text_tokens_ids = []
68
  if tokenizer:
69
  text_tokens_ids = tokenizer.encode(text)
 
72
  # Fallback
73
  text_tokens_count = len(text) // 2
74
 
75
+ # 2. Image Tokens
76
  image_details = []
77
  for img in images:
78
  width, height = img['width'], img['height']
79
+
80
+ # Apply Qwen Official Smart Resize
81
+ new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
82
+
83
  grid_w = new_w // 14
84
  grid_h = new_h // 14
85
  img_tokens = grid_h * grid_w
 
91
  "tokens": img_tokens
92
  })
93
 
94
+ # 3. Video Tokens
95
  video_details = []
96
  for vid in videos:
97
  frames = vid['frames']
98
  width, height = vid['width'], vid['height']
99
+
100
+ # Video processing logic is similar to images
101
+ new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
102
+
103
  grid_w = new_w // 14
104
  grid_h = new_h // 14
105
  frame_tokens = grid_h * grid_w
 
128
 
129
  return total_tokens, text_tokens_ids, breakdown, media_details
130
 
131
+ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
132
  """
133
+ Llava-1.6 (Next) Token Calculation Formula
134
  """
135
  text_tokens_count = 0
136
  image_tokens_count = 0
137
 
138
+ # 1. Text Tokens
139
  text_tokens_ids = []
140
  if tokenizer:
141
  text_tokens_ids = tokenizer.encode(text)
 
143
  else:
144
  text_tokens_count = len(text) // 2
145
 
146
+ # 2. Image Tokens
147
  image_details = []
148
  for img in images:
149
  width, height = img['width'], img['height']
150
+
151
+ # Llava-Next Logic:
152
+ # If max_pixels is specified, resize first
153
+ if max_pixels > 0 and (width * height > max_pixels):
154
+ scale_factor = math.sqrt(max_pixels / (width * height))
155
+ width = int(width * scale_factor)
156
+ height = int(height * scale_factor)
157
+
158
  scale_res = 336
159
  patch_x = math.ceil(width / scale_res)
160
  patch_y = math.ceil(height / scale_res)
 
163
 
164
  image_tokens_count += img_tokens
165
  image_details.append({
166
+ "original_size": [img['width'], img['height']],
167
+ "resized_size": [width, height],
168
+ "grid_patches": f"{patch_x}x{patch_y}",
169
  "tokens": img_tokens
170
  })
171
 
 
184
 
185
  return total_tokens, text_tokens_ids, breakdown, media_details
186
 
187
+ # --- Actual UI Logic ---
188
 
189
+ def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
190
+ # Construct virtual data
191
  images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
192
  videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
193
 
194
+ # Get Tokenizer
195
  tokenizer = get_tokenizer(model)
196
 
197
+ # Determine real model ID
198
  model_id_map = {
199
  "Qwen2.5-VL / Qwen2-VL": "Qwen/Qwen2.5-VL-7B-Instruct",
200
  "Llava-1.6 (Next)": "llava-hf/llava-v1.6-vicuna-7b-hf"
 
207
  tokens = 0
208
 
209
  if model == "Qwen2.5-VL / Qwen2-VL":
210
+ tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, images, videos, tokenizer, img_max_pixels)
211
  elif model == "Llava-1.6 (Next)":
212
+ tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, images, tokenizer, img_max_pixels)
213
  else:
214
  tokens = 0
215
 
216
+ # Generate Token Analysis File
217
  token_file_path = None
218
  if tokenizer and text_tokens_ids:
219
  token_data = []
220
+ # Decode each token id
221
  for tid in text_tokens_ids:
222
  token_str = tokenizer.decode([tid])
223
  token_data.append({"id": tid, "token": token_str})
 
226
  with open(token_file_path, "w", encoding="utf-8") as f:
227
  json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
228
 
229
+ # Construct final JSON result
230
  result = {
231
  "model_id": real_model_id,
232
  "tokenizer_loaded": tokenizer is not None,
 
255
  img_count = gr.Number(value=1, label="图片数量", precision=0)
256
  img_w = gr.Number(value=1024, label="宽 (px)")
257
  img_h = gr.Number(value=1024, label="高 (px)")
258
+ with gr.Row():
259
+ img_max_pixels = gr.Number(value=1280*1280, label="Max Pixels (最大像素限制)", precision=0)
260
 
261
  with gr.Accordion("🎥 视频设置 (Videos)", open=False):
262
  with gr.Row():
 
279
 
280
  btn.click(
281
  run_calculation,
282
+ [text_input, model_select, img_count, img_w, img_h, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
283
  [out_json, out_file]
284
  )
requirements.txt CHANGED
@@ -3,4 +3,5 @@ Pillow
3
  img2pdf
4
  huggingface_hub
5
  transformers
6
- tiktoken
 
 
3
  img2pdf
4
  huggingface_hub
5
  transformers
6
+ tiktoken
7
+ qwen-vl-utils