Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- apps/text_tools.py +55 -21
apps/text_tools.py
CHANGED
|
@@ -55,7 +55,7 @@ def get_tokenizer(model_name):
|
|
| 55 |
|
| 56 |
# --- Token Calculation Logic ---
|
| 57 |
|
| 58 |
-
def calculate_qwen2_vl_tokens(text,
|
| 59 |
"""
|
| 60 |
Qwen2-VL / Qwen2.5-VL Token Calculation Formula
|
| 61 |
"""
|
|
@@ -74,8 +74,11 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
|
|
| 74 |
|
| 75 |
# 2. Image Tokens
|
| 76 |
image_details = []
|
| 77 |
-
for
|
| 78 |
-
width, height =
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
# Apply Qwen Official Smart Resize
|
| 81 |
new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
|
|
@@ -84,11 +87,15 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
|
|
| 84 |
grid_h = new_h // 14
|
| 85 |
img_tokens = grid_h * grid_w
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
image_details.append({
|
|
|
|
| 89 |
"original_size": [width, height],
|
| 90 |
"resized_size": [new_w, new_h],
|
| 91 |
-
"
|
|
|
|
| 92 |
})
|
| 93 |
|
| 94 |
# 3. Video Tokens
|
|
@@ -128,7 +135,7 @@ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer, max_pixels):
|
|
| 128 |
|
| 129 |
return total_tokens, text_tokens_ids, breakdown, media_details
|
| 130 |
|
| 131 |
-
def calculate_llava_next_tokens(text,
|
| 132 |
"""
|
| 133 |
Llava-1.6 (Next) Token Calculation Formula
|
| 134 |
"""
|
|
@@ -145,8 +152,11 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
|
|
| 145 |
|
| 146 |
# 2. Image Tokens
|
| 147 |
image_details = []
|
| 148 |
-
for
|
| 149 |
-
width, height =
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
# Llava-Next Logic:
|
| 152 |
# If max_pixels is specified, resize first
|
|
@@ -161,12 +171,16 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
|
|
| 161 |
num_patches = patch_x * patch_y
|
| 162 |
img_tokens = (num_patches + 1) * 576
|
| 163 |
|
| 164 |
-
|
|
|
|
|
|
|
| 165 |
image_details.append({
|
| 166 |
-
"
|
|
|
|
| 167 |
"resized_size": [width, height],
|
| 168 |
"grid_patches": f"{patch_x}x{patch_y}",
|
| 169 |
-
"
|
|
|
|
| 170 |
})
|
| 171 |
|
| 172 |
total_tokens = text_tokens_count + image_tokens_count
|
|
@@ -186,9 +200,25 @@ def calculate_llava_next_tokens(text, images, tokenizer, max_pixels):
|
|
| 186 |
|
| 187 |
# --- Actual UI Logic ---
|
| 188 |
|
| 189 |
-
def run_calculation(text, model,
|
| 190 |
-
# Construct
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
|
| 193 |
|
| 194 |
# Get Tokenizer
|
|
@@ -207,9 +237,9 @@ def run_calculation(text, model, img_count, img_w, img_h, img_max_pixels, vid_co
|
|
| 207 |
tokens = 0
|
| 208 |
|
| 209 |
if model == "Qwen2.5-VL / Qwen2-VL":
|
| 210 |
-
tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text,
|
| 211 |
elif model == "Llava-1.6 (Next)":
|
| 212 |
-
tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text,
|
| 213 |
else:
|
| 214 |
tokens = 0
|
| 215 |
|
|
@@ -251,10 +281,14 @@ def create_ui():
|
|
| 251 |
text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
|
| 252 |
|
| 253 |
with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
with gr.Row():
|
| 259 |
img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
|
| 260 |
|
|
@@ -279,6 +313,6 @@ def create_ui():
|
|
| 279 |
|
| 280 |
btn.click(
|
| 281 |
run_calculation,
|
| 282 |
-
[text_input, model_select,
|
| 283 |
[out_json, out_file]
|
| 284 |
)
|
|
|
|
| 55 |
|
| 56 |
# --- Token Calculation Logic ---
|
| 57 |
|
| 58 |
+
def calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, max_pixels):
|
| 59 |
"""
|
| 60 |
Qwen2-VL / Qwen2.5-VL Token Calculation Formula
|
| 61 |
"""
|
|
|
|
| 74 |
|
| 75 |
# 2. Image Tokens
|
| 76 |
image_details = []
|
| 77 |
+
for group in image_groups:
|
| 78 |
+
width, height = group['width'], group['height']
|
| 79 |
+
count = int(group['count'])
|
| 80 |
+
if count <= 0:
|
| 81 |
+
continue
|
| 82 |
|
| 83 |
# Apply Qwen Official Smart Resize
|
| 84 |
new_h, new_w = qwen_smart_resize(height, width, factor=28, min_pixels=56*56, max_pixels=max_pixels)
|
|
|
|
| 87 |
grid_h = new_h // 14
|
| 88 |
img_tokens = grid_h * grid_w
|
| 89 |
|
| 90 |
+
group_tokens = img_tokens * count
|
| 91 |
+
image_tokens_count += group_tokens
|
| 92 |
+
|
| 93 |
image_details.append({
|
| 94 |
+
"count": count,
|
| 95 |
"original_size": [width, height],
|
| 96 |
"resized_size": [new_w, new_h],
|
| 97 |
+
"tokens_per_image": img_tokens,
|
| 98 |
+
"total_tokens": group_tokens
|
| 99 |
})
|
| 100 |
|
| 101 |
# 3. Video Tokens
|
|
|
|
| 135 |
|
| 136 |
return total_tokens, text_tokens_ids, breakdown, media_details
|
| 137 |
|
| 138 |
+
def calculate_llava_next_tokens(text, image_groups, tokenizer, max_pixels):
|
| 139 |
"""
|
| 140 |
Llava-1.6 (Next) Token Calculation Formula
|
| 141 |
"""
|
|
|
|
| 152 |
|
| 153 |
# 2. Image Tokens
|
| 154 |
image_details = []
|
| 155 |
+
for group in image_groups:
|
| 156 |
+
width, height = group['width'], group['height']
|
| 157 |
+
count = int(group['count'])
|
| 158 |
+
if count <= 0:
|
| 159 |
+
continue
|
| 160 |
|
| 161 |
# Llava-Next Logic:
|
| 162 |
# If max_pixels is specified, resize first
|
|
|
|
| 171 |
num_patches = patch_x * patch_y
|
| 172 |
img_tokens = (num_patches + 1) * 576
|
| 173 |
|
| 174 |
+
group_tokens = img_tokens * count
|
| 175 |
+
image_tokens_count += group_tokens
|
| 176 |
+
|
| 177 |
image_details.append({
|
| 178 |
+
"count": count,
|
| 179 |
+
"original_size": [group['width'], group['height']],
|
| 180 |
"resized_size": [width, height],
|
| 181 |
"grid_patches": f"{patch_x}x{patch_y}",
|
| 182 |
+
"tokens_per_image": img_tokens,
|
| 183 |
+
"total_tokens": group_tokens
|
| 184 |
})
|
| 185 |
|
| 186 |
total_tokens = text_tokens_count + image_tokens_count
|
|
|
|
| 200 |
|
| 201 |
# --- Actual UI Logic ---
|
| 202 |
|
| 203 |
+
def run_calculation(text, model, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h):
|
| 204 |
+
# Construct image groups from Dataframe
|
| 205 |
+
image_groups = []
|
| 206 |
+
if img_data is not None:
|
| 207 |
+
# Handle different Dataframe formats (list of lists or pandas)
|
| 208 |
+
# Gradio usually returns list of lists if type='array' (default?)
|
| 209 |
+
# Let's assume list of lists for now, or handle pandas if needed.
|
| 210 |
+
# But wait, Gradio Dataframe `value` is list of lists.
|
| 211 |
+
for row in img_data:
|
| 212 |
+
try:
|
| 213 |
+
c, w, h = row
|
| 214 |
+
c = int(c)
|
| 215 |
+
w = int(w)
|
| 216 |
+
h = int(h)
|
| 217 |
+
if c > 0:
|
| 218 |
+
image_groups.append({'count': c, 'width': w, 'height': h})
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
|
| 223 |
|
| 224 |
# Get Tokenizer
|
|
|
|
| 237 |
tokens = 0
|
| 238 |
|
| 239 |
if model == "Qwen2.5-VL / Qwen2-VL":
|
| 240 |
+
tokens, text_tokens_ids, breakdown, media_details = calculate_qwen2_vl_tokens(text, image_groups, videos, tokenizer, img_max_pixels)
|
| 241 |
elif model == "Llava-1.6 (Next)":
|
| 242 |
+
tokens, text_tokens_ids, breakdown, media_details = calculate_llava_next_tokens(text, image_groups, tokenizer, img_max_pixels)
|
| 243 |
else:
|
| 244 |
tokens = 0
|
| 245 |
|
|
|
|
| 281 |
text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
|
| 282 |
|
| 283 |
with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
|
| 284 |
+
img_data = gr.Dataframe(
|
| 285 |
+
headers=["数量", "宽 (px)", "高 (px)"],
|
| 286 |
+
datatype=["number", "number", "number"],
|
| 287 |
+
value=[[0, 1080, 1920]],
|
| 288 |
+
label="图片列表 (可添加多行)",
|
| 289 |
+
col_count=(3, "fixed"),
|
| 290 |
+
interactive=True
|
| 291 |
+
)
|
| 292 |
with gr.Row():
|
| 293 |
img_max_pixels = gr.Number(value=512*512, label="Max Pixels (最大像素限制)", precision=0)
|
| 294 |
|
|
|
|
| 313 |
|
| 314 |
btn.click(
|
| 315 |
run_calculation,
|
| 316 |
+
[text_input, model_select, img_data, img_max_pixels, vid_count, vid_frames, vid_w, vid_h],
|
| 317 |
[out_json, out_file]
|
| 318 |
)
|