MashiroLn commited on
Commit
4f4e23f
·
verified ·
1 Parent(s): bfedf71

Upload folder using huggingface_hub

Browse files
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from apps import pdf_cropper, text_tools, text_diff
3
 
4
  def create_main_interface():
5
  with gr.Blocks(title="我的科研工具箱") as main_app:
@@ -15,10 +15,6 @@ def create_main_interface():
15
  # --- 工具 2: 文本分析 (示例) ---
16
  with gr.TabItem("📝 文本统计"):
17
  text_tools.create_ui()
18
-
19
- # --- 工具 3: 文本比对 ---
20
- with gr.TabItem("🔍 文本比对"):
21
- text_diff.create_ui()
22
 
23
  # --- 可以在这里继续添加更多 Tab ---
24
 
@@ -27,13 +23,18 @@ def create_main_interface():
27
  if __name__ == "__main__":
28
  app = create_main_interface()
29
 
30
- # 创建一个微调过的 Soft 主题
31
- # primary_hue="indigo" (靛青色,更有科技感)
32
- # neutral_hue="slate" (岩灰色,更护眼)
33
- custom_theme = gr.themes.Soft(
34
- primary_hue="indigo",
35
- neutral_hue="slate",
36
- )
 
 
 
 
 
37
 
38
  # 注意:在 Gradio 新版本中,theme 参数已移动到 launch() 方法中
39
- app.launch(inbrowser=True, theme=custom_theme)
 
1
  import gradio as gr
2
+ from apps import pdf_cropper, text_tools
3
 
4
  def create_main_interface():
5
  with gr.Blocks(title="我的科研工具箱") as main_app:
 
15
  # --- 工具 2: 文本分析 (示例) ---
16
  with gr.TabItem("📝 文本统计"):
17
  text_tools.create_ui()
 
 
 
 
18
 
19
  # --- 可以在这里继续添加更多 Tab ---
20
 
 
23
  if __name__ == "__main__":
24
  app = create_main_interface()
25
 
26
+ # custom_theme = gr.themes.Ocean(
27
+ # primary_hue="emerald",
28
+ # neutral_hue="gray",
29
+ # ).set(
30
+ # body_background_fill="#0f172a", # 深蓝灰背景 (类似 Slate 900)
31
+ # block_background_fill="#1e293b", # 卡片背景 (类似 Slate 800)
32
+ # block_border_width="0px", # 扁平化,去边框
33
+ # block_shadow="none", # 扁平化,去阴影
34
+ # button_primary_background_fill="*primary_600",
35
+ # button_primary_background_fill_hover="*primary_500",
36
+ # block_title_text_weight="600",
37
+ # )
38
 
39
  # 注意:在 Gradio 新版本中,theme 参数已移动到 launch() 方法中
40
+ app.launch(inbrowser=True)
apps/__pycache__/pdf_cropper.cpython-311.pyc ADDED
Binary file (4.72 kB). View file
 
apps/__pycache__/text_diff.cpython-311.pyc ADDED
Binary file (3.57 kB). View file
 
apps/__pycache__/text_tools.cpython-311.pyc ADDED
Binary file (9.73 kB). View file
 
apps/pdf_cropper.py CHANGED
@@ -66,6 +66,11 @@ def create_ui():
66
  fuzz = gr.Slider(0, 100, 30, label="容差")
67
  btn = gr.Button("开始处理", variant="primary")
68
  with gr.Column():
69
- output = gr.File(label="下载结果", file_count="multiple")
 
 
 
 
 
70
 
71
  btn.click(process_pipeline, [file_input, quality, fuzz], output)
 
66
  fuzz = gr.Slider(0, 100, 30, label="容差")
67
  btn = gr.Button("开始处理", variant="primary")
68
  with gr.Column():
69
+ # 输出文件列表
70
+ output = gr.File(label="下载结果 (点击文件名下载)", file_count="multiple")
71
+ # 增加一个 Zip 下载选项,方便用户
72
+ # 注意:这里我们暂时不实现 Zip 打包逻辑,因为用户明确说“不要打包”
73
+ # 但为了方便“一次性下载”,通常 Zip 是唯一解。
74
+ # 如果用户坚持不要 Zip,那只能列表展示。
75
 
76
  btn.click(process_pipeline, [file_input, quality, fuzz], output)
apps/text_tools.py CHANGED
@@ -1,16 +1,185 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def analyze_text(text):
4
  return {
5
- "字符数": len(text),
6
- "单词数 (空格分隔)": len(text.split()),
7
- "行数": len(text.splitlines())
8
- }
 
 
9
 
10
  def create_ui():
11
  with gr.Row():
12
- inp = gr.Textbox(lines=5, label="输入文本")
13
- out = gr.JSON(label="统计结果")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- btn = gr.Button("分析")
16
- btn.click(analyze_text, inp, out)
 
 
 
 
1
  import gradio as gr
2
+ import math
3
+ import json
4
+ import os
5
+ from transformers import AutoTokenizer
6
+
7
+ # --- Tokenizer 加载逻辑 ---
8
+ # 为了避免每次请求都重新加载,我们可以尝试缓存 tokenizer
9
+ # 但在 HF Spaces 中,内存有限,且模型可能很大。
10
+ # 对于 Qwen2.5-VL,我们可以使用 Qwen/Qwen2.5-VL-7B-Instruct 的 tokenizer
11
+ # 对于 Llava,通常使用 Llama-2 或 Vicuna 的 tokenizer
12
+ TOKENIZERS = {}
13
+
14
+ def get_tokenizer(model_name):
15
+ if model_name in TOKENIZERS:
16
+ return TOKENIZERS[model_name]
17
+
18
+ try:
19
+ if model_name == "Qwen2.5-VL / Qwen2-VL":
20
+ # Qwen2-VL 使用 Qwen2 的 tokenizer
21
+ # 注意:这里需要联网下载 tokenizer.json,HF Spaces 通常允许
22
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", trust_remote_code=True)
23
+ elif model_name == "Llava-1.6 (Next)":
24
+ # Llava-1.6 基于 Vicuna/Llama-2,这里用 Llama-2 tokenizer 近似,或者直接用 llava-hf
25
+ # 为了通用性,我们使用 llava-hf/llava-v1.6-vicuna-7b-hf
26
+ tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf", trust_remote_code=True)
27
+ else:
28
+ return None
29
+
30
+ TOKENIZERS[model_name] = tokenizer
31
+ return tokenizer
32
+ except Exception as e:
33
+ print(f"Error loading tokenizer for {model_name}: {e}")
34
+ return None
35
+
36
+ # --- Token 计算逻辑 ---
37
+
38
+ def calculate_qwen2_vl_tokens(text, images, videos, tokenizer):
39
+ """
40
+ Qwen2-VL / Qwen2.5-VL Token 计算公式
41
+ """
42
+ total_tokens = 0
43
+
44
+ # 1. 文本 Token (真实计算)
45
+ text_tokens = []
46
+ if tokenizer:
47
+ text_tokens = tokenizer.encode(text)
48
+ total_tokens += len(text_tokens)
49
+ else:
50
+ # Fallback
51
+ total_tokens += len(text) // 2
52
+
53
+ # 2. 图片 Token
54
+ for img in images:
55
+ width, height = img['width'], img['height']
56
+ new_w = int(round(width / 28.0) * 28)
57
+ new_h = int(round(height / 28.0) * 28)
58
+ grid_w = new_w // 14
59
+ grid_h = new_h // 14
60
+ img_tokens = grid_h * grid_w
61
+ total_tokens += img_tokens
62
+
63
+ # 3. 视频 Token
64
+ for vid in videos:
65
+ frames = vid['frames']
66
+ width, height = vid['width'], vid['height']
67
+ new_w = int(round(width / 28.0) * 28)
68
+ new_h = int(round(height / 28.0) * 28)
69
+ grid_w = new_w // 14
70
+ grid_h = new_h // 14
71
+ frame_tokens = grid_h * grid_w
72
+ total_tokens += frames * frame_tokens
73
+
74
+ return total_tokens, text_tokens
75
+
76
+ def calculate_llava_next_tokens(text, images, tokenizer):
77
+ """
78
+ Llava-1.6 (Next) Token 计算公式
79
+ """
80
+ total_tokens = 0
81
+
82
+ # 1. 文本 Token
83
+ text_tokens = []
84
+ if tokenizer:
85
+ text_tokens = tokenizer.encode(text)
86
+ total_tokens += len(text_tokens)
87
+ else:
88
+ total_tokens += len(text) // 2
89
+
90
+ # 2. 图片 Token
91
+ for img in images:
92
+ width, height = img['width'], img['height']
93
+ scale_res = 336
94
+ patch_x = math.ceil(width / scale_res)
95
+ patch_y = math.ceil(height / scale_res)
96
+ num_patches = patch_x * patch_y
97
+ img_tokens = (num_patches + 1) * 576
98
+ total_tokens += img_tokens
99
+
100
+ return total_tokens, text_tokens
101
+
102
+ # --- 实际 UI 逻辑 ---
103
+
104
+ def run_calculation(text, model, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h):
105
+ # 构造虚拟数据
106
+ images = [{'width': img_w, 'height': img_h} for _ in range(int(img_count))]
107
+ videos = [{'width': vid_w, 'height': vid_h, 'frames': int(vid_frames)} for _ in range(int(vid_count))]
108
+
109
+ # 获取 Tokenizer
110
+ tokenizer = get_tokenizer(model)
111
+ tokenizer_status = "✅ 已加载真实 Tokenizer" if tokenizer else "⚠️ Tokenizer 加载失败,使用估算值"
112
+
113
+ text_tokens_ids = []
114
+
115
+ if model == "Qwen2.5-VL / Qwen2-VL":
116
+ tokens, text_tokens_ids = calculate_qwen2_vl_tokens(text, images, videos, tokenizer)
117
+ info = "Qwen2-VL 使用 Naive Dynamic Resolution (patch 14x14)。\n图片会被 resize 为 28 的倍数。"
118
+ elif model == "Llava-1.6 (Next)":
119
+ tokens, text_tokens_ids = calculate_llava_next_tokens(text, images, tokenizer)
120
+ info = "Llava-1.6 使用 AnyRes 技术 (base 336x336)。\n包含 Base Image + Grid Patches。"
121
+ else:
122
+ tokens = 0
123
+ info = "未知模型"
124
+
125
+ # 生成 Token 对应文件
126
+ token_file_path = None
127
+ if tokenizer and text_tokens_ids:
128
+ token_data = []
129
+ # 解码每个 token id 对应的 string
130
+ for tid in text_tokens_ids:
131
+ token_str = tokenizer.decode([tid])
132
+ token_data.append({"id": tid, "token": token_str})
133
+
134
+ token_file_path = "token_analysis.json"
135
+ with open(token_file_path, "w", encoding="utf-8") as f:
136
+ json.dump({"text": text, "tokens": token_data}, f, ensure_ascii=False, indent=2)
137
 
 
138
  return {
139
+ "总 Token 数": tokens,
140
+ "自然语言字符数": len(text),
141
+ "Tokenizer 状态": tokenizer_status,
142
+ "模型": model,
143
+ "说明": info
144
+ }, token_file_path
145
 
146
  def create_ui():
147
  with gr.Row():
148
+ with gr.Column(scale=1):
149
+ model_select = gr.Dropdown(
150
+ choices=["Qwen2.5-VL / Qwen2-VL", "Llava-1.6 (Next)"],
151
+ value="Qwen2.5-VL / Qwen2-VL",
152
+ label="选择模型"
153
+ )
154
+ text_input = gr.Textbox(lines=5, label="输入文本 (Text)", placeholder="输入 Prompt...")
155
+
156
+ with gr.Accordion("🖼️ 图片设置 (Images)", open=True):
157
+ with gr.Row():
158
+ img_count = gr.Number(value=1, label="图片数量", precision=0)
159
+ img_w = gr.Number(value=1024, label="宽 (px)")
160
+ img_h = gr.Number(value=1024, label="高 (px)")
161
+
162
+ with gr.Accordion("🎥 视频设置 (Videos)", open=False):
163
+ with gr.Row():
164
+ vid_count = gr.Number(value=0, label="视频数量", precision=0)
165
+ vid_frames = gr.Number(value=16, label="总帧数/视频", precision=0)
166
+ vid_w = gr.Number(value=512, label="宽 (px)")
167
+ vid_h = gr.Number(value=512, label="高 (px)")
168
+
169
+ btn = gr.Button("🚀 计算 Token", variant="primary")
170
+
171
+ with gr.Column(scale=1):
172
+ out_json = gr.JSON(label="计算结果")
173
+ out_file = gr.File(label="下载 Token 分析 (JSON)")
174
+ gr.Markdown("""
175
+ ### 说明
176
+ * **真实 Tokenizer**: 首次运行时会自动下载 `transformers` 模型配置,可能需要几秒钟。
177
+ * **Qwen2-VL**: 基于 `H/14 * W/14` 计算,自动对齐到 28px 网格。
178
+ * **Llava-1.6**: 基于 `(Patches + 1) * 576` 计算,Patch 大小为 336px。
179
+ """)
180
 
181
+ btn.click(
182
+ run_calculation,
183
+ [text_input, model_select, img_count, img_w, img_h, vid_count, vid_frames, vid_w, vid_h],
184
+ [out_json, out_file]
185
+ )
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  gradio
2
  Pillow
3
  img2pdf
4
- huggingface_hub
 
 
 
1
  gradio
2
  Pillow
3
  img2pdf
4
+ huggingface_hub
5
+ transformers
6
+ tiktoken
token_analysis.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "text": "思考1.234和1.435哪个更大\n",
3
+ "tokens": [
4
+ {
5
+ "id": 1,
6
+ "token": "<s>"
7
+ },
8
+ {
9
+ "id": 29871,
10
+ "token": ""
11
+ },
12
+ {
13
+ "id": 31579,
14
+ "token": "思"
15
+ },
16
+ {
17
+ "id": 235,
18
+ "token": "�"
19
+ },
20
+ {
21
+ "id": 131,
22
+ "token": "�"
23
+ },
24
+ {
25
+ "id": 134,
26
+ "token": "�"
27
+ },
28
+ {
29
+ "id": 29896,
30
+ "token": "1"
31
+ },
32
+ {
33
+ "id": 29889,
34
+ "token": "."
35
+ },
36
+ {
37
+ "id": 29906,
38
+ "token": "2"
39
+ },
40
+ {
41
+ "id": 29941,
42
+ "token": "3"
43
+ },
44
+ {
45
+ "id": 29946,
46
+ "token": "4"
47
+ },
48
+ {
49
+ "id": 30503,
50
+ "token": "和"
51
+ },
52
+ {
53
+ "id": 29896,
54
+ "token": "1"
55
+ },
56
+ {
57
+ "id": 29889,
58
+ "token": "."
59
+ },
60
+ {
61
+ "id": 29946,
62
+ "token": "4"
63
+ },
64
+ {
65
+ "id": 29941,
66
+ "token": "3"
67
+ },
68
+ {
69
+ "id": 29945,
70
+ "token": "5"
71
+ },
72
+ {
73
+ "id": 232,
74
+ "token": "�"
75
+ },
76
+ {
77
+ "id": 150,
78
+ "token": "�"
79
+ },
80
+ {
81
+ "id": 173,
82
+ "token": "�"
83
+ },
84
+ {
85
+ "id": 30502,
86
+ "token": "个"
87
+ },
88
+ {
89
+ "id": 31100,
90
+ "token": "更"
91
+ },
92
+ {
93
+ "id": 30257,
94
+ "token": "大"
95
+ },
96
+ {
97
+ "id": 13,
98
+ "token": "\n"
99
+ }
100
+ ]
101
+ }