zerozeyi commited on
Commit
a29195e
·
1 Parent(s): 07ac0e8
Files changed (4) hide show
  1. README.md +4 -3
  2. app.py +249 -0
  3. conv_for_infer.py +39 -0
  4. requirements.txt +4 -0
README.md CHANGED
@@ -1,14 +1,15 @@
1
  ---
2
  title: FireRed OCR
3
- emoji: 🏢
4
- colorFrom: green
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 6.8.0
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: FireRed OCR
3
+ emoji: 🏆
4
+ colorFrom: yellow
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 6.8.0
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
  license: apache-2.0
12
+ short_description: FireRed-OCR for Document Recognition
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import torch
4
+ from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
5
+
6
+ MODEL_DIR = "FireRedTeam/FireRed-OCR"
7
+
8
+ print("🔥 Loading FireRed-OCR model...")
9
+
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
13
+ MODEL_DIR,
14
+ trust_remote_code=True
15
+ ).to(device)
16
+
17
+ processor = AutoProcessor.from_pretrained(
18
+ MODEL_DIR,
19
+ trust_remote_code=True
20
+ )
21
+
22
+ model.eval()
23
+
24
+ import gradio as gr
25
+ import markdown
26
+ from PIL import Image
27
+ import os
28
+ from datetime import datetime
29
+ import tempfile
30
+ import shutil
31
+ from pathlib import Path
32
+ from conv_for_infer import generate_conv
33
+ import base64
34
+
35
+ MARKDOWN_OUTPUT = "md_output"
36
+
37
+ @spaces.GPU
38
+ def process_images(image_paths):
39
+
40
+ if not image_paths:
41
+ return "<p style='color:red;'>Please upload image.</p>", None, None
42
+
43
+ os.makedirs("md_output", exist_ok=True)
44
+
45
+ all_text = ""
46
+
47
+ for image_path in image_paths:
48
+ try:
49
+ basename = os.path.splitext(os.path.basename(image_path))[0]
50
+ markdown_file = os.path.join("md_output", f"{basename}.md")
51
+
52
+ # === 你的原始逻辑 ===
53
+ messages = generate_conv({"image_path": image_path})
54
+
55
+ inputs = processor.apply_chat_template(
56
+ messages,
57
+ tokenize=True,
58
+ add_generation_prompt=True,
59
+ return_dict=True,
60
+ return_tensors="pt"
61
+ ).to(device)
62
+
63
+ with torch.no_grad():
64
+ outputs = model.generate(
65
+ **inputs,
66
+ max_new_tokens=1024
67
+ )
68
+
69
+ generated_ids_trimmed = [
70
+ out_ids[len(in_ids):]
71
+ for in_ids, out_ids in zip(inputs.input_ids, outputs)
72
+ ]
73
+
74
+ text = processor.batch_decode(
75
+ generated_ids_trimmed,
76
+ skip_special_tokens=True,
77
+ clean_up_tokenization_spaces=False
78
+ )[0]
79
+
80
+ # 保存文件
81
+ with open(markdown_file, "w", encoding="utf-8") as f:
82
+ f.write(text)
83
+
84
+ all_text += text + "\n\n"
85
+
86
+ except Exception as e:
87
+ all_text += f"\n\n**Error processing {image_path}: {str(e)}**\n\n"
88
+
89
+ latex_text = all_text.replace("```markdown", "$$")
90
+ latex_text = latex_text.replace("```", "$$")
91
+
92
+ return all_text.strip(), latex_text, markdown_file
93
+
94
+ def download_markdown(md_file_path):
95
+ """
96
+ 提供Markdown文件下载
97
+ """
98
+ if md_file_path and os.path.exists(md_file_path):
99
+ return md_file_path
100
+ return None
101
+
102
+ def clear_files():
103
+ """
104
+ 清空所有内容
105
+ """
106
+ return None, None, None, None
107
+
108
+ def image_to_base64(img_path):
109
+ with open(img_path, "rb") as f:
110
+ return base64.b64encode(f.read()).decode("utf-8")
111
+
112
+ def preview_images(files):
113
+ """
114
+ 预览上传的图片
115
+ """
116
+ if not files:
117
+ return None
118
+
119
+ preview_html = "<div style='display: flex; flex-wrap: wrap; gap: 10px;'>"
120
+ for i, file in enumerate(files[:5]): # 只显示前5张预览
121
+ try:
122
+ img = Image.open(file)
123
+ # 缩略图
124
+ img.thumbnail((150, 150))
125
+
126
+ # 临时保存缩略图
127
+ thumb_dir = tempfile.gettempdir()
128
+ thumb_path = os.path.join(thumb_dir, f"thumb_{i}_{datetime.now().timestamp()}.jpg")
129
+ img.save(thumb_path, "JPEG")
130
+ # print("thumb_path:", thumb_path)
131
+
132
+ preview_html += f"""
133
+ <div style="border: 1px solid #ddd; padding: 5px; border-radius: 5px;">
134
+ <img src="data:image/png;base64,{image_to_base64(thumb_path)}" style="max-width: 150px; max-height: 150px;">
135
+ <p style="text-align: center; margin: 5px 0;">Image {i+1}</p>
136
+ </div>
137
+ """
138
+ except:
139
+ pass
140
+
141
+ preview_html += "</div>"
142
+ if len(files) > 5:
143
+ preview_html += f"<p>... and {len(files) - 5} more images</p>"
144
+
145
+ return preview_html
146
+
147
+ # 创建Gradio界面
148
+ with gr.Blocks(title="FireRed-OCR") as demo:
149
+ gr.HTML("""
150
+ <div style="text-align: center; margin-bottom: 20px;">
151
+ <h1 style="display: inline-block;">🔍 FireRed-OCR</h1>
152
+ <p style="font-size: 14px; color: #666;"><i>Upload Image → Generate Recognition Markdown</i></p>
153
+ </div>
154
+ """)
155
+
156
+ with gr.Row():
157
+ with gr.Column(scale=1):
158
+ # 左侧:输入区域
159
+ gr.Markdown("### 📤 Upload & Select")
160
+
161
+ # 图片上传组件
162
+ image_input = gr.File(
163
+ label="Upload Image",
164
+ file_count="multiple",
165
+ file_types=["image"],
166
+ type="filepath"
167
+ )
168
+
169
+ # 图片预览
170
+ image_preview = gr.HTML(label="Image Preview")
171
+
172
+ with gr.Row():
173
+ run_btn = gr.Button("🚀 Generate Markdown", variant="primary", size="lg", scale=2)
174
+ clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
175
+
176
+ with gr.Column(scale=1):
177
+ # 右侧:预览和下载区域
178
+ gr.Markdown("### 👀 Preview & Download")
179
+
180
+ preview_output = gr.Code(
181
+ label="Markdown Code Preview",
182
+ language="markdown",
183
+ value=">Click「Generate Markdown」Button for Previewing",
184
+ interactive=False
185
+ )
186
+
187
+ preview_img_output = gr.Markdown(
188
+ label="Markdown Preview",
189
+ latex_delimiters=[
190
+ {"left": "$$", "right": "$$", "display": True}, # Display equations
191
+ {"left": "$", "right": "$", "display": False} # Inline equations
192
+ ]
193
+ )
194
+
195
+ # 下载按钮
196
+ download_btn = gr.File(
197
+ label="📥 Click to Download Markdown File",
198
+ interactive=False,
199
+ visible=True
200
+ )
201
+
202
+ # 添加状态存储
203
+ md_file_state = gr.State()
204
+
205
+ # 绑定事件
206
+ def update_preview(files):
207
+ if files:
208
+ return preview_images(files)
209
+ return "<p>No image available</p>"
210
+
211
+ image_input.change(
212
+ fn=update_preview,
213
+ inputs=[image_input],
214
+ outputs=[image_preview]
215
+ )
216
+
217
+ run_btn.click(
218
+ fn=process_images,
219
+ # inputs=[image_input, markdown_input],
220
+ inputs=[image_input],
221
+ outputs=[preview_output, preview_img_output, md_file_state]
222
+ ).then(
223
+ fn=download_markdown,
224
+ inputs=[md_file_state],
225
+ outputs=[download_btn]
226
+ )
227
+
228
+ clear_btn.click(
229
+ fn=clear_files,
230
+ inputs=[],
231
+ # outputs=[image_input, markdown_input, preview_output, download_btn]
232
+ outputs=[image_input, preview_output, preview_img_output, download_btn]
233
+ ).then(
234
+ fn=lambda: "<p>No image available</p>",
235
+ inputs=[],
236
+ outputs=[image_preview]
237
+ )
238
+
239
+ # 添加页脚
240
+ gr.Markdown("""
241
+ ---
242
+ <p style="text-align: center; color: #666;">✨ Convert Images to Standard Markdown Easily ✨</p>
243
+ """)
244
+
245
+ # 配置并启动应用
246
+ if __name__ == "__main__":
247
+ demo.queue().launch(
248
+ ssr_mode=False
249
+ )
conv_for_infer.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def generate_conv(data_dict):
4
+ PROMPT = '''You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
5
+
6
+ 1. Text Processing:
7
+ - Accurately recognize all text content in the PDF image without guessing or inferring.
8
+ - Convert the recognized text into Markdown format.
9
+ - Maintain the original document structure, including headings, paragraphs, lists, etc.
10
+
11
+ 2. Mathematical Formula Processing:
12
+ - Convert all mathematical formulas to LaTeX format.
13
+ - Enclose inline formulas with,(,). For example: This is an inline formula,( E = mc^2,)
14
+ - Enclose block formulas with,\[,\]. For example:,[,frac{-b,pm,sqrt{b^2 - 4ac}}{2a},]
15
+
16
+ 3. Table Processing:
17
+ - Convert tables to HTML format.
18
+ - Wrap the entire table with <table> and </table>.
19
+
20
+ 4. Figure Handling:
21
+ - Ignore figures content in the PDF image. Do not attempt to describe or convert images.
22
+
23
+ 5. Output Format:
24
+ - Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
25
+ - For complex layouts, try to maintain the original document's structure and format as closely as possible.
26
+
27
+ Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
28
+ '''
29
+ image_path = data_dict["image_path"]
30
+ user_conv = [
31
+ {
32
+ "role": "user",
33
+ "content": [
34
+ {"type": "image", "image": image_path},
35
+ {"type": "text", "text": PROMPT},
36
+ ],
37
+ },
38
+ ]
39
+ return user_conv
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ markdown==3.10.2
2
+ transformers==4.57.3
3
+ pillow==12.0.0
4
+ torchvision