GiantPandas commited on
Commit
16dedd7
·
verified ·
1 Parent(s): 18fc128

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import json
5
+ import time
6
+ import copy
7
+ import base64
8
+ import asyncio
9
+ import tempfile
10
+ import subprocess
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ import zipfile
14
+
15
+ import numpy as np
16
+ import gradio as gr
17
+ from PIL import Image
18
+ from pdf2image import convert_from_path
19
+ from loguru import logger
20
+ from openai import OpenAI, AsyncOpenAI
21
+ from gradio_pdf import PDF
22
+
23
+ import uuid
24
+ import tqdm
25
+
26
+ import requests
27
+
28
+
29
+ preset_prompts = [
30
+ "Please convert the document into Markdown format.",
31
+ "Generate a clean and structured Markdown version of the document.",
32
+ "Transform this content into Markdown with proper headings and bullet points.",
33
+ "Convert the text to Markdown, preserving structure and formatting.",
34
+ "Reformat this document as Markdown with clear sections and lists.",
35
+ ]
36
+
37
+
38
+ def send_pdf_to_parse(file_path, server_ip="47.117.17.202", port=9999, route="/upload", api_key=None):
39
+ url = f"http://{server_ip}:{port}{route}"
40
+ headers = {}
41
+ if api_key:
42
+ headers["Authorization"] = f"Bearer {api_key}"
43
+
44
+ with open(file_path, "rb") as f:
45
+ files = {"file": (os.path.basename(file_path), f, "application/pdf")}
46
+ response = requests.post(url, files=files, headers=headers)
47
+ return response
48
+
49
+
50
+ def extract_makrdown(text):
51
+ m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE)
52
+ if m:
53
+ return m.group(1).strip()
54
+ else:
55
+ return text
56
+
57
+ openai_api_key = "EMPTY"
58
+ openai_api_base = "http://47.117.17.202:9999/v1"
59
+
60
+
61
+ client = AsyncOpenAI(
62
+ api_key=openai_api_key,
63
+ base_url=openai_api_base,
64
+ )
65
+
66
+
67
+ async def request(messages):
68
+
69
+ chat_completion_from_base64 = await client.chat.completions.create(
70
+ messages=messages,
71
+ model="Qwen2_5VL",
72
+ max_completion_tokens=4096,
73
+ stream=True,
74
+ temperature=0.0,
75
+ top_p=0.95
76
+ )
77
+
78
+ page = ""
79
+ async for chunk in chat_completion_from_base64:
80
+ if chunk.choices[0].delta.content:
81
+ content = chunk.choices[0].delta.content
82
+
83
+ choice = chunk.choices[0]
84
+ if choice.finish_reason is not None:
85
+ print(f"end reason = {choice.finish_reason}")
86
+ break
87
+ page += content
88
+
89
+ yield content
90
+
91
+
92
+ def images_to_pdf(img_paths, pdf_path):
93
+
94
+ if isinstance(img_paths, (str, Path)):
95
+ img_paths = [img_paths]
96
+
97
+ if not img_paths:
98
+ raise ValueError("img_paths is empty")
99
+ images = []
100
+ for p in img_paths:
101
+ p = Path(p)
102
+ if not p.is_file():
103
+ raise FileNotFoundError(p)
104
+
105
+ img = Image.open(p)
106
+ if img.mode in ("RGBA", "P"):
107
+ img = img.convert("RGB")
108
+ images.append(img)
109
+
110
+ pdf_path = Path(pdf_path)
111
+ pdf_path.parent.mkdir(parents=True, exist_ok=True)
112
+ images[0].save(pdf_path,
113
+ save_all=True,
114
+ append_images=images[1:],
115
+ resolution=300.0)
116
+ return pdf_path
117
+
118
+
119
+ def encode_image(image_path):
120
+ with open(image_path, "rb") as image_file:
121
+ return base64.b64encode(image_file.read()).decode("utf-8")
122
+
123
+ def build_message(image_path, prompt):
124
+
125
+ content = [
126
+ {
127
+ "type": "image_url",
128
+ "image_url": {
129
+ "url": f"data:image/jpeg;base64,{encode_image(image_path)}"
130
+ }
131
+ },
132
+ {"type": "text", 'text': prompt}
133
+ ]
134
+
135
+
136
+ messages = [
137
+ {"role": "system", "content": "You are a helpful assistant."},
138
+ {'role': 'user', 'content': content}
139
+
140
+ ]
141
+
142
+ return messages
143
+
144
+
145
+
146
+ def download_markdown_file(md_text):
147
+ filename = f"markdown_{uuid.uuid4().hex[:8]}.md"
148
+ filepath = Path("downloads") / filename
149
+ filepath.parent.mkdir(exist_ok=True)
150
+ with open(filepath, "w", encoding="utf-8") as f:
151
+ f.write(md_text)
152
+ return str(filepath)
153
+
154
+
155
+ async def doc_parser(doc_path, prompt):
156
+
157
+ doc_path = Path(doc_path)
158
+ if not doc_path.is_file():
159
+ raise FileNotFoundError(doc_path)
160
+
161
+ with tempfile.TemporaryDirectory() as tmpdir:
162
+ tmpdir = Path(tmpdir)
163
+
164
+ queries = []
165
+ if doc_path.suffix.lower() == ".pdf":
166
+ pages: List[Image.Image] = convert_from_path(doc_path, dpi=300)
167
+ for idx, page in enumerate(pages, start=1):
168
+ img_path = tmpdir / f"page_{idx}.png"
169
+ page.save(img_path, "PNG")
170
+
171
+ messages = build_message(img_path, prompt)
172
+ queries.append(messages)
173
+
174
+ else:
175
+ messages = build_message(doc_path, prompt)
176
+ queries.append(messages)
177
+
178
+ all_pages = []
179
+ all_pages_raw = []
180
+ for query in queries:
181
+ pages = ""
182
+ async for chunk in request(query):
183
+ pages += chunk
184
+ yield extract_makrdown(pages), pages
185
+ all_pages.append(extract_makrdown(pages))
186
+ all_pages_raw.append(pages)
187
+ print(all_pages)
188
+ yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw)
189
+
190
+
191
+ def compress_directory_to_zip(directory_path, output_zip_path):
192
+ try:
193
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
194
+
195
+
196
+ for root, dirs, files in os.walk(directory_path):
197
+ for file in files:
198
+
199
+ file_path = os.path.join(root, file)
200
+
201
+ arcname = os.path.relpath(file_path, directory_path)
202
+
203
+ zipf.write(file_path, arcname)
204
+ return 0
205
+ except Exception as e:
206
+ logger.exception(e)
207
+ return -1
208
+
209
+ latex_delimiters = [
210
+ {'left': '$$', 'right': '$$', 'display': True},
211
+ {'left': '$', 'right': '$', 'display': False},
212
+ {'left': '\\(', 'right': '\\)', 'display': False},
213
+ {'left': '\\[', 'right': '\\]', 'display': True},
214
+ ]
215
+
216
+ def check_prompt(prompt):
217
+ if not prompt or prompt.strip() == "":
218
+ raise gr.Error("Please select or enter a prompt before parsing.")
219
+ return prompt
220
+
221
+ def to_file(image_path):
222
+
223
+ if image_path.endswith("Academic_Papers.png"):
224
+ image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf")
225
+
226
+ return image_path
227
+
228
+ def process_file(file_path):
229
+ if file_path is None:
230
+ return None
231
+ if not file_path.endswith(".pdf"):
232
+
233
+ tmp_file_path = Path(file_path)
234
+ tmp_file_path = tmp_file_path.with_suffix(".pdf")
235
+ images_to_pdf(file_path, tmp_file_path)
236
+ else:
237
+ send_pdf_to_parse(file_path)
238
+ tmp_file_path = file_path
239
+
240
+ return str(tmp_file_path)
241
+
242
+
243
+ # def render_pdf_viewer(pdf_file_path):
244
+ # print(pdf_file_path)
245
+ # # 假设 pdf_file_path
246
+ # return f"""
247
+ # <iframe
248
+ # src="https://mozilla.github.io/pdf.js/web/viewer.html?file=file://{pdf_file_path}"
249
+ # width="100%" height="1000px"
250
+ # style="border: none;">
251
+ # </iframe>
252
+ # """
253
+ def render_pdf_viewer(pdf_file_path):
254
+ filename = os.path.basename(pdf_file_path)
255
+ print(pdf_file_path)
256
+ return f"""
257
+ <iframe
258
+ src="https://mozilla.github.io/pdf.js/web/viewer.html?file=/file={pdf_file_path}"
259
+ width="100%" height="1000px"
260
+ style="border: none;">
261
+ </iframe>
262
+ """
263
+
264
+
265
+ if __name__ == '__main__':
266
+ with gr.Blocks() as demo:
267
+ with gr.Row():
268
+ with gr.Column(variant='panel', scale=5):
269
+
270
+ file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath")
271
+ prompts = gr.Dropdown(
272
+ choices=preset_prompts,
273
+ label="Prompt",
274
+ info="Enter or select prompts...",
275
+ value=preset_prompts[0],
276
+ multiselect=False,
277
+ interactive=True,
278
+ allow_custom_value=True,
279
+ )
280
+
281
+ with gr.Row():
282
+ change_bu = gr.Button('Parse')
283
+ clear_bu = gr.ClearButton(value='Clear')
284
+ #pdf_show = PDF(label='Preview', interactive=False, visible=True, height=800)
285
+ pdf_show = gr.HTML()
286
+
287
+
288
+ example_root = os.path.join(os.path.dirname(__file__), 'examples')
289
+ images = [
290
+ os.path.join(example_root, f)
291
+ for f in os.listdir(example_root)
292
+ if f.lower().endswith(('png', 'jpg', 'jpeg'))
293
+ ]
294
+
295
+ with gr.Column(variant='panel', scale=5):
296
+ with gr.Accordion("Examples", open=True):
297
+ example_root = "examples"
298
+ file_path = [
299
+ os.path.join(example_root, f)
300
+ for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"]
301
+
302
+ ]
303
+
304
+ with gr.Row():
305
+ for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]):
306
+ with gr.Column(scale=1, min_width=120):
307
+ gr.Image(
308
+ value=file_path[i],
309
+ width=120,
310
+ height=90,
311
+ show_label=False,
312
+ show_download_button=False
313
+ )
314
+ gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file)
315
+
316
+
317
+ download_btn = gr.Button("⬇️ Generate download link", size="sm")
318
+ output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False)
319
+
320
+ gr.HTML("""
321
+ <style>
322
+ #down-file-box {
323
+ max-height: 300px;
324
+ }
325
+ </style>
326
+ """)
327
+ with gr.Tabs():
328
+ with gr.Tab('Markdown rendering'):
329
+ md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
330
+ latex_delimiters=latex_delimiters,
331
+ line_breaks=True)
332
+ with gr.Tab('Markdown text'):
333
+ md_text = gr.TextArea(lines=45, show_copy_button=True)
334
+
335
+
336
+
337
+ # file.change(fn=process_file, inputs=file, outputs=pdf_show)
338
+ file.change(
339
+ lambda f: render_pdf_viewer(f),
340
+ inputs=file,
341
+ outputs=pdf_show
342
+ )
343
+
344
+ change_bu.click(
345
+ fn=check_prompt,
346
+ inputs=prompts,
347
+ outputs=prompts
348
+ ).then(
349
+ lambda f: gr.update(visible=False),
350
+ inputs=output_file,
351
+ outputs=output_file
352
+ ).then(
353
+ fn=doc_parser,
354
+ inputs=[file, prompts],
355
+ outputs=[md, md_text]
356
+ )
357
+
358
+ clear_bu.add([file, md, pdf_show, md_text])
359
+
360
+ download_btn.click(
361
+ fn=download_markdown_file,
362
+ inputs=md_text,
363
+ outputs=output_file
364
+ ).then(
365
+ lambda f: gr.update(visible=True),
366
+ inputs=output_file,
367
+ outputs=output_file
368
+ )
369
+
370
+
371
+ demo.launch(server_name='0.0.0.0',share=True)