prithivMLmods commited on
Commit
b03b8b6
·
verified ·
1 Parent(s): 9dd5c39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -146
app.py CHANGED
@@ -15,15 +15,16 @@ from pathlib import Path
15
  import torch
16
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
17
  from transformers.image_utils import load_image
18
-
19
- from pdf2image import convert_from_path
20
  import html2text
21
  import markdown
22
  import tempfile
23
 
 
24
  pdf_suffixes = [".pdf"]
25
  image_suffixes = [".png", ".jpeg", ".jpg"]
26
 
 
27
  latex_delimiters_type_a = [
28
  {'left': '$$', 'right': '$$', 'display': True},
29
  {'left': '$', 'right': '$', 'display': False},
@@ -34,6 +35,7 @@ latex_delimiters_type_b = [
34
  ]
35
  latex_delimiters_type_all = latex_delimiters_type_a + latex_delimiters_type_b
36
 
 
37
  device = "cuda" if torch.cuda.is_available() else "cpu"
38
  MODEL_ID = "Logics-MLLM/Logics-Parsing"
39
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
@@ -45,6 +47,9 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
45
 
46
  @spaces.GPU
47
  def parse_page(image: Image.Image) -> str:
 
 
 
48
  messages = [
49
  {
50
  "role": "user",
@@ -80,23 +85,22 @@ def parse_page(image: Image.Image) -> str:
80
  return output_text
81
 
82
 
83
- def images_bytes_to_pdf_bytes(image_bytes):
84
- # Memory buffer
 
 
85
  pdf_buffer = BytesIO()
86
-
87
- # Load and convert all images to RGB mode
88
  image = Image.open(BytesIO(image_bytes)).convert("RGB")
89
-
90
- # Save the first image as a PDF and append the rest
91
  image.save(pdf_buffer, format="PDF", save_all=True)
92
-
93
- # Get PDF bytes and reset the pointer (optional)
94
  pdf_bytes = pdf_buffer.getvalue()
95
  pdf_buffer.close()
96
  return pdf_bytes
97
 
98
 
99
- def read_fn(path):
 
 
 
100
  if not isinstance(path, Path):
101
  path = Path(path)
102
  with open(str(path), "rb") as input_file:
@@ -109,110 +113,63 @@ def read_fn(path):
109
  raise Exception(f"Unknown file suffix: {path.suffix}")
110
 
111
 
112
- def safe_stem(file_path):
 
 
 
113
  stem = Path(file_path).stem
114
- # Keep only letters, numbers, underscores, and dots, and replace other characters with underscores
115
  return re.sub(r'[^\w.]', '_', stem)
116
 
117
 
118
- def sanitize_filename(filename: str, max_prefix_len: int = 15) -> str:
119
  """
120
- Sanitize filename: remove illegal characters, truncate, and add a hash to prevent duplicates.
 
121
  """
122
- # 1. Extract the extension
123
- name, ext = '', ''
124
- if '.' in filename:
125
- name = filename.rsplit('.', 1)[0]
126
- ext = '.' + filename.rsplit('.', 1)[1].lower()
127
- else:
128
- name = filename
129
- ext = ''
130
-
131
- # 2. Remove illegal characters (Windows/Linux compatible)
132
- # Allowed: letters, numbers, -_.()
133
- name = re.sub(r'[\\/:\*\?"<>\|\s]+', '_', name) # Replace spaces and illegal characters with underscores
134
- name = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', name) # Remove control characters
135
-
136
- # 3. Truncate and reserve space for the hash
137
- prefix = name[:max_prefix_len]
138
-
139
- # 4. Add an MD5 prefix hash to ensure uniqueness (based on the original path or content)
140
- hash_suffix = hashlib.md5(filename.encode('utf-8')).hexdigest()[:6]
141
-
142
- # 5. Combine
143
- safe_name = f"{prefix}_{hash_suffix}{ext}"
144
-
145
- # 6. Prevent starting or ending with a dot (sensitive in some systems)
146
- while safe_name.startswith('.'):
147
- safe_name = safe_name[1:]
148
- if len(safe_name) == 0:
149
- safe_name = f"file_{hash_suffix}.bin"
150
-
151
- if len(safe_name.encode('utf-8')) > 250:
152
- # Fallback to an absolutely safe name
153
- unique_hash = hashlib.md5(filename.encode('utf-8')).hexdigest()[:8]
154
- safe_name = f"doc_{unique_hash}.pdf"
155
-
156
- return safe_name
157
-
158
-
159
- def to_pdf(file_path):
160
  if file_path is None:
161
  return None
162
 
163
  pdf_bytes = read_fn(file_path)
164
-
165
- # unique_filename = f'{uuid.uuid4()}.pdf'
166
  unique_filename = f'{safe_stem(file_path)}.pdf'
167
-
168
- # Construct the full file path
169
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
170
 
171
- # Write the byte data to the file
172
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
173
  tmp_pdf_file.write(pdf_bytes)
174
 
175
  return tmp_file_path
176
 
177
 
178
- def arg_parse(ctx: 'click.Context') -> dict:
179
- # Parse extra arguments
180
- extra_kwargs = {}
181
- i = 0
182
- while i < len(ctx.args):
183
- arg = ctx.args[i]
184
- if arg.startswith('--'):
185
- param_name = arg[2:].replace('-', '_') # Convert parameter name format
186
- i += 1
187
- if i < len(ctx.args) and not ctx.args[i].startswith('--'):
188
- # The parameter has a value
189
- try:
190
- # Try to convert to the appropriate type
191
- if ctx.args[i].lower() == 'true':
192
- extra_kwargs[param_name] = True
193
- elif ctx.args[i].lower() == 'false':
194
- extra_kwargs[param_name] = False
195
- elif '.' in ctx.args[i]:
196
- try:
197
- extra_kwargs[param_name] = float(ctx.args[i])
198
- except ValueError:
199
- extra_kwargs[param_name] = ctx.args[i]
200
- else:
201
- try:
202
- extra_kwargs[param_name] = int(ctx.args[i])
203
- except ValueError:
204
- extra_kwargs[param_name] = ctx.args[i]
205
- except:
206
- extra_kwargs[param_name] = ctx.args[i]
207
- else:
208
- # Boolean flag parameter
209
- extra_kwargs[param_name] = True
210
- i -= 1
211
- i += 1
212
- return extra_kwargs
213
-
214
-
215
- async def pdf_parse(file_path, request: gr.Request):
216
  if file_path is None:
217
  logger.warning("file_path is None")
218
  return (
@@ -223,7 +180,9 @@ async def pdf_parse(file_path, request: gr.Request):
223
  None,
224
  "Error: No file provided"
225
  )
226
- logger.info(f'file_path: {file_path}')
 
 
227
  tmp_pdf_path = to_pdf(file_path)
228
  if tmp_pdf_path is None:
229
  return (
@@ -234,25 +193,36 @@ async def pdf_parse(file_path, request: gr.Request):
234
  None,
235
  "Error: Failed to process file"
236
  )
 
237
  start_time = time.time()
238
  try:
239
- pages = convert_from_path(tmp_pdf_path, dpi=200)
 
 
240
  html_parts = []
241
  for i, page in enumerate(pages):
242
  logger.info(f"Parsing page {i+1}/{len(pages)}")
243
  html = parse_page(page)
244
  html_parts.append(f'<div class="page-{i+1}">{html}</div>')
 
245
  full_html = '\n'.join(html_parts)
246
  parsing_time = time.time() - start_time
 
 
247
  mmd = html2text.html2text(full_html)
248
  mmd_html = markdown.markdown(mmd)
249
  qwen_html = full_html
 
 
250
  with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
251
  f.write(mmd)
252
  md_path = f.name
 
253
  input_path = tmp_pdf_path
254
- cost_time = f'Queue waiting time: 0, Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
 
255
  return mmd_html, mmd, qwen_html, md_path, input_path, cost_time
 
256
  except Exception as e:
257
  logger.error(f"Parsing failed: {e}")
258
  return (
@@ -267,26 +237,10 @@ async def pdf_parse(file_path, request: gr.Request):
267
 
268
  @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
269
  @click.pass_context
270
- @click.option(
271
- '--latex-delimiters-type',
272
- 'latex_delimiters_type',
273
- type=click.Choice(['a', 'b', 'all']),
274
- help="Set the type of LaTeX delimiters to use in Markdown rendering:"
275
- "'a' for type '$', 'b' for type '()[]', 'all' for both types.",
276
- default='all',
277
- )
278
- def main(ctx, latex_delimiters_type, **kwargs):
279
- kwargs.update(arg_parse(ctx))
280
- if latex_delimiters_type == 'a':
281
- latex_delimiters = latex_delimiters_type_a
282
- elif latex_delimiters_type == 'b':
283
- latex_delimiters = latex_delimiters_type_b
284
- elif latex_delimiters_type == 'all':
285
- latex_delimiters = latex_delimiters_type_all
286
- else:
287
- raise ValueError(f"Invalid latex delimiters type: {latex_delimiters_type}.")
288
-
289
- suffixes = pdf_suffixes + image_suffixes
290
  with gr.Blocks(head='''
291
  <meta name="data-spm" content="label" />
292
  <meta name="aplus-core" content="aplus.js" />
@@ -315,51 +269,56 @@ def main(ctx, latex_delimiters_type, **kwargs):
315
  })(window, document, 'script', 'aplus_queue');
316
  </script>
317
  ''') as demo:
 
 
318
  with gr.Row():
319
  with gr.Column(variant='panel', scale=5):
320
  with gr.Row():
321
  input_file = gr.File(label='Please upload a PDF or image (Max 20 pages for conversion)',
322
- file_types=suffixes)
323
  with gr.Row():
324
- change_bu = gr.Button('Convert')
325
  clear_bu = gr.ClearButton(value='Clear')
326
  pdf_show = PDF(label='PDF Preview', interactive=False, visible=True, height=800)
327
 
328
- example_root = os.path.join(os.getcwd(), 'parsing/examples')
329
- print(example_root)
330
- logger.info(f'example_root: {example_root}')
331
- if os.path.exists(example_root):
332
- with gr.Accordion('Examples:'):
333
- gr.Examples(
334
- examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
335
- _.endswith(tuple(suffixes))],
336
- inputs=input_file
337
- )
338
 
339
  with gr.Column(variant='panel', scale=5):
340
- output_file = gr.File(label='Conversion Result', interactive=False)
341
- cost_time = gr.Text(label='Time Cost')
342
  with gr.Tabs():
343
- with gr.Tab('MMD Rendering'):
344
  mmd_html = gr.HTML(label='MMD Rendering')
345
- # with gr.Tab('mmd html text'):
346
- # mmd_html_text = gr.TextArea(lines=45, show_copy_button=True)
347
- with gr.Tab('MMD'):
348
- mmd = gr.TextArea(lines=45, show_copy_button=True)
349
- with gr.Tab('Qwen HTML'):
350
- raw_html = gr.TextArea(lines=45, show_copy_button=True)
351
-
352
- clear_bu.add([input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time])
353
- cna = gr.Textbox(visible=False)
354
- input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show)
 
 
355
  change_bu.click(
356
  fn=pdf_parse,
357
  inputs=[input_file],
358
  outputs=[mmd_html, mmd, raw_html, output_file, pdf_show, cost_time],
359
- concurrency_limit=15
 
360
  )
361
 
362
- demo.launch()
363
 
364
 
365
  if __name__ == '__main__':
 
15
  import torch
16
  from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
17
  from transformers.image_utils import load_image
18
+ import fitz # PyMuPDF library for PDF processing
 
19
  import html2text
20
  import markdown
21
  import tempfile
22
 
23
+ # Define supported file suffixes
24
  pdf_suffixes = [".pdf"]
25
  image_suffixes = [".png", ".jpeg", ".jpg"]
26
 
27
+ # LaTeX delimiter configurations
28
  latex_delimiters_type_a = [
29
  {'left': '$$', 'right': '$$', 'display': True},
30
  {'left': '$', 'right': '$', 'display': False},
 
35
  ]
36
  latex_delimiters_type_all = latex_delimiters_type_a + latex_delimiters_type_b
37
 
38
+ # --- Model and Processor Initialization ---
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  MODEL_ID = "Logics-MLLM/Logics-Parsing"
41
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 
47
 
48
  @spaces.GPU
49
  def parse_page(image: Image.Image) -> str:
50
+ """
51
+ Parses a single document page image using the Qwen2.5-VL model.
52
+ """
53
  messages = [
54
  {
55
  "role": "user",
 
85
  return output_text
86
 
87
 
88
+ def images_bytes_to_pdf_bytes(image_bytes: bytes) -> bytes:
89
+ """
90
+ Converts image bytes into PDF bytes.
91
+ """
92
  pdf_buffer = BytesIO()
 
 
93
  image = Image.open(BytesIO(image_bytes)).convert("RGB")
 
 
94
  image.save(pdf_buffer, format="PDF", save_all=True)
 
 
95
  pdf_bytes = pdf_buffer.getvalue()
96
  pdf_buffer.close()
97
  return pdf_bytes
98
 
99
 
100
+ def read_fn(path: str or Path) -> bytes:
101
+ """
102
+ Reads a file and returns its content in bytes. Converts images to PDF bytes.
103
+ """
104
  if not isinstance(path, Path):
105
  path = Path(path)
106
  with open(str(path), "rb") as input_file:
 
113
  raise Exception(f"Unknown file suffix: {path.suffix}")
114
 
115
 
116
+ def safe_stem(file_path: str) -> str:
117
+ """
118
+ Creates a safe file stem from a path.
119
+ """
120
  stem = Path(file_path).stem
 
121
  return re.sub(r'[^\w.]', '_', stem)
122
 
123
 
124
+ def to_pdf(file_path: str) -> str or None:
125
  """
126
+ Ensures the input file is in PDF format for consistent processing.
127
+ If the input is an image, it's converted to a temporary PDF.
128
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  if file_path is None:
130
  return None
131
 
132
  pdf_bytes = read_fn(file_path)
 
 
133
  unique_filename = f'{safe_stem(file_path)}.pdf'
 
 
134
  tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
135
 
 
136
  with open(tmp_file_path, 'wb') as tmp_pdf_file:
137
  tmp_pdf_file.write(pdf_bytes)
138
 
139
  return tmp_file_path
140
 
141
 
142
+ def convert_pdf_to_images_fitz(pdf_path: str, dpi: int = 200) -> list:
143
+ """
144
+ Converts a PDF file to a list of PIL Images using PyMuPDF (fitz).
145
+ This function replaces the need for the external Poppler dependency.
146
+ """
147
+ images = []
148
+ logger.info(f"Converting PDF '{pdf_path}' to images with PyMuPDF.")
149
+ try:
150
+ pdf_document = fitz.open(pdf_path)
151
+ # Calculate zoom factor based on desired DPI
152
+ zoom = dpi / 72.0
153
+ mat = fitz.Matrix(zoom, zoom)
154
+
155
+ for page_num in range(len(pdf_document)):
156
+ page = pdf_document.load_page(page_num)
157
+ pix = page.get_pixmap(matrix=mat)
158
+ img_data = pix.tobytes("png") # Use PNG for better quality
159
+ image = Image.open(BytesIO(img_data))
160
+ images.append(image)
161
+ pdf_document.close()
162
+ logger.info(f"Successfully converted {len(images)} pages.")
163
+ except Exception as e:
164
+ logger.error(f"Failed to convert PDF using PyMuPDF: {e}")
165
+ raise
166
+ return images
167
+
168
+
169
+ async def pdf_parse(file_path: str, request: gr.Request):
170
+ """
171
+ Main parsing function that orchestrates the PDF processing pipeline.
172
+ """
 
 
 
 
 
 
 
173
  if file_path is None:
174
  logger.warning("file_path is None")
175
  return (
 
180
  None,
181
  "Error: No file provided"
182
  )
183
+ logger.info(f'Processing file: {file_path}')
184
+
185
+ # Ensure file is in PDF format
186
  tmp_pdf_path = to_pdf(file_path)
187
  if tmp_pdf_path is None:
188
  return (
 
193
  None,
194
  "Error: Failed to process file"
195
  )
196
+
197
  start_time = time.time()
198
  try:
199
+ # ** FIX: Use PyMuPDF (fitz) instead of pdf2image to avoid Poppler dependency **
200
+ pages = convert_pdf_to_images_fitz(tmp_pdf_path, dpi=200)
201
+
202
  html_parts = []
203
  for i, page in enumerate(pages):
204
  logger.info(f"Parsing page {i+1}/{len(pages)}")
205
  html = parse_page(page)
206
  html_parts.append(f'<div class="page-{i+1}">{html}</div>')
207
+
208
  full_html = '\n'.join(html_parts)
209
  parsing_time = time.time() - start_time
210
+
211
+ # Convert generated HTML to Markdown
212
  mmd = html2text.html2text(full_html)
213
  mmd_html = markdown.markdown(mmd)
214
  qwen_html = full_html
215
+
216
+ # Create a temporary markdown file for download
217
  with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f:
218
  f.write(mmd)
219
  md_path = f.name
220
+
221
  input_path = tmp_pdf_path
222
+ cost_time = f'Parsing time: {parsing_time:.2f}s, Total time: {parsing_time:.2f}s'
223
+
224
  return mmd_html, mmd, qwen_html, md_path, input_path, cost_time
225
+
226
  except Exception as e:
227
  logger.error(f"Parsing failed: {e}")
228
  return (
 
237
 
238
  @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
239
  @click.pass_context
240
+ def main(ctx, **kwargs):
241
+ """
242
+ Sets up and launches the Gradio user interface.
243
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  with gr.Blocks(head='''
245
  <meta name="data-spm" content="label" />
246
  <meta name="aplus-core" content="aplus.js" />
 
269
  })(window, document, 'script', 'aplus_queue');
270
  </script>
271
  ''') as demo:
272
+ gr.Markdown("# 📄 Logics-Parsing Document Analysis")
273
+ gr.Markdown("Upload a PDF or image file to parse its content into structured Markdown and HTML formats.")
274
  with gr.Row():
275
  with gr.Column(variant='panel', scale=5):
276
  with gr.Row():
277
  input_file = gr.File(label='Please upload a PDF or image (Max 20 pages for conversion)',
278
+ file_types=pdf_suffixes + image_suffixes)
279
  with gr.Row():
280
+ change_bu = gr.Button('Convert', variant='primary')
281
  clear_bu = gr.ClearButton(value='Clear')
282
  pdf_show = PDF(label='PDF Preview', interactive=False, visible=True, height=800)
283
 
284
+ example_root = 'parsing/examples'
285
+ logger.info(f'Looking for examples in: {example_root}')
286
+ if os.path.exists(example_root) and os.path.isdir(example_root):
287
+ example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
288
+ if example_files:
289
+ with gr.Accordion('Examples:', open=True):
290
+ gr.Examples(
291
+ examples=example_files,
292
+ inputs=input_file
293
+ )
294
 
295
  with gr.Column(variant='panel', scale=5):
296
+ output_file = gr.File(label='Download Markdown Result', interactive=False)
297
+ cost_time = gr.Text(label='Time Cost', interactive=False)
298
  with gr.Tabs():
299
+ with gr.Tab('Markdown Rendering'):
300
  mmd_html = gr.HTML(label='MMD Rendering')
301
+ with gr.Tab('Markdown Source'):
302
+ mmd = gr.TextArea(lines=45, show_copy_button=True, label="Markdown Source")
303
+ with gr.Tab('Generated HTML'):
304
+ raw_html = gr.TextArea(lines=45, show_copy_button=True, label="Generated HTML")
305
+
306
+ # Define component list for clearing
307
+ components_to_clear = [input_file, pdf_show, mmd, raw_html, output_file, mmd_html, cost_time]
308
+
309
+ clear_bu.add(components_to_clear)
310
+
311
+ input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, show_progress="full")
312
+
313
  change_bu.click(
314
  fn=pdf_parse,
315
  inputs=[input_file],
316
  outputs=[mmd_html, mmd, raw_html, output_file, pdf_show, cost_time],
317
+ concurrency_limit=15,
318
+ show_progress="full"
319
  )
320
 
321
+ demo.launch(debug=True)
322
 
323
 
324
  if __name__ == '__main__':