AloneDancer commited on
Commit
8c80b4b
·
verified ·
1 Parent(s): 06ae99f

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. app.py +65 -0
  3. assets/Tesa.png +3 -0
  4. assets/tapes 5782_EN.pdf +3 -0
  5. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/tapes[[:space:]]5782_EN.pdf filter=lfs diff=lfs merge=lfs -text
37
+ assets/Tesa.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
4
+ from magic_pdf.pipe.UNIPipe import UNIPipe
5
+ from magic_pdf.tools.common import parse_pdf_by_unipipe
6
+
7
+ def convert_pdf(pdf_file, enable_formula, enable_table, max_pages=100):
8
+ try:
9
+ # 获取上传文件路径
10
+ file_path = pdf_file.name
11
+ parent_path = os.path.dirname(file_path)
12
+
13
+ # 初始化 MinerU 管道
14
+ image_writer = DiskReaderWriter(parent_path)
15
+ jso_useful_key = {
16
+ "model_list": [],
17
+ "enable_formula_recognition": enable_formula,
18
+ "enable_table_recognition": enable_table
19
+ }
20
+ pipe = UNIPipe(file_path, jso_useful_key, image_writer=image_writer)
21
+
22
+ # 执行 PDF 解析
23
+ pipe.pipe_classify()
24
+ pipe.pipe_parse()
25
+
26
+ # 转换为 Markdown
27
+ md_content = parse_pdf_by_unipipe(file_path, pipe.pipe_mkユニ, max_pages=max_pages)
28
+
29
+ # 保存 Markdown 到临时文件
30
+ output_dir = "output"
31
+ os.makedirs(output_dir, exist_ok=True)
32
+ output_file = os.path.join(output_dir, f"{os.path.basename(file_path)}.md")
33
+ with open(output_file, "w", encoding="utf-8") as f:
34
+ f.write(md_content)
35
+
36
+ return md_content, output_file
37
+ except Exception as e:
38
+ return f"Error: {str(e)}", None
39
+
40
+ # 定义 Gradio 界面
41
+ with gr.Blocks(title="MinerU PDF to Markdown Converter") as iface:
42
+ gr.Markdown("# MinerU PDF to Markdown Converter")
43
+ gr.Markdown("Upload a PDF file, configure options, and convert to Markdown.")
44
+
45
+ with gr.Row():
46
+ with gr.Column():
47
+ pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
48
+ formula_checkbox = gr.Checkbox(label="Enable formula recognition", value=False)
49
+ table_checkbox = gr.Checkbox(label="Enable table recognition", value=False)
50
+ max_pages = gr.Number(label="Max Pages to Process", value=100, precision=0)
51
+ convert_button = gr.Button("Convert to Markdown")
52
+
53
+ with gr.Column():
54
+ markdown_output = gr.Markdown(label="Markdown Output")
55
+ download_button = gr.File(label="Download Markdown File")
56
+
57
+ # 绑定按钮事件
58
+ convert_button.click(
59
+ fn=convert_pdf,
60
+ inputs=[pdf_input, formula_checkbox, table_checkbox, max_pages],
61
+ outputs=[markdown_output, download_button]
62
+ )
63
+
64
+ # 启动 Gradio 应用
65
+ iface.launch()
assets/Tesa.png ADDED

Git LFS Details

  • SHA256: 6dd12bd84b58ba94a4afb02f8fdaec1a9135401afa7932f4cd57dc28502c6fb7
  • Pointer size: 132 Bytes
  • Size of remote file: 1.25 MB
assets/tapes 5782_EN.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31b46992626dd2500a7ffdc1157ed6906b29ccad669fb3c59c568d27cdc4bc54
3
+ size 321979
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ mineru[core]
2
+ gradio==4.44.0
3
+ torch==2.4.1
4
+ numpy==1.26.4
5
+ PyMuPDF==1.24.10