Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,17 +8,8 @@ from pathlib import Path
|
|
| 8 |
import re
|
| 9 |
|
| 10 |
# os.system('pip install -U magic-pdf==0.8.1')
|
| 11 |
-
# os.system('pip install -U magic_pdf-0.9.0a9-py3-none-any.whl')
|
| 12 |
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
|
| 13 |
|
| 14 |
-
# from huggingface_hub import snapshot_download
|
| 15 |
-
# model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
|
| 16 |
-
# layoutreader_model_dir = snapshot_download('hantian/layoutreader')
|
| 17 |
-
|
| 18 |
-
# os.system('cp magic-pdf.template.json ~/magic-pdf.json')
|
| 19 |
-
# os.system(f"sed -i 's|/tmp/models|{model_dir}/models|g' /home/user/magic-pdf.json")
|
| 20 |
-
# os.system(f"sed -i 's|/tmp/layoutreader|{layoutreader_model_dir}|g' /home/user/magic-pdf.json")
|
| 21 |
-
|
| 22 |
os.system('wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py')
|
| 23 |
os.system('python download_models_hf.py')
|
| 24 |
os.system("sed -i 's|cpu|cuda|g' /home/user/magic-pdf.json")
|
|
@@ -36,8 +27,6 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
|
| 36 |
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
| 37 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 38 |
|
| 39 |
-
# import spaces
|
| 40 |
-
|
| 41 |
|
| 42 |
def read_fn(path):
|
| 43 |
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
|
@@ -131,28 +120,13 @@ def to_markdown(file_path, end_pages, ocr):
|
|
| 131 |
# 返回转换后的PDF路径
|
| 132 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
| 133 |
|
| 134 |
-
# return md_content, txt_content, archive_zip_path, show_pdf(new_pdf_path)
|
| 135 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
| 136 |
|
| 137 |
|
| 138 |
-
# def show_pdf(file_path):
|
| 139 |
-
# with open(file_path, "rb") as f:
|
| 140 |
-
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
|
| 141 |
-
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
|
| 142 |
-
# f'width="100%" height="1000" type="application/pdf">'
|
| 143 |
-
# return pdf_display
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
def show_pdf(file):
|
| 147 |
-
return file
|
| 148 |
-
|
| 149 |
-
|
| 150 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
| 151 |
{"left": '$', "right": '$', "display": False}]
|
| 152 |
|
| 153 |
|
| 154 |
-
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
| 155 |
-
|
| 156 |
def init_model():
|
| 157 |
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
| 158 |
try:
|
|
@@ -205,7 +179,6 @@ if __name__ == "__main__":
|
|
| 205 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 206 |
with gr.Tab("Markdown text"):
|
| 207 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 208 |
-
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
|
| 209 |
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
|
| 210 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 211 |
|
|
|
|
| 8 |
import re
|
| 9 |
|
| 10 |
# os.system('pip install -U magic-pdf==0.8.1')
|
|
|
|
| 11 |
os.system('pip install git+https://github.com/opendatalab/MinerU.git@dev')
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
os.system('wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py')
|
| 14 |
os.system('python download_models_hf.py')
|
| 15 |
os.system("sed -i 's|cpu|cuda|g' /home/user/magic-pdf.json")
|
|
|
|
| 27 |
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
| 28 |
from magic_pdf.tools.common import do_parse, prepare_env
|
| 29 |
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def read_fn(path):
|
| 32 |
disk_rw = DiskReaderWriter(os.path.dirname(path))
|
|
|
|
| 120 |
# 返回转换后的PDF路径
|
| 121 |
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
|
| 122 |
|
|
|
|
| 123 |
return md_content, txt_content, archive_zip_path, new_pdf_path
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
|
| 127 |
{"left": '$', "right": '$', "display": False}]
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
| 130 |
def init_model():
|
| 131 |
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
|
| 132 |
try:
|
|
|
|
| 179 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 180 |
with gr.Tab("Markdown text"):
|
| 181 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
|
|
|
| 182 |
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr], outputs=[md, md_text, output_file, pdf_show])
|
| 183 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 184 |
|