Vladyslav Nalyvaiko
commited on
Commit
·
ca05b65
1
Parent(s):
11551ca
Fast API update
Browse files- app.py +10 -58
- mineru_single.py +1 -5
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import List, Optional
|
|
| 10 |
# Alternatively you can do this in a "startup" event handler
|
| 11 |
os.system("python download_models_hf.py")
|
| 12 |
|
| 13 |
-
from
|
| 14 |
# Or if you want single-file approach, from miner_single import to_markdown
|
| 15 |
|
| 16 |
app = FastAPI()
|
|
@@ -22,67 +22,19 @@ os.makedirs(INBOX_DIR, exist_ok=True)
|
|
| 22 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 23 |
|
| 24 |
@app.post("/process")
|
| 25 |
-
async def
|
| 26 |
-
files: List[UploadFile] = File(...),
|
| 27 |
-
background_tasks: BackgroundTasks = None,
|
| 28 |
-
num_workers: int = 2,
|
| 29 |
-
num_gpus: int = 1
|
| 30 |
-
):
|
| 31 |
-
"""
|
| 32 |
-
POST multiple PDFs via multipart/form-data.
|
| 33 |
-
We store them in ./inbox, then process in the background using parallel_processor.
|
| 34 |
-
"""
|
| 35 |
-
pdf_paths = []
|
| 36 |
-
for f in files:
|
| 37 |
-
file_path = os.path.join(INBOX_DIR, f.filename)
|
| 38 |
-
with open(file_path, "wb") as out_file:
|
| 39 |
-
shutil.copyfileobj(f.file, out_file)
|
| 40 |
-
pdf_paths.append(file_path)
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
pdf_paths,
|
| 46 |
-
OUTPUT_DIR,
|
| 47 |
-
num_workers,
|
| 48 |
-
num_gpus
|
| 49 |
-
)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
"files_received": [f.filename for f in files],
|
| 54 |
-
"workers": num_workers,
|
| 55 |
-
"gpus": num_gpus
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
@app.get("/check_output")
|
| 59 |
-
def check_output(filename: str):
|
| 60 |
-
"""
|
| 61 |
-
Simple endpoint: provide a PDF filename, returns the final .md (if ready).
|
| 62 |
-
We assume the PDF was named e.g. 'paper.pdf'. The final output is `paper.md`
|
| 63 |
-
in `./output/paper/paper_<timestamp>.md`.
|
| 64 |
-
Because of how we rename outputs in `miner_single.py`, you may need to locate them by pattern.
|
| 65 |
-
"""
|
| 66 |
-
name_wo_ext = os.path.splitext(filename)[0]
|
| 67 |
-
# Because we appended timestamp, let's see if we can locate a .md in the folder
|
| 68 |
-
subdir = os.path.join(OUTPUT_DIR, name_wo_ext)
|
| 69 |
-
if not os.path.exists(subdir):
|
| 70 |
-
return {"status": "not_found"}
|
| 71 |
-
|
| 72 |
-
# Try to find a .md in the subdir
|
| 73 |
-
found_md = [f for f in os.listdir(subdir) if f.endswith(".md")]
|
| 74 |
-
if not found_md:
|
| 75 |
-
return {"status": "incomplete"}
|
| 76 |
-
|
| 77 |
-
# If we do find it:
|
| 78 |
-
md_path = os.path.join(subdir, found_md[0])
|
| 79 |
-
with open(md_path, "r", encoding="utf-8") as f:
|
| 80 |
-
content = f.read()
|
| 81 |
|
| 82 |
return {
|
| 83 |
-
"
|
| 84 |
-
"
|
| 85 |
-
"content":
|
| 86 |
}
|
| 87 |
|
| 88 |
# If you want to run locally or for debug:
|
|
|
|
| 10 |
# Alternatively you can do this in a "startup" event handler
|
| 11 |
os.system("python download_models_hf.py")
|
| 12 |
|
| 13 |
+
from mineru_single import to_markdown
|
| 14 |
# Or if you want single-file approach, from miner_single import to_markdown
|
| 15 |
|
| 16 |
app = FastAPI()
|
|
|
|
| 22 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 23 |
|
| 24 |
@app.post("/process")
|
| 25 |
+
async def process_pdf(file: UploadFile = File(...)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
file_path = os.path.join(INBOX_DIR, file.filename)
|
| 28 |
+
with open(file_path, "wb") as out_file:
|
| 29 |
+
shutil.copyfileobj(file.file, out_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
# Process the file and wait for completion
|
| 32 |
+
markdown_text = to_markdown(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
return {
|
| 35 |
+
"message": "Processing completed",
|
| 36 |
+
"code": 200,
|
| 37 |
+
"content": markdown_text
|
| 38 |
}
|
| 39 |
|
| 40 |
# If you want to run locally or for debug:
|
mineru_single.py
CHANGED
|
@@ -135,8 +135,4 @@ def to_markdown(
|
|
| 135 |
|
| 136 |
md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
|
| 137 |
|
| 138 |
-
|
| 139 |
-
with open(md_path, "w", encoding="utf-8") as fw:
|
| 140 |
-
fw.write(md_content_with_embeds)
|
| 141 |
-
|
| 142 |
-
return md_path
|
|
|
|
| 135 |
|
| 136 |
md_content_with_embeds = replace_image_with_base64(original_md_content, local_md_dir)
|
| 137 |
|
| 138 |
+
return md_content_with_embeds
|
|
|
|
|
|
|
|
|
|
|
|