Update app.py
Browse files
app.py
CHANGED
|
@@ -1,24 +1,25 @@
|
|
| 1 |
import os
|
| 2 |
import shutil
|
| 3 |
import tempfile
|
| 4 |
-
import
|
| 5 |
-
import numpy as np
|
| 6 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 7 |
|
| 8 |
# --- CPU OPTIMIZATION FLAGS ---
|
|
|
|
| 9 |
os.environ["OMP_NUM_THREADS"] = "2"
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
app = FastAPI(
|
| 15 |
title="Document Ingestion API",
|
| 16 |
-
description="Lightweight PP-
|
| 17 |
)
|
| 18 |
|
| 19 |
-
print("Initializing PP-
|
| 20 |
-
|
| 21 |
-
table_engine = PPStructure(show_log=False, layout=True, recovery=True)
|
| 22 |
print("Pipeline ready!")
|
| 23 |
|
| 24 |
@app.post("/ingest")
|
|
@@ -34,40 +35,34 @@ async def ingest_document(file: UploadFile = File(...)):
|
|
| 34 |
with open(file_path, "wb") as buffer:
|
| 35 |
shutil.copyfileobj(file.file, buffer)
|
| 36 |
|
| 37 |
-
# 2.
|
| 38 |
-
|
| 39 |
-
if img is None:
|
| 40 |
-
raise ValueError("Could not read image file.")
|
| 41 |
-
|
| 42 |
-
# 3. Predict layout and extract
|
| 43 |
-
result = table_engine(img)
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"content": content
|
| 65 |
})
|
| 66 |
-
|
| 67 |
return {
|
| 68 |
"status": "success",
|
| 69 |
"filename": file.filename,
|
| 70 |
-
"data":
|
| 71 |
}
|
| 72 |
|
| 73 |
except Exception as e:
|
|
|
|
| 1 |
import os
|
| 2 |
import shutil
|
| 3 |
import tempfile
|
| 4 |
+
import json
|
|
|
|
| 5 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 6 |
|
| 7 |
# --- CPU OPTIMIZATION FLAGS ---
|
| 8 |
+
# Limit threads for HF Free Tier (2 vCPUs)
|
| 9 |
os.environ["OMP_NUM_THREADS"] = "2"
|
| 10 |
+
# MKLDNN works perfectly with PPStructureV3 for a massive Intel speedup
|
| 11 |
+
os.environ["FLAGS_use_mkldnn"] = "1"
|
| 12 |
|
| 13 |
+
# IMPORT THE NEW V3 PIPELINE
|
| 14 |
+
from paddleocr import PPStructureV3
|
| 15 |
|
| 16 |
app = FastAPI(
|
| 17 |
title="Document Ingestion API",
|
| 18 |
+
description="Lightweight PP-StructureV3 extraction"
|
| 19 |
)
|
| 20 |
|
| 21 |
+
print("Initializing PP-StructureV3 (MKLDNN Enabled)...")
|
| 22 |
+
pipeline = PPStructureV3()
|
|
|
|
| 23 |
print("Pipeline ready!")
|
| 24 |
|
| 25 |
@app.post("/ingest")
|
|
|
|
| 35 |
with open(file_path, "wb") as buffer:
|
| 36 |
shutil.copyfileobj(file.file, buffer)
|
| 37 |
|
| 38 |
+
# 2. Predict (takes the file path directly, no cv2 needed!)
|
| 39 |
+
output = pipeline.predict(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
parsed_pages = []
|
| 42 |
+
for page_num, res in enumerate(output):
|
| 43 |
+
# We can save to the temp directory exactly like the VLM pipeline
|
| 44 |
+
md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
|
| 45 |
+
json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
|
| 46 |
|
| 47 |
+
res.save_to_markdown(save_path=md_path)
|
| 48 |
+
res.save_to_json(save_path=json_path)
|
| 49 |
+
|
| 50 |
+
with open(md_path, "r", encoding="utf-8") as f:
|
| 51 |
+
md_content = f.read()
|
| 52 |
+
|
| 53 |
+
with open(json_path, "r", encoding="utf-8") as f:
|
| 54 |
+
json_content = json.load(f)
|
| 55 |
+
|
| 56 |
+
parsed_pages.append({
|
| 57 |
+
"page": page_num + 1,
|
| 58 |
+
"markdown": md_content,
|
| 59 |
+
"json_data": json_content
|
|
|
|
| 60 |
})
|
| 61 |
+
|
| 62 |
return {
|
| 63 |
"status": "success",
|
| 64 |
"filename": file.filename,
|
| 65 |
+
"data": parsed_pages
|
| 66 |
}
|
| 67 |
|
| 68 |
except Exception as e:
|