Spaces:

chipling
/

paddleocr

Build error

App Files Files Community

chipling commited on Mar 1

Commit

3865d02

verified ·

1 Parent(s): 277f5b9

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -36

app.py CHANGED Viewed

@@ -1,24 +1,25 @@
 import os
 import shutil
 import tempfile
-import cv2
-import numpy as np
 from fastapi import FastAPI, UploadFile, File, HTTPException
 # --- CPU OPTIMIZATION FLAGS ---
 os.environ["OMP_NUM_THREADS"] = "2"
-os.environ["FLAGS_use_mkldnn"] = "1" # Back on! Works perfectly with PP-Structure
-from paddleocr import PPStructure
 app = FastAPI(
     title="Document Ingestion API",
-    description="Lightweight PP-StructureV2 extraction"
 )
-print("Initializing PP-Structure (MKLDNN Enabled)...")
-# recovery=True helps fix layout issues, layout=True enables section detection
-table_engine = PPStructure(show_log=False, layout=True, recovery=True)
 print("Pipeline ready!")
 @app.post("/ingest")
@@ -34,40 +35,34 @@ async def ingest_document(file: UploadFile = File(...)):
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
-            # 2. Read image for PP-Structure (expects cv2 numpy array)
-            img = cv2.imread(file_path)
-            if img is None:
-                raise ValueError("Could not read image file.")
-            # 3. Predict layout and extract
-            result = table_engine(img)
-            # 4. Format the output cleanly
-            structured_data = []
-            for region in result:
-                # region is a dict with keys like: 'type', 'bbox', 'res'
-                block_type = region.get('type') # e.g., 'text', 'title', 'table', 'figure'
-                content = ""
-                if block_type == 'table':
-                    content = region.get('res', {}).get('html', '')
-                elif block_type in ['text', 'title', 'list']:
-                    # Extract joined text lines
-                    lines = region.get('res', [])
-                    content = "\n".join([line.get('text', '') for line in lines if 'text' in line])
-                elif block_type == 'figure':
-                    content = "[Image cropped by layout engine]"
-                structured_data.append({
-                    "type": block_type,
-                    "bounding_box": region.get('bbox'),
-                    "content": content
                 })
             return {
                 "status": "success",
                 "filename": file.filename,
-                "data": structured_data
             }
         except Exception as e:

 import os
 import shutil
 import tempfile
+import json
 from fastapi import FastAPI, UploadFile, File, HTTPException
 # --- CPU OPTIMIZATION FLAGS ---
+# Limit threads for HF Free Tier (2 vCPUs)
 os.environ["OMP_NUM_THREADS"] = "2"
+# MKLDNN works perfectly with PPStructureV3 for a massive Intel speedup
+os.environ["FLAGS_use_mkldnn"] = "1"
+# IMPORT THE NEW V3 PIPELINE
+from paddleocr import PPStructureV3
 app = FastAPI(
     title="Document Ingestion API",
+    description="Lightweight PP-StructureV3 extraction"
 )
+print("Initializing PP-StructureV3 (MKLDNN Enabled)...")
+pipeline = PPStructureV3()
 print("Pipeline ready!")
 @app.post("/ingest")
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
+            # 2. Predict (takes the file path directly, no cv2 needed!)
+            output = pipeline.predict(file_path)
+            parsed_pages = []
+            for page_num, res in enumerate(output):
+                # We can save to the temp directory exactly like the VLM pipeline
+                md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
+                json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
+                res.save_to_markdown(save_path=md_path)
+                res.save_to_json(save_path=json_path)
+                with open(md_path, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                with open(json_path, "r", encoding="utf-8") as f:
+                    json_content = json.load(f)
+                parsed_pages.append({
+                    "page": page_num + 1,
+                    "markdown": md_content,
+                    "json_data": json_content
                 })
             return {
                 "status": "success",
                 "filename": file.filename,
+                "data": parsed_pages
             }
         except Exception as e: