Spaces:

chipling
/

paddleocr

Build error

App Files Files Community

chipling commited on Mar 1

Commit

277f5b9

verified ·

1 Parent(s): ea88869

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -65

app.py CHANGED Viewed

@@ -1,50 +1,26 @@
 import os
-# --- CPU OPTIMIZATION FLAGS (Must be set before importing Paddle) ---
-# Hugging Face Free Tier has 2 vCPUs. Limiting threads prevents overhead.
-os.environ["OMP_NUM_THREADS"] = "2"
-# DISABLE MKLDNN: PaddleOCR-VL transformer ops are currently incompatible
-# with the oneDNN instruction converter in Paddle's new executor.
-os.environ["FLAGS_use_mkldnn"] = "0"
 import shutil
-import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException
-from paddleocr import PaddleOCRVL
-from PIL import Image
 app = FastAPI(
     title="Document Ingestion API",
-    description="Optimized PaddleOCR-VL extraction"
 )
-print("Initializing PaddleOCR-VL Pipeline (MKLDNN Disabled)...")
-pipeline = PaddleOCRVL()
 print("Pipeline ready!")
-def optimize_image_for_vlm(file_path, max_dimension=1200):
-    """
-    Downscales large images to reduce the number of visual tokens the VLM
-    has to process. This creates a massive speedup on CPU.
-    """
-    try:
-        # Only attempt to resize if it's a standard image (ignore PDFs)
-        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
-            with Image.open(file_path) as img:
-                # Calculate the scaling factor if the image is too large
-                if max(img.size) > max_dimension:
-                    ratio = max_dimension / max(img.size)
-                    new_size = (int(img.width * ratio), int(img.height * ratio))
-                    # Resize and overwrite the temporary file
-                    img = img.resize(new_size, Image.Resampling.LANCZOS)
-                    img.save(file_path)
-                    print(f"Image downscaled to {new_size} for faster CPU inference.")
-    except Exception as e:
-        print(f"Skipping image optimization: {e}")
 @app.post("/ingest")
 async def ingest_document(file: UploadFile = File(...)):
     if not file.filename:
@@ -58,41 +34,41 @@ async def ingest_document(file: UploadFile = File(...)):
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
-            # 2. Optimize image (massive CPU speedup)
-            optimize_image_for_vlm(file_path)
-            # 3. Predict
-            output = pipeline.predict(file_path)
-            parsed_pages = []
-            for page_num, res in enumerate(output):
-                md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
-                json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
-                res.save_to_markdown(save_path=md_path)
-                res.save_to_json(save_path=json_path)
-                with open(md_path, "r", encoding="utf-8") as f:
-                    md_content = f.read()
-                with open(json_path, "r", encoding="utf-8") as f:
-                    json_content = json.load(f)
-                parsed_pages.append({
-                    "page": page_num + 1,
-                    "markdown": md_content,
-                    "json_data": json_content
                 })
             return {
                 "status": "success",
                 "filename": file.filename,
-                "data": parsed_pages
             }
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=str(e))
-@app.get("/")
-def health_check():
-    return {"status": "active", "model": "PaddleOCR-VL (Optimized)"}

 import os
 import shutil
 import tempfile
+import cv2
+import numpy as np
 from fastapi import FastAPI, UploadFile, File, HTTPException
+# --- CPU OPTIMIZATION FLAGS ---
+os.environ["OMP_NUM_THREADS"] = "2"
+os.environ["FLAGS_use_mkldnn"] = "1" # Back on! Works perfectly with PP-Structure
+from paddleocr import PPStructure
 app = FastAPI(
     title="Document Ingestion API",
+    description="Lightweight PP-StructureV2 extraction"
 )
+print("Initializing PP-Structure (MKLDNN Enabled)...")
+# recovery=True helps fix layout issues, layout=True enables section detection
+table_engine = PPStructure(show_log=False, layout=True, recovery=True)
 print("Pipeline ready!")
 @app.post("/ingest")
 async def ingest_document(file: UploadFile = File(...)):
     if not file.filename:
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
+            # 2. Read image for PP-Structure (expects cv2 numpy array)
+            img = cv2.imread(file_path)
+            if img is None:
+                raise ValueError("Could not read image file.")
+            # 3. Predict layout and extract
+            result = table_engine(img)
+            # 4. Format the output cleanly
+            structured_data = []
+            for region in result:
+                # region is a dict with keys like: 'type', 'bbox', 'res'
+                block_type = region.get('type') # e.g., 'text', 'title', 'table', 'figure'
+                content = ""
+                if block_type == 'table':
+                    content = region.get('res', {}).get('html', '')
+                elif block_type in ['text', 'title', 'list']:
+                    # Extract joined text lines
+                    lines = region.get('res', [])
+                    content = "\n".join([line.get('text', '') for line in lines if 'text' in line])
+                elif block_type == 'figure':
+                    content = "[Image cropped by layout engine]"
+                structured_data.append({
+                    "type": block_type,
+                    "bounding_box": region.get('bbox'),
+                    "content": content
                 })
             return {
                 "status": "success",
                 "filename": file.filename,
+                "data": structured_data
             }
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))