Spaces:

chipling
/

paddleocr

Build error

App Files Files Community

chipling commited on Mar 1

Commit

1bcd2bb

verified ·

1 Parent(s): d03efa1

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -13

app.py CHANGED Viewed

@@ -1,49 +1,75 @@
 import os
 import shutil
 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCRVL
-# Initialize FastAPI
 app = FastAPI(
     title="Document Ingestion API",
-    description="PaddleOCR-VL Markdown/JSON extraction"
 )
-# Load the model once globally when the server starts
-print("Initializing PaddleOCR-VL Pipeline...")
 pipeline = PaddleOCRVL()
 print("Pipeline ready!")
 @app.post("/ingest")
 async def ingest_document(file: UploadFile = File(...)):
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
-    # Use a TemporaryDirectory to handle creation and auto-cleanup safely
     with tempfile.TemporaryDirectory() as temp_dir:
         file_path = os.path.join(temp_dir, file.filename)
         try:
-            # Save the uploaded file temporarily
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
-            # Run the VLM prediction
             output = pipeline.predict(file_path)
             parsed_pages = []
             for page_num, res in enumerate(output):
                 md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
                 json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
-                # Save using the model's native methods
                 res.save_to_markdown(save_path=md_path)
                 res.save_to_json(save_path=json_path)
-                # Read the contents back to send in the HTTP response
                 with open(md_path, "r", encoding="utf-8") as f:
                     md_content = f.read()
@@ -59,14 +85,12 @@ async def ingest_document(file: UploadFile = File(...)):
             return {
                 "status": "success",
                 "filename": file.filename,
-                "total_pages": len(parsed_pages),
                 "data": parsed_pages
             }
         except Exception as e:
             raise HTTPException(status_code=500, detail=str(e))
-# Hugging Face Spaces routes health checks to the root
 @app.get("/")
 def health_check():
-    return {"status": "active", "model": "PaddleOCR-VL"}

 import os
+# --- CPU OPTIMIZATION FLAGS (Must be set before importing Paddle) ---
+# Hugging Face Free Tier has 2 vCPUs. Limiting threads prevents overhead.
+os.environ["OMP_NUM_THREADS"] = "2"
+# Enables Intel CPU math acceleration
+os.environ["FLAGS_use_mkldnn"] = "1"
 import shutil
 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCRVL
+from PIL import Image
 app = FastAPI(
     title="Document Ingestion API",
+    description="Optimized PaddleOCR-VL extraction"
 )
+print("Initializing PaddleOCR-VL Pipeline (MKLDNN Enabled)...")
 pipeline = PaddleOCRVL()
 print("Pipeline ready!")
+def optimize_image_for_vlm(file_path, max_dimension=1200):
+    """
+    Downscales large images to reduce the number of visual tokens the VLM
+    has to process. This creates a massive speedup on CPU.
+    """
+    try:
+        # Only attempt to resize if it's a standard image (ignore PDFs)
+        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
+            with Image.open(file_path) as img:
+                # Calculate the scaling factor if the image is too large
+                if max(img.size) > max_dimension:
+                    ratio = max_dimension / max(img.size)
+                    new_size = (int(img.width * ratio), int(img.height * ratio))
+                    # Resize and overwrite the temporary file
+                    img = img.resize(new_size, Image.Resampling.LANCZOS)
+                    img.save(file_path)
+                    print(f"Image downscaled to {new_size} for faster CPU inference.")
+    except Exception as e:
+        print(f"Skipping image optimization: {e}")
 @app.post("/ingest")
 async def ingest_document(file: UploadFile = File(...)):
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
     with tempfile.TemporaryDirectory() as temp_dir:
         file_path = os.path.join(temp_dir, file.filename)
         try:
+            # 1. Save file
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
+            # 2. Optimize image (massive CPU speedup)
+            optimize_image_for_vlm(file_path)
+            # 3. Predict
             output = pipeline.predict(file_path)
             parsed_pages = []
             for page_num, res in enumerate(output):
                 md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
                 json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
                 res.save_to_markdown(save_path=md_path)
                 res.save_to_json(save_path=json_path)
                 with open(md_path, "r", encoding="utf-8") as f:
                     md_content = f.read()
             return {
                 "status": "success",
                 "filename": file.filename,
                 "data": parsed_pages
             }
         except Exception as e:
             raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 def health_check():
+    return {"status": "active", "model": "PaddleOCR-VL (Optimized)"}