Spaces:

chipling
/

paddleocr

Build error

App Files Files Community

chipling commited on Mar 1

Commit

d03efa1

verified ·

1 Parent(s): e24fdcc

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -43

app.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import os
 import shutil
 import json
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCRVL
 # Initialize FastAPI
-app = FastAPI(title="Document Ingestion API", description="PaddleOCR-VL Markdown/JSON extraction")
 # Load the model once globally when the server starts
 print("Initializing PaddleOCR-VL Pipeline...")
@@ -17,54 +21,50 @@ async def ingest_document(file: UploadFile = File(...)):
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
-    temp_dir = "temp_workspace"
-    os.makedirs(temp_dir, exist_ok=True)
-    file_path = os.path.join(temp_dir, file.filename)
-    try:
-        # Save the uploaded file temporarily
-        with open(file_path, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Run the VLM prediction
-        output = pipeline.predict(file_path)
-        parsed_pages = []
-        for page_num, res in enumerate(output):
-            md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
-            json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
-            # Save using the model's native methods
-            res.save_to_markdown(save_path=md_path)
-            res.save_to_json(save_path=json_path)
-            # Read the contents back to send in the HTTP response
-            with open(md_path, "r", encoding="utf-8") as f:
-                md_content = f.read()
-            with open(json_path, "r", encoding="utf-8") as f:
-                json_content = json.load(f)
-            parsed_pages.append({
-                "page": page_num + 1,
-                "markdown": md_content,
-                "json_data": json_content
-            })
-        return {
-            "status": "success",
-            "filename": file.filename,
-            "total_pages": len(parsed_pages),
-            "data": parsed_pages
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        # Clean up the temporary files so the server doesn't run out of storage
-        shutil.rmtree(temp_dir, ignore_errors=True)
 # Hugging Face Spaces routes health checks to the root
 @app.get("/")

 import os
 import shutil
 import json
+import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCRVL
 # Initialize FastAPI
+app = FastAPI(
+    title="Document Ingestion API",
+    description="PaddleOCR-VL Markdown/JSON extraction"
+)
 # Load the model once globally when the server starts
 print("Initializing PaddleOCR-VL Pipeline...")
     if not file.filename:
         raise HTTPException(status_code=400, detail="No file provided")
+    # Use a TemporaryDirectory to handle creation and auto-cleanup safely
+    with tempfile.TemporaryDirectory() as temp_dir:
+        file_path = os.path.join(temp_dir, file.filename)
+        try:
+            # Save the uploaded file temporarily
+            with open(file_path, "wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+            # Run the VLM prediction
+            output = pipeline.predict(file_path)
+            parsed_pages = []
+            for page_num, res in enumerate(output):
+                md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
+                json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
+                # Save using the model's native methods
+                res.save_to_markdown(save_path=md_path)
+                res.save_to_json(save_path=json_path)
+                # Read the contents back to send in the HTTP response
+                with open(md_path, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                with open(json_path, "r", encoding="utf-8") as f:
+                    json_content = json.load(f)
+                parsed_pages.append({
+                    "page": page_num + 1,
+                    "markdown": md_content,
+                    "json_data": json_content
+                })
+            return {
+                "status": "success",
+                "filename": file.filename,
+                "total_pages": len(parsed_pages),
+                "data": parsed_pages
+            }
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
 # Hugging Face Spaces routes health checks to the root
 @app.get("/")