Update main.py
Browse files
main.py
CHANGED
|
@@ -6,11 +6,12 @@ from pathlib import Path
|
|
| 6 |
from typing import List, Union, Optional, Dict, Any
|
| 7 |
|
| 8 |
from fastapi import FastAPI, File, UploadFile, HTTPException, status
|
| 9 |
-
from pydantic import BaseModel, Field
|
| 10 |
-
from PIL import Image # For type hinting
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
from
|
|
|
|
| 14 |
MagicPDFProcessor,
|
| 15 |
MDRStructuredBlock,
|
| 16 |
MDRTextBlock,
|
|
@@ -163,7 +164,7 @@ class MDRFigureBlockModel(MDRBasicBlockModel):
|
|
| 163 |
)
|
| 164 |
|
| 165 |
# Union type for the response model
|
| 166 |
-
|
| 167 |
|
| 168 |
# --- FastAPI App ---
|
| 169 |
app = FastAPI(
|
|
@@ -187,7 +188,7 @@ async def health_check():
|
|
| 187 |
return {"status": "ok", "message": "MagicPDFProcessor is running."}
|
| 188 |
|
| 189 |
@app.post("/process-pdf/",
|
| 190 |
-
response_model=List[
|
| 191 |
summary="Process a PDF file",
|
| 192 |
description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
|
| 193 |
async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
|
|
@@ -201,6 +202,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
| 201 |
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
| 202 |
|
| 203 |
# Save uploaded file temporarily
|
|
|
|
| 204 |
try:
|
| 205 |
# Create a temporary directory if it doesn't exist
|
| 206 |
temp_dir = Path("./temp_uploads")
|
|
@@ -215,7 +217,7 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
| 215 |
except Exception as e:
|
| 216 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
|
| 217 |
|
| 218 |
-
extracted_blocks_api: List[
|
| 219 |
start_process_time = time.time()
|
| 220 |
|
| 221 |
try:
|
|
@@ -246,11 +248,12 @@ async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF
|
|
| 246 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
|
| 247 |
finally:
|
| 248 |
# Clean up the temporary file
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
| 254 |
|
| 255 |
return extracted_blocks_api
|
| 256 |
|
|
@@ -261,12 +264,4 @@ async def read_root():
|
|
| 261 |
"message": "Welcome to the MagicDataReadiness PDF Processor API!",
|
| 262 |
"docs_url": "/docs",
|
| 263 |
"health_url": "/health"
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
# --- Run with Uvicorn (for local testing) ---
|
| 267 |
-
# This part is usually not included when deploying with Docker,
|
| 268 |
-
# as Docker CMD handles running uvicorn.
|
| 269 |
-
# if __name__ == "__main__":
|
| 270 |
-
# import uvicorn
|
| 271 |
-
# print("Starting Uvicorn server locally...")
|
| 272 |
-
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 6 |
from typing import List, Union, Optional, Dict, Any
|
| 7 |
|
| 8 |
from fastapi import FastAPI, File, UploadFile, HTTPException, status
|
| 9 |
+
from pydantic import BaseModel, Field # Removed field_validator as it wasn't used
|
| 10 |
+
from PIL import Image # For type hinting
|
| 11 |
|
| 12 |
+
# --- CORRECTED IMPORT ---
|
| 13 |
+
# Import directly from the monolithic script file name
|
| 14 |
+
from magic_pdf_processor import (
|
| 15 |
MagicPDFProcessor,
|
| 16 |
MDRStructuredBlock,
|
| 17 |
MDRTextBlock,
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
# Union type for the response model
|
| 167 |
+
MDRStructuredBlockModelAPI = Union[MDRTextBlockModel, MDRTableBlockModel, MDRFormulaBlockModel, MDRFigureBlockModel] # Renamed API Union type
|
| 168 |
|
| 169 |
# --- FastAPI App ---
|
| 170 |
app = FastAPI(
|
|
|
|
| 188 |
return {"status": "ok", "message": "MagicPDFProcessor is running."}
|
| 189 |
|
| 190 |
@app.post("/process-pdf/",
|
| 191 |
+
response_model=List[MDRStructuredBlockModelAPI], # Use the Union type
|
| 192 |
summary="Process a PDF file",
|
| 193 |
description="Upload a PDF file to extract structured blocks (text, tables, figures, formulas).")
|
| 194 |
async def process_pdf_endpoint(file: UploadFile = File(..., description="The PDF file to process.")):
|
|
|
|
| 202 |
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid file type. Please upload a PDF.")
|
| 203 |
|
| 204 |
# Save uploaded file temporarily
|
| 205 |
+
temp_pdf_path = "" # Initialize path
|
| 206 |
try:
|
| 207 |
# Create a temporary directory if it doesn't exist
|
| 208 |
temp_dir = Path("./temp_uploads")
|
|
|
|
| 217 |
except Exception as e:
|
| 218 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to save uploaded file: {e}")
|
| 219 |
|
| 220 |
+
extracted_blocks_api: List[MDRStructuredBlockModelAPI] = []
|
| 221 |
start_process_time = time.time()
|
| 222 |
|
| 223 |
try:
|
|
|
|
| 248 |
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"An error occurred during PDF processing: {e}")
|
| 249 |
finally:
|
| 250 |
# Clean up the temporary file
|
| 251 |
+
if temp_pdf_path and os.path.exists(temp_pdf_path):
|
| 252 |
+
try:
|
| 253 |
+
os.remove(temp_pdf_path)
|
| 254 |
+
print(f"Cleaned up temporary file: {temp_pdf_path}")
|
| 255 |
+
except OSError as e:
|
| 256 |
+
print(f"Warning: Could not remove temporary file {temp_pdf_path}: {e}")
|
| 257 |
|
| 258 |
return extracted_blocks_api
|
| 259 |
|
|
|
|
| 264 |
"message": "Welcome to the MagicDataReadiness PDF Processor API!",
|
| 265 |
"docs_url": "/docs",
|
| 266 |
"health_url": "/health"
|
| 267 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|