sortitout / app.py
triflix's picture
Update app.py
6894202 verified
import os
import uuid
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import List
import fitz
# -------------------------------------------------------------------
# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
# -------------------------------------------------------------------
os.environ["PADDLE_HOME"] = "/app/paddle_home"
os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
os.makedirs("/app/paddle_home", exist_ok=True)
os.makedirs("/app/xdg_cache", exist_ok=True)
# now safe to import paddlex/paddleocr
from paddleocr import PaddleOCR
# -------------------------------------------------------------------
# PDF β†’ IMAGE
# -------------------------------------------------------------------
def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
if not os.path.exists(pdf_path):
raise FileNotFoundError(pdf_path)
doc = fitz.open(pdf_path)
page_count = len(doc)
limit = page_count if max_pages is None else min(max_pages, page_count)
output_paths: List[str] = []
out_dir = "/app/pdf_images"
os.makedirs(out_dir, exist_ok=True)
for i in range(limit):
page = doc.load_page(i)
pix = page.get_pixmap(dpi=220)
img_name = f"{uuid.uuid4()}.jpg"
img_path = os.path.join(out_dir, img_name)
pix.save(img_path)
output_paths.append(img_path)
return output_paths
# -------------------------------------------------------------------
# OCR ENGINE
# -------------------------------------------------------------------
ocr_engine = PaddleOCR(
lang="mr",
text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False
)
def extract_text(image_path: str):
result = ocr_engine.predict(input=image_path)
output = []
for block in result:
texts = block["rec_texts"]
scores = block["rec_scores"]
for t, s in zip(texts, scores):
output.append({"text": t, "confidence": float(s)})
return output
# -------------------------------------------------------------------
# FASTAPI
# -------------------------------------------------------------------
app = FastAPI()
UPLOAD_DIR = "/app/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
@app.post("/ocr")
async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
if len(files) > 15:
raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
structured_output = {"files": []}
for index, file in enumerate(files, start=1):
filename = file.filename.lower()
ext = filename.split(".")[-1]
temp_name = f"{uuid.uuid4()}.{ext}"
temp_path = os.path.join(UPLOAD_DIR, temp_name)
with open(temp_path, "wb") as f:
f.write(await file.read())
file_record = {
"file_id": f"file_{index}",
"filename": filename,
"pages": []
}
# -------------------------------
# PDF
# -------------------------------
if filename.endswith(".pdf"):
img_paths = pdf_to_images(temp_path, max_pages=max_pages)
for page_idx, img_path in enumerate(img_paths):
ocr_results = extract_text(img_path)
file_record["pages"].append({
"page_index": page_idx,
"results": ocr_results
})
# -------------------------------
# IMAGE
# -------------------------------
elif filename.endswith((".jpg", ".jpeg", ".png")):
ocr_results = extract_text(temp_path)
file_record["pages"].append({
"page_index": 0,
"results": ocr_results
})
else:
raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
structured_output["files"].append(file_record)
return JSONResponse(structured_output)