Spaces:

triflix
/

sortitout

Sleeping

File size: 4,098 Bytes

4cfe185
76565ed
c08cd96
a09fca3
c08cd96
 
 
 
 
 
 
 
 
 
 
 
bfb796a
76565ed
c08cd96
 
 
 
 
 
76565ed
 
c08cd96
4cfe185
c08cd96
 
 
 
 
4cfe185
76565ed
 
c08cd96
76565ed
c08cd96
76565ed
c08cd96
4cfe185
c08cd96
4cfe185
76565ed
c08cd96
 
 
 
 
 
 
 
 
 
76565ed
 
c08cd96
 
 
 
 
 
 
 
 
76565ed
a09fca3
c08cd96
 
 
 
 
 
76565ed
 
c08cd96
 
 
 
bfb796a
6894202
76565ed
6894202
c08cd96
 
76565ed
c08cd96
 
76565ed
c08cd96
 
bfb796a
6894202
 
 
 
 
 
 
c08cd96
6894202
c08cd96
 
76565ed
6894202
 
 
 
 
 
 
 
 
 
 
c08cd96
6894202
 
 
 
 
 
76565ed
c08cd96
 
76565ed
6894202

import os
import uuid
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import List
import fitz

# -------------------------------------------------------------------
# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
# -------------------------------------------------------------------
os.environ["PADDLE_HOME"] = "/app/paddle_home"
os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
os.makedirs("/app/paddle_home", exist_ok=True)
os.makedirs("/app/xdg_cache", exist_ok=True)

# now safe to import paddlex/paddleocr
from paddleocr import PaddleOCR

# -------------------------------------------------------------------
# PDF → IMAGE
# -------------------------------------------------------------------
def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(pdf_path)

    doc = fitz.open(pdf_path)
    page_count = len(doc)

    limit = page_count if max_pages is None else min(max_pages, page_count)
    output_paths: List[str] = []

    out_dir = "/app/pdf_images"
    os.makedirs(out_dir, exist_ok=True)

    for i in range(limit):
        page = doc.load_page(i)
        pix = page.get_pixmap(dpi=220)
        img_name = f"{uuid.uuid4()}.jpg"
        img_path = os.path.join(out_dir, img_name)
        pix.save(img_path)
        output_paths.append(img_path)

    return output_paths


# -------------------------------------------------------------------
# OCR ENGINE
# -------------------------------------------------------------------
ocr_engine = PaddleOCR(
    lang="mr",
    text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)


def extract_text(image_path: str):
    result = ocr_engine.predict(input=image_path)
    output = []
    for block in result:
        texts = block["rec_texts"]
        scores = block["rec_scores"]
        for t, s in zip(texts, scores):
            output.append({"text": t, "confidence": float(s)})
    return output


# -------------------------------------------------------------------
# FASTAPI
# -------------------------------------------------------------------
app = FastAPI()
UPLOAD_DIR = "/app/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)


@app.post("/ocr")
async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
    if len(files) > 15:
        raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")

    structured_output = {"files": []}

    for index, file in enumerate(files, start=1):
        filename = file.filename.lower()
        ext = filename.split(".")[-1]

        temp_name = f"{uuid.uuid4()}.{ext}"
        temp_path = os.path.join(UPLOAD_DIR, temp_name)

        with open(temp_path, "wb") as f:
            f.write(await file.read())

        file_record = {
            "file_id": f"file_{index}",
            "filename": filename,
            "pages": []
        }

        # -------------------------------
        # PDF
        # -------------------------------
        if filename.endswith(".pdf"):
            img_paths = pdf_to_images(temp_path, max_pages=max_pages)

            for page_idx, img_path in enumerate(img_paths):
                ocr_results = extract_text(img_path)

                file_record["pages"].append({
                    "page_index": page_idx,
                    "results": ocr_results
                })

        # -------------------------------
        # IMAGE
        # -------------------------------
        elif filename.endswith((".jpg", ".jpeg", ".png")):
            ocr_results = extract_text(temp_path)

            file_record["pages"].append({
                "page_index": 0,
                "results": ocr_results
            })

        else:
            raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")

        structured_output["files"].append(file_record)

    return JSONResponse(structured_output)