File size: 4,098 Bytes
4cfe185
76565ed
c08cd96
a09fca3
c08cd96
 
 
 
 
 
 
 
 
 
 
 
bfb796a
76565ed
c08cd96
 
 
 
 
 
76565ed
 
c08cd96
4cfe185
c08cd96
 
 
 
 
4cfe185
76565ed
 
c08cd96
76565ed
c08cd96
76565ed
c08cd96
4cfe185
c08cd96
4cfe185
76565ed
c08cd96
 
 
 
 
 
 
 
 
 
76565ed
 
c08cd96
 
 
 
 
 
 
 
 
76565ed
a09fca3
c08cd96
 
 
 
 
 
76565ed
 
c08cd96
 
 
 
bfb796a
6894202
76565ed
6894202
c08cd96
 
76565ed
c08cd96
 
76565ed
c08cd96
 
bfb796a
6894202
 
 
 
 
 
 
c08cd96
6894202
c08cd96
 
76565ed
6894202
 
 
 
 
 
 
 
 
 
 
c08cd96
6894202
 
 
 
 
 
76565ed
c08cd96
 
76565ed
6894202
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import uuid
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import List
import fitz

# -------------------------------------------------------------------
# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
# -------------------------------------------------------------------
os.environ["PADDLE_HOME"] = "/app/paddle_home"
os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
os.makedirs("/app/paddle_home", exist_ok=True)
os.makedirs("/app/xdg_cache", exist_ok=True)

# now safe to import paddlex/paddleocr
from paddleocr import PaddleOCR

# -------------------------------------------------------------------
# PDF → IMAGE
# -------------------------------------------------------------------
def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(pdf_path)

    doc = fitz.open(pdf_path)
    page_count = len(doc)

    limit = page_count if max_pages is None else min(max_pages, page_count)
    output_paths: List[str] = []

    out_dir = "/app/pdf_images"
    os.makedirs(out_dir, exist_ok=True)

    for i in range(limit):
        page = doc.load_page(i)
        pix = page.get_pixmap(dpi=220)
        img_name = f"{uuid.uuid4()}.jpg"
        img_path = os.path.join(out_dir, img_name)
        pix.save(img_path)
        output_paths.append(img_path)

    return output_paths


# -------------------------------------------------------------------
# OCR ENGINE
# -------------------------------------------------------------------
ocr_engine = PaddleOCR(
    lang="mr",
    text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False
)


def extract_text(image_path: str):
    result = ocr_engine.predict(input=image_path)
    output = []
    for block in result:
        texts = block["rec_texts"]
        scores = block["rec_scores"]
        for t, s in zip(texts, scores):
            output.append({"text": t, "confidence": float(s)})
    return output


# -------------------------------------------------------------------
# FASTAPI
# -------------------------------------------------------------------
app = FastAPI()
UPLOAD_DIR = "/app/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)


@app.post("/ocr")
async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
    if len(files) > 15:
        raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")

    structured_output = {"files": []}

    for index, file in enumerate(files, start=1):
        filename = file.filename.lower()
        ext = filename.split(".")[-1]

        temp_name = f"{uuid.uuid4()}.{ext}"
        temp_path = os.path.join(UPLOAD_DIR, temp_name)

        with open(temp_path, "wb") as f:
            f.write(await file.read())

        file_record = {
            "file_id": f"file_{index}",
            "filename": filename,
            "pages": []
        }

        # -------------------------------
        # PDF
        # -------------------------------
        if filename.endswith(".pdf"):
            img_paths = pdf_to_images(temp_path, max_pages=max_pages)

            for page_idx, img_path in enumerate(img_paths):
                ocr_results = extract_text(img_path)

                file_record["pages"].append({
                    "page_index": page_idx,
                    "results": ocr_results
                })

        # -------------------------------
        # IMAGE
        # -------------------------------
        elif filename.endswith((".jpg", ".jpeg", ".png")):
            ocr_results = extract_text(temp_path)

            file_record["pages"].append({
                "page_index": 0,
                "results": ocr_results
            })

        else:
            raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")

        structured_output["files"].append(file_record)

    return JSONResponse(structured_output)