File size: 4,098 Bytes
4cfe185 76565ed c08cd96 a09fca3 c08cd96 bfb796a 76565ed c08cd96 76565ed c08cd96 4cfe185 c08cd96 4cfe185 76565ed c08cd96 76565ed c08cd96 76565ed c08cd96 4cfe185 c08cd96 4cfe185 76565ed c08cd96 76565ed c08cd96 76565ed a09fca3 c08cd96 76565ed c08cd96 bfb796a 6894202 76565ed 6894202 c08cd96 76565ed c08cd96 76565ed c08cd96 bfb796a 6894202 c08cd96 6894202 c08cd96 76565ed 6894202 c08cd96 6894202 76565ed c08cd96 76565ed 6894202 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import uuid
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from typing import List
import fitz
# -------------------------------------------------------------------
# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
# -------------------------------------------------------------------
os.environ["PADDLE_HOME"] = "/app/paddle_home"
os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
os.makedirs("/app/paddle_home", exist_ok=True)
os.makedirs("/app/xdg_cache", exist_ok=True)
# now safe to import paddlex/paddleocr
from paddleocr import PaddleOCR
# -------------------------------------------------------------------
# PDF → IMAGE
# -------------------------------------------------------------------
def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
if not os.path.exists(pdf_path):
raise FileNotFoundError(pdf_path)
doc = fitz.open(pdf_path)
page_count = len(doc)
limit = page_count if max_pages is None else min(max_pages, page_count)
output_paths: List[str] = []
out_dir = "/app/pdf_images"
os.makedirs(out_dir, exist_ok=True)
for i in range(limit):
page = doc.load_page(i)
pix = page.get_pixmap(dpi=220)
img_name = f"{uuid.uuid4()}.jpg"
img_path = os.path.join(out_dir, img_name)
pix.save(img_path)
output_paths.append(img_path)
return output_paths
# -------------------------------------------------------------------
# OCR ENGINE
# -------------------------------------------------------------------
ocr_engine = PaddleOCR(
lang="mr",
text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False
)
def extract_text(image_path: str):
result = ocr_engine.predict(input=image_path)
output = []
for block in result:
texts = block["rec_texts"]
scores = block["rec_scores"]
for t, s in zip(texts, scores):
output.append({"text": t, "confidence": float(s)})
return output
# -------------------------------------------------------------------
# FASTAPI
# -------------------------------------------------------------------
app = FastAPI()
UPLOAD_DIR = "/app/uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
@app.post("/ocr")
async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
if len(files) > 15:
raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
structured_output = {"files": []}
for index, file in enumerate(files, start=1):
filename = file.filename.lower()
ext = filename.split(".")[-1]
temp_name = f"{uuid.uuid4()}.{ext}"
temp_path = os.path.join(UPLOAD_DIR, temp_name)
with open(temp_path, "wb") as f:
f.write(await file.read())
file_record = {
"file_id": f"file_{index}",
"filename": filename,
"pages": []
}
# -------------------------------
# PDF
# -------------------------------
if filename.endswith(".pdf"):
img_paths = pdf_to_images(temp_path, max_pages=max_pages)
for page_idx, img_path in enumerate(img_paths):
ocr_results = extract_text(img_path)
file_record["pages"].append({
"page_index": page_idx,
"results": ocr_results
})
# -------------------------------
# IMAGE
# -------------------------------
elif filename.endswith((".jpg", ".jpeg", ".png")):
ocr_results = extract_text(temp_path)
file_record["pages"].append({
"page_index": 0,
"results": ocr_results
})
else:
raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
structured_output["files"].append(file_record)
return JSONResponse(structured_output)
|