|
|
import os |
|
|
import uuid |
|
|
from fastapi import FastAPI, UploadFile, File, HTTPException |
|
|
from fastapi.responses import JSONResponse |
|
|
from typing import List |
|
|
import fitz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.environ["PADDLE_HOME"] = "/app/paddle_home" |
|
|
os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache" |
|
|
os.makedirs("/app/paddle_home", exist_ok=True) |
|
|
os.makedirs("/app/xdg_cache", exist_ok=True) |
|
|
|
|
|
|
|
|
from paddleocr import PaddleOCR |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]: |
|
|
if not os.path.exists(pdf_path): |
|
|
raise FileNotFoundError(pdf_path) |
|
|
|
|
|
doc = fitz.open(pdf_path) |
|
|
page_count = len(doc) |
|
|
|
|
|
limit = page_count if max_pages is None else min(max_pages, page_count) |
|
|
output_paths: List[str] = [] |
|
|
|
|
|
out_dir = "/app/pdf_images" |
|
|
os.makedirs(out_dir, exist_ok=True) |
|
|
|
|
|
for i in range(limit): |
|
|
page = doc.load_page(i) |
|
|
pix = page.get_pixmap(dpi=220) |
|
|
img_name = f"{uuid.uuid4()}.jpg" |
|
|
img_path = os.path.join(out_dir, img_name) |
|
|
pix.save(img_path) |
|
|
output_paths.append(img_path) |
|
|
|
|
|
return output_paths |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ocr_engine = PaddleOCR( |
|
|
lang="mr", |
|
|
text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec", |
|
|
use_doc_orientation_classify=False, |
|
|
use_doc_unwarping=False, |
|
|
use_textline_orientation=False |
|
|
) |
|
|
|
|
|
|
|
|
def extract_text(image_path: str): |
|
|
result = ocr_engine.predict(input=image_path) |
|
|
output = [] |
|
|
for block in result: |
|
|
texts = block["rec_texts"] |
|
|
scores = block["rec_scores"] |
|
|
for t, s in zip(texts, scores): |
|
|
output.append({"text": t, "confidence": float(s)}) |
|
|
return output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
UPLOAD_DIR = "/app/uploads" |
|
|
os.makedirs(UPLOAD_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
@app.post("/ocr") |
|
|
async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3): |
|
|
if len(files) > 15: |
|
|
raise HTTPException(status_code=400, detail="Maximum 15 files allowed.") |
|
|
|
|
|
structured_output = {"files": []} |
|
|
|
|
|
for index, file in enumerate(files, start=1): |
|
|
filename = file.filename.lower() |
|
|
ext = filename.split(".")[-1] |
|
|
|
|
|
temp_name = f"{uuid.uuid4()}.{ext}" |
|
|
temp_path = os.path.join(UPLOAD_DIR, temp_name) |
|
|
|
|
|
with open(temp_path, "wb") as f: |
|
|
f.write(await file.read()) |
|
|
|
|
|
file_record = { |
|
|
"file_id": f"file_{index}", |
|
|
"filename": filename, |
|
|
"pages": [] |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if filename.endswith(".pdf"): |
|
|
img_paths = pdf_to_images(temp_path, max_pages=max_pages) |
|
|
|
|
|
for page_idx, img_path in enumerate(img_paths): |
|
|
ocr_results = extract_text(img_path) |
|
|
|
|
|
file_record["pages"].append({ |
|
|
"page_index": page_idx, |
|
|
"results": ocr_results |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif filename.endswith((".jpg", ".jpeg", ".png")): |
|
|
ocr_results = extract_text(temp_path) |
|
|
|
|
|
file_record["pages"].append({ |
|
|
"page_index": 0, |
|
|
"results": ocr_results |
|
|
}) |
|
|
|
|
|
else: |
|
|
raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}") |
|
|
|
|
|
structured_output["files"].append(file_record) |
|
|
|
|
|
return JSONResponse(structured_output) |
|
|
|