from fastapi import FastAPI, UploadFile, File, Form from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware from PIL import Image import pytesseract import fitz from concurrent.futures import ThreadPoolExecutor import asyncio import cv2 import numpy as np import io app = FastAPI(title="Fast Parallel Text Extract API") executor = ThreadPoolExecutor(max_workers=8) app.add_middleware( CORSMiddleware, allow_origins=["*"], # allow all origins for testing allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ---------- Utils ---------- def read_image_from_bytes(file_bytes: bytes): arr = np.frombuffer(file_bytes, np.uint8) img = cv2.imdecode(arr, cv2.IMREAD_COLOR) return img def resize_if_large(img, max_dim=2000): h, w = img.shape[:2] if max(h, w) > max_dim: scale = max_dim / max(h, w) img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA) return img # ---------- Fast OCR ---------- def fast_ocr(file_bytes: bytes, lang: str = "eng"): img_bgr = read_image_from_bytes(file_bytes) if img_bgr is None: return "" img_bgr = resize_if_large(img_bgr) # Light preprocessing gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) pil_img = Image.fromarray(gray) config = "--oem 3 --psm 6" # balanced speed + accuracy text = pytesseract.image_to_string(pil_img, config=config, lang=lang) return text.strip() # ---------- Heavy OCR (fallback only) ---------- def heavy_ocr(file_bytes: bytes, lang: str = "eng"): img_bgr = read_image_from_bytes(file_bytes) if img_bgr is None: return "" # Denoise + threshold (slower but more robust) gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) gray = cv2.fastNlMeansDenoising(gray, None, h=10) _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) pil_img = Image.fromarray(gray) config = "--oem 3 --psm 6" text = pytesseract.image_to_string(pil_img, config=config, lang=lang) return text.strip() # ---------- Image extraction ---------- def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"): text = fast_ocr(file_bytes, lang) if len(text) < 20: text = heavy_ocr(file_bytes, lang) return text # ---------- PDF extraction ---------- def extract_text_from_pdf_bytes(file_bytes: bytes): doc = fitz.open(stream=file_bytes, filetype="pdf") texts = [] for page in doc: try: texts.append(page.get_text("text")) except Exception: texts.append("") return "\n".join(texts) # ---------- Endpoints ---------- @app.post("/extract-image") async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")): """ Extract text from image. lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin' """ try: raw = await file.read() loop = asyncio.get_event_loop() text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang) return JSONResponse({"text": text}) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.post("/extract-pdf") async def extract_pdf(file: UploadFile = File(...)): """ Extract text from PDF. """ try: raw = await file.read() loop = asyncio.get_event_loop() text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw) return JSONResponse({"text": text}) except Exception as e: return JSONResponse({"error": str(e)}, status_code=500)