Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File, Form | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from PIL import Image | |
| import pytesseract | |
| import fitz | |
| from concurrent.futures import ThreadPoolExecutor | |
| import asyncio | |
| import cv2 | |
| import numpy as np | |
| import io | |
| app = FastAPI(title="Fast Parallel Text Extract API") | |
| executor = ThreadPoolExecutor(max_workers=8) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # allow all origins for testing | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ---------- Utils ---------- | |
| def read_image_from_bytes(file_bytes: bytes): | |
| arr = np.frombuffer(file_bytes, np.uint8) | |
| img = cv2.imdecode(arr, cv2.IMREAD_COLOR) | |
| return img | |
| def resize_if_large(img, max_dim=2000): | |
| h, w = img.shape[:2] | |
| if max(h, w) > max_dim: | |
| scale = max_dim / max(h, w) | |
| img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA) | |
| return img | |
| # ---------- Fast OCR ---------- | |
| def fast_ocr(file_bytes: bytes, lang: str = "eng"): | |
| img_bgr = read_image_from_bytes(file_bytes) | |
| if img_bgr is None: | |
| return "" | |
| img_bgr = resize_if_large(img_bgr) | |
| # Light preprocessing | |
| gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) | |
| _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| pil_img = Image.fromarray(gray) | |
| config = "--oem 3 --psm 6" # balanced speed + accuracy | |
| text = pytesseract.image_to_string(pil_img, config=config, lang=lang) | |
| return text.strip() | |
| # ---------- Heavy OCR (fallback only) ---------- | |
| def heavy_ocr(file_bytes: bytes, lang: str = "eng"): | |
| img_bgr = read_image_from_bytes(file_bytes) | |
| if img_bgr is None: | |
| return "" | |
| # Denoise + threshold (slower but more robust) | |
| gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) | |
| gray = cv2.fastNlMeansDenoising(gray, None, h=10) | |
| _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| pil_img = Image.fromarray(gray) | |
| config = "--oem 3 --psm 6" | |
| text = pytesseract.image_to_string(pil_img, config=config, lang=lang) | |
| return text.strip() | |
| # ---------- Image extraction ---------- | |
| def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"): | |
| text = fast_ocr(file_bytes, lang) | |
| if len(text) < 20: | |
| text = heavy_ocr(file_bytes, lang) | |
| return text | |
| # ---------- PDF extraction ---------- | |
| def extract_text_from_pdf_bytes(file_bytes: bytes): | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| texts = [] | |
| for page in doc: | |
| try: | |
| texts.append(page.get_text("text")) | |
| except Exception: | |
| texts.append("") | |
| return "\n".join(texts) | |
| # ---------- Endpoints ---------- | |
| async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")): | |
| """ | |
| Extract text from image. | |
| lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin' | |
| """ | |
| try: | |
| raw = await file.read() | |
| loop = asyncio.get_event_loop() | |
| text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang) | |
| return JSONResponse({"text": text}) | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def extract_pdf(file: UploadFile = File(...)): | |
| """ | |
| Extract text from PDF. | |
| """ | |
| try: | |
| raw = await file.read() | |
| loop = asyncio.get_event_loop() | |
| text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw) | |
| return JSONResponse({"text": text}) | |
| except Exception as e: | |
| return JSONResponse({"error": str(e)}, status_code=500) | |