Spaces:
Sleeping
Sleeping
File size: 3,732 Bytes
632c507 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image
import pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
import asyncio
import cv2
import numpy as np
import io
app = FastAPI(title="Fast Parallel Text Extract API")
executor = ThreadPoolExecutor(max_workers=8)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # allow all origins for testing
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ---------- Utils ----------
def read_image_from_bytes(file_bytes: bytes):
arr = np.frombuffer(file_bytes, np.uint8)
img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
return img
def resize_if_large(img, max_dim=2000):
h, w = img.shape[:2]
if max(h, w) > max_dim:
scale = max_dim / max(h, w)
img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
return img
# ---------- Fast OCR ----------
def fast_ocr(file_bytes: bytes, lang: str = "eng"):
img_bgr = read_image_from_bytes(file_bytes)
if img_bgr is None:
return ""
img_bgr = resize_if_large(img_bgr)
# Light preprocessing
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
_, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
pil_img = Image.fromarray(gray)
config = "--oem 3 --psm 6" # balanced speed + accuracy
text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
return text.strip()
# ---------- Heavy OCR (fallback only) ----------
def heavy_ocr(file_bytes: bytes, lang: str = "eng"):
img_bgr = read_image_from_bytes(file_bytes)
if img_bgr is None:
return ""
# Denoise + threshold (slower but more robust)
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
gray = cv2.fastNlMeansDenoising(gray, None, h=10)
_, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
pil_img = Image.fromarray(gray)
config = "--oem 3 --psm 6"
text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
return text.strip()
# ---------- Image extraction ----------
def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"):
text = fast_ocr(file_bytes, lang)
if len(text) < 20:
text = heavy_ocr(file_bytes, lang)
return text
# ---------- PDF extraction ----------
def extract_text_from_pdf_bytes(file_bytes: bytes):
doc = fitz.open(stream=file_bytes, filetype="pdf")
texts = []
for page in doc:
try:
texts.append(page.get_text("text"))
except Exception:
texts.append("")
return "\n".join(texts)
# ---------- Endpoints ----------
@app.post("/extract-image")
async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")):
"""
Extract text from image.
lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin'
"""
try:
raw = await file.read()
loop = asyncio.get_event_loop()
text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang)
return JSONResponse({"text": text})
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.post("/extract-pdf")
async def extract_pdf(file: UploadFile = File(...)):
"""
Extract text from PDF.
"""
try:
raw = await file.read()
loop = asyncio.get_event_loop()
text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw)
return JSONResponse({"text": text})
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
|