Spaces:

Rivalcoder
/

Rapid-Extractor

Sleeping

Rapid-Extractor / main.py

Rivalcoder

Add Files

632c507 3 months ago

3.73 kB

	from fastapi import FastAPI, UploadFile, File, Form
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from PIL import Image
	import pytesseract
	import fitz
	from concurrent.futures import ThreadPoolExecutor
	import asyncio
	import cv2
	import numpy as np
	import io

	app = FastAPI(title="Fast Parallel Text Extract API")
	executor = ThreadPoolExecutor(max_workers=8)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"], # allow all origins for testing
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)



	# ---------- Utils ----------
	def read_image_from_bytes(file_bytes: bytes):
	arr = np.frombuffer(file_bytes, np.uint8)
	img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
	return img

	def resize_if_large(img, max_dim=2000):
	h, w = img.shape[:2]
	if max(h, w) > max_dim:
	scale = max_dim / max(h, w)
	img = cv2.resize(img, (int(wscale), int(hscale)), interpolation=cv2.INTER_AREA)
	return img

	# ---------- Fast OCR ----------
	def fast_ocr(file_bytes: bytes, lang: str = "eng"):
	img_bgr = read_image_from_bytes(file_bytes)
	if img_bgr is None:
	return ""

	img_bgr = resize_if_large(img_bgr)

	# Light preprocessing
	gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
	_, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	pil_img = Image.fromarray(gray)
	config = "--oem 3 --psm 6" # balanced speed + accuracy
	text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
	return text.strip()

	# ---------- Heavy OCR (fallback only) ----------
	def heavy_ocr(file_bytes: bytes, lang: str = "eng"):
	img_bgr = read_image_from_bytes(file_bytes)
	if img_bgr is None:
	return ""

	# Denoise + threshold (slower but more robust)
	gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
	gray = cv2.fastNlMeansDenoising(gray, None, h=10)
	_, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	pil_img = Image.fromarray(gray)
	config = "--oem 3 --psm 6"
	text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
	return text.strip()

	# ---------- Image extraction ----------
	def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"):
	text = fast_ocr(file_bytes, lang)
	if len(text) < 20:
	text = heavy_ocr(file_bytes, lang)
	return text

	# ---------- PDF extraction ----------
	def extract_text_from_pdf_bytes(file_bytes: bytes):
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	texts = []
	for page in doc:
	try:
	texts.append(page.get_text("text"))
	except Exception:
	texts.append("")
	return "\n".join(texts)

	# ---------- Endpoints ----------

	@app.post("/extract-image")
	async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")):
	"""
	Extract text from image.
	lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin'
	"""
	try:
	raw = await file.read()
	loop = asyncio.get_event_loop()
	text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang)
	return JSONResponse({"text": text})
	except Exception as e:
	return JSONResponse({"error": str(e)}, status_code=500)

	@app.post("/extract-pdf")
	async def extract_pdf(file: UploadFile = File(...)):
	"""
	Extract text from PDF.
	"""
	try:
	raw = await file.read()
	loop = asyncio.get_event_loop()
	text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw)
	return JSONResponse({"text": text})
	except Exception as e:
	return JSONResponse({"error": str(e)}, status_code=500)