Spaces:

venkataashok
/

project

Sleeping

Upload 3 files

8f04bd0 verified 15 days ago

1.47 kB

	# models/ocr.py

	import fitz # PyMuPDF
	import easyocr

	# Load OCR model once (very important)
	reader = easyocr.Reader(['en'], gpu=False)

	def extract_text(file_path):
	"""
	Extract text from PDF, TXT, or Image.
	Faster version with smart OCR fallback.
	"""
	file_path = str(file_path)

	# ------------------------------
	# PDF Handling
	# ------------------------------
	if file_path.lower().endswith(".pdf"):
	doc = fitz.open(file_path)
	text = ""

	for page in doc:
	text += page.get_text()

	doc.close()


	# If PDF already has selectable text → return immediately
	if text.strip():
	return text

	# If scanned PDF → fallback to OCR
	images_text = []
	doc = fitz.open(file_path)
	for page in doc:
	pix = page.get_pixmap()
	img_bytes = pix.tobytes("png")
	result = reader.readtext(img_bytes, detail=0)
	images_text.extend(result)
	doc.close()

	return " ".join(images_text)

	# ------------------------------
	# TXT Handling
	# ------------------------------
	elif file_path.lower().endswith(".txt"):
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()

	# ------------------------------
	# Image Handling
	# ------------------------------
	else:
	result = reader.readtext(file_path, detail=0)
	return " ".join(result)