Spaces:

point9
/

Invoice_Digitization_Agent

Sleeping

App Files Files Community

Invoice_Digitization_Agent / backend /worker /text_extractor.py

Dipan04

Deploy Invoice Digitization Agent

8a859a8 3 months ago

raw

history blame

3.59 kB

	"""
	Text extraction utilities for PDF and images.
	Supports both digital PDFs and scanned documents (OCR).
	"""

	import pdfplumber
	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image
	from pathlib import Path
	from typing import Dict, Tuple
	import logging

	logger = logging.getLogger(__name__)


	def extract_text_from_pdf(file_path: Path) -> Tuple[str, Dict]:
	"""
	Extract text from PDF using pdfplumber (for digital PDFs).

	Returns:
	(raw_text, metadata)
	"""
	try:
	text_pages = []
	page_count = 0

	with pdfplumber.open(str(file_path)) as pdf:
	page_count = len(pdf.pages)

	for page in pdf.pages:
	text = page.extract_text()
	if text:
	text_pages.append(text)

	raw_text = "\n\n".join(text_pages)

	metadata = {
	"page_count": page_count,
	"extraction_method": "pdfplumber",
	"confidence_score": 1.0 if len(raw_text) > 50 else 0.5
	}

	# If no text extracted, it might be a scanned PDF
	if not raw_text.strip():
	logger.info("No text found with pdfplumber, trying OCR...")
	return extract_text_from_pdf_ocr(file_path)

	return raw_text, metadata

	except Exception as e:
	logger.error(f"PDF extraction failed: {e}")
	raise


	def extract_text_from_pdf_ocr(file_path: Path) -> Tuple[str, Dict]:
	"""
	Extract text from scanned PDF using OCR (PyMuPDF + Tesseract).
	"""
	try:
	text_pages = []
	doc = fitz.open(str(file_path))
	page_count = len(doc)

	for page_num in range(page_count):
	page = doc[page_num]
	# Convert page to image
	pix = page.get_pixmap(dpi=300)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	# OCR
	text = pytesseract.image_to_string(img)
	text_pages.append(text)

	doc.close()
	raw_text = "\n\n".join(text_pages)

	metadata = {
	"page_count": page_count,
	"extraction_method": "tesseract_ocr",
	"confidence_score": 0.7 # OCR typically less confident
	}

	return raw_text, metadata

	except Exception as e:
	logger.error(f"OCR extraction failed: {e}")
	raise


	def extract_text_from_image(file_path: Path) -> Tuple[str, Dict]:
	"""
	Extract text from image using OCR (Tesseract).
	"""
	try:
	img = Image.open(str(file_path))
	raw_text = pytesseract.image_to_string(img)

	metadata = {
	"page_count": 1,
	"extraction_method": "tesseract_ocr",
	"confidence_score": 0.7
	}

	return raw_text, metadata

	except Exception as e:
	logger.error(f"Image OCR failed: {e}")
	raise


	def extract_text(file_path: Path, mime_type: str) -> Tuple[str, Dict]:
	"""
	Main entry point for text extraction.
	Routes to appropriate extractor based on file type.

	Args:
	file_path: Path to document
	mime_type: MIME type of document

	Returns:
	(raw_text, metadata_dict)
	"""
	if mime_type == "application/pdf":
	return extract_text_from_pdf(file_path)
	elif mime_type in ["image/png", "image/jpeg", "image/jpg"]:
	return extract_text_from_image(file_path)
	else:
	raise ValueError(f"Unsupported file type: {mime_type}")