Spaces:

quantumbit
/

rag-bajaj

Sleeping

Upload 39 files

e8051be verified 6 months ago

1.9 kB

	"""
	Text Extractor Module

	Handles extracting text content from PDF files.
	"""

	import pdfplumber


	class TextExtractor:
	"""Handles text extraction from PDF files."""

	def __init__(self):
	"""Initialize the text extractor."""
	pass

	async def extract_text_from_pdf(self, pdf_path: str) -> str:
	"""
	Extract text from PDF file.

	Args:
	pdf_path: Path to the PDF file

	Returns:
	str: Extracted text content

	Raises:
	Exception: If text extraction fails
	"""
	print(f"📖 Extracting text from PDF...")

	full_text = ""
	try:
	with pdfplumber.open(pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages):
	text = page.extract_text()
	if text:
	full_text += f"\n--- Page {page_num + 1} ---\n"
	full_text += text

	print(f"✅ Extracted {len(full_text)} characters from PDF")
	return full_text

	except Exception as e:
	raise Exception(f"Failed to extract text from PDF: {str(e)}")

	def validate_extracted_text(self, text: str) -> bool:
	"""
	Validate that extracted text is not empty and contains meaningful content.

	Args:
	text: The extracted text to validate

	Returns:
	bool: True if text is valid, False otherwise
	"""
	if not text or not text.strip():
	return False

	# Check if text has at least some alphabetic characters
	alphabetic_chars = sum(1 for char in text if char.isalpha())
	return alphabetic_chars > 50 # At least 50 alphabetic characters