Spaces:

uumerrr684
/

chunkify-smart-pdf-chunker

Sleeping

App Files Files Community

chunkify-smart-pdf-chunker / pdf_reader.py

uumerrr684

✅ Fix PDF reader functions and error handling

1b2f162 6 months ago

raw

history blame contribute delete

10.6 kB

	import fitz # PyMuPDF
	import re
	import os
	from typing import Dict, List, Optional


	def extract_text_from_pdf(pdf_path: str) -> str:
	"""
	Extract clean text from PDF file

	Args:
	pdf_path (str): Path to the PDF file

	Returns:
	str: Extracted and cleaned text

	Raises:
	RuntimeError: If PDF cannot be opened or processed
	"""
	if not pdf_path or not os.path.exists(pdf_path):
	raise RuntimeError("PDF file not found or path is invalid")

	try:
	doc = fitz.open(pdf_path)
	except Exception as e:
	raise RuntimeError(f"Failed to open PDF: {str(e)}")

	full_text = ""

	try:
	total_pages = doc.page_count
	print(f"📄 Processing {total_pages} pages...")

	for page_num in range(total_pages):
	try:
	page = doc[page_num]

	# Extract text from page
	text = page.get_text("text")

	if text.strip():
	# Clean the extracted text
	cleaned_text = clean_extracted_text(text)

	# Add page separator (except for last page)
	if page_num < total_pages - 1:
	cleaned_text += "\n\n--- PAGE BREAK ---\n\n"

	full_text += cleaned_text

	print(f"✅ Page {page_num + 1} processed")

	except Exception as e:
	print(f"⚠️ Error processing page {page_num + 1}: {e}")
	continue

	except Exception as e:
	raise RuntimeError(f"Error during text extraction: {str(e)}")

	finally:
	doc.close()

	if not full_text.strip():
	raise RuntimeError(
	"No text found in PDF. The file may contain only images or be corrupted.")

	return post_process_text(full_text)


	def extract_text_with_metadata(pdf_path: str) -> Dict:
	"""
	Extract text with additional metadata and document info

	Args:
	pdf_path (str): Path to the PDF file

	Returns:
	dict: Complete extraction results with metadata
	"""
	if not pdf_path or not os.path.exists(pdf_path):
	raise RuntimeError("PDF file not found or path is invalid")

	try:
	doc = fitz.open(pdf_path)
	except Exception as e:
	raise RuntimeError(f"Failed to open PDF: {str(e)}")

	full_text = ""
	page_texts = []

	try:
	total_pages = doc.page_count
	print(f"📄 Processing {total_pages} pages with metadata...")

	# Extract metadata
	metadata = doc.metadata

	# Process each page
	for page_num in range(total_pages):
	try:
	page = doc[page_num]
	text = page.get_text("text")

	if text.strip():
	cleaned_text = clean_extracted_text(text)
	page_texts.append(cleaned_text)

	if page_num < total_pages - 1:
	cleaned_text += "\n\n--- PAGE BREAK ---\n\n"

	full_text += cleaned_text
	else:
	page_texts.append("")

	print(f"✅ Page {page_num + 1} processed")

	except Exception as e:
	print(f"⚠️ Error processing page {page_num + 1}: {e}")
	page_texts.append("")
	continue

	result = {
	'full_text': post_process_text(full_text),
	'page_texts': page_texts,
	'page_count': total_pages,
	'metadata': clean_metadata(metadata),
	'file_info': {
	'file_path': pdf_path,
	'file_size': os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0
	}
	}

	return result

	except Exception as e:
	raise RuntimeError(f"Error during extraction with metadata: {str(e)}")

	finally:
	doc.close()


	def clean_extracted_text(text: str) -> str:
	"""
	Clean raw extracted text from PDF artifacts

	Args:
	text (str): Raw text from PDF

	Returns:
	str: Cleaned text
	"""
	if not text:
	return ""

	try:
	# Remove form feed characters
	text = text.replace('\f', '')

	# Fix hyphenated words broken across lines
	text = re.sub(r'(\w+)-\s\n\s(\w+)', r'\1\2', text)

	# Normalize whitespace
	text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces to single
	text = re.sub(r'\n[ \t]+', '\n', text) # Spaces after newlines
	text = re.sub(r'[ \t]+\n', '\n', text) # Spaces before newlines

	# Normalize line endings
	text = re.sub(r'\r\n?', '\n', text)

	# Remove excessive blank lines
	text = re.sub(r'\n{3,}', '\n\n', text)

	return text.strip()

	except Exception as e:
	print(f"Warning: Error cleaning text: {e}")
	return text.strip() if text else ""


	def post_process_text(text: str) -> str:
	"""
	Final post-processing of extracted text

	Args:
	text (str): Text to post-process

	Returns:
	str: Final processed text
	"""
	if not text:
	return ""

	try:
	# Fix common character encoding issues
	replacements = {
	''': "'", # Smart quotes
	''': "'",
	'"': '"',
	'"': '"',
	'–': '-', # En dash
	'—': '--', # Em dash
	'…': '...', # Ellipsis
	'\u00a0': ' ', # Non-breaking space
	'\u2028': '\n', # Line separator
	'\u2029': '\n\n', # Paragraph separator
	}

	for old_char, new_char in replacements.items():
	text = text.replace(old_char, new_char)

	# Remove isolated single characters (OCR artifacts)
	text = re.sub(r'\n[a-zA-Z]\n', '\n', text)

	# Remove standalone numbers (likely page numbers)
	text = re.sub(r'\n\s\d{1,3}\s\n', '\n', text)

	# Final whitespace cleanup
	text = re.sub(r'\n{3,}', '\n\n', text)

	return text.strip()

	except Exception as e:
	print(f"Warning: Error in post-processing: {e}")
	return text.strip() if text else ""


	def clean_metadata(metadata: dict) -> dict:
	"""
	Clean and structure PDF metadata

	Args:
	metadata (dict): Raw metadata from PDF

	Returns:
	dict: Cleaned metadata
	"""
	if not metadata:
	return {}

	try:
	cleaned = {}

	# Common metadata fields
	field_mapping = {
	'title': 'Title',
	'author': 'Author',
	'subject': 'Subject',
	'creator': 'Creator',
	'producer': 'Producer',
	'creationDate': 'Creation Date',
	'modDate': 'Modification Date'
	}

	for key, display_name in field_mapping.items():
	value = metadata.get(key, '')
	if value and isinstance(value, str):
	# Clean the value
	value = value.strip()
	if value and value != 'Unknown':
	cleaned[display_name] = value

	return cleaned

	except Exception as e:
	print(f"Warning: Error cleaning metadata: {e}")
	return {}


	def validate_pdf(pdf_path: str) -> bool:
	"""
	Validate if the file is a readable PDF

	Args:
	pdf_path (str): Path to PDF file

	Returns:
	bool: True if valid PDF, False otherwise
	"""
	try:
	if not pdf_path or not os.path.exists(pdf_path):
	return False

	# Check file extension
	if not pdf_path.lower().endswith('.pdf'):
	return False

	# Try to open with PyMuPDF
	doc = fitz.open(pdf_path)

	# Check if document has pages
	has_pages = doc.page_count > 0

	doc.close()
	return has_pages

	except Exception:
	return False


	def get_pdf_info(pdf_path: str) -> dict:
	"""
	Get basic information about PDF without extracting text

	Args:
	pdf_path (str): Path to PDF file

	Returns:
	dict: Basic PDF information
	"""
	try:
	if not validate_pdf(pdf_path):
	return {'error': 'Invalid PDF file'}

	doc = fitz.open(pdf_path)

	info = {
	'page_count': doc.page_count,
	'file_size': os.path.getsize(pdf_path),
	'is_encrypted': doc.needs_pass,
	'metadata': clean_metadata(doc.metadata)
	}

	doc.close()
	return info

	except Exception as e:
	return {'error': f'Error getting PDF info: {str(e)}'}


	def extract_images_info(pdf_path: str) -> List[dict]:
	"""
	Extract information about images in the PDF

	Args:
	pdf_path (str): Path to PDF file

	Returns:
	list: List of image information dictionaries
	"""
	try:
	if not validate_pdf(pdf_path):
	return []

	doc = fitz.open(pdf_path)
	images_info = []

	for page_num in range(doc.page_count):
	page = doc[page_num]
	image_list = page.get_images()

	for img_index, img in enumerate(image_list):
	img_info = {
	'page': page_num + 1,
	'index': img_index,
	'width': img[2] if len(img) > 2 else None,
	'height': img[3] if len(img) > 3 else None,
	}
	images_info.append(img_info)

	doc.close()
	return images_info

	except Exception as e:
	print(f"Warning: Error extracting image info: {e}")
	return []

	# Test functionality


	def test_pdf_reader():
	"""Test the PDF reader functionality"""
	print("=== PDF Reader Test ===")

	# This would need an actual PDF file to test
	test_pdf = "sample.pdf" # Replace with actual PDF path

	try:
	if os.path.exists(test_pdf):
	print(f"Testing with: {test_pdf}")

	# Test validation
	is_valid = validate_pdf(test_pdf)
	print(f"Valid PDF: {is_valid}")

	if is_valid:
	# Test basic info
	info = get_pdf_info(test_pdf)
	print(f"Pages: {info.get('page_count', 'Unknown')}")

	# Test text extraction
	text = extract_text_from_pdf(test_pdf)
	print(f"Extracted {len(text)} characters")
	print(f"First 100 chars: {text[:100]}...")

	else:
	print("No test PDF found. Create a 'sample.pdf' to test.")

	except Exception as e:
	print(f"Test failed: {e}")


	if __name__ == "__main__":
	test_pdf_reader()