import fitz # PyMuPDF import re import os from typing import Dict, List, Optional def extract_text_from_pdf(pdf_path: str) -> str: """ Extract clean text from PDF file Args: pdf_path (str): Path to the PDF file Returns: str: Extracted and cleaned text Raises: RuntimeError: If PDF cannot be opened or processed """ if not pdf_path or not os.path.exists(pdf_path): raise RuntimeError("PDF file not found or path is invalid") try: doc = fitz.open(pdf_path) except Exception as e: raise RuntimeError(f"Failed to open PDF: {str(e)}") full_text = "" try: total_pages = doc.page_count print(f"📄 Processing {total_pages} pages...") for page_num in range(total_pages): try: page = doc[page_num] # Extract text from page text = page.get_text("text") if text.strip(): # Clean the extracted text cleaned_text = clean_extracted_text(text) # Add page separator (except for last page) if page_num < total_pages - 1: cleaned_text += "\n\n--- PAGE BREAK ---\n\n" full_text += cleaned_text print(f"✅ Page {page_num + 1} processed") except Exception as e: print(f"⚠️ Error processing page {page_num + 1}: {e}") continue except Exception as e: raise RuntimeError(f"Error during text extraction: {str(e)}") finally: doc.close() if not full_text.strip(): raise RuntimeError( "No text found in PDF. The file may contain only images or be corrupted.") return post_process_text(full_text) def extract_text_with_metadata(pdf_path: str) -> Dict: """ Extract text with additional metadata and document info Args: pdf_path (str): Path to the PDF file Returns: dict: Complete extraction results with metadata """ if not pdf_path or not os.path.exists(pdf_path): raise RuntimeError("PDF file not found or path is invalid") try: doc = fitz.open(pdf_path) except Exception as e: raise RuntimeError(f"Failed to open PDF: {str(e)}") full_text = "" page_texts = [] try: total_pages = doc.page_count print(f"📄 Processing {total_pages} pages with metadata...") # Extract metadata metadata = doc.metadata # Process each page for page_num in range(total_pages): try: page = doc[page_num] text = page.get_text("text") if text.strip(): cleaned_text = clean_extracted_text(text) page_texts.append(cleaned_text) if page_num < total_pages - 1: cleaned_text += "\n\n--- PAGE BREAK ---\n\n" full_text += cleaned_text else: page_texts.append("") print(f"✅ Page {page_num + 1} processed") except Exception as e: print(f"⚠️ Error processing page {page_num + 1}: {e}") page_texts.append("") continue result = { 'full_text': post_process_text(full_text), 'page_texts': page_texts, 'page_count': total_pages, 'metadata': clean_metadata(metadata), 'file_info': { 'file_path': pdf_path, 'file_size': os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 0 } } return result except Exception as e: raise RuntimeError(f"Error during extraction with metadata: {str(e)}") finally: doc.close() def clean_extracted_text(text: str) -> str: """ Clean raw extracted text from PDF artifacts Args: text (str): Raw text from PDF Returns: str: Cleaned text """ if not text: return "" try: # Remove form feed characters text = text.replace('\f', '') # Fix hyphenated words broken across lines text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text) # Normalize whitespace text = re.sub(r'[ \t]+', ' ', text) # Multiple spaces to single text = re.sub(r'\n[ \t]+', '\n', text) # Spaces after newlines text = re.sub(r'[ \t]+\n', '\n', text) # Spaces before newlines # Normalize line endings text = re.sub(r'\r\n?', '\n', text) # Remove excessive blank lines text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() except Exception as e: print(f"Warning: Error cleaning text: {e}") return text.strip() if text else "" def post_process_text(text: str) -> str: """ Final post-processing of extracted text Args: text (str): Text to post-process Returns: str: Final processed text """ if not text: return "" try: # Fix common character encoding issues replacements = { ''': "'", # Smart quotes ''': "'", '"': '"', '"': '"', '–': '-', # En dash '—': '--', # Em dash '…': '...', # Ellipsis '\u00a0': ' ', # Non-breaking space '\u2028': '\n', # Line separator '\u2029': '\n\n', # Paragraph separator } for old_char, new_char in replacements.items(): text = text.replace(old_char, new_char) # Remove isolated single characters (OCR artifacts) text = re.sub(r'\n[a-zA-Z]\n', '\n', text) # Remove standalone numbers (likely page numbers) text = re.sub(r'\n\s*\d{1,3}\s*\n', '\n', text) # Final whitespace cleanup text = re.sub(r'\n{3,}', '\n\n', text) return text.strip() except Exception as e: print(f"Warning: Error in post-processing: {e}") return text.strip() if text else "" def clean_metadata(metadata: dict) -> dict: """ Clean and structure PDF metadata Args: metadata (dict): Raw metadata from PDF Returns: dict: Cleaned metadata """ if not metadata: return {} try: cleaned = {} # Common metadata fields field_mapping = { 'title': 'Title', 'author': 'Author', 'subject': 'Subject', 'creator': 'Creator', 'producer': 'Producer', 'creationDate': 'Creation Date', 'modDate': 'Modification Date' } for key, display_name in field_mapping.items(): value = metadata.get(key, '') if value and isinstance(value, str): # Clean the value value = value.strip() if value and value != 'Unknown': cleaned[display_name] = value return cleaned except Exception as e: print(f"Warning: Error cleaning metadata: {e}") return {} def validate_pdf(pdf_path: str) -> bool: """ Validate if the file is a readable PDF Args: pdf_path (str): Path to PDF file Returns: bool: True if valid PDF, False otherwise """ try: if not pdf_path or not os.path.exists(pdf_path): return False # Check file extension if not pdf_path.lower().endswith('.pdf'): return False # Try to open with PyMuPDF doc = fitz.open(pdf_path) # Check if document has pages has_pages = doc.page_count > 0 doc.close() return has_pages except Exception: return False def get_pdf_info(pdf_path: str) -> dict: """ Get basic information about PDF without extracting text Args: pdf_path (str): Path to PDF file Returns: dict: Basic PDF information """ try: if not validate_pdf(pdf_path): return {'error': 'Invalid PDF file'} doc = fitz.open(pdf_path) info = { 'page_count': doc.page_count, 'file_size': os.path.getsize(pdf_path), 'is_encrypted': doc.needs_pass, 'metadata': clean_metadata(doc.metadata) } doc.close() return info except Exception as e: return {'error': f'Error getting PDF info: {str(e)}'} def extract_images_info(pdf_path: str) -> List[dict]: """ Extract information about images in the PDF Args: pdf_path (str): Path to PDF file Returns: list: List of image information dictionaries """ try: if not validate_pdf(pdf_path): return [] doc = fitz.open(pdf_path) images_info = [] for page_num in range(doc.page_count): page = doc[page_num] image_list = page.get_images() for img_index, img in enumerate(image_list): img_info = { 'page': page_num + 1, 'index': img_index, 'width': img[2] if len(img) > 2 else None, 'height': img[3] if len(img) > 3 else None, } images_info.append(img_info) doc.close() return images_info except Exception as e: print(f"Warning: Error extracting image info: {e}") return [] # Test functionality def test_pdf_reader(): """Test the PDF reader functionality""" print("=== PDF Reader Test ===") # This would need an actual PDF file to test test_pdf = "sample.pdf" # Replace with actual PDF path try: if os.path.exists(test_pdf): print(f"Testing with: {test_pdf}") # Test validation is_valid = validate_pdf(test_pdf) print(f"Valid PDF: {is_valid}") if is_valid: # Test basic info info = get_pdf_info(test_pdf) print(f"Pages: {info.get('page_count', 'Unknown')}") # Test text extraction text = extract_text_from_pdf(test_pdf) print(f"Extracted {len(text)} characters") print(f"First 100 chars: {text[:100]}...") else: print("No test PDF found. Create a 'sample.pdf' to test.") except Exception as e: print(f"Test failed: {e}") if __name__ == "__main__": test_pdf_reader()