import os import fitz from src.utils import ValidationResult def validate_pdf(filepath: str) -> ValidationResult: """ Validates a PDF file. Checks: file exists, .pdf extension, size < 50MB, readable by PyMuPDF, has embedded fonts (not scanned), has meaningful text (>= 200 chars). """ errors = [] warnings = [] is_valid = True is_scanned = False # Check if exists if not os.path.exists(filepath): errors.append(f"File not found: {filepath}") return ValidationResult(is_valid=False, errors=errors) # Check extension if not filepath.lower().endswith(".pdf"): errors.append(f"Not a PDF file: {filepath}") return ValidationResult(is_valid=False, errors=errors) # Check size < 50MB size_mb = os.path.getsize(filepath) / (1024 * 1024) if size_mb > 50: errors.append(f"File too large: {size_mb:.2f}MB (max 50MB)") return ValidationResult(is_valid=False, errors=errors) try: doc = fitz.open(filepath) except Exception as e: errors.append(f"Cannot open PDF: {str(e)}") return ValidationResult(is_valid=False, errors=errors) if doc.page_count < 1: errors.append("PDF has no pages.") return ValidationResult(is_valid=False, errors=errors) # Check for embedded fonts and meaningful text has_fonts = False total_text = "" for page in doc: fonts = page.get_fonts() if fonts: has_fonts = True total_text += page.get_text("text") if not has_fonts: is_scanned = True errors.append("No embedded fonts detected. This appears to be a scanned PDF and OCR is not supported.") is_valid = False if len(total_text.strip()) < 200: errors.append(f"Not enough meaningful text found (only {len(total_text.strip())} chars).") is_valid = False doc.close() return ValidationResult(is_valid=is_valid, errors=errors, warnings=warnings, is_scanned=is_scanned)