Spaces:
Running
Running
| import os | |
| import fitz | |
| from src.utils import ValidationResult | |
| def validate_pdf(filepath: str) -> ValidationResult: | |
| """ | |
| Validates a PDF file. | |
| Checks: file exists, .pdf extension, size < 50MB, readable by PyMuPDF, | |
| has embedded fonts (not scanned), has meaningful text (>= 200 chars). | |
| """ | |
| errors = [] | |
| warnings = [] | |
| is_valid = True | |
| is_scanned = False | |
| # Check if exists | |
| if not os.path.exists(filepath): | |
| errors.append(f"File not found: {filepath}") | |
| return ValidationResult(is_valid=False, errors=errors) | |
| # Check extension | |
| if not filepath.lower().endswith(".pdf"): | |
| errors.append(f"Not a PDF file: {filepath}") | |
| return ValidationResult(is_valid=False, errors=errors) | |
| # Check size < 50MB | |
| size_mb = os.path.getsize(filepath) / (1024 * 1024) | |
| if size_mb > 50: | |
| errors.append(f"File too large: {size_mb:.2f}MB (max 50MB)") | |
| return ValidationResult(is_valid=False, errors=errors) | |
| try: | |
| doc = fitz.open(filepath) | |
| except Exception as e: | |
| errors.append(f"Cannot open PDF: {str(e)}") | |
| return ValidationResult(is_valid=False, errors=errors) | |
| if doc.page_count < 1: | |
| errors.append("PDF has no pages.") | |
| return ValidationResult(is_valid=False, errors=errors) | |
| # Check for embedded fonts and meaningful text | |
| has_fonts = False | |
| total_text = "" | |
| for page in doc: | |
| fonts = page.get_fonts() | |
| if fonts: | |
| has_fonts = True | |
| total_text += page.get_text("text") | |
| if not has_fonts: | |
| is_scanned = True | |
| errors.append("No embedded fonts detected. This appears to be a scanned PDF and OCR is not supported.") | |
| is_valid = False | |
| if len(total_text.strip()) < 200: | |
| errors.append(f"Not enough meaningful text found (only {len(total_text.strip())} chars).") | |
| is_valid = False | |
| doc.close() | |
| return ValidationResult(is_valid=is_valid, errors=errors, warnings=warnings, is_scanned=is_scanned) | |