Spaces:

thundarstrom
/

research-lens

Running

App Files Files Community

research-lens / src /pdf_validator.py

thundarstrom

feat: add core backend pipelines and engine services

e3994d1 9 days ago

raw

history blame contribute delete

2.08 kB

	import os
	import fitz
	from src.utils import ValidationResult

	def validate_pdf(filepath: str) -> ValidationResult:
	"""
	Validates a PDF file.
	Checks: file exists, .pdf extension, size < 50MB, readable by PyMuPDF,
	has embedded fonts (not scanned), has meaningful text (>= 200 chars).
	"""
	errors = []
	warnings = []
	is_valid = True
	is_scanned = False

	# Check if exists
	if not os.path.exists(filepath):
	errors.append(f"File not found: {filepath}")
	return ValidationResult(is_valid=False, errors=errors)

	# Check extension
	if not filepath.lower().endswith(".pdf"):
	errors.append(f"Not a PDF file: {filepath}")
	return ValidationResult(is_valid=False, errors=errors)

	# Check size < 50MB
	size_mb = os.path.getsize(filepath) / (1024 * 1024)
	if size_mb > 50:
	errors.append(f"File too large: {size_mb:.2f}MB (max 50MB)")
	return ValidationResult(is_valid=False, errors=errors)

	try:
	doc = fitz.open(filepath)
	except Exception as e:
	errors.append(f"Cannot open PDF: {str(e)}")
	return ValidationResult(is_valid=False, errors=errors)

	if doc.page_count < 1:
	errors.append("PDF has no pages.")
	return ValidationResult(is_valid=False, errors=errors)

	# Check for embedded fonts and meaningful text
	has_fonts = False
	total_text = ""
	for page in doc:
	fonts = page.get_fonts()
	if fonts:
	has_fonts = True
	total_text += page.get_text("text")

	if not has_fonts:
	is_scanned = True
	errors.append("No embedded fonts detected. This appears to be a scanned PDF and OCR is not supported.")
	is_valid = False

	if len(total_text.strip()) < 200:
	errors.append(f"Not enough meaningful text found (only {len(total_text.strip())} chars).")
	is_valid = False

	doc.close()

	return ValidationResult(is_valid=is_valid, errors=errors, warnings=warnings, is_scanned=is_scanned)