Spaces:

nagur-shareef-shaik
/

InsuCompass-API

Sleeping

App Files Files Community

InsuCompass-API / scripts /data_processing /document_loader.py

nagur-shareef-shaik

Add Application Code

cd6f412 8 months ago

raw

history blame contribute delete

2.5 kB

	import logging
	from pathlib import Path
	from bs4 import BeautifulSoup
	from pypdf import PdfReader
	from typing import Optional

	logger = logging.getLogger(__name__)

	def load_html_content(file_path: Path) -> Optional[str]:
	"""Loads and extracts clean text content from an HTML file."""
	logger.debug(f"Loading HTML from: {file_path}")
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	soup = BeautifulSoup(f.read(), 'lxml')

	# Remove script, style, nav, footer, header, and other common clutter
	for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']):
	element.decompose()

	# Get text, strip whitespace, and join lines
	text = ' '.join(soup.get_text(separator=' ', strip=True).split())

	if not text:
	logger.warning(f"No text content could be extracted from {file_path}")
	return None
	return text
	except Exception as e:
	logger.error(f"Failed to load or parse HTML file {file_path}: {e}")
	return None

	def load_pdf_content(file_path: Path) -> Optional[str]:
	"""Loads and extracts text content from a PDF file."""
	logger.debug(f"Loading PDF from: {file_path}")
	if not file_path.exists():
	logger.error(f"PDF file not found at {file_path}")
	return None
	try:
	reader = PdfReader(file_path)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n\n" # Add space between pages

	if not text:
	logger.warning(f"No text could be extracted from PDF {file_path}")
	return None
	return text
	except Exception as e:
	logger.error(f"Failed to load or parse PDF file {file_path}: {e}")
	return None

	def load_document(file_path_str: str) -> Optional[str]:
	"""
	Generic document loader that dispatches to the correct function
	based on file extension.
	"""
	file_path = Path(file_path_str)
	if not file_path.exists():
	logger.error(f"Document not found at path: {file_path}")
	return None

	extension = file_path.suffix.lower()
	if extension == '.html':
	return load_html_content(file_path)
	elif extension == '.pdf':
	return load_pdf_content(file_path)
	else:
	logger.warning(f"Unsupported file type '{extension}' for file {file_path}. Skipping.")
	return None