Spaces:
Sleeping
Sleeping
| import logging | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| from pypdf import PdfReader | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| def load_html_content(file_path: Path) -> Optional[str]: | |
| """Loads and extracts clean text content from an HTML file.""" | |
| logger.debug(f"Loading HTML from: {file_path}") | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| soup = BeautifulSoup(f.read(), 'lxml') | |
| # Remove script, style, nav, footer, header, and other common clutter | |
| for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']): | |
| element.decompose() | |
| # Get text, strip whitespace, and join lines | |
| text = ' '.join(soup.get_text(separator=' ', strip=True).split()) | |
| if not text: | |
| logger.warning(f"No text content could be extracted from {file_path}") | |
| return None | |
| return text | |
| except Exception as e: | |
| logger.error(f"Failed to load or parse HTML file {file_path}: {e}") | |
| return None | |
| def load_pdf_content(file_path: Path) -> Optional[str]: | |
| """Loads and extracts text content from a PDF file.""" | |
| logger.debug(f"Loading PDF from: {file_path}") | |
| if not file_path.exists(): | |
| logger.error(f"PDF file not found at {file_path}") | |
| return None | |
| try: | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n\n" # Add space between pages | |
| if not text: | |
| logger.warning(f"No text could be extracted from PDF {file_path}") | |
| return None | |
| return text | |
| except Exception as e: | |
| logger.error(f"Failed to load or parse PDF file {file_path}: {e}") | |
| return None | |
| def load_document(file_path_str: str) -> Optional[str]: | |
| """ | |
| Generic document loader that dispatches to the correct function | |
| based on file extension. | |
| """ | |
| file_path = Path(file_path_str) | |
| if not file_path.exists(): | |
| logger.error(f"Document not found at path: {file_path}") | |
| return None | |
| extension = file_path.suffix.lower() | |
| if extension == '.html': | |
| return load_html_content(file_path) | |
| elif extension == '.pdf': | |
| return load_pdf_content(file_path) | |
| else: | |
| logger.warning(f"Unsupported file type '{extension}' for file {file_path}. Skipping.") | |
| return None |