File size: 2,504 Bytes
cd6f412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import logging
from pathlib import Path
from bs4 import BeautifulSoup
from pypdf import PdfReader
from typing import Optional

logger = logging.getLogger(__name__)

def load_html_content(file_path: Path) -> Optional[str]:
    """Loads and extracts clean text content from an HTML file."""
    logger.debug(f"Loading HTML from: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'lxml')
        
        # Remove script, style, nav, footer, header, and other common clutter
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'form']):
            element.decompose()

        # Get text, strip whitespace, and join lines
        text = ' '.join(soup.get_text(separator=' ', strip=True).split())
        
        if not text:
            logger.warning(f"No text content could be extracted from {file_path}")
            return None
        return text
    except Exception as e:
        logger.error(f"Failed to load or parse HTML file {file_path}: {e}")
        return None

def load_pdf_content(file_path: Path) -> Optional[str]:
    """Loads and extracts text content from a PDF file."""
    logger.debug(f"Loading PDF from: {file_path}")
    if not file_path.exists():
        logger.error(f"PDF file not found at {file_path}")
        return None
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n\n" # Add space between pages
        
        if not text:
            logger.warning(f"No text could be extracted from PDF {file_path}")
            return None
        return text
    except Exception as e:
        logger.error(f"Failed to load or parse PDF file {file_path}: {e}")
        return None

def load_document(file_path_str: str) -> Optional[str]:
    """
    Generic document loader that dispatches to the correct function
    based on file extension.
    """
    file_path = Path(file_path_str)
    if not file_path.exists():
        logger.error(f"Document not found at path: {file_path}")
        return None

    extension = file_path.suffix.lower()
    if extension == '.html':
        return load_html_content(file_path)
    elif extension == '.pdf':
        return load_pdf_content(file_path)
    else:
        logger.warning(f"Unsupported file type '{extension}' for file {file_path}. Skipping.")
        return None