File size: 3,423 Bytes
d456104
 
 
 
c237f60
d456104
 
c237f60
 
 
 
 
 
 
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c237f60
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c237f60
 
 
 
d456104
934d814
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3779ff3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
document_loader.py
------------------
Loads and parses documents from a file path or directory.
Supports PDF, TXT, DOCX, HTML, and Markdown (.md).
Returns a list of LangChain Document objects with metadata
(source filename, page number where available).

Bug Fix (Step 1):
- Changed setdefault to direct assignment so source metadata is always
  normalized to just the filename, not the full absolute path.

Enhancement (Step 2):
- Added .md (Markdown) support via TextLoader.
"""

import logging
from pathlib import Path
from typing import List

from langchain.schema import Document
from langchain_community.document_loaders import (
    PyMuPDFLoader,
    TextLoader,
    Docx2txtLoader,
    UnstructuredHTMLLoader,
)

logger = logging.getLogger(__name__)

# Map file extensions → loader class
LOADER_MAP = {
    ".pdf":  PyMuPDFLoader,
    ".txt":  TextLoader,
    ".md":   TextLoader,        # Markdown support added
    ".docx": Docx2txtLoader,
    ".html": UnstructuredHTMLLoader,
    ".htm":  UnstructuredHTMLLoader,
}


def load_document(file_path: str | Path) -> List[Document]:
    """
    Load a single document and return a list of LangChain Document objects.

    Args:
        file_path: Absolute or relative path to the document.

    Returns:
        List of Document objects with `.page_content` and `.metadata`.

    Raises:
        ValueError: If the file type is not supported.
        FileNotFoundError: If the file does not exist.
    """
    file_path = Path(file_path)

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    ext = file_path.suffix.lower()
    loader_cls = LOADER_MAP.get(ext)

    if loader_cls is None:
        supported = ", ".join(LOADER_MAP.keys())
        raise ValueError(
            f"Unsupported file type '{ext}'. Supported types: {supported}"
        )

    logger.info("Loading %s with %s", file_path.name, loader_cls.__name__)
    loader = loader_cls(str(file_path))
    docs = loader.load()

    # BUG FIX: always overwrite source with just the filename.
    # LangChain loaders set source to the full absolute path by default.
    # setdefault() would leave the full path intact since the key already exists.
    # Direct assignment always normalises to just the filename.
    for doc in docs:
        doc.metadata["source"] = file_path.name

    logger.info("Loaded %d page(s) from %s", len(docs), file_path.name)
    return docs


def load_documents_from_directory(directory: str | Path) -> List[Document]:
    """
    Recursively load all supported documents from a directory.

    Args:
        directory: Path to the folder containing source documents.

    Returns:
        Flat list of Document objects from all files found.
    """
    directory = Path(directory)

    if not directory.is_dir():
        raise NotADirectoryError(f"Not a directory: {directory}")

    all_docs: List[Document] = []
    supported_exts = set(LOADER_MAP.keys())

    for file_path in sorted(directory.rglob("*")):
        if file_path.suffix.lower() in supported_exts:
            try:
                docs = load_document(file_path)
                all_docs.extend(docs)
            except Exception as exc:
                logger.warning("Skipping %s — %s", file_path.name, exc)

    logger.info(
        "Loaded %d document chunk(s) from directory '%s'",
        len(all_docs),
        directory,
    )
    return all_docs