File size: 4,464 Bytes
c37cfba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# ============================================================
# FILE: src/document_loader.py
# ============================================================
# PURPOSE:
# Load documents from the local knowledge base folder.
#
# SUPPORTED FILE TYPES:
# - .txt
# - .md
# - .csv
# - .pdf
#
# In production, document loading becomes an ingestion pipeline.
# You may need:
# - file validation
# - file size limits
# - malware scanning
# - OCR for scanned PDFs
# - metadata extraction
# - document versioning
# - access control rules
# ============================================================

from dataclasses import dataclass
from pathlib import Path
from typing import List

import pandas as pd

"""
Why dataclass decoration?
- Cleaner syntax for simple data containers.
- Automatic generation of __init__, __repr__, and other methods.
- Ideal for the Document class, which is just a structured way to hold data.
"""

@dataclass
class Document:
    """
    Represents one loaded document.

    source:
    - relative file path used for source attribution

    text:
    - extracted plain text

    file_type:
    - original file extension

    character_count:
    - useful for debugging and monitoring
    """

    source: str
    text: str
    file_type: str
    character_count: int


def read_text_file(path: Path) -> str:
    """
    Read a normal text file.

    errors='ignore' prevents a full crash if the file contains
    unusual encoding characters.
    """
    return path.read_text(encoding="utf-8", errors="ignore")


def read_csv_file(path: Path) -> str:
    """
    Read a CSV file and convert each row into readable text.

    Why convert CSV to text?
    RAG retrieval works on text chunks. A row must become text before
    it can be embedded and retrieved.
    """

    df = pd.read_csv(path)
    lines = []

    for row_index, row in df.iterrows():
        row_parts = []

        for column_name, value in row.items():
            row_parts.append(f"{column_name}: {value}")

        lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts))

    return "\n".join(lines)


def read_pdf_file(path: Path) -> str:
    """
    Extract text from a PDF file.

    Important limitation:
    pypdf works for text-based PDFs.
    It may not work for scanned image PDFs.

    Production options for scanned PDFs:
    - Tesseract OCR
    - AWS Textract
    - Azure Document Intelligence
    - Google Document AI
    """

    try:
        from pypdf import PdfReader
    except ImportError as error:
        raise ImportError("pypdf is not installed. Run: pip install pypdf") from error

    reader = PdfReader(str(path))
    pages = []

    for page_number, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text() or ""
        pages.append(f"\n--- Page {page_number} ---\n{page_text}")

    return "\n".join(pages)


def load_single_document(path: Path, project_root: Path) -> Document:
    """
    Load one supported document and return a Document object.

    This function keeps file-type-specific logic in one place.
    """

    extension = path.suffix.lower()

    if extension in {".txt", ".md"}:
        text = read_text_file(path)
    elif extension == ".csv":
        text = read_csv_file(path)
    elif extension == ".pdf":
        text = read_pdf_file(path)
    else:
        raise ValueError(f"Unsupported file type: {extension}")

    text = text.strip()

    return Document(
        source=str(path.relative_to(project_root)),
        text=text,
        file_type=extension,
        character_count=len(text),
    )


def load_documents(folder: Path, project_root: Path) -> List[Document]:
    """
    Load all supported documents from a folder.

    Returns:
    List[Document]

    AI ENGINEER PRODUCTION TIP:
    Always keep source metadata. Without source metadata, your app
    cannot explain where an answer came from.
    """

    supported_extensions = {".txt", ".md", ".csv", ".pdf"}
    documents = []

    for path in sorted(folder.rglob("*")):
        if not path.is_file():
            continue

        if path.suffix.lower() not in supported_extensions:
            continue

        try:
            document = load_single_document(path=path, project_root=project_root)

            if document.text:
                documents.append(document)

        except Exception as error:
            print(f"Could not load file: {path}")
            print(f"Reason: {error}")

    return documents