File size: 4,464 Bytes
c37cfba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | # ============================================================
# FILE: src/document_loader.py
# ============================================================
# PURPOSE:
# Load documents from the local knowledge base folder.
#
# SUPPORTED FILE TYPES:
# - .txt
# - .md
# - .csv
# - .pdf
#
# In production, document loading becomes an ingestion pipeline.
# You may need:
# - file validation
# - file size limits
# - malware scanning
# - OCR for scanned PDFs
# - metadata extraction
# - document versioning
# - access control rules
# ============================================================
from dataclasses import dataclass
from pathlib import Path
from typing import List
import pandas as pd
"""
Why dataclass decoration?
- Cleaner syntax for simple data containers.
- Automatic generation of __init__, __repr__, and other methods.
- Ideal for the Document class, which is just a structured way to hold data.
"""
@dataclass
class Document:
"""
Represents one loaded document.
source:
- relative file path used for source attribution
text:
- extracted plain text
file_type:
- original file extension
character_count:
- useful for debugging and monitoring
"""
source: str
text: str
file_type: str
character_count: int
def read_text_file(path: Path) -> str:
"""
Read a normal text file.
errors='ignore' prevents a full crash if the file contains
unusual encoding characters.
"""
return path.read_text(encoding="utf-8", errors="ignore")
def read_csv_file(path: Path) -> str:
"""
Read a CSV file and convert each row into readable text.
Why convert CSV to text?
RAG retrieval works on text chunks. A row must become text before
it can be embedded and retrieved.
"""
df = pd.read_csv(path)
lines = []
for row_index, row in df.iterrows():
row_parts = []
for column_name, value in row.items():
row_parts.append(f"{column_name}: {value}")
lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts))
return "\n".join(lines)
def read_pdf_file(path: Path) -> str:
"""
Extract text from a PDF file.
Important limitation:
pypdf works for text-based PDFs.
It may not work for scanned image PDFs.
Production options for scanned PDFs:
- Tesseract OCR
- AWS Textract
- Azure Document Intelligence
- Google Document AI
"""
try:
from pypdf import PdfReader
except ImportError as error:
raise ImportError("pypdf is not installed. Run: pip install pypdf") from error
reader = PdfReader(str(path))
pages = []
for page_number, page in enumerate(reader.pages, start=1):
page_text = page.extract_text() or ""
pages.append(f"\n--- Page {page_number} ---\n{page_text}")
return "\n".join(pages)
def load_single_document(path: Path, project_root: Path) -> Document:
"""
Load one supported document and return a Document object.
This function keeps file-type-specific logic in one place.
"""
extension = path.suffix.lower()
if extension in {".txt", ".md"}:
text = read_text_file(path)
elif extension == ".csv":
text = read_csv_file(path)
elif extension == ".pdf":
text = read_pdf_file(path)
else:
raise ValueError(f"Unsupported file type: {extension}")
text = text.strip()
return Document(
source=str(path.relative_to(project_root)),
text=text,
file_type=extension,
character_count=len(text),
)
def load_documents(folder: Path, project_root: Path) -> List[Document]:
"""
Load all supported documents from a folder.
Returns:
List[Document]
AI ENGINEER PRODUCTION TIP:
Always keep source metadata. Without source metadata, your app
cannot explain where an answer came from.
"""
supported_extensions = {".txt", ".md", ".csv", ".pdf"}
documents = []
for path in sorted(folder.rglob("*")):
if not path.is_file():
continue
if path.suffix.lower() not in supported_extensions:
continue
try:
document = load_single_document(path=path, project_root=project_root)
if document.text:
documents.append(document)
except Exception as error:
print(f"Could not load file: {path}")
print(f"Reason: {error}")
return documents |