# ============================================================ # FILE: src/document_loader.py # ============================================================ # PURPOSE: # Load documents from the local knowledge base folder. # # SUPPORTED FILE TYPES: # - .txt # - .md # - .csv # - .pdf # # In production, document loading becomes an ingestion pipeline. # You may need: # - file validation # - file size limits # - malware scanning # - OCR for scanned PDFs # - metadata extraction # - document versioning # - access control rules # ============================================================ from dataclasses import dataclass from pathlib import Path from typing import List import pandas as pd """ Why dataclass decoration? - Cleaner syntax for simple data containers. - Automatic generation of __init__, __repr__, and other methods. - Ideal for the Document class, which is just a structured way to hold data. """ @dataclass class Document: """ Represents one loaded document. source: - relative file path used for source attribution text: - extracted plain text file_type: - original file extension character_count: - useful for debugging and monitoring """ source: str text: str file_type: str character_count: int def read_text_file(path: Path) -> str: """ Read a normal text file. errors='ignore' prevents a full crash if the file contains unusual encoding characters. """ return path.read_text(encoding="utf-8", errors="ignore") def read_csv_file(path: Path) -> str: """ Read a CSV file and convert each row into readable text. Why convert CSV to text? RAG retrieval works on text chunks. A row must become text before it can be embedded and retrieved. """ df = pd.read_csv(path) lines = [] for row_index, row in df.iterrows(): row_parts = [] for column_name, value in row.items(): row_parts.append(f"{column_name}: {value}") lines.append(f"Row {row_index + 1}: " + " | ".join(row_parts)) return "\n".join(lines) def read_pdf_file(path: Path) -> str: """ Extract text from a PDF file. Important limitation: pypdf works for text-based PDFs. It may not work for scanned image PDFs. Production options for scanned PDFs: - Tesseract OCR - AWS Textract - Azure Document Intelligence - Google Document AI """ try: from pypdf import PdfReader except ImportError as error: raise ImportError("pypdf is not installed. Run: pip install pypdf") from error reader = PdfReader(str(path)) pages = [] for page_number, page in enumerate(reader.pages, start=1): page_text = page.extract_text() or "" pages.append(f"\n--- Page {page_number} ---\n{page_text}") return "\n".join(pages) def load_single_document(path: Path, project_root: Path) -> Document: """ Load one supported document and return a Document object. This function keeps file-type-specific logic in one place. """ extension = path.suffix.lower() if extension in {".txt", ".md"}: text = read_text_file(path) elif extension == ".csv": text = read_csv_file(path) elif extension == ".pdf": text = read_pdf_file(path) else: raise ValueError(f"Unsupported file type: {extension}") text = text.strip() return Document( source=str(path.relative_to(project_root)), text=text, file_type=extension, character_count=len(text), ) def load_documents(folder: Path, project_root: Path) -> List[Document]: """ Load all supported documents from a folder. Returns: List[Document] AI ENGINEER PRODUCTION TIP: Always keep source metadata. Without source metadata, your app cannot explain where an answer came from. """ supported_extensions = {".txt", ".md", ".csv", ".pdf"} documents = [] for path in sorted(folder.rglob("*")): if not path.is_file(): continue if path.suffix.lower() not in supported_extensions: continue try: document = load_single_document(path=path, project_root=project_root) if document.text: documents.append(document) except Exception as error: print(f"Could not load file: {path}") print(f"Reason: {error}") return documents