""" document_loader.py ------------------ Handles loading and extracting text from different file types. Supported formats: - .txt (plain text) - .pdf (PDF documents) - .csv (comma-separated values) - .docx (Microsoft Word documents) Each loader returns a list of LangChain Document objects. A Document has two fields: - page_content : the extracted text - metadata : a dict with extra info like the source file name """ import os from langchain_core.documents import Document # ── helpers ────────────────────────────────────────────────────────────────── def _make_doc(text: str, source: str) -> Document: """Wrap extracted text in a LangChain Document with source metadata.""" return Document(page_content=text, metadata={"source": source}) # ── per-format loaders ──────────────────────────────────────────────────────── def load_txt(file_path: str) -> list[Document]: """Load a plain-text file and return it as a single Document.""" with open(file_path, "r", encoding="utf-8", errors="ignore") as f: text = f.read() return [_make_doc(text, file_path)] def load_pdf(file_path: str) -> list[Document]: """ Load a PDF file page-by-page. Each page becomes its own Document so we can cite the exact page later. Requires: pypdf """ try: from pypdf import PdfReader except ImportError: raise ImportError("pypdf is required for PDF support. Run: pip install pypdf") reader = PdfReader(file_path) documents = [] for page_num, page in enumerate(reader.pages): text = page.extract_text() or "" if text.strip(): # skip blank pages doc = Document( page_content=text, metadata={"source": file_path, "page": page_num + 1}, ) documents.append(doc) return documents def load_csv(file_path: str) -> list[Document]: """ Load a CSV file. Each row is turned into a readable 'key: value' string and stored as one Document so every row is individually searchable. Requires: pandas """ try: import pandas as pd except ImportError: raise ImportError("pandas is required for CSV support. Run: pip install pandas") df = pd.read_csv(file_path) documents = [] for idx, row in df.iterrows(): # Build a human-readable string from each row row_text = "\n".join(f"{col}: {val}" for col, val in row.items()) doc = Document( page_content=row_text, metadata={"source": file_path, "row": idx + 1}, ) documents.append(doc) return documents def load_docx(file_path: str) -> list[Document]: """ Load a Microsoft Word (.docx) file. Each paragraph becomes its own Document. Requires: python-docx """ try: from docx import Document as WordDocument except ImportError: raise ImportError( "python-docx is required for DOCX support. Run: pip install python-docx" ) word_doc = WordDocument(file_path) documents = [] for para_num, para in enumerate(word_doc.paragraphs): text = para.text.strip() if text: # skip empty paragraphs doc = Document( page_content=text, metadata={"source": file_path, "paragraph": para_num + 1}, ) documents.append(doc) return documents # ── main entry point ────────────────────────────────────────────────────────── def load_document(file_path: str) -> list[Document]: """ Detect the file extension and call the right loader. Parameters ---------- file_path : str Full path to the file on disk. Returns ------- list[Document] A list of LangChain Document objects with extracted text. Raises ------ ValueError – if the file type is not supported. Exception – if loading fails for any reason. """ if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") extension = os.path.splitext(file_path)[1].lower() loaders = { ".txt": load_txt, ".pdf": load_pdf, ".csv": load_csv, ".docx": load_docx, } if extension not in loaders: raise ValueError( f"Unsupported file type: '{extension}'. " f"Supported types: {', '.join(loaders.keys())}" ) # Call the appropriate loader documents = loaders[extension](file_path) if not documents: raise ValueError(f"No readable text found in: {file_path}") print(f" OK: Loaded {len(documents)} chunk(s) from '{os.path.basename(file_path)}'") return documents