Spaces:
Sleeping
Sleeping
| """ | |
| document_loader.py | |
| ------------------ | |
| Handles loading and extracting text from different file types. | |
| Supported formats: | |
| - .txt (plain text) | |
| - .pdf (PDF documents) | |
| - .csv (comma-separated values) | |
| - .docx (Microsoft Word documents) | |
| Each loader returns a list of LangChain Document objects. | |
| A Document has two fields: | |
| - page_content : the extracted text | |
| - metadata : a dict with extra info like the source file name | |
| """ | |
| import os | |
| from langchain_core.documents import Document | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _make_doc(text: str, source: str) -> Document: | |
| """Wrap extracted text in a LangChain Document with source metadata.""" | |
| return Document(page_content=text, metadata={"source": source}) | |
| # ββ per-format loaders ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_txt(file_path: str) -> list[Document]: | |
| """Load a plain-text file and return it as a single Document.""" | |
| with open(file_path, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| return [_make_doc(text, file_path)] | |
| def load_pdf(file_path: str) -> list[Document]: | |
| """ | |
| Load a PDF file page-by-page. | |
| Each page becomes its own Document so we can cite the exact page later. | |
| Requires: pypdf | |
| """ | |
| try: | |
| from pypdf import PdfReader | |
| except ImportError: | |
| raise ImportError("pypdf is required for PDF support. Run: pip install pypdf") | |
| reader = PdfReader(file_path) | |
| documents = [] | |
| for page_num, page in enumerate(reader.pages): | |
| text = page.extract_text() or "" | |
| if text.strip(): # skip blank pages | |
| doc = Document( | |
| page_content=text, | |
| metadata={"source": file_path, "page": page_num + 1}, | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def load_csv(file_path: str) -> list[Document]: | |
| """ | |
| Load a CSV file. | |
| Each row is turned into a readable 'key: value' string and stored as | |
| one Document so every row is individually searchable. | |
| Requires: pandas | |
| """ | |
| try: | |
| import pandas as pd | |
| except ImportError: | |
| raise ImportError("pandas is required for CSV support. Run: pip install pandas") | |
| df = pd.read_csv(file_path) | |
| documents = [] | |
| for idx, row in df.iterrows(): | |
| # Build a human-readable string from each row | |
| row_text = "\n".join(f"{col}: {val}" for col, val in row.items()) | |
| doc = Document( | |
| page_content=row_text, | |
| metadata={"source": file_path, "row": idx + 1}, | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def load_docx(file_path: str) -> list[Document]: | |
| """ | |
| Load a Microsoft Word (.docx) file. | |
| Each paragraph becomes its own Document. | |
| Requires: python-docx | |
| """ | |
| try: | |
| from docx import Document as WordDocument | |
| except ImportError: | |
| raise ImportError( | |
| "python-docx is required for DOCX support. Run: pip install python-docx" | |
| ) | |
| word_doc = WordDocument(file_path) | |
| documents = [] | |
| for para_num, para in enumerate(word_doc.paragraphs): | |
| text = para.text.strip() | |
| if text: # skip empty paragraphs | |
| doc = Document( | |
| page_content=text, | |
| metadata={"source": file_path, "paragraph": para_num + 1}, | |
| ) | |
| documents.append(doc) | |
| return documents | |
| # ββ main entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_document(file_path: str) -> list[Document]: | |
| """ | |
| Detect the file extension and call the right loader. | |
| Parameters | |
| ---------- | |
| file_path : str | |
| Full path to the file on disk. | |
| Returns | |
| ------- | |
| list[Document] | |
| A list of LangChain Document objects with extracted text. | |
| Raises | |
| ------ | |
| ValueError β if the file type is not supported. | |
| Exception β if loading fails for any reason. | |
| """ | |
| if not os.path.exists(file_path): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| extension = os.path.splitext(file_path)[1].lower() | |
| loaders = { | |
| ".txt": load_txt, | |
| ".pdf": load_pdf, | |
| ".csv": load_csv, | |
| ".docx": load_docx, | |
| } | |
| if extension not in loaders: | |
| raise ValueError( | |
| f"Unsupported file type: '{extension}'. " | |
| f"Supported types: {', '.join(loaders.keys())}" | |
| ) | |
| # Call the appropriate loader | |
| documents = loaders[extension](file_path) | |
| if not documents: | |
| raise ValueError(f"No readable text found in: {file_path}") | |
| print(f" OK: Loaded {len(documents)} chunk(s) from '{os.path.basename(file_path)}'") | |
| return documents | |