Ragbase_Studio / src /document_loader.py
Abdul2000's picture
Rename document_loader.py to src/document_loader.py
6558fe5 verified
Raw
History Blame Contribute Delete
5.3 kB
"""
document_loader.py
------------------
Handles loading and extracting text from different file types.
Supported formats:
- .txt (plain text)
- .pdf (PDF documents)
- .csv (comma-separated values)
- .docx (Microsoft Word documents)
Each loader returns a list of LangChain Document objects.
A Document has two fields:
- page_content : the extracted text
- metadata : a dict with extra info like the source file name
"""
import os
from langchain_core.documents import Document
# ── helpers ──────────────────────────────────────────────────────────────────
def _make_doc(text: str, source: str) -> Document:
"""Wrap extracted text in a LangChain Document with source metadata."""
return Document(page_content=text, metadata={"source": source})
# ── per-format loaders ────────────────────────────────────────────────────────
def load_txt(file_path: str) -> list[Document]:
"""Load a plain-text file and return it as a single Document."""
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
return [_make_doc(text, file_path)]
def load_pdf(file_path: str) -> list[Document]:
"""
Load a PDF file page-by-page.
Each page becomes its own Document so we can cite the exact page later.
Requires: pypdf
"""
try:
from pypdf import PdfReader
except ImportError:
raise ImportError("pypdf is required for PDF support. Run: pip install pypdf")
reader = PdfReader(file_path)
documents = []
for page_num, page in enumerate(reader.pages):
text = page.extract_text() or ""
if text.strip(): # skip blank pages
doc = Document(
page_content=text,
metadata={"source": file_path, "page": page_num + 1},
)
documents.append(doc)
return documents
def load_csv(file_path: str) -> list[Document]:
"""
Load a CSV file.
Each row is turned into a readable 'key: value' string and stored as
one Document so every row is individually searchable.
Requires: pandas
"""
try:
import pandas as pd
except ImportError:
raise ImportError("pandas is required for CSV support. Run: pip install pandas")
df = pd.read_csv(file_path)
documents = []
for idx, row in df.iterrows():
# Build a human-readable string from each row
row_text = "\n".join(f"{col}: {val}" for col, val in row.items())
doc = Document(
page_content=row_text,
metadata={"source": file_path, "row": idx + 1},
)
documents.append(doc)
return documents
def load_docx(file_path: str) -> list[Document]:
"""
Load a Microsoft Word (.docx) file.
Each paragraph becomes its own Document.
Requires: python-docx
"""
try:
from docx import Document as WordDocument
except ImportError:
raise ImportError(
"python-docx is required for DOCX support. Run: pip install python-docx"
)
word_doc = WordDocument(file_path)
documents = []
for para_num, para in enumerate(word_doc.paragraphs):
text = para.text.strip()
if text: # skip empty paragraphs
doc = Document(
page_content=text,
metadata={"source": file_path, "paragraph": para_num + 1},
)
documents.append(doc)
return documents
# ── main entry point ──────────────────────────────────────────────────────────
def load_document(file_path: str) -> list[Document]:
"""
Detect the file extension and call the right loader.
Parameters
----------
file_path : str
Full path to the file on disk.
Returns
-------
list[Document]
A list of LangChain Document objects with extracted text.
Raises
------
ValueError – if the file type is not supported.
Exception – if loading fails for any reason.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
extension = os.path.splitext(file_path)[1].lower()
loaders = {
".txt": load_txt,
".pdf": load_pdf,
".csv": load_csv,
".docx": load_docx,
}
if extension not in loaders:
raise ValueError(
f"Unsupported file type: '{extension}'. "
f"Supported types: {', '.join(loaders.keys())}"
)
# Call the appropriate loader
documents = loaders[extension](file_path)
if not documents:
raise ValueError(f"No readable text found in: {file_path}")
print(f" OK: Loaded {len(documents)} chunk(s) from '{os.path.basename(file_path)}'")
return documents