Document-Audit-RAG / rag /loader.py
Mayank Chugh
Refactor embedding function creation and document loading. Update ingest and query routes to remove unnecessary settings parameters, streamline chunking logic, and enhance load_documents function to handle both string and list inputs. Adjust model name in embedder for consistency with OpenAI API.
830947a
raw
history blame contribute delete
921 Bytes
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
def load_documents(paths: str | list[str]) -> list[Document]:
normalized_paths = [paths] if isinstance(paths, str) else paths
all_docs: list[Document] = []
for path_str in normalized_paths:
path = Path(path_str)
suffix = path.suffix.lower()
if suffix == ".pdf":
loader = PyMuPDFLoader(str(path_str))
elif suffix in {".txt", ".md"}:
loader = TextLoader(str(path_str), encoding="utf-8")
else:
raise ValueError(f"Unsupported file type: {suffix or 'unknown'}")
documents = loader.load()
for doc in documents:
doc.metadata.setdefault("source", path.name)
doc.metadata.setdefault("page", 0)
all_docs.extend(documents)
return all_docs