Mayank Chugh
Refactor embedding function creation and document loading. Update ingest and query routes to remove unnecessary settings parameters, streamline chunking logic, and enhance load_documents function to handle both string and list inputs. Adjust model name in embedder for consistency with OpenAI API.
830947a | from pathlib import Path | |
| from langchain_core.documents import Document | |
| from langchain_community.document_loaders import PyMuPDFLoader, TextLoader | |
| def load_documents(paths: str | list[str]) -> list[Document]: | |
| normalized_paths = [paths] if isinstance(paths, str) else paths | |
| all_docs: list[Document] = [] | |
| for path_str in normalized_paths: | |
| path = Path(path_str) | |
| suffix = path.suffix.lower() | |
| if suffix == ".pdf": | |
| loader = PyMuPDFLoader(str(path_str)) | |
| elif suffix in {".txt", ".md"}: | |
| loader = TextLoader(str(path_str), encoding="utf-8") | |
| else: | |
| raise ValueError(f"Unsupported file type: {suffix or 'unknown'}") | |
| documents = loader.load() | |
| for doc in documents: | |
| doc.metadata.setdefault("source", path.name) | |
| doc.metadata.setdefault("page", 0) | |
| all_docs.extend(documents) | |
| return all_docs | |