Spaces:
Sleeping
Sleeping
| import os | |
| from langchain_community.retrievers import WikipediaRetriever | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from .embedding import get_embeddings | |
| from typing import List | |
| embedder = get_embeddings() | |
| def get_rag_retriever_from_paths(pdf_paths: List[str]): | |
| """Loads PDFs from a list of paths, splits them, and creates a Chroma retriever.""" | |
| all_docs = [] | |
| for path in pdf_paths: | |
| loader = PyPDFLoader(path) | |
| all_docs.extend(loader.load()) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=270) | |
| splits = text_splitter.split_documents(all_docs) | |
| vectorstore = Chroma.from_documents(documents=splits, embedding=embedder) | |
| rag_retriever = vectorstore.as_retriever() | |
| return rag_retriever | |
| def get_wiki_retriever(): | |
| wikiretriever = WikipediaRetriever(top_k_results=2) | |
| return wikiretriever |