Spaces:
Paused
Paused
| from langchain import FAISS | |
| from langchain.document_loaders import PyPDFium2Loader | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.text_splitter import CharacterTextSplitter | |
| import pypdfium2 as pdfium | |
| from constants import chunk_size, chunk_overlap, number_snippets_to_retrieve | |
| def download_and_index_pdf(urls: list[str]) -> FAISS: | |
| """ | |
| Download and index a list of PDFs based on the URLs | |
| """ | |
| def __update_metadata(pages, url): | |
| """ | |
| Add to the document metadata the title and original URL | |
| """ | |
| for page in pages: | |
| pdf = pdfium.PdfDocument(page.metadata['source']) | |
| title = pdf.get_metadata_dict().get('Title', url) | |
| page.metadata['source'] = url | |
| page.metadata['title'] = title | |
| return pages | |
| all_pages = [] | |
| for url in urls: | |
| loader = PyPDFium2Loader(url) | |
| splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| pages = loader.load_and_split(splitter) | |
| pages = __update_metadata(pages, url) | |
| all_pages += pages | |
| faiss_index = FAISS.from_documents(all_pages, OpenAIEmbeddings()) | |
| return faiss_index | |
| def search_faiss_index(faiss_index: FAISS, query: str, top_k: int = number_snippets_to_retrieve) -> list: | |
| """ | |
| Search a FAISS index, using the passed query | |
| """ | |
| docs = faiss_index.similarity_search(query, k=top_k) | |
| return docs | |