| from pathlib import Path |
| import faiss |
| import pickle |
| from PyPDF2 import PdfReader |
| from tqdm import tqdm |
| import glob |
| import os |
| import re |
|
|
| from langchain.embeddings.openai import OpenAIEmbeddings |
| from langchain.text_splitter import CharacterTextSplitter |
| from langchain.vectorstores import FAISS |
| from langchain.document_loaders import TextLoader |
|
|
|
|
| import dotenv |
|
|
| dotenv.load_dotenv() |
|
|
| def get_all_pdf_filenames(paths, recursive): |
| extensions = ["pdf"] |
| filenames = [] |
| for ext_name in extensions: |
| ext = f"**/*.{ext_name}" if recursive else f"*.{ext_name}" |
| for path in paths: |
| filenames.extend(glob.glob(os.path.join(path, ext), recursive=recursive)) |
| return filenames |
|
|
|
|
| |
| |
| |
| |
|
|
| class Ingester(): |
| """ |
| Vectorises chunks of the data and puts source as metadata |
| """ |
| def __init__( |
| self, |
| separator='\n', |
| chunk_overlap=200, |
| chunk_size=200, |
| ): |
|
|
| self.splitter = CharacterTextSplitter(chunk_size=chunk_size, separator=separator, chunk_overlap=chunk_overlap) |
| |
| def ingest(self, path): |
| |
| ps = ['/mnt/c/users/elio/Downloads/UNHCR Emergency Manual/UNHCR Emergency Manual/46a9e29a2.pdf'] |
| data = [] |
| sources = [] |
| for p in tqdm(ps): |
| reader = PdfReader(p) |
| page = '\n'.join([reader.pages[i].extract_text() for i in range(len(reader.pages))]) |
| data.append(page) |
| sources.append(p) |
|
|
| docs = [] |
| metadatas = [] |
| for i, d in tqdm(enumerate(data)): |
| splits = self.splitter.split_text(d) |
| if all(s != "" for s in splits): |
| docs.extend(splits) |
| metadatas.extend([{"source": sources[i]}] * len(splits)) |
| |
| assert len(docs) > 0 |
| |
| print("Extracting embeddings") |
| store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) |
| |
| with open(os.path.join('./data', 'store.pkl'), "wb") as f: |
| pickle.dump(store, f) |
| |
| print(f"Saved store at {os.path.join('./data', 'store.pkl')}.") |
| |
| ingester = Ingester(chunk_size=2000) |
| ingester.ingest("/mnt/c/users/elio/Downloads/UNHCR Emergency Manual") |
|
|