Spaces:
Runtime error
Runtime error
| from langchain_community.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.embeddings import ( | |
| OpenAIEmbeddings, | |
| HuggingFaceBgeEmbeddings, | |
| HuggingFaceEmbeddings, | |
| HuggingFaceInstructEmbeddings, | |
| ) | |
| class Ingest: | |
| def __init__( | |
| self, | |
| openai_api_key=None, | |
| chunk=512, | |
| overlap=256, | |
| czech_store="stores/czech_512", | |
| english_store="stores/english_512", | |
| data_czech="data/czech", | |
| data_english="data/english", | |
| english_embedding_model="text-embedding-3-large", | |
| czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en", | |
| ): | |
| self.openai_api_key = openai_api_key | |
| self.chunk = chunk | |
| self.overlap = overlap | |
| self.czech_store = czech_store | |
| self.english_store = english_store | |
| self.data_czech = data_czech | |
| self.data_english = data_english | |
| self.english_embedding_model = english_embedding_model | |
| self.czech_embedding_model = czech_embedding_model | |
| def ingest_english(self): | |
| embedding = OpenAIEmbeddings( | |
| openai_api_key=self.openai_api_key, | |
| model=self.english_embedding_model, | |
| ) | |
| loader = DirectoryLoader( | |
| self.data_english, | |
| show_progress=True, | |
| loader_cls=PyPDFLoader, | |
| ) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=self.chunk, | |
| chunk_overlap=self.overlap, | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| vectordb = FAISS.from_documents( | |
| documents=texts, | |
| embedding=embedding, | |
| ) | |
| vectordb.save_local(self.english_store) | |
| print("\n English vector Store Created.......\n\n") | |
| def ingest_czech(self): | |
| embedding_model = self.czech_embedding_model | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": False} | |
| embedding = HuggingFaceEmbeddings( | |
| model_name=embedding_model, | |
| model_kwargs=model_kwargs, | |
| encode_kwargs=encode_kwargs, | |
| ) | |
| loader = DirectoryLoader( | |
| self.data_czech, | |
| show_progress=True, | |
| ) | |
| documents = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=self.chunk, | |
| chunk_overlap=self.overlap, | |
| ) | |
| texts = text_splitter.split_documents(documents) | |
| vectordb = FAISS.from_documents( | |
| documents=texts, | |
| embedding=embedding, | |
| ) | |
| vectordb.save_local(self.czech_store) | |
| print("\n Czech vector Store Created.......\n\n") | |