Spaces:
Build error
Build error
| from pathlib import Path | |
| from langchain_community.document_loaders import UnstructuredRTFLoader | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.storage import LocalFileStore | |
| from langchain.embeddings import CacheBackedEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from helpers.import_envs import openai_api_key, index_file, index_name | |
| import pypandoc | |
| def load_rtf_document(file_path): | |
| pypandoc.download_pandoc() | |
| # Load RTF file using LangChain's UnstructuredRTFLoader | |
| loader = UnstructuredRTFLoader(file_path) | |
| document = loader.load() | |
| return document | |
| def load_rtf_document_and_chunk(file_path): | |
| pypandoc.download_pandoc() | |
| loader = UnstructuredRTFLoader(file_path) | |
| document = loader.load_and_split() # uses RecursiveCharacterTextSplitter by default | |
| return document | |
| def embed_chunks(chunked_docs): | |
| # create our embedding model | |
| embedding_model = OpenAIEmbeddings( | |
| model="text-embedding-3-large", api_key=openai_api_key | |
| ) | |
| # create a local file store to for our cached embeddings | |
| store = LocalFileStore( | |
| "./cache/" | |
| ) | |
| embedder = CacheBackedEmbeddings.from_bytes_store( | |
| embedding_model, store, namespace=embedding_model.model | |
| ) | |
| # Create vector store using Facebook AI Similarity Search (FAISS) | |
| vector_store = FAISS.from_documents( | |
| documents=chunked_docs, embedding=embedder | |
| ) # TODO: How do we create our vector store using FAISS? | |
| print(vector_store.index.ntotal) | |
| # save our vector store locally | |
| vector_store.save_local(folder_path=index_name) | |
| return vector_store | |
| def create_or_load_vectore_store(transcript_file_name): | |
| chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name) | |
| embedding_model = OpenAIEmbeddings( | |
| model="text-embedding-3-large", api_key=openai_api_key | |
| ) | |
| index_file_path = Path(index_file) | |
| if index_file_path.exists(): | |
| print("Embeddings already done, use the saved index") | |
| # Combine the retrieved data with the output of the LLM | |
| vector_store = FAISS.load_local( | |
| index_name, embedding_model, allow_dangerous_deserialization=True | |
| ) | |
| else: | |
| vector_store = embed_chunks(chunked_docs=chunked_docs) | |
| return vector_store | |