Spaces:
Build error
Build error
File size: 2,302 Bytes
1c0a23b 240ad82 1c0a23b 240ad82 1c0a23b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | from pathlib import Path
from langchain_community.document_loaders import UnstructuredRTFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import FAISS
from helpers.import_envs import openai_api_key, index_file, index_name
import pypandoc
def load_rtf_document(file_path):
pypandoc.download_pandoc()
# Load RTF file using LangChain's UnstructuredRTFLoader
loader = UnstructuredRTFLoader(file_path)
document = loader.load()
return document
def load_rtf_document_and_chunk(file_path):
pypandoc.download_pandoc()
loader = UnstructuredRTFLoader(file_path)
document = loader.load_and_split() # uses RecursiveCharacterTextSplitter by default
return document
def embed_chunks(chunked_docs):
# create our embedding model
embedding_model = OpenAIEmbeddings(
model="text-embedding-3-large", api_key=openai_api_key
)
# create a local file store to for our cached embeddings
store = LocalFileStore(
"./cache/"
)
embedder = CacheBackedEmbeddings.from_bytes_store(
embedding_model, store, namespace=embedding_model.model
)
# Create vector store using Facebook AI Similarity Search (FAISS)
vector_store = FAISS.from_documents(
documents=chunked_docs, embedding=embedder
) # TODO: How do we create our vector store using FAISS?
print(vector_store.index.ntotal)
# save our vector store locally
vector_store.save_local(folder_path=index_name)
return vector_store
def create_or_load_vectore_store(transcript_file_name):
chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)
embedding_model = OpenAIEmbeddings(
model="text-embedding-3-large", api_key=openai_api_key
)
index_file_path = Path(index_file)
if index_file_path.exists():
print("Embeddings already done, use the saved index")
# Combine the retrieved data with the output of the LLM
vector_store = FAISS.load_local(
index_name, embedding_model, allow_dangerous_deserialization=True
)
else:
vector_store = embed_chunks(chunked_docs=chunked_docs)
return vector_store
|