File size: 2,302 Bytes
1c0a23b
 
 
 
 
 
240ad82
1c0a23b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240ad82
 
1c0a23b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from pathlib import Path
from langchain_community.document_loaders import UnstructuredRTFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import FAISS
from helpers.import_envs import openai_api_key, index_file, index_name
import pypandoc

def load_rtf_document(file_path):
    pypandoc.download_pandoc()
    # Load RTF file using LangChain's UnstructuredRTFLoader
    loader = UnstructuredRTFLoader(file_path)
    document = loader.load()
    return document


def load_rtf_document_and_chunk(file_path):
    pypandoc.download_pandoc()
    loader = UnstructuredRTFLoader(file_path)
    document = loader.load_and_split()  # uses RecursiveCharacterTextSplitter by default
    return document

def embed_chunks(chunked_docs):
    # create our embedding model
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large", api_key=openai_api_key
    )  

    # create a local file store to for our cached embeddings
    store = LocalFileStore(
        "./cache/"
    )  
    embedder = CacheBackedEmbeddings.from_bytes_store(
        embedding_model, store, namespace=embedding_model.model
    )

    # Create vector store using Facebook AI Similarity Search (FAISS)
    vector_store = FAISS.from_documents(
        documents=chunked_docs, embedding=embedder
    )  # TODO: How do we create our vector store using FAISS?
    print(vector_store.index.ntotal)


    # save our vector store locally
    vector_store.save_local(folder_path=index_name)
    return vector_store

def create_or_load_vectore_store(transcript_file_name):
    chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)

    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large", api_key=openai_api_key
    )  

    index_file_path = Path(index_file)
    if index_file_path.exists():
        print("Embeddings already done, use the saved index")
        # Combine the retrieved data with the output of the LLM
        vector_store = FAISS.load_local(
            index_name, embedding_model, allow_dangerous_deserialization=True
        )
    else:
        vector_store = embed_chunks(chunked_docs=chunked_docs)

    return vector_store