Spaces:

svijayanand
/

Podcast_Oracle

Build error

File size: 2,302 Bytes

from pathlib import Path
from langchain_community.document_loaders import UnstructuredRTFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.vectorstores import FAISS
from helpers.import_envs import openai_api_key, index_file, index_name
import pypandoc

def load_rtf_document(file_path):
    pypandoc.download_pandoc()
    # Load RTF file using LangChain's UnstructuredRTFLoader
    loader = UnstructuredRTFLoader(file_path)
    document = loader.load()
    return document


def load_rtf_document_and_chunk(file_path):
    pypandoc.download_pandoc()
    loader = UnstructuredRTFLoader(file_path)
    document = loader.load_and_split()  # uses RecursiveCharacterTextSplitter by default
    return document

def embed_chunks(chunked_docs):
    # create our embedding model
    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large", api_key=openai_api_key
    )  

    # create a local file store to for our cached embeddings
    store = LocalFileStore(
        "./cache/"
    )  
    embedder = CacheBackedEmbeddings.from_bytes_store(
        embedding_model, store, namespace=embedding_model.model
    )

    # Create vector store using Facebook AI Similarity Search (FAISS)
    vector_store = FAISS.from_documents(
        documents=chunked_docs, embedding=embedder
    )  # TODO: How do we create our vector store using FAISS?
    print(vector_store.index.ntotal)


    # save our vector store locally
    vector_store.save_local(folder_path=index_name)
    return vector_store

def create_or_load_vectore_store(transcript_file_name):
    chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)

    embedding_model = OpenAIEmbeddings(
        model="text-embedding-3-large", api_key=openai_api_key
    )  

    index_file_path = Path(index_file)
    if index_file_path.exists():
        print("Embeddings already done, use the saved index")
        # Combine the retrieved data with the output of the LLM
        vector_store = FAISS.load_local(
            index_name, embedding_model, allow_dangerous_deserialization=True
        )
    else:
        vector_store = embed_chunks(chunked_docs=chunked_docs)

    return vector_store