Spaces:

svijayanand
/

Podcast_Oracle

Build error

Vijayanand Sankarasubramanian

updated UI to get constants

240ad82 almost 2 years ago

2.3 kB

	from pathlib import Path
	from langchain_community.document_loaders import UnstructuredRTFLoader
	from langchain_openai import OpenAIEmbeddings
	from langchain.storage import LocalFileStore
	from langchain.embeddings import CacheBackedEmbeddings
	from langchain_community.vectorstores import FAISS
	from helpers.import_envs import openai_api_key, index_file, index_name
	import pypandoc

	def load_rtf_document(file_path):
	pypandoc.download_pandoc()
	# Load RTF file using LangChain's UnstructuredRTFLoader
	loader = UnstructuredRTFLoader(file_path)
	document = loader.load()
	return document


	def load_rtf_document_and_chunk(file_path):
	pypandoc.download_pandoc()
	loader = UnstructuredRTFLoader(file_path)
	document = loader.load_and_split() # uses RecursiveCharacterTextSplitter by default
	return document

	def embed_chunks(chunked_docs):
	# create our embedding model
	embedding_model = OpenAIEmbeddings(
	model="text-embedding-3-large", api_key=openai_api_key
	)

	# create a local file store to for our cached embeddings
	store = LocalFileStore(
	"./cache/"
	)
	embedder = CacheBackedEmbeddings.from_bytes_store(
	embedding_model, store, namespace=embedding_model.model
	)

	# Create vector store using Facebook AI Similarity Search (FAISS)
	vector_store = FAISS.from_documents(
	documents=chunked_docs, embedding=embedder
	) # TODO: How do we create our vector store using FAISS?
	print(vector_store.index.ntotal)


	# save our vector store locally
	vector_store.save_local(folder_path=index_name)
	return vector_store

	def create_or_load_vectore_store(transcript_file_name):
	chunked_docs = load_rtf_document_and_chunk(file_path=transcript_file_name)

	embedding_model = OpenAIEmbeddings(
	model="text-embedding-3-large", api_key=openai_api_key
	)

	index_file_path = Path(index_file)
	if index_file_path.exists():
	print("Embeddings already done, use the saved index")
	# Combine the retrieved data with the output of the LLM
	vector_store = FAISS.load_local(
	index_name, embedding_model, allow_dangerous_deserialization=True
	)
	else:
	vector_store = embed_chunks(chunked_docs=chunked_docs)

	return vector_store