Spaces:
Sleeping
Sleeping
| import os | |
| from datasets import load_dataset | |
| from langchain_community.document_loaders.csv_loader import CSVLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.embeddings import CacheBackedEmbeddings | |
| from langchain.storage import LocalFileStore | |
| from langchain_community.vectorstores import FAISS | |
| def fetch_retriever_or_load_local_retriever(): | |
| csv_path = "./imdb.csv" | |
| if not os.path.exists(csv_path): | |
| dataset = load_dataset("ShubhamChoksi/IMDB_Movies") | |
| dataset["train"].to_csv("imdb.csv") | |
| loader = CSVLoader(file_path=csv_path) | |
| data = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| chunked_documents = text_splitter.split_documents(data) | |
| embedding_model = OpenAIEmbeddings() | |
| store = LocalFileStore("./cache/") | |
| cached_embedder = CacheBackedEmbeddings.from_bytes_store( | |
| embedding_model, store, namespace=embedding_model.model | |
| ) | |
| index_path = "local_index" | |
| if os.path.exists(index_path): | |
| vector_store = FAISS.load_local( | |
| index_path, cached_embedder, allow_dangerous_deserialization=True | |
| ) | |
| else: | |
| vector_store = FAISS.from_documents(chunked_documents, cached_embedder) | |
| vector_store.save_local(index_path) | |
| return vector_store.as_retriever() | |