Torchie / loader.py
sreedeepEK's picture
Update loader.py
655195f verified
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import warnings
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
warnings.filterwarnings('ignore')
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})
def read_docs_from_folder():
loader = DirectoryLoader("./docs/", glob="**/*.txt", show_progress=True, loader_cls=lambda path: TextLoader(path, encoding='utf-8'))
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 50)
chunks = []
for doc in docs:
chunks.extend(text_splitter.split_documents([doc]))
for i, chunk in enumerate(chunks[:5]): # Adjust the range as needed
print(f"Length of chunk {i}: {len(chunk.page_content)} characters")
return chunks
def create_and_save_faiss_index(docs, index_path="faiss_index"):
texts = [doc.page_content for doc in docs]
vector_store = FAISS.from_texts(texts, embedding_model)
vector_store.save_local(folder_path=index_path)
print(f"FAISS index saved at {index_path}")
def load_embeddings(index_path='faiss_index'):
vector_store = FAISS.load_local(folder_path=index_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
print("FAISS index loaded successfully")
return vector_store
if __name__ == "__main__":
documents = read_docs_from_folder()
create_and_save_faiss_index(documents)