Spaces:

sreedeepEK
/

Torchie

Sleeping

App Files Files Community

Torchie / loader.py

sreedeepEK

Update loader.py

655195f verified about 2 months ago

raw

history blame contribute delete

1.77 kB

	import os
	os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

	import torch
	import warnings
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import DirectoryLoader, TextLoader

	warnings.filterwarnings('ignore')


	device = "cuda" if torch.cuda.is_available() else "cpu"
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device})

	def read_docs_from_folder():
	loader = DirectoryLoader("./docs/", glob="*/.txt", show_progress=True, loader_cls=lambda path: TextLoader(path, encoding='utf-8'))
	docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 50)

	chunks = []

	for doc in docs:
	chunks.extend(text_splitter.split_documents([doc]))

	for i, chunk in enumerate(chunks[:5]): # Adjust the range as needed
	print(f"Length of chunk {i}: {len(chunk.page_content)} characters")

	return chunks

	def create_and_save_faiss_index(docs, index_path="faiss_index"):
	texts = [doc.page_content for doc in docs]
	vector_store = FAISS.from_texts(texts, embedding_model)
	vector_store.save_local(folder_path=index_path)
	print(f"FAISS index saved at {index_path}")


	def load_embeddings(index_path='faiss_index'):
	vector_store = FAISS.load_local(folder_path=index_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
	print("FAISS index loaded successfully")
	return vector_store


	if __name__ == "__main__":
	documents = read_docs_from_folder()
	create_and_save_faiss_index(documents)