Spaces:
Sleeping
Sleeping
| import os | |
| os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" | |
| import torch | |
| import warnings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
| warnings.filterwarnings('ignore') | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": device}) | |
| def read_docs_from_folder(): | |
| loader = DirectoryLoader("./docs/", glob="**/*.txt", show_progress=True, loader_cls=lambda path: TextLoader(path, encoding='utf-8')) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 50) | |
| chunks = [] | |
| for doc in docs: | |
| chunks.extend(text_splitter.split_documents([doc])) | |
| for i, chunk in enumerate(chunks[:5]): # Adjust the range as needed | |
| print(f"Length of chunk {i}: {len(chunk.page_content)} characters") | |
| return chunks | |
| def create_and_save_faiss_index(docs, index_path="faiss_index"): | |
| texts = [doc.page_content for doc in docs] | |
| vector_store = FAISS.from_texts(texts, embedding_model) | |
| vector_store.save_local(folder_path=index_path) | |
| print(f"FAISS index saved at {index_path}") | |
| def load_embeddings(index_path='faiss_index'): | |
| vector_store = FAISS.load_local(folder_path=index_path, embeddings=embedding_model, allow_dangerous_deserialization=True) | |
| print("FAISS index loaded successfully") | |
| return vector_store | |
| if __name__ == "__main__": | |
| documents = read_docs_from_folder() | |
| create_and_save_faiss_index(documents) | |