from langchain.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings from langchain.vectorstores import FAISS import os import random from llm import api_key as SECRET_KEY # Path to the folder containing the text files folder_path = "./data" # Initialize variables documents = [] # Load all text files from the folder for filename in os.listdir(folder_path): if filename.endswith(".txt"): file_path = os.path.join(folder_path, filename) loader = TextLoader(file_path, encoding="utf-8") documents.extend(loader.load()) # Split the documents into chunks for better vectorization text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, # Size of each chunk chunk_overlap=200 # Overlap between chunks ) random.shuffle(documents) split_docs = text_splitter.split_documents(documents) # Initialize embeddings (using OpenAIEmbeddings as an example) embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY) # Ensure your OpenAI API key is set in the environment # Create the FAISS vectorstore faiss_vectorstore = FAISS.from_documents(split_docs, embeddings) # Save the FAISS vectorstore to disk output_path = "faiss_index" faiss_vectorstore.save_local(output_path) print(f"FAISS vector database created and saved to: {output_path}")