Spaces:
Sleeping
Sleeping
| from langchain.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings | |
| from langchain.vectorstores import FAISS | |
| import os | |
| import random | |
| from llm import api_key as SECRET_KEY | |
| # Path to the folder containing the text files | |
| folder_path = "./data" | |
| # Initialize variables | |
| documents = [] | |
| # Load all text files from the folder | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith(".txt"): | |
| file_path = os.path.join(folder_path, filename) | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| documents.extend(loader.load()) | |
| # Split the documents into chunks for better vectorization | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, # Size of each chunk | |
| chunk_overlap=200 # Overlap between chunks | |
| ) | |
| random.shuffle(documents) | |
| split_docs = text_splitter.split_documents(documents) | |
| # Initialize embeddings (using OpenAIEmbeddings as an example) | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY) # Ensure your OpenAI API key is set in the environment | |
| # Create the FAISS vectorstore | |
| faiss_vectorstore = FAISS.from_documents(split_docs, embeddings) | |
| # Save the FAISS vectorstore to disk | |
| output_path = "faiss_index" | |
| faiss_vectorstore.save_local(output_path) | |
| print(f"FAISS vector database created and saved to: {output_path}") | |