File size: 1,484 Bytes
d2224c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
import os
import random
from llm import api_key as SECRET_KEY
# Path to the folder containing the text files
folder_path = "./data"

# Initialize variables
documents = []

# Load all text files from the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        loader = TextLoader(file_path, encoding="utf-8")
        documents.extend(loader.load())

# Split the documents into chunks for better vectorization
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Size of each chunk
    chunk_overlap=200  # Overlap between chunks
)
random.shuffle(documents)
split_docs = text_splitter.split_documents(documents)

# Initialize embeddings (using OpenAIEmbeddings as an example)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY)  # Ensure your OpenAI API key is set in the environment

# Create the FAISS vectorstore
faiss_vectorstore = FAISS.from_documents(split_docs, embeddings)

# Save the FAISS vectorstore to disk
output_path = "faiss_index"
faiss_vectorstore.save_local(output_path)

print(f"FAISS vector database created and saved to: {output_path}")