German_Political_chatbot / make_vecdb.py
Sadique5's picture
Upload 23 files
d2224c7 verified
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
import os
import random
from llm import api_key as SECRET_KEY
# Path to the folder containing the text files
folder_path = "./data"
# Initialize variables
documents = []
# Load all text files from the folder
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
loader = TextLoader(file_path, encoding="utf-8")
documents.extend(loader.load())
# Split the documents into chunks for better vectorization
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Size of each chunk
chunk_overlap=200 # Overlap between chunks
)
random.shuffle(documents)
split_docs = text_splitter.split_documents(documents)
# Initialize embeddings (using OpenAIEmbeddings as an example)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY) # Ensure your OpenAI API key is set in the environment
# Create the FAISS vectorstore
faiss_vectorstore = FAISS.from_documents(split_docs, embeddings)
# Save the FAISS vectorstore to disk
output_path = "faiss_index"
faiss_vectorstore.save_local(output_path)
print(f"FAISS vector database created and saved to: {output_path}")