Spaces:

Sadique5
/

German_Political_chatbot

Sleeping

File size: 1,484 Bytes

d2224c7

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
import os
import random
from llm import api_key as SECRET_KEY
# Path to the folder containing the text files
folder_path = "./data"

# Initialize variables
documents = []

# Load all text files from the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        loader = TextLoader(file_path, encoding="utf-8")
        documents.extend(loader.load())

# Split the documents into chunks for better vectorization
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Size of each chunk
    chunk_overlap=200  # Overlap between chunks
)
random.shuffle(documents)
split_docs = text_splitter.split_documents(documents)

# Initialize embeddings (using OpenAIEmbeddings as an example)
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY)  # Ensure your OpenAI API key is set in the environment

# Create the FAISS vectorstore
faiss_vectorstore = FAISS.from_documents(split_docs, embeddings)

# Save the FAISS vectorstore to disk
output_path = "faiss_index"
faiss_vectorstore.save_local(output_path)

print(f"FAISS vector database created and saved to: {output_path}")