Spaces:

Sadique5
/

German_Political_chatbot

Sleeping

German_Political_chatbot / make_vecdb.py

Upload 23 files

d2224c7 verified 12 months ago

1.48 kB

	from langchain.document_loaders import TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
	from langchain.vectorstores import FAISS
	import os
	import random
	from llm import api_key as SECRET_KEY
	# Path to the folder containing the text files
	folder_path = "./data"

	# Initialize variables
	documents = []

	# Load all text files from the folder
	for filename in os.listdir(folder_path):
	if filename.endswith(".txt"):
	file_path = os.path.join(folder_path, filename)
	loader = TextLoader(file_path, encoding="utf-8")
	documents.extend(loader.load())

	# Split the documents into chunks for better vectorization
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, # Size of each chunk
	chunk_overlap=200 # Overlap between chunks
	)
	random.shuffle(documents)
	split_docs = text_splitter.split_documents(documents)

	# Initialize embeddings (using OpenAIEmbeddings as an example)
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=SECRET_KEY) # Ensure your OpenAI API key is set in the environment

	# Create the FAISS vectorstore
	faiss_vectorstore = FAISS.from_documents(split_docs, embeddings)

	# Save the FAISS vectorstore to disk
	output_path = "faiss_index"
	faiss_vectorstore.save_local(output_path)

	print(f"FAISS vector database created and saved to: {output_path}")