Spaces:

BOB0920
/

My_Self_Bot

Running

App Files Files Community

My_Self_Bot / src /ingest.py

BOB0920

Update src/ingest.py

8c79180 verified 6 months ago

Raw

History Blame Contribute Delete

1.92 kB

	import os
	from langchain_community.document_loaders import PyPDFLoader, TextLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	DATA_PATH = "data"
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	DB_FAISS_PATH = os.path.join(SCRIPT_DIR, "vectorstore", "db_faiss")

	def create_vector_db():
	documents = []

	# Check if data directory exists
	if not os.path.exists(DATA_PATH):
	print(f"Directory {DATA_PATH} not found.")
	return

	# Load documents
	for filename in os.listdir(DATA_PATH):
	file_path = os.path.join(DATA_PATH, filename)
	if filename.endswith(".pdf"):
	loader = PyPDFLoader(file_path)
	documents.extend(loader.load())
	print(f"Loaded {filename}")
	elif filename.endswith(".txt"):
	loader = TextLoader(file_path, encoding='utf-8')
	documents.extend(loader.load())
	print(f"Loaded {filename}")

	if not documents:
	print("No documents found to ingest.")
	return

	# Split text
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	texts = text_splitter.split_documents(documents)
	print(f"Split documents into {len(texts)} chunks.")

	# Create embeddings (using HuggingFace - FREE!)
	print("Generating embeddings locally with sentence-transformers...")
	embeddings = HuggingFaceEmbeddings(
	model_name='sentence-transformers/all-MiniLM-L6-v2',
	model_kwargs={'device': 'cpu'}
	)

	# Create vector store
	db = FAISS.from_documents(texts, embeddings)
	db.save_local(DB_FAISS_PATH)
	print(f"Vector store saved to {DB_FAISS_PATH}")

	if __name__ == "__main__":
	create_vector_db()