Spaces:

felipelemes
/

databricks-rag-assistant

Runtime error

App Files Files Community

databricks-rag-assistant / prepare_data.py

felipelemes

Initial commit: Core RAG project files and setup

3975d30 7 months ago

raw

history blame contribute delete

1.88 kB

	import os
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import SentenceTransformerEmbeddings
	from langchain.vectorstores import FAISS

	# --- Configurations ---
	PDF_PATH = "data/azure-databricks.pdf" # Path to PDF file
	EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Embedding model name to be used
	VECTOR_DB_PATH = "vector_db" # Folder where the vector database will be saved

	# --- 1. Load the PDF ---
	print(f"Loading PDF from: {PDF_PATH}...")
	try:
	loader = PyPDFLoader(PDF_PATH)
	documents = loader.load()
	print(f"PDF loaded successfully! Total of {len(documents)} pages.")
	except Exception as e:
	print(f"Error loading PDF: {e}")
	print("Please ensure the PDF file exists and the path is correct.")
	exit() # Stop the script if an error occurs

	# --- 2. Split the text into chunks ---
	print("Splitting text into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, # Maximum size of each chunk (in characters)
	chunk_overlap=200, # How many characters chunks can overlap (to maintain context)
	length_function=len # Function to calculate chunk length
	)
	chunks = text_splitter.split_documents(documents)
	print(f"Text split into {len(chunks)} chunks.")

	# --- 3. Create Embeddings and Store in FAISS ---
	print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
	embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)

	print("Generating embeddings and creating the FAISS vector database...")
	# Create the vector database from the chunks and embeddings
	vector_db = FAISS.from_documents(chunks, embeddings)

	# --- 4. Save the Vector Database ---
	print(f"Saving the vector database to: {VECTOR_DB_PATH}...")
	vector_db.save_local(VECTOR_DB_PATH)
	print("Vector database created and saved successfully!")