Spaces:

kamkol
/

AB_AI_RAG_Agent_v2

Sleeping

App Files Files Community

AB_AI_RAG_Agent_v2 / process_data.py

kamkol

Update the app with the better chunks and retrieval strategy

7ab7579 10 months ago

raw

history blame contribute delete

8.83 kB

	import os
	import json
	import shutil
	import pickle
	import time
	from pathlib import Path
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Qdrant
	from langchain_community.retrievers import BM25Retriever
	from rank_bm25 import BM25Okapi
	import numpy as np
	from langchain.retrievers import EnsembleRetriever
	from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
	from langchain_cohere import CohereRerank

	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	def clean_directory(directory_path):
	"""Clean a directory by removing all files and subdirectories"""
	path = Path(directory_path)
	if path.exists():
	print(f"Cleaning directory: {directory_path}")
	shutil.rmtree(path)

	# Wait a moment to ensure OS releases the directory handles
	time.sleep(1)

	path.mkdir(parents=True, exist_ok=True)
	print(f"Created clean directory: {directory_path}")

	def load_preprocessed_chunks():
	"""Load the preprocessed chunks from JSON file"""
	print("Loading preprocessed chunks...")

	# Path to the saved JSON file
	chunks_file = "all_chunks_95percentile.json"
	if not os.path.exists(chunks_file):
	raise FileNotFoundError(f"Chunks file not found: {chunks_file}")

	with open(chunks_file, "r", encoding="utf-8") as f:
	chunks_data = json.load(f)

	# Convert to Document objects
	documents = [
	Document(
	page_content=chunk['page_content'],
	metadata=chunk['metadata']
	)
	for chunk in chunks_data
	]

	print(f"Loaded {len(documents)} preprocessed chunks.")
	return documents

	def create_vectorstore_and_retrievers(documents):
	"""Create vectorstore and retrievers using the latest chunking strategy."""

	try:
	# Initialize embedding model
	print("Loading embedding model...")
	embedding_model = HuggingFaceEmbeddings(
	model_name="kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	)

	# Create Qdrant vectorstore
	print("Creating Qdrant vectorstore...")
	qdrant_vectorstore = Qdrant.from_documents(
	documents,
	embedding_model,
	location=":memory:",
	collection_name="kohavi_ab_testing_pdf_collection",
	)

	# Create BM25 retriever
	print("Creating BM25 retriever...")
	texts = [doc.page_content for doc in documents]
	tokenized_corpus = [text.split() for text in texts]
	bm25 = BM25Okapi(tokenized_corpus)
	bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])
	bm25_retriever.k = 10 # Set top-k results

	print(f"Successfully created vectorstore with {len(documents)} documents")
	print(f"BM25 retriever created with {len(texts)} texts")

	return qdrant_vectorstore, bm25_retriever, embedding_model

	except Exception as e:
	print(f"Error creating vectorstore and retrievers: {e}")
	raise

	def save_processed_data(qdrant_vectorstore, bm25_retriever, embedding_model, documents):
	"""Save all processed data files needed for the app"""
	print("Saving processed data...")

	# Create processed data directory
	processed_data_dir = Path("data/processed_data")
	clean_directory(processed_data_dir)

	# Save documents as chunks
	print("Saving document chunks...")
	with open(processed_data_dir / "chunks.pkl", "wb") as f:
	pickle.dump(documents, f)

	# Save BM25 retriever
	print("Saving BM25 retriever...")
	with open(processed_data_dir / "bm25_retriever.pkl", "wb") as f:
	pickle.dump(bm25_retriever, f)

	# Save embedding model info (we'll reinitialize it in the app)
	print("Saving embedding model info...")
	embedding_info = {
	"model_name": "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	}
	with open(processed_data_dir / "embedding_info.json", "w") as f:
	json.dump(embedding_info, f)

	# Save vector data for Qdrant - we need to extract vectors and metadata
	print("Saving Qdrant vector data...")

	# Get all vectors and their metadata from Qdrant
	vectors_data = []
	for doc in documents:
	# We'll need to re-embed in the app since we can't easily serialize Qdrant's in-memory store
	vectors_data.append({
	"text": doc.page_content,
	"metadata": doc.metadata
	})

	with open(processed_data_dir / "vector_data.json", "w", encoding="utf-8") as f:
	json.dump(vectors_data, f, ensure_ascii=False, indent=2)

	print("All processed data saved successfully!")

	def create_processed_data():
	"""Create all processed data files needed for the RAG system"""

	# Ensure the processed_data directory exists
	processed_data_dir = Path("AB_AI_RAG_Agent/data/processed_data")
	processed_data_dir.mkdir(parents=True, exist_ok=True)

	# Load the improved chunks from the Jupyter notebook
	chunks_source_path = Path("all_chunks_95percentile.json")

	if not chunks_source_path.exists():
	raise FileNotFoundError(f"Source chunks file not found: {chunks_source_path}")

	print("Loading improved chunks from Jupyter notebook...")
	with open(chunks_source_path, 'r') as f:
	chunk_data = json.load(f)

	# Convert to Document objects
	documents = []
	for chunk in chunk_data:
	doc = Document(
	page_content=chunk['page_content'],
	metadata=chunk['metadata']
	)
	documents.append(doc)

	print(f"Loaded {len(documents)} chunks")

	# Save documents as pickle
	chunks_path = processed_data_dir / "chunks.pkl"
	with open(chunks_path, "wb") as f:
	pickle.dump(documents, f)
	print(f"Saved chunks to {chunks_path}")

	# Create BM25 retriever
	print("Creating BM25 retriever...")
	texts = [doc.page_content for doc in documents]
	tokenized_texts = [text.split() for text in texts]
	bm25 = BM25Okapi(tokenized_texts)

	# Create BM25 retriever object
	from langchain_community.retrievers import BM25Retriever
	bm25_retriever = BM25Retriever.from_texts(texts, metadatas=[doc.metadata for doc in documents])

	# Save BM25 retriever
	bm25_path = processed_data_dir / "bm25_retriever.pkl"
	with open(bm25_path, "wb") as f:
	pickle.dump(bm25_retriever, f)
	print(f"Saved BM25 retriever to {bm25_path}")

	# Initialize embedding model
	print("Initializing embedding model...")
	model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
	embedding_model = HuggingFaceEmbeddings(model_name=model_name)

	# Save embedding model info
	embedding_info = {"model_name": model_name}
	embedding_info_path = processed_data_dir / "embedding_info.json"
	with open(embedding_info_path, "w") as f:
	json.dump(embedding_info, f)
	print(f"Saved embedding info to {embedding_info_path}")

	# Pre-compute embeddings for all documents
	print("Pre-computing embeddings (this may take a while)...")
	embedded_docs = []

	# Process in batches to avoid memory issues
	batch_size = 50
	for i in range(0, len(documents), batch_size):
	batch = documents[i:i+batch_size]

	# Extract text
	texts = [doc.page_content for doc in batch]

	# Get embeddings
	embeddings = embedding_model.embed_documents(texts)

	# Store with metadata
	for j, doc in enumerate(batch):
	embedded_docs.append({
	"id": i + j,
	"text": doc.page_content,
	"metadata": doc.metadata,
	"embedding": embeddings[j]
	})

	# Print progress
	print(f"Embedded {min(i+batch_size, len(documents))}/{len(documents)} chunks")

	# Save the embedded docs for fast loading
	embedded_docs_path = processed_data_dir / "embedded_docs.pkl"
	with open(embedded_docs_path, "wb") as f:
	pickle.dump(embedded_docs, f)
	print(f"Saved embedded docs to {embedded_docs_path}")

	print(f"Processing complete! All files saved to {processed_data_dir}")
	print(f"Files created:")
	print(f" - chunks.pkl ({len(documents)} documents)")
	print(f" - bm25_retriever.pkl")
	print(f" - embedding_info.json")
	print(f" - embedded_docs.pkl ({len(embedded_docs)} embedded documents)")

	if __name__ == "__main__":
	create_processed_data()