Spaces:

IW2025
/

VarunChatbotSpace

Sleeping

App Files Files Community

VarunChatbotSpace / Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py

IW2025

Upload Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py (#1)

3395f39 verified 6 months ago

raw

history blame contribute delete

8.96 kB

	# This is a Gradio based Web UI code to create Vector DB from PDF files.
	# Upload and index PDF documents via browser
	# Create or add to existing collections
	# Display existing collections and their associated topics from the persist_dir
	# Populate a dropdown dynamically with those collection names

	import os
	from pathlib import Path
	from re import sub
	from typing import List

	import gradio as gr
	import chromadb
	from llama_index.core import (
	SimpleDirectoryReader,
	VectorStoreIndex,
	StorageContext,
	Document,
	Settings as LlamaSettings
	)
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.vector_stores.chroma import ChromaVectorStore

	# Chunking settings
	EMBED_CHUNK_SIZE = 512
	EMBED_CHUNK_OVERLAP = 50


	def sanitize_metadata(metadata: dict) -> dict:
	return {k: str(v) if v is not None else "" for k, v in metadata.items()}


	def sanitize_name(value: str) -> str:
	return sub(r"[^\w]+", "_", value).strip("_").lower()


	def load_documents(pdf_path: str, topic: str) -> list:
	pdf_file = Path(pdf_path)
	raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
	documents = []

	for i, doc in enumerate(raw_docs):
	if not doc.text:
	print(f"⚠️ Skipping empty doc {i}")
	continue

	meta = sanitize_metadata(doc.metadata or {})
	meta["topic"] = topic
	meta["source"] = str(pdf_file.name)
	if hasattr(doc, "page_label"):
	meta["page"] = str(doc.page_label)

	documents.append(Document(text=doc.text, metadata=meta))

	return documents


	def initialize_embedding() -> HuggingFaceEmbedding:
	print("🔧 Initializing embedding model...")
	embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
	LlamaSettings.chunk_size = EMBED_CHUNK_SIZE
	LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP
	return embed_model


	def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
	pdf_file = Path(pdf_path)
	if not pdf_file.exists():
	raise FileNotFoundError(f"File not found: {pdf_path}")
	if pdf_file.suffix.lower() != ".pdf":
	raise ValueError("Provided file is not a PDF")

	persist_path = Path(persist_dir)
	if persist_path.exists():
	raise FileExistsError(f"Persist directory already exists: {persist_path}")

	persist_path.mkdir(parents=True, exist_ok=True)

	if not collection_name:
	topic_safe = sanitize_name(topic)
	pdf_name = sanitize_name(pdf_file.stem)
	collection_name = f"{pdf_name}_{topic_safe}"

	documents = load_documents(pdf_path, topic)
	if not documents:
	raise ValueError("No valid documents found in PDF")

	embed_model = initialize_embedding()
	chroma_client = chromadb.PersistentClient(path=persist_dir)
	collection = chroma_client.get_or_create_collection(name=collection_name)

	vector_store = ChromaVectorStore(chroma_collection=collection)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	VectorStoreIndex.from_documents(
	documents,
	storage_context=storage_context,
	embed_model=embed_model
	)
	print(f"✅ Created collection: {collection_name}")
	return collection_name


	def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
	pdf_file = Path(pdf_path)
	if not pdf_file.exists():
	raise FileNotFoundError(f"File not found: {pdf_path}")
	if pdf_file.suffix.lower() != ".pdf":
	raise ValueError("Provided file is not a PDF")

	persist_path = Path(persist_dir)
	if not persist_path.exists():
	raise FileNotFoundError(f"Persist directory not found: {persist_path}")

	documents = load_documents(pdf_path, topic)
	if not documents:
	raise ValueError("No valid documents found in PDF")

	embed_model = initialize_embedding()
	chroma_client = chromadb.PersistentClient(path=persist_dir)
	collection = chroma_client.get_or_create_collection(name=collection_name)

	vector_store = ChromaVectorStore(chroma_collection=collection)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	VectorStoreIndex.from_documents(
	documents,
	storage_context=storage_context,
	embed_model=embed_model
	)
	print(f"📦 Added to collection: {collection_name}")
	return collection_name


	def list_collections_and_topics(persist_dir: str) -> List[str]:
	persist_path = Path(persist_dir)
	if not persist_path.exists():
	print(f"⚠️ Persist directory does not exist: {persist_dir}")
	return []

	try:
	chroma_client = chromadb.PersistentClient(path=persist_dir)
	collections = chroma_client.list_collections()
	items = []

	for col in collections:
	name = col.name
	topic = "Unknown"
	try:
	docs = col.get(limit=1)
	if docs and docs['metadatas']:
	metadata = docs['metadatas'][0]
	topic = metadata.get("topic", "Unknown")
	except Exception:
	pass
	items.append(f"{name} ({topic})")
	return items
	except Exception as e:
	print(f"Error fetching collections: {e}")
	return []


	def run_indexing(pdf_file, topic, mode, collection_name, persist_dir):
	try:
	file_path = str(pdf_file) # pdf_file is already a path-like object

	if mode == "create":
	collection_used = create_vector_index(file_path, persist_dir, topic, collection_name)
	else:
	collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name)

	return f"✅ Indexed successfully into collection '{collection_used}'"
	except Exception as e:
	return f"❌ Error: {str(e)}"


	def launch_ui():
	with gr.Blocks() as demo:
	gr.Markdown("# 🧠 PDF Vector Indexer (ChromaDB)")
	gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.")

	with gr.Row():
	pdf_input = gr.File(label="Upload PDF")
	topic_input = gr.Textbox(label="Topic")
	mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create")

	with gr.Row():
	persist_dir_input = gr.Textbox(
	label="Persist Directory",
	value="",
	info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing."
	)
	collection_name_input = gr.Textbox(
	label="Collection Name",
	info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic."
	)

	collection_dropdown = gr.Dropdown(label="📖 Existing Collections", choices=[], interactive=True)
	refresh_button = gr.Button("🔄 Refresh Collections")
	result_output = gr.Textbox(label="Status", lines=2)
	debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False)

	def handle_indexing(pdf_file, topic, mode, name, persist):
	result = run_indexing(pdf_file, topic, mode, name, persist)
	updated = list_collections_and_topics(persist)
	print("🔍 Collections returned:", updated)
	debug_msg = f"Collections returned: {updated}"
	return result, gr.update(choices=updated, value=None), debug_msg

	index_btn = gr.Button("🚀 Run Indexing")
	index_btn.click(
	fn=handle_indexing,
	inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input],
	outputs=[result_output, collection_dropdown, debug_output]
	)

	def refresh_dropdown_handler(persist_path):
	choices = list_collections_and_topics(persist_path)
	print("🔄 Refreshed collections:", choices)
	return gr.update(choices=choices, value=None)

	refresh_button.click(
	fn=refresh_dropdown_handler,
	inputs=[persist_dir_input],
	outputs=[collection_dropdown]
	)

	def handle_collection_selection(selection):
	if not selection:
	return gr.update(value=""), gr.update(value="")
	try:
	name, topic = selection.strip().rsplit(" (", 1)
	topic = topic.rstrip(")")
	return gr.update(value=name), gr.update(value=topic)
	except Exception:
	return gr.update(value=""), gr.update(value="")

	collection_dropdown.change(
	fn=handle_collection_selection,
	inputs=[collection_dropdown],
	outputs=[collection_name_input, topic_input]
	)

	demo.launch()


	if __name__ == "__main__":
	launch_ui()