VarunChatbotSpace / Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py
IW2025's picture
Upload Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py (#1)
3395f39 verified
# This is a Gradio based Web UI code to create Vector DB from PDF files.
# Upload and index PDF documents via browser
# Create or add to existing collections
# Display existing collections and their associated topics from the persist_dir
# Populate a dropdown dynamically with those collection names
import os
from pathlib import Path
from re import sub
from typing import List
import gradio as gr
import chromadb
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
StorageContext,
Document,
Settings as LlamaSettings
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
# Chunking settings
EMBED_CHUNK_SIZE = 512
EMBED_CHUNK_OVERLAP = 50
def sanitize_metadata(metadata: dict) -> dict:
return {k: str(v) if v is not None else "" for k, v in metadata.items()}
def sanitize_name(value: str) -> str:
return sub(r"[^\w]+", "_", value).strip("_").lower()
def load_documents(pdf_path: str, topic: str) -> list:
pdf_file = Path(pdf_path)
raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
documents = []
for i, doc in enumerate(raw_docs):
if not doc.text:
print(f"⚠️ Skipping empty doc {i}")
continue
meta = sanitize_metadata(doc.metadata or {})
meta["topic"] = topic
meta["source"] = str(pdf_file.name)
if hasattr(doc, "page_label"):
meta["page"] = str(doc.page_label)
documents.append(Document(text=doc.text, metadata=meta))
return documents
def initialize_embedding() -> HuggingFaceEmbedding:
print("πŸ”§ Initializing embedding model...")
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
LlamaSettings.chunk_size = EMBED_CHUNK_SIZE
LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP
return embed_model
def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
pdf_file = Path(pdf_path)
if not pdf_file.exists():
raise FileNotFoundError(f"File not found: {pdf_path}")
if pdf_file.suffix.lower() != ".pdf":
raise ValueError("Provided file is not a PDF")
persist_path = Path(persist_dir)
if persist_path.exists():
raise FileExistsError(f"Persist directory already exists: {persist_path}")
persist_path.mkdir(parents=True, exist_ok=True)
if not collection_name:
topic_safe = sanitize_name(topic)
pdf_name = sanitize_name(pdf_file.stem)
collection_name = f"{pdf_name}_{topic_safe}"
documents = load_documents(pdf_path, topic)
if not documents:
raise ValueError("No valid documents found in PDF")
embed_model = initialize_embedding()
chroma_client = chromadb.PersistentClient(path=persist_dir)
collection = chroma_client.get_or_create_collection(name=collection_name)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
embed_model=embed_model
)
print(f"βœ… Created collection: {collection_name}")
return collection_name
def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
pdf_file = Path(pdf_path)
if not pdf_file.exists():
raise FileNotFoundError(f"File not found: {pdf_path}")
if pdf_file.suffix.lower() != ".pdf":
raise ValueError("Provided file is not a PDF")
persist_path = Path(persist_dir)
if not persist_path.exists():
raise FileNotFoundError(f"Persist directory not found: {persist_path}")
documents = load_documents(pdf_path, topic)
if not documents:
raise ValueError("No valid documents found in PDF")
embed_model = initialize_embedding()
chroma_client = chromadb.PersistentClient(path=persist_dir)
collection = chroma_client.get_or_create_collection(name=collection_name)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
embed_model=embed_model
)
print(f"πŸ“¦ Added to collection: {collection_name}")
return collection_name
def list_collections_and_topics(persist_dir: str) -> List[str]:
persist_path = Path(persist_dir)
if not persist_path.exists():
print(f"⚠️ Persist directory does not exist: {persist_dir}")
return []
try:
chroma_client = chromadb.PersistentClient(path=persist_dir)
collections = chroma_client.list_collections()
items = []
for col in collections:
name = col.name
topic = "Unknown"
try:
docs = col.get(limit=1)
if docs and docs['metadatas']:
metadata = docs['metadatas'][0]
topic = metadata.get("topic", "Unknown")
except Exception:
pass
items.append(f"{name} ({topic})")
return items
except Exception as e:
print(f"Error fetching collections: {e}")
return []
def run_indexing(pdf_file, topic, mode, collection_name, persist_dir):
try:
file_path = str(pdf_file) # pdf_file is already a path-like object
if mode == "create":
collection_used = create_vector_index(file_path, persist_dir, topic, collection_name)
else:
collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name)
return f"βœ… Indexed successfully into collection '{collection_used}'"
except Exception as e:
return f"❌ Error: {str(e)}"
def launch_ui():
with gr.Blocks() as demo:
gr.Markdown("# 🧠 PDF Vector Indexer (ChromaDB)")
gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.")
with gr.Row():
pdf_input = gr.File(label="Upload PDF")
topic_input = gr.Textbox(label="Topic")
mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create")
with gr.Row():
persist_dir_input = gr.Textbox(
label="Persist Directory",
value="",
info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing."
)
collection_name_input = gr.Textbox(
label="Collection Name",
info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic."
)
collection_dropdown = gr.Dropdown(label="πŸ“– Existing Collections", choices=[], interactive=True)
refresh_button = gr.Button("πŸ”„ Refresh Collections")
result_output = gr.Textbox(label="Status", lines=2)
debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False)
def handle_indexing(pdf_file, topic, mode, name, persist):
result = run_indexing(pdf_file, topic, mode, name, persist)
updated = list_collections_and_topics(persist)
print("πŸ” Collections returned:", updated)
debug_msg = f"Collections returned: {updated}"
return result, gr.update(choices=updated, value=None), debug_msg
index_btn = gr.Button("πŸš€ Run Indexing")
index_btn.click(
fn=handle_indexing,
inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input],
outputs=[result_output, collection_dropdown, debug_output]
)
def refresh_dropdown_handler(persist_path):
choices = list_collections_and_topics(persist_path)
print("πŸ”„ Refreshed collections:", choices)
return gr.update(choices=choices, value=None)
refresh_button.click(
fn=refresh_dropdown_handler,
inputs=[persist_dir_input],
outputs=[collection_dropdown]
)
def handle_collection_selection(selection):
if not selection:
return gr.update(value=""), gr.update(value="")
try:
name, topic = selection.strip().rsplit(" (", 1)
topic = topic.rstrip(")")
return gr.update(value=name), gr.update(value=topic)
except Exception:
return gr.update(value=""), gr.update(value="")
collection_dropdown.change(
fn=handle_collection_selection,
inputs=[collection_dropdown],
outputs=[collection_name_input, topic_input]
)
demo.launch()
if __name__ == "__main__":
launch_ui()