Spaces:

IW2025
/

VarunChatbotSpace

Sleeping

App Files Files Community

IW2025 commited on Jul 27, 2025

Commit

08caccc

verified ·

1 Parent(s): 6b4764f

Upload Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py

Browse files

Files changed (1) hide show

Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py +249 -0

Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# This is a Gradio based Web UI code to create Vector DB from PDF files.
+# Upload and index PDF documents via browser
+# Create or add to existing collections
+# Display existing collections and their associated topics from the persist_dir
+# Populate a dropdown dynamically with those collection names
+import os
+from pathlib import Path
+from re import sub
+from typing import List
+import gradio as gr
+import chromadb
+from llama_index.core import (
+    SimpleDirectoryReader,
+    VectorStoreIndex,
+    StorageContext,
+    Document,
+    Settings as LlamaSettings
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+# Chunking settings
+EMBED_CHUNK_SIZE = 512
+EMBED_CHUNK_OVERLAP = 50
+def sanitize_metadata(metadata: dict) -> dict:
+    return {k: str(v) if v is not None else "" for k, v in metadata.items()}
+def sanitize_name(value: str) -> str:
+    return sub(r"[^\w]+", "_", value).strip("_").lower()
+def load_documents(pdf_path: str, topic: str) -> list:
+    pdf_file = Path(pdf_path)
+    raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
+    documents = []
+    for i, doc in enumerate(raw_docs):
+        if not doc.text:
+            print(f"⚠️ Skipping empty doc {i}")
+            continue
+        meta = sanitize_metadata(doc.metadata or {})
+        meta["topic"] = topic
+        meta["source"] = str(pdf_file.name)
+        if hasattr(doc, "page_label"):
+            meta["page"] = str(doc.page_label)
+        documents.append(Document(text=doc.text, metadata=meta))
+    return documents
+def initialize_embedding() -> HuggingFaceEmbedding:
+    print("🔧 Initializing embedding model...")
+    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    LlamaSettings.chunk_size = EMBED_CHUNK_SIZE
+    LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP
+    return embed_model
+def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
+    pdf_file = Path(pdf_path)
+    if not pdf_file.exists():
+        raise FileNotFoundError(f"File not found: {pdf_path}")
+    if pdf_file.suffix.lower() != ".pdf":
+        raise ValueError("Provided file is not a PDF")
+    persist_path = Path(persist_dir)
+    if persist_path.exists():
+        raise FileExistsError(f"Persist directory already exists: {persist_path}")
+    persist_path.mkdir(parents=True, exist_ok=True)
+    if not collection_name:
+        topic_safe = sanitize_name(topic)
+        pdf_name = sanitize_name(pdf_file.stem)
+        collection_name = f"{pdf_name}_{topic_safe}"
+    documents = load_documents(pdf_path, topic)
+    if not documents:
+        raise ValueError("No valid documents found in PDF")
+    embed_model = initialize_embedding()
+    chroma_client = chromadb.PersistentClient(path=persist_dir)
+    collection = chroma_client.get_or_create_collection(name=collection_name)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+        embed_model=embed_model
+    )
+    print(f"✅ Created collection: {collection_name}")
+    return collection_name
+def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
+    pdf_file = Path(pdf_path)
+    if not pdf_file.exists():
+        raise FileNotFoundError(f"File not found: {pdf_path}")
+    if pdf_file.suffix.lower() != ".pdf":
+        raise ValueError("Provided file is not a PDF")
+    persist_path = Path(persist_dir)
+    if not persist_path.exists():
+        raise FileNotFoundError(f"Persist directory not found: {persist_path}")
+    documents = load_documents(pdf_path, topic)
+    if not documents:
+        raise ValueError("No valid documents found in PDF")
+    embed_model = initialize_embedding()
+    chroma_client = chromadb.PersistentClient(path=persist_dir)
+    collection = chroma_client.get_or_create_collection(name=collection_name)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+        embed_model=embed_model
+    )
+    print(f"📦 Added to collection: {collection_name}")
+    return collection_name
+def list_collections_and_topics(persist_dir: str) -> List[str]:
+    persist_path = Path(persist_dir)
+    if not persist_path.exists():
+        print(f"⚠️ Persist directory does not exist: {persist_dir}")
+        return []
+    try:
+        chroma_client = chromadb.PersistentClient(path=persist_dir)
+        collections = chroma_client.list_collections()
+        items = []
+        for col in collections:
+            name = col.name
+            topic = "Unknown"
+            try:
+                docs = col.get(limit=1)
+                if docs and docs['metadatas']:
+                    metadata = docs['metadatas'][0]
+                    topic = metadata.get("topic", "Unknown")
+            except Exception:
+                pass
+            items.append(f"{name} ({topic})")
+        return items
+    except Exception as e:
+        print(f"Error fetching collections: {e}")
+        return []
+def run_indexing(pdf_file, topic, mode, collection_name, persist_dir):
+    try:
+        file_path = str(pdf_file)  # pdf_file is already a path-like object
+        if mode == "create":
+            collection_used = create_vector_index(file_path, persist_dir, topic, collection_name)
+        else:
+            collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name)
+        return f"✅ Indexed successfully into collection '{collection_used}'"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def launch_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("# 🧠 PDF Vector Indexer (ChromaDB)")
+        gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.")
+        with gr.Row():
+            pdf_input = gr.File(label="Upload PDF")
+            topic_input = gr.Textbox(label="Topic")
+            mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create")
+        with gr.Row():
+            persist_dir_input = gr.Textbox(
+                    label="Persist Directory",
+                    value="",
+                    info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing."
+            )
+            collection_name_input = gr.Textbox(
+                label="Collection Name",
+                info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic."
+            )
+        collection_dropdown = gr.Dropdown(label="📖 Existing Collections", choices=[], interactive=True)
+        refresh_button = gr.Button("🔄 Refresh Collections")
+        result_output = gr.Textbox(label="Status", lines=2)
+        debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False)
+        def handle_indexing(pdf_file, topic, mode, name, persist):
+            result = run_indexing(pdf_file, topic, mode, name, persist)
+            updated = list_collections_and_topics(persist)
+            print("🔍 Collections returned:", updated)
+            debug_msg = f"Collections returned: {updated}"
+            return result, gr.update(choices=updated, value=None), debug_msg
+        index_btn = gr.Button("🚀 Run Indexing")
+        index_btn.click(
+            fn=handle_indexing,
+            inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input],
+            outputs=[result_output, collection_dropdown, debug_output]
+        )
+        def refresh_dropdown_handler(persist_path):
+            choices = list_collections_and_topics(persist_path)
+            print("🔄 Refreshed collections:", choices)
+            return gr.update(choices=choices, value=None)
+        refresh_button.click(
+            fn=refresh_dropdown_handler,
+            inputs=[persist_dir_input],
+            outputs=[collection_dropdown]
+        )
+        def handle_collection_selection(selection):
+            if not selection:
+                return gr.update(value=""), gr.update(value="")
+            try:
+                name, topic = selection.strip().rsplit(" (", 1)
+                topic = topic.rstrip(")")
+                return gr.update(value=name), gr.update(value=topic)
+            except Exception:
+                return gr.update(value=""), gr.update(value="")
+        collection_dropdown.change(
+            fn=handle_collection_selection,
+            inputs=[collection_dropdown],
+            outputs=[collection_name_input, topic_input]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    launch_ui()