Spaces:

IW2025
/

VarunChatbotSpace

Sleeping

File size: 8,956 Bytes

3395f39

# This is a Gradio based Web UI code to create Vector DB from PDF files.
# Upload and index PDF documents via browser
# Create or add to existing collections
# Display existing collections and their associated topics from the persist_dir
# Populate a dropdown dynamically with those collection names

import os
from pathlib import Path
from re import sub
from typing import List

import gradio as gr
import chromadb
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings as LlamaSettings
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

# Chunking settings
EMBED_CHUNK_SIZE = 512
EMBED_CHUNK_OVERLAP = 50


def sanitize_metadata(metadata: dict) -> dict:
    return {k: str(v) if v is not None else "" for k, v in metadata.items()}


def sanitize_name(value: str) -> str:
    return sub(r"[^\w]+", "_", value).strip("_").lower()


def load_documents(pdf_path: str, topic: str) -> list:
    pdf_file = Path(pdf_path)
    raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
    documents = []

    for i, doc in enumerate(raw_docs):
        if not doc.text:
            print(f"⚠️ Skipping empty doc {i}")
            continue

        meta = sanitize_metadata(doc.metadata or {})
        meta["topic"] = topic
        meta["source"] = str(pdf_file.name)
        if hasattr(doc, "page_label"):
            meta["page"] = str(doc.page_label)

        documents.append(Document(text=doc.text, metadata=meta))

    return documents


def initialize_embedding() -> HuggingFaceEmbedding:
    print("🔧 Initializing embedding model...")
    embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
    LlamaSettings.chunk_size = EMBED_CHUNK_SIZE
    LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP
    return embed_model


def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
    pdf_file = Path(pdf_path)
    if not pdf_file.exists():
        raise FileNotFoundError(f"File not found: {pdf_path}")
    if pdf_file.suffix.lower() != ".pdf":
        raise ValueError("Provided file is not a PDF")

    persist_path = Path(persist_dir)
    if persist_path.exists():
        raise FileExistsError(f"Persist directory already exists: {persist_path}")

    persist_path.mkdir(parents=True, exist_ok=True)

    if not collection_name:
        topic_safe = sanitize_name(topic)
        pdf_name = sanitize_name(pdf_file.stem)
        collection_name = f"{pdf_name}_{topic_safe}"

    documents = load_documents(pdf_path, topic)
    if not documents:
        raise ValueError("No valid documents found in PDF")

    embed_model = initialize_embedding()
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        embed_model=embed_model
    )
    print(f"✅ Created collection: {collection_name}")
    return collection_name


def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
    pdf_file = Path(pdf_path)
    if not pdf_file.exists():
        raise FileNotFoundError(f"File not found: {pdf_path}")
    if pdf_file.suffix.lower() != ".pdf":
        raise ValueError("Provided file is not a PDF")

    persist_path = Path(persist_dir)
    if not persist_path.exists():
        raise FileNotFoundError(f"Persist directory not found: {persist_path}")

    documents = load_documents(pdf_path, topic)
    if not documents:
        raise ValueError("No valid documents found in PDF")

    embed_model = initialize_embedding()
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        embed_model=embed_model
    )
    print(f"📦 Added to collection: {collection_name}")
    return collection_name


def list_collections_and_topics(persist_dir: str) -> List[str]:
    persist_path = Path(persist_dir)
    if not persist_path.exists():
        print(f"⚠️ Persist directory does not exist: {persist_dir}")
        return []

    try:
        chroma_client = chromadb.PersistentClient(path=persist_dir)
        collections = chroma_client.list_collections()
        items = []

        for col in collections:
            name = col.name
            topic = "Unknown"
            try:
                docs = col.get(limit=1)
                if docs and docs['metadatas']:
                    metadata = docs['metadatas'][0]
                    topic = metadata.get("topic", "Unknown")
            except Exception:
                pass
            items.append(f"{name} ({topic})")
        return items
    except Exception as e:
        print(f"Error fetching collections: {e}")
        return []


def run_indexing(pdf_file, topic, mode, collection_name, persist_dir):
    try:
        file_path = str(pdf_file)  # pdf_file is already a path-like object

        if mode == "create":
            collection_used = create_vector_index(file_path, persist_dir, topic, collection_name)
        else:
            collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name)

        return f"✅ Indexed successfully into collection '{collection_used}'"
    except Exception as e:
        return f"❌ Error: {str(e)}"


def launch_ui():
    with gr.Blocks() as demo:
        gr.Markdown("# 🧠 PDF Vector Indexer (ChromaDB)")
        gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.")

        with gr.Row():
            pdf_input = gr.File(label="Upload PDF")
            topic_input = gr.Textbox(label="Topic")
            mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create")

        with gr.Row():
            persist_dir_input = gr.Textbox(
                    label="Persist Directory", 
                    value="",
                    info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing."
            )
            collection_name_input = gr.Textbox(
                label="Collection Name",
                info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic."
            )

        collection_dropdown = gr.Dropdown(label="📖 Existing Collections", choices=[], interactive=True)
        refresh_button = gr.Button("🔄 Refresh Collections")
        result_output = gr.Textbox(label="Status", lines=2)
        debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False)

        def handle_indexing(pdf_file, topic, mode, name, persist):
            result = run_indexing(pdf_file, topic, mode, name, persist)
            updated = list_collections_and_topics(persist)
            print("🔍 Collections returned:", updated)
            debug_msg = f"Collections returned: {updated}"
            return result, gr.update(choices=updated, value=None), debug_msg

        index_btn = gr.Button("🚀 Run Indexing")
        index_btn.click(
            fn=handle_indexing,
            inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input],
            outputs=[result_output, collection_dropdown, debug_output]
        )

        def refresh_dropdown_handler(persist_path):
            choices = list_collections_and_topics(persist_path)
            print("🔄 Refreshed collections:", choices)
            return gr.update(choices=choices, value=None)

        refresh_button.click(
            fn=refresh_dropdown_handler,
            inputs=[persist_dir_input],
            outputs=[collection_dropdown]
        )

        def handle_collection_selection(selection):
            if not selection:
                return gr.update(value=""), gr.update(value="")
            try:
                name, topic = selection.strip().rsplit(" (", 1)
                topic = topic.rstrip(")")
                return gr.update(value=name), gr.update(value=topic)
            except Exception:
                return gr.update(value=""), gr.update(value="")

        collection_dropdown.change(
            fn=handle_collection_selection,
            inputs=[collection_dropdown],
            outputs=[collection_name_input, topic_input]
        )

    demo.launch()


if __name__ == "__main__":
    launch_ui()