Spaces:
Sleeping
Sleeping
| # This is a Gradio based Web UI code to create Vector DB from PDF files. | |
| # Upload and index PDF documents via browser | |
| # Create or add to existing collections | |
| # Display existing collections and their associated topics from the persist_dir | |
| # Populate a dropdown dynamically with those collection names | |
| import os | |
| from pathlib import Path | |
| from re import sub | |
| from typing import List | |
| import gradio as gr | |
| import chromadb | |
| from llama_index.core import ( | |
| SimpleDirectoryReader, | |
| VectorStoreIndex, | |
| StorageContext, | |
| Document, | |
| Settings as LlamaSettings | |
| ) | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| # Chunking settings | |
| EMBED_CHUNK_SIZE = 512 | |
| EMBED_CHUNK_OVERLAP = 50 | |
| def sanitize_metadata(metadata: dict) -> dict: | |
| return {k: str(v) if v is not None else "" for k, v in metadata.items()} | |
| def sanitize_name(value: str) -> str: | |
| return sub(r"[^\w]+", "_", value).strip("_").lower() | |
| def load_documents(pdf_path: str, topic: str) -> list: | |
| pdf_file = Path(pdf_path) | |
| raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data() | |
| documents = [] | |
| for i, doc in enumerate(raw_docs): | |
| if not doc.text: | |
| print(f"β οΈ Skipping empty doc {i}") | |
| continue | |
| meta = sanitize_metadata(doc.metadata or {}) | |
| meta["topic"] = topic | |
| meta["source"] = str(pdf_file.name) | |
| if hasattr(doc, "page_label"): | |
| meta["page"] = str(doc.page_label) | |
| documents.append(Document(text=doc.text, metadata=meta)) | |
| return documents | |
| def initialize_embedding() -> HuggingFaceEmbedding: | |
| print("π§ Initializing embedding model...") | |
| embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| LlamaSettings.chunk_size = EMBED_CHUNK_SIZE | |
| LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP | |
| return embed_model | |
| def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str): | |
| pdf_file = Path(pdf_path) | |
| if not pdf_file.exists(): | |
| raise FileNotFoundError(f"File not found: {pdf_path}") | |
| if pdf_file.suffix.lower() != ".pdf": | |
| raise ValueError("Provided file is not a PDF") | |
| persist_path = Path(persist_dir) | |
| if persist_path.exists(): | |
| raise FileExistsError(f"Persist directory already exists: {persist_path}") | |
| persist_path.mkdir(parents=True, exist_ok=True) | |
| if not collection_name: | |
| topic_safe = sanitize_name(topic) | |
| pdf_name = sanitize_name(pdf_file.stem) | |
| collection_name = f"{pdf_name}_{topic_safe}" | |
| documents = load_documents(pdf_path, topic) | |
| if not documents: | |
| raise ValueError("No valid documents found in PDF") | |
| embed_model = initialize_embedding() | |
| chroma_client = chromadb.PersistentClient(path=persist_dir) | |
| collection = chroma_client.get_or_create_collection(name=collection_name) | |
| vector_store = ChromaVectorStore(chroma_collection=collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| VectorStoreIndex.from_documents( | |
| documents, | |
| storage_context=storage_context, | |
| embed_model=embed_model | |
| ) | |
| print(f"β Created collection: {collection_name}") | |
| return collection_name | |
| def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str): | |
| pdf_file = Path(pdf_path) | |
| if not pdf_file.exists(): | |
| raise FileNotFoundError(f"File not found: {pdf_path}") | |
| if pdf_file.suffix.lower() != ".pdf": | |
| raise ValueError("Provided file is not a PDF") | |
| persist_path = Path(persist_dir) | |
| if not persist_path.exists(): | |
| raise FileNotFoundError(f"Persist directory not found: {persist_path}") | |
| documents = load_documents(pdf_path, topic) | |
| if not documents: | |
| raise ValueError("No valid documents found in PDF") | |
| embed_model = initialize_embedding() | |
| chroma_client = chromadb.PersistentClient(path=persist_dir) | |
| collection = chroma_client.get_or_create_collection(name=collection_name) | |
| vector_store = ChromaVectorStore(chroma_collection=collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| VectorStoreIndex.from_documents( | |
| documents, | |
| storage_context=storage_context, | |
| embed_model=embed_model | |
| ) | |
| print(f"π¦ Added to collection: {collection_name}") | |
| return collection_name | |
| def list_collections_and_topics(persist_dir: str) -> List[str]: | |
| persist_path = Path(persist_dir) | |
| if not persist_path.exists(): | |
| print(f"β οΈ Persist directory does not exist: {persist_dir}") | |
| return [] | |
| try: | |
| chroma_client = chromadb.PersistentClient(path=persist_dir) | |
| collections = chroma_client.list_collections() | |
| items = [] | |
| for col in collections: | |
| name = col.name | |
| topic = "Unknown" | |
| try: | |
| docs = col.get(limit=1) | |
| if docs and docs['metadatas']: | |
| metadata = docs['metadatas'][0] | |
| topic = metadata.get("topic", "Unknown") | |
| except Exception: | |
| pass | |
| items.append(f"{name} ({topic})") | |
| return items | |
| except Exception as e: | |
| print(f"Error fetching collections: {e}") | |
| return [] | |
| def run_indexing(pdf_file, topic, mode, collection_name, persist_dir): | |
| try: | |
| file_path = str(pdf_file) # pdf_file is already a path-like object | |
| if mode == "create": | |
| collection_used = create_vector_index(file_path, persist_dir, topic, collection_name) | |
| else: | |
| collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name) | |
| return f"β Indexed successfully into collection '{collection_used}'" | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def launch_ui(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π§ PDF Vector Indexer (ChromaDB)") | |
| gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF") | |
| topic_input = gr.Textbox(label="Topic") | |
| mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create") | |
| with gr.Row(): | |
| persist_dir_input = gr.Textbox( | |
| label="Persist Directory", | |
| value="", | |
| info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing." | |
| ) | |
| collection_name_input = gr.Textbox( | |
| label="Collection Name", | |
| info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic." | |
| ) | |
| collection_dropdown = gr.Dropdown(label="π Existing Collections", choices=[], interactive=True) | |
| refresh_button = gr.Button("π Refresh Collections") | |
| result_output = gr.Textbox(label="Status", lines=2) | |
| debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False) | |
| def handle_indexing(pdf_file, topic, mode, name, persist): | |
| result = run_indexing(pdf_file, topic, mode, name, persist) | |
| updated = list_collections_and_topics(persist) | |
| print("π Collections returned:", updated) | |
| debug_msg = f"Collections returned: {updated}" | |
| return result, gr.update(choices=updated, value=None), debug_msg | |
| index_btn = gr.Button("π Run Indexing") | |
| index_btn.click( | |
| fn=handle_indexing, | |
| inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input], | |
| outputs=[result_output, collection_dropdown, debug_output] | |
| ) | |
| def refresh_dropdown_handler(persist_path): | |
| choices = list_collections_and_topics(persist_path) | |
| print("π Refreshed collections:", choices) | |
| return gr.update(choices=choices, value=None) | |
| refresh_button.click( | |
| fn=refresh_dropdown_handler, | |
| inputs=[persist_dir_input], | |
| outputs=[collection_dropdown] | |
| ) | |
| def handle_collection_selection(selection): | |
| if not selection: | |
| return gr.update(value=""), gr.update(value="") | |
| try: | |
| name, topic = selection.strip().rsplit(" (", 1) | |
| topic = topic.rstrip(")") | |
| return gr.update(value=name), gr.update(value=topic) | |
| except Exception: | |
| return gr.update(value=""), gr.update(value="") | |
| collection_dropdown.change( | |
| fn=handle_collection_selection, | |
| inputs=[collection_dropdown], | |
| outputs=[collection_name_input, topic_input] | |
| ) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| launch_ui() | |