IW2025 commited on
Commit
08caccc
Β·
verified Β·
1 Parent(s): 6b4764f

Upload Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py

Browse files
Create_ChromaDB_w_LlamaIndex_Gradio_WebUI.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a Gradio based Web UI code to create Vector DB from PDF files.
2
+ # Upload and index PDF documents via browser
3
+ # Create or add to existing collections
4
+ # Display existing collections and their associated topics from the persist_dir
5
+ # Populate a dropdown dynamically with those collection names
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from re import sub
10
+ from typing import List
11
+
12
+ import gradio as gr
13
+ import chromadb
14
+ from llama_index.core import (
15
+ SimpleDirectoryReader,
16
+ VectorStoreIndex,
17
+ StorageContext,
18
+ Document,
19
+ Settings as LlamaSettings
20
+ )
21
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
22
+ from llama_index.vector_stores.chroma import ChromaVectorStore
23
+
24
+ # Chunking settings
25
+ EMBED_CHUNK_SIZE = 512
26
+ EMBED_CHUNK_OVERLAP = 50
27
+
28
+
29
+ def sanitize_metadata(metadata: dict) -> dict:
30
+ return {k: str(v) if v is not None else "" for k, v in metadata.items()}
31
+
32
+
33
+ def sanitize_name(value: str) -> str:
34
+ return sub(r"[^\w]+", "_", value).strip("_").lower()
35
+
36
+
37
+ def load_documents(pdf_path: str, topic: str) -> list:
38
+ pdf_file = Path(pdf_path)
39
+ raw_docs = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
40
+ documents = []
41
+
42
+ for i, doc in enumerate(raw_docs):
43
+ if not doc.text:
44
+ print(f"⚠️ Skipping empty doc {i}")
45
+ continue
46
+
47
+ meta = sanitize_metadata(doc.metadata or {})
48
+ meta["topic"] = topic
49
+ meta["source"] = str(pdf_file.name)
50
+ if hasattr(doc, "page_label"):
51
+ meta["page"] = str(doc.page_label)
52
+
53
+ documents.append(Document(text=doc.text, metadata=meta))
54
+
55
+ return documents
56
+
57
+
58
+ def initialize_embedding() -> HuggingFaceEmbedding:
59
+ print("πŸ”§ Initializing embedding model...")
60
+ embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
61
+ LlamaSettings.chunk_size = EMBED_CHUNK_SIZE
62
+ LlamaSettings.chunk_overlap = EMBED_CHUNK_OVERLAP
63
+ return embed_model
64
+
65
+
66
+ def create_vector_index(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
67
+ pdf_file = Path(pdf_path)
68
+ if not pdf_file.exists():
69
+ raise FileNotFoundError(f"File not found: {pdf_path}")
70
+ if pdf_file.suffix.lower() != ".pdf":
71
+ raise ValueError("Provided file is not a PDF")
72
+
73
+ persist_path = Path(persist_dir)
74
+ if persist_path.exists():
75
+ raise FileExistsError(f"Persist directory already exists: {persist_path}")
76
+
77
+ persist_path.mkdir(parents=True, exist_ok=True)
78
+
79
+ if not collection_name:
80
+ topic_safe = sanitize_name(topic)
81
+ pdf_name = sanitize_name(pdf_file.stem)
82
+ collection_name = f"{pdf_name}_{topic_safe}"
83
+
84
+ documents = load_documents(pdf_path, topic)
85
+ if not documents:
86
+ raise ValueError("No valid documents found in PDF")
87
+
88
+ embed_model = initialize_embedding()
89
+ chroma_client = chromadb.PersistentClient(path=persist_dir)
90
+ collection = chroma_client.get_or_create_collection(name=collection_name)
91
+
92
+ vector_store = ChromaVectorStore(chroma_collection=collection)
93
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
94
+
95
+ VectorStoreIndex.from_documents(
96
+ documents,
97
+ storage_context=storage_context,
98
+ embed_model=embed_model
99
+ )
100
+ print(f"βœ… Created collection: {collection_name}")
101
+ return collection_name
102
+
103
+
104
+ def add_files_to_existing_collection(pdf_path: str, persist_dir: str, topic: str, collection_name: str):
105
+ pdf_file = Path(pdf_path)
106
+ if not pdf_file.exists():
107
+ raise FileNotFoundError(f"File not found: {pdf_path}")
108
+ if pdf_file.suffix.lower() != ".pdf":
109
+ raise ValueError("Provided file is not a PDF")
110
+
111
+ persist_path = Path(persist_dir)
112
+ if not persist_path.exists():
113
+ raise FileNotFoundError(f"Persist directory not found: {persist_path}")
114
+
115
+ documents = load_documents(pdf_path, topic)
116
+ if not documents:
117
+ raise ValueError("No valid documents found in PDF")
118
+
119
+ embed_model = initialize_embedding()
120
+ chroma_client = chromadb.PersistentClient(path=persist_dir)
121
+ collection = chroma_client.get_or_create_collection(name=collection_name)
122
+
123
+ vector_store = ChromaVectorStore(chroma_collection=collection)
124
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
125
+
126
+ VectorStoreIndex.from_documents(
127
+ documents,
128
+ storage_context=storage_context,
129
+ embed_model=embed_model
130
+ )
131
+ print(f"πŸ“¦ Added to collection: {collection_name}")
132
+ return collection_name
133
+
134
+
135
+ def list_collections_and_topics(persist_dir: str) -> List[str]:
136
+ persist_path = Path(persist_dir)
137
+ if not persist_path.exists():
138
+ print(f"⚠️ Persist directory does not exist: {persist_dir}")
139
+ return []
140
+
141
+ try:
142
+ chroma_client = chromadb.PersistentClient(path=persist_dir)
143
+ collections = chroma_client.list_collections()
144
+ items = []
145
+
146
+ for col in collections:
147
+ name = col.name
148
+ topic = "Unknown"
149
+ try:
150
+ docs = col.get(limit=1)
151
+ if docs and docs['metadatas']:
152
+ metadata = docs['metadatas'][0]
153
+ topic = metadata.get("topic", "Unknown")
154
+ except Exception:
155
+ pass
156
+ items.append(f"{name} ({topic})")
157
+ return items
158
+ except Exception as e:
159
+ print(f"Error fetching collections: {e}")
160
+ return []
161
+
162
+
163
+ def run_indexing(pdf_file, topic, mode, collection_name, persist_dir):
164
+ try:
165
+ file_path = str(pdf_file) # pdf_file is already a path-like object
166
+
167
+ if mode == "create":
168
+ collection_used = create_vector_index(file_path, persist_dir, topic, collection_name)
169
+ else:
170
+ collection_used = add_files_to_existing_collection(file_path, persist_dir, topic, collection_name)
171
+
172
+ return f"βœ… Indexed successfully into collection '{collection_used}'"
173
+ except Exception as e:
174
+ return f"❌ Error: {str(e)}"
175
+
176
+
177
+ def launch_ui():
178
+ with gr.Blocks() as demo:
179
+ gr.Markdown("# 🧠 PDF Vector Indexer (ChromaDB)")
180
+ gr.Markdown("Upload a PDF, specify a topic, and create or update a vector index with citation-ready metadata.")
181
+
182
+ with gr.Row():
183
+ pdf_input = gr.File(label="Upload PDF")
184
+ topic_input = gr.Textbox(label="Topic")
185
+ mode_input = gr.Radio(choices=["create", "add"], label="Mode", value="create")
186
+
187
+ with gr.Row():
188
+ persist_dir_input = gr.Textbox(
189
+ label="Persist Directory",
190
+ value="",
191
+ info="Directory where ChromaDB should store vector embeddings. Must exist or be created during indexing."
192
+ )
193
+ collection_name_input = gr.Textbox(
194
+ label="Collection Name",
195
+ info="Name of the collection to create or add to. Must be specified, e.g. concatenated pdf file name to topic."
196
+ )
197
+
198
+ collection_dropdown = gr.Dropdown(label="πŸ“– Existing Collections", choices=[], interactive=True)
199
+ refresh_button = gr.Button("πŸ”„ Refresh Collections")
200
+ result_output = gr.Textbox(label="Status", lines=2)
201
+ debug_output = gr.Textbox(label="Debug Log", lines=2, interactive=False)
202
+
203
+ def handle_indexing(pdf_file, topic, mode, name, persist):
204
+ result = run_indexing(pdf_file, topic, mode, name, persist)
205
+ updated = list_collections_and_topics(persist)
206
+ print("πŸ” Collections returned:", updated)
207
+ debug_msg = f"Collections returned: {updated}"
208
+ return result, gr.update(choices=updated, value=None), debug_msg
209
+
210
+ index_btn = gr.Button("πŸš€ Run Indexing")
211
+ index_btn.click(
212
+ fn=handle_indexing,
213
+ inputs=[pdf_input, topic_input, mode_input, collection_name_input, persist_dir_input],
214
+ outputs=[result_output, collection_dropdown, debug_output]
215
+ )
216
+
217
+ def refresh_dropdown_handler(persist_path):
218
+ choices = list_collections_and_topics(persist_path)
219
+ print("πŸ”„ Refreshed collections:", choices)
220
+ return gr.update(choices=choices, value=None)
221
+
222
+ refresh_button.click(
223
+ fn=refresh_dropdown_handler,
224
+ inputs=[persist_dir_input],
225
+ outputs=[collection_dropdown]
226
+ )
227
+
228
+ def handle_collection_selection(selection):
229
+ if not selection:
230
+ return gr.update(value=""), gr.update(value="")
231
+ try:
232
+ name, topic = selection.strip().rsplit(" (", 1)
233
+ topic = topic.rstrip(")")
234
+ return gr.update(value=name), gr.update(value=topic)
235
+ except Exception:
236
+ return gr.update(value=""), gr.update(value="")
237
+
238
+ collection_dropdown.change(
239
+ fn=handle_collection_selection,
240
+ inputs=[collection_dropdown],
241
+ outputs=[collection_name_input, topic_input]
242
+ )
243
+
244
+ demo.launch()
245
+
246
+
247
+ if __name__ == "__main__":
248
+ launch_ui()
249
+