Hebaelsayed commited on
Commit
6d08aa2
Β·
verified Β·
1 Parent(s): c89d7a4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +589 -465
src/streamlit_app.py CHANGED
@@ -1,553 +1,677 @@
1
  import streamlit as st
2
  import os
3
- from anthropic import Anthropic
4
- from qdrant_client import QdrantClient
5
- from qdrant_client.models import Distance, VectorParams, PointStruct
6
- from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # ============================================================================
9
  # CONFIGURATION
10
  # ============================================================================
11
 
12
  st.set_page_config(
13
- page_title="Math AI - Phase 2: Database",
14
  page_icon="πŸ—„οΈ",
15
  layout="wide"
16
  )
17
 
18
- COLLECTION_NAME = "math_knowledge_base"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # ============================================================================
21
- # CACHED FUNCTIONS - LOAD ONCE
22
  # ============================================================================
23
 
24
- @st.cache_resource(show_spinner="πŸ”Œ Connecting to Qdrant...")
25
- def get_qdrant_client():
26
- """Cache Qdrant client - only connects once"""
27
- qdrant_url = os.getenv("QDRANT_URL")
28
- qdrant_api_key = os.getenv("QDRANT_API_KEY")
29
-
30
- if not qdrant_url or not qdrant_api_key:
31
- return None
32
-
33
- return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
34
 
35
- @st.cache_resource(show_spinner="πŸ€– Loading embedding model (first time: 30-60s)...")
36
- def get_embedding_model():
37
- """Cache embedding model - only loads once"""
38
- try:
39
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
40
- return model
41
- except Exception as e:
42
- st.error(f"Failed to load model: {e}")
43
- return None
44
 
45
- def get_vector_count_reliable(client, collection_name):
46
- """Get vector count with multiple fallback methods"""
 
 
47
  try:
48
- # Method 1: Try scroll to count
49
- scroll_result = client.scroll(
50
- collection_name=collection_name,
51
- limit=1,
52
- with_payload=False,
53
- with_vectors=False
54
  )
55
 
56
- # If scroll returns None, collection might be empty
57
- if scroll_result is None or scroll_result[0] is None:
58
- return 0
59
 
60
- # Method 2: Try collection info
61
- try:
62
- info = client.get_collection(collection_name)
63
-
64
- # Try different attribute names
65
- if hasattr(info, 'points_count') and info.points_count is not None:
66
- return info.points_count
67
- elif hasattr(info, 'vectors_count') and info.vectors_count is not None:
68
- return info.vectors_count
69
- except:
70
- pass
71
 
72
- # Method 3: Count by scrolling through all
73
- try:
74
- count = 0
75
- offset = None
76
- while True:
77
- result = client.scroll(
78
  collection_name=collection_name,
79
- limit=100,
80
- offset=offset,
81
  with_payload=False,
82
  with_vectors=False
83
  )
84
 
85
- if result is None or result[0] is None or len(result[0]) == 0:
86
- break
 
 
 
87
 
88
- count += len(result[0])
89
- offset = result[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- if offset is None:
92
- break
93
-
94
- return count
95
- except:
96
- return 0
97
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  except Exception as e:
99
- st.warning(f"Could not get vector count: {e}")
100
- return 0
101
-
102
- def check_collection_exists(client, collection_name):
103
- """Check if collection exists"""
104
- try:
105
- collections = client.get_collections().collections
106
- return any(c.name == collection_name for c in collections)
107
- except:
108
- return False
109
 
110
  # ============================================================================
111
- # INITIALIZE SESSION STATE
112
  # ============================================================================
113
 
114
- if 'db_created' not in st.session_state:
115
- st.session_state.db_created = False
116
-
117
- if 'embedder_ready' not in st.session_state:
118
- st.session_state.embedder_ready = False
119
-
120
- if 'show_step' not in st.session_state:
121
- st.session_state.show_step = 'all'
122
 
123
- # ============================================================================
124
- # MAIN APP
125
- # ============================================================================
126
-
127
- st.title("πŸ—„οΈ Phase 2: Vector Database Setup")
128
-
129
- # Quick Navigation
130
- with st.sidebar:
131
- st.header("⚑ Quick Navigation")
132
- st.caption("Jump to specific steps (saves time!)")
133
-
134
- if st.button("πŸ“‹ Show All Steps", use_container_width=True):
135
- st.session_state.show_step = 'all'
136
 
137
- if st.button("πŸš€ Skip to Upload (Step 5)", use_container_width=True):
138
- st.session_state.show_step = 'upload'
139
 
140
- if st.button("πŸ” Skip to Search (Step 6)", use_container_width=True):
141
- st.session_state.show_step = 'search'
 
142
 
143
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # Status indicators
146
- st.subheader("πŸ“Š System Status")
147
- client = get_qdrant_client()
148
- embedder = get_embedding_model()
149
 
150
- if client and check_collection_exists(client, COLLECTION_NAME):
151
- st.success("βœ… Database Ready")
152
- st.session_state.db_created = True
153
- else:
154
- st.warning("⚠️ Database Not Ready")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- if embedder:
157
- st.success("βœ… Model Loaded")
158
- st.session_state.embedder_ready = True
159
- else:
160
- st.warning("⚠️ Model Not Loaded")
161
 
162
- # Vector count
163
- if client and st.session_state.db_created:
164
- count = get_vector_count_reliable(client, COLLECTION_NAME)
165
- st.metric("Vectors in DB", f"{count:,}")
166
-
167
- # Get cached resources
168
- client = get_qdrant_client()
169
- embedder = get_embedding_model()
170
-
171
- # ============================================================================
172
- # CONDITIONAL DISPLAY BASED ON show_step
173
- # ============================================================================
174
-
175
- show_all = st.session_state.show_step == 'all'
176
- show_upload = st.session_state.show_step in ['all', 'upload']
177
- show_search = st.session_state.show_step in ['all', 'search']
178
-
179
- # ============================================================================
180
- # STEP 1-2: Quick Status (Always Show)
181
- # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- if show_all:
184
- st.header("Step 1-2: System Check")
185
-
186
- col1, col2, col3 = st.columns(3)
187
-
188
- with col1:
189
- st.metric("Claude API", "βœ…" if os.getenv("ANTHROPIC_API_KEY") else "❌")
190
 
191
- with col2:
192
- st.metric("Qdrant", "βœ… Connected" if client else "❌")
 
193
 
194
- with col3:
195
- st.metric("Embedder", "βœ… Cached" if embedder else "❌")
196
-
197
- if not client:
198
- st.error("⚠️ Qdrant connection failed. Check secrets!")
199
- st.stop()
200
-
201
- st.markdown("---")
202
-
203
- # ============================================================================
204
- # STEP 3: Collection Management
205
- # ============================================================================
206
-
207
- if show_all:
208
- st.header("πŸ—οΈ Step 3: Database Collection")
209
-
210
- if st.session_state.db_created:
211
- st.success(f"βœ… Collection '{COLLECTION_NAME}' is ready!")
212
 
213
  col1, col2 = st.columns(2)
 
214
  with col1:
215
- if st.button("πŸ”„ Recreate Collection"):
216
- try:
217
- client.delete_collection(COLLECTION_NAME)
218
- st.session_state.db_created = False
219
- st.rerun()
220
- except Exception as e:
221
- st.error(f"Error: {e}")
 
 
 
 
 
 
222
 
223
  with col2:
224
- if st.button("ℹ️ Collection Info"):
225
- count = get_vector_count_reliable(client, COLLECTION_NAME)
226
- st.json({
227
- "name": COLLECTION_NAME,
228
- "vectors": count,
229
- "status": "Ready"
230
- })
 
 
 
 
 
 
 
 
 
231
 
232
- else:
233
- if st.button("πŸ—οΈ CREATE COLLECTION", type="primary"):
234
- try:
235
- client.create_collection(
236
- collection_name=COLLECTION_NAME,
237
- vectors_config=VectorParams(size=384, distance=Distance.COSINE)
238
- )
239
- st.success(f"πŸŽ‰ Created: {COLLECTION_NAME}")
240
- st.session_state.db_created = True
241
- st.rerun()
242
- except Exception as e:
243
- st.error(f"❌ Failed: {str(e)}")
244
 
245
  st.markdown("---")
246
-
247
- # ============================================================================
248
- # STEP 4: Embedding Model (Already Loaded via Cache)
249
- # ============================================================================
250
-
251
- if show_all:
252
- st.header("πŸ€– Step 4: Embedding Model")
253
 
254
- if embedder:
255
- st.success("βœ… Model loaded and cached!")
256
- st.session_state.embedder_ready = True
257
- else:
258
- st.warning("⚠️ Model loading failed. Try refreshing page.")
259
 
260
- st.markdown("---")
 
 
261
 
262
  # ============================================================================
263
- # STEP 5A: Upload Custom Text
264
  # ============================================================================
265
 
266
- if show_upload:
267
- st.header("πŸ“ Step 5A: Upload Custom Math Notes")
268
 
269
- if not st.session_state.db_created or not st.session_state.embedder_ready:
270
- st.error("⚠️ Complete Steps 3 & 4 first (or check sidebar status)")
271
- else:
272
- with st.expander("✍️ Paste text to upload", expanded=True):
273
-
274
- custom_text = st.text_area(
275
- "Math notes:",
276
- value="""Linear Equations: ax + b = 0, solution is x = -b/a
277
-
278
- Quadratic Equations: axΒ² + bx + c = 0
279
- Solution: x = (-b ± √(b²-4ac)) / 2a
280
- Example: xΒ² + 5x - 4 = 0
281
- x = (-5 ± √(25+16)) / 2
282
- x = (-5 ± √41) / 2
283
-
284
- Pythagorean Theorem: aΒ² + bΒ² = cΒ²
285
- For right triangles with sides a, b and hypotenuse c
286
-
287
- Derivatives:
288
- d/dx(xⁿ) = nxⁿ⁻¹
289
- d/dx(sin x) = cos x
290
- d/dx(cos x) = -sin x
291
- d/dx(eΛ£) = eΛ£""",
292
- height=200
293
- )
294
 
295
- source_name = st.text_input("Source name:", value="math_notes.txt")
296
 
297
- if st.button("πŸš€ UPLOAD TEXT", type="primary"):
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
- if not custom_text.strip():
300
- st.error("Please enter text!")
301
- else:
302
- try:
303
- progress = st.progress(0)
304
- status = st.empty()
305
-
306
- # Chunk
307
- status.text("πŸ“„ Chunking text...")
308
- progress.progress(0.2)
309
-
310
- words = custom_text.split()
311
- chunks = []
312
- chunk_size = 50
313
-
314
- for i in range(0, len(words), 40):
315
- chunk = ' '.join(words[i:i + chunk_size])
316
- if chunk.strip():
317
- chunks.append(chunk)
318
-
319
- st.write(f"βœ… Created {len(chunks)} chunks")
320
-
321
- # Embed
322
- status.text("πŸ”’ Generating embeddings...")
323
- progress.progress(0.5)
324
-
325
- embeddings = embedder.encode(chunks, show_progress_bar=False)
326
- st.write(f"βœ… Generated {len(embeddings)} embeddings")
327
-
328
- # Upload
329
- status.text("☁️ Uploading to Qdrant...")
330
- progress.progress(0.8)
331
-
332
- points = []
333
- for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
334
- points.append(PointStruct(
335
- id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}")) % (2**63),
336
- vector=embedding.tolist(),
337
- payload={
338
- "content": chunk,
339
- "source_name": source_name,
340
- "source_type": "custom_notes",
341
- "chunk_index": idx
342
- }
343
- ))
344
-
345
- client.upsert(collection_name=COLLECTION_NAME, points=points)
346
-
347
- progress.progress(1.0)
348
- status.empty()
349
-
350
- st.success(f"πŸŽ‰ Uploaded {len(points)} vectors!")
351
-
352
- # Get count
353
- count = get_vector_count_reliable(client, COLLECTION_NAME)
354
- st.info(f"πŸ“Š **Total vectors in database: {count:,}**")
355
-
356
- except Exception as e:
357
- st.error(f"❌ Failed: {str(e)}")
358
- st.exception(e)
359
-
360
- st.markdown("---")
361
-
362
- # ============================================================================
363
- # STEP 5B: Load Public Datasets
364
- # ============================================================================
365
-
366
- if show_upload:
367
- st.header("πŸ“š Step 5B: Load Public Datasets")
368
-
369
- if not st.session_state.db_created or not st.session_state.embedder_ready:
370
- st.error("⚠️ Complete Steps 3 & 4 first")
371
- else:
372
- with st.expander("πŸ“Š Load from Hugging Face", expanded=False):
373
-
374
- dataset_choice = st.selectbox(
375
- "Dataset:",
376
- [
377
- "GSM8K - Grade School Math",
378
- "MATH - Competition Math",
379
- "RACE - Reading Comprehension"
380
  ]
381
- )
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
- sample_size = st.slider("Items to load:", 10, 500, 50)
 
 
384
 
385
- if st.button("πŸ“₯ LOAD DATASET", type="primary"):
 
 
 
 
 
 
 
 
386
 
387
- try:
388
- from datasets import load_dataset
389
-
390
- progress = st.progress(0)
391
- status = st.empty()
392
-
393
- # Load dataset
394
- status.text(f"πŸ“₯ Downloading {dataset_choice.split('-')[0].strip()}...")
395
- progress.progress(0.1)
396
-
397
- if "GSM8K" in dataset_choice:
398
- dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
399
- dataset_name = "GSM8K"
400
-
401
- texts = []
402
- for i in range(min(sample_size, len(dataset))):
403
- item = dataset[i]
404
- text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
405
- texts.append(text)
406
-
407
- elif "MATH" in dataset_choice:
408
- dataset = load_dataset("hendrycks/competition_math", split="train", trust_remote_code=True)
409
- dataset_name = "MATH"
410
-
411
- texts = []
412
- for i in range(min(sample_size, len(dataset))):
413
- item = dataset[i]
414
- text = f"Problem ({item['type']}): {item['problem']}\n\nSolution: {item['solution']}"
415
- texts.append(text)
416
-
417
- else:
418
- dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
419
- dataset_name = "RACE"
420
-
421
- texts = []
422
- for i in range(min(sample_size, len(dataset))):
423
- item = dataset[i]
424
- text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
425
- texts.append(text)
426
-
427
- st.write(f"βœ… Loaded {len(texts)} items")
428
- progress.progress(0.3)
429
-
430
- # Embed
431
- status.text("πŸ”’ Generating embeddings...")
432
- embeddings = []
433
-
434
- for idx, text in enumerate(texts):
435
- embedding = embedder.encode(text)
436
- embeddings.append(embedding)
437
-
438
- if idx % 10 == 0:
439
- progress.progress(0.3 + (0.5 * idx / len(texts)))
440
- status.text(f"πŸ”’ Embedding {idx+1}/{len(texts)}")
441
-
442
- st.write(f"βœ… Generated {len(embeddings)} embeddings")
443
- progress.progress(0.8)
444
-
445
- # Upload
446
- status.text("☁️ Uploading to Qdrant...")
447
-
448
- points = []
449
- for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
450
- content = text[:2000] if len(text) > 2000 else text
451
-
452
- points.append(PointStruct(
453
- id=abs(hash(f"{dataset_name}_{idx}")) % (2**63),
454
- vector=embedding.tolist(),
455
- payload={
456
- "content": content,
457
- "source_name": dataset_name,
458
- "source_type": "public_dataset",
459
- "dataset": dataset_name,
460
- "index": idx
461
- }
462
- ))
463
-
464
- client.upsert(collection_name=COLLECTION_NAME, points=points)
465
-
466
- progress.progress(1.0)
467
- status.empty()
468
-
469
- st.success(f"πŸŽ‰ Uploaded {len(points)} vectors from {dataset_name}!")
470
-
471
- # Get count
472
- count = get_vector_count_reliable(client, COLLECTION_NAME)
473
- st.info(f"πŸ“Š **Total vectors in database: {count:,}**")
474
-
475
- except ImportError:
476
- st.error("❌ Add 'datasets' to requirements.txt")
477
- except Exception as e:
478
- st.error(f"❌ Failed: {str(e)}")
479
- st.exception(e)
480
 
481
- st.markdown("---")
 
482
 
483
  # ============================================================================
484
- # STEP 6: Search
485
  # ============================================================================
486
 
487
- if show_search:
488
- st.header("πŸ” Step 6: Test Search")
489
 
490
- if not st.session_state.db_created or not st.session_state.embedder_ready:
491
- st.error("⚠️ Database and embedder must be ready")
492
- else:
493
- search_query = st.text_input(
494
- "Question:",
495
- placeholder="Solve xΒ² + 5x - 4 = 0"
 
496
  )
497
 
498
- col1, col2 = st.columns([3, 1])
499
- with col1:
500
- top_k = st.slider("Results:", 1, 10, 5)
501
 
502
- with col2:
503
- st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))
 
504
 
505
- if st.button("πŸ” SEARCH", type="primary") and search_query:
 
506
 
507
- try:
508
- with st.spinner("Searching..."):
509
-
510
- query_embedding = embedder.encode(search_query)
 
 
 
 
511
 
512
- results = client.search(
513
- collection_name=COLLECTION_NAME,
514
- query_vector=query_embedding.tolist(),
515
- limit=top_k
 
 
516
  )
517
 
518
- if results:
519
- st.success(f"βœ… Found {len(results)} results!")
520
-
521
- for i, result in enumerate(results, 1):
522
- similarity_pct = result.score * 100
523
-
524
- # Color code by relevance
525
- if similarity_pct > 50:
526
- color = "🟒"
527
- elif similarity_pct > 30:
528
- color = "🟑"
529
- else:
530
- color = "πŸ”΄"
531
-
532
- with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
533
- st.info(result.payload['content'])
534
-
535
- col1, col2, col3 = st.columns(3)
536
- with col1:
537
- st.caption(f"**Source:** {result.payload['source_name']}")
538
- with col2:
539
- st.caption(f"**Type:** {result.payload['source_type']}")
540
- with col3:
541
- st.caption(f"**Score:** {result.score:.4f}")
542
- else:
543
- st.warning("No results found!")
544
-
545
- except Exception as e:
546
- st.error(f"❌ Search failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
  # ============================================================================
549
  # FOOTER
550
  # ============================================================================
551
 
552
  st.markdown("---")
553
- st.success("πŸŽ‰ Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG with Claude")
 
1
  import streamlit as st
2
  import os
3
+ import sys
4
+
5
+ # ============================================================================
6
+ # LAZY IMPORTS - Only import when needed!
7
+ # ============================================================================
8
+
9
+ @st.cache_resource
10
+ def lazy_import_qdrant():
11
+ """Import Qdrant only when needed"""
12
+ from qdrant_client import QdrantClient
13
+ from qdrant_client.models import Distance, VectorParams, PointStruct
14
+ return QdrantClient, Distance, VectorParams, PointStruct
15
+
16
+ @st.cache_resource
17
+ def lazy_import_embedder():
18
+ """Import sentence transformers only when needed"""
19
+ from sentence_transformers import SentenceTransformer
20
+ return SentenceTransformer
21
+
22
+ @st.cache_resource
23
+ def lazy_import_datasets():
24
+ """Import datasets only when needed"""
25
+ from datasets import load_dataset
26
+ return load_dataset
27
 
28
  # ============================================================================
29
  # CONFIGURATION
30
  # ============================================================================
31
 
32
  st.set_page_config(
33
+ page_title="Math AI - Database Dashboard",
34
  page_icon="πŸ—„οΈ",
35
  layout="wide"
36
  )
37
 
38
+ # ============================================================================
39
+ # DATABASE CONFIGURATION SETTINGS
40
+ # ============================================================================
41
+
42
+ if 'db_config' not in st.session_state:
43
+ st.session_state.db_config = {
44
+ 'collection_name': 'math_knowledge_base',
45
+ 'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
46
+ 'embedding_dimensions': 384,
47
+ 'chunk_size': 500,
48
+ 'chunk_overlap': 50,
49
+ 'similarity_metric': 'COSINE',
50
+ 'max_chunk_tokens': 8192,
51
+ 'tokenizer': 'whitespace'
52
+ }
53
+
54
+ if 'db_created' not in st.session_state:
55
+ st.session_state.db_created = False
56
+
57
+ if 'embedder_loaded' not in st.session_state:
58
+ st.session_state.embedder_loaded = False
59
 
60
  # ============================================================================
61
+ # HEADER
62
  # ============================================================================
63
 
64
+ st.title("πŸ—„οΈ Vector Database Configuration & Analytics")
65
+ st.markdown("**Complete database setup with full visibility and control**")
 
 
 
 
 
 
 
 
66
 
67
+ # ============================================================================
68
+ # SIDEBAR: QUICK STATS
69
+ # ============================================================================
 
 
 
 
 
 
70
 
71
+ with st.sidebar:
72
+ st.header("πŸ“Š Database Stats")
73
+
74
+ # Try to connect and get stats
75
  try:
76
+ QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
77
+ client = QdrantClient(
78
+ url=os.getenv("QDRANT_URL"),
79
+ api_key=os.getenv("QDRANT_API_KEY")
 
 
80
  )
81
 
82
+ collection_name = st.session_state.db_config['collection_name']
 
 
83
 
84
+ # Check if collection exists
85
+ collections = client.get_collections().collections
86
+ exists = any(c.name == collection_name for c in collections)
 
 
 
 
 
 
 
 
87
 
88
+ if exists:
89
+ st.success("βœ… Database Online")
90
+
91
+ # Get vector count
92
+ try:
93
+ scroll_result = client.scroll(
94
  collection_name=collection_name,
95
+ limit=1,
 
96
  with_payload=False,
97
  with_vectors=False
98
  )
99
 
100
+ # Try multiple ways to get count
101
+ count = 0
102
+ offset = None
103
+ max_iterations = 1000
104
+ iteration = 0
105
 
106
+ while iteration < max_iterations:
107
+ result = client.scroll(
108
+ collection_name=collection_name,
109
+ limit=100,
110
+ offset=offset,
111
+ with_payload=False,
112
+ with_vectors=False
113
+ )
114
+
115
+ if result is None or result[0] is None or len(result[0]) == 0:
116
+ break
117
+
118
+ count += len(result[0])
119
+ offset = result[1]
120
+ iteration += 1
121
+
122
+ if offset is None:
123
+ break
124
 
125
+ st.metric("Total Vectors", f"{count:,}")
126
+
127
+ # Calculate approximate storage size
128
+ vector_dim = st.session_state.db_config['embedding_dimensions']
129
+ bytes_per_float = 4
130
+ metadata_overhead = 100 # bytes per vector for metadata
131
+
132
+ vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
133
+ metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
134
+ total_size_mb = vector_size_mb + metadata_size_mb
135
+
136
+ st.metric("Storage Used", f"{total_size_mb:.2f} MB")
137
+ st.caption(f"Vectors: {vector_size_mb:.2f} MB")
138
+ st.caption(f"Metadata: {metadata_size_mb:.2f} MB")
139
+
140
+ # Calculate storage capacity
141
+ free_tier_gb = 1.0
142
+ used_gb = total_size_mb / 1024
143
+ remaining_gb = free_tier_gb - used_gb
144
+ usage_pct = (used_gb / free_tier_gb) * 100
145
+
146
+ st.metric("Free Tier Usage", f"{usage_pct:.1f}%")
147
+ st.progress(min(usage_pct / 100, 1.0))
148
+ st.caption(f"Remaining: {remaining_gb:.3f} GB")
149
+
150
+ except Exception as e:
151
+ st.error(f"Stats error: {e}")
152
+ else:
153
+ st.warning("⚠️ Database Not Created")
154
+
155
  except Exception as e:
156
+ st.error("❌ Connection Failed")
157
+ st.caption(str(e)[:50])
 
 
 
 
 
 
 
 
158
 
159
  # ============================================================================
160
+ # TAB 1: DATABASE CONFIGURATION
161
  # ============================================================================
162
 
163
+ tab1, tab2, tab3, tab4 = st.tabs([
164
+ "βš™οΈ Configuration",
165
+ "πŸ“Š Analytics",
166
+ "πŸ”§ Management",
167
+ "πŸ“š Data Upload"
168
+ ])
 
 
169
 
170
+ with tab1:
171
+ st.header("βš™οΈ Database Configuration")
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ st.info("**Configure your vector database parameters before creation**")
 
174
 
175
+ # ========================================================================
176
+ # SECTION 1: COLLECTION SETTINGS
177
+ # ========================================================================
178
 
179
+ with st.expander("πŸ—„οΈ Collection Settings", expanded=True):
180
+
181
+ col1, col2 = st.columns(2)
182
+
183
+ with col1:
184
+ collection_name = st.text_input(
185
+ "Collection Name",
186
+ value=st.session_state.db_config['collection_name'],
187
+ help="Name of your vector database collection"
188
+ )
189
+ st.session_state.db_config['collection_name'] = collection_name
190
+
191
+ with col2:
192
+ similarity_options = {
193
+ 'COSINE': 'Cosine Similarity (Best for text, -1 to 1)',
194
+ 'EUCLIDEAN': 'Euclidean Distance (L2 norm)',
195
+ 'DOT': 'Dot Product (Fast, unnormalized)'
196
+ }
197
+
198
+ similarity_metric = st.selectbox(
199
+ "Similarity Metric",
200
+ options=list(similarity_options.keys()),
201
+ index=0,
202
+ help="How to measure similarity between vectors",
203
+ format_func=lambda x: similarity_options[x]
204
+ )
205
+ st.session_state.db_config['similarity_metric'] = similarity_metric
206
+
207
+ # Explanation
208
+ st.caption("""
209
+ **Cosine Similarity**: Measures angle between vectors (best for text)
210
+ **Euclidean**: Measures distance in space (sensitive to magnitude)
211
+ **Dot Product**: Fast but requires normalized vectors
212
+ """)
213
 
214
+ # ========================================================================
215
+ # SECTION 2: EMBEDDING MODEL
216
+ # ========================================================================
 
217
 
218
+ with st.expander("πŸ€– Embedding Model Configuration", expanded=True):
219
+
220
+ embedding_models = {
221
+ 'sentence-transformers/all-MiniLM-L6-v2': {
222
+ 'name': 'all-MiniLM-L6-v2 (Recommended)',
223
+ 'dimensions': 384,
224
+ 'size': '90 MB',
225
+ 'speed': 'Fast',
226
+ 'quality': 'Good',
227
+ 'description': 'Best balance of speed and quality for math content'
228
+ },
229
+ 'sentence-transformers/all-mpnet-base-v2': {
230
+ 'name': 'all-mpnet-base-v2 (High Quality)',
231
+ 'dimensions': 768,
232
+ 'size': '420 MB',
233
+ 'speed': 'Medium',
234
+ 'quality': 'Excellent',
235
+ 'description': 'Higher quality embeddings, slower inference'
236
+ },
237
+ 'sentence-transformers/all-MiniLM-L12-v2': {
238
+ 'name': 'all-MiniLM-L12-v2 (Balanced)',
239
+ 'dimensions': 384,
240
+ 'size': '120 MB',
241
+ 'speed': 'Medium',
242
+ 'quality': 'Very Good',
243
+ 'description': 'Larger MiniLM, better quality than L6'
244
+ }
245
+ }
246
+
247
+ selected_model = st.selectbox(
248
+ "Select Embedding Model",
249
+ options=list(embedding_models.keys()),
250
+ format_func=lambda x: embedding_models[x]['name']
251
+ )
252
+
253
+ st.session_state.db_config['embedding_model'] = selected_model
254
+ st.session_state.db_config['embedding_dimensions'] = embedding_models[selected_model]['dimensions']
255
+
256
+ # Model details
257
+ model_info = embedding_models[selected_model]
258
+
259
+ col1, col2, col3, col4 = st.columns(4)
260
+
261
+ with col1:
262
+ st.metric("Dimensions", model_info['dimensions'])
263
+ with col2:
264
+ st.metric("Model Size", model_info['size'])
265
+ with col3:
266
+ st.metric("Speed", model_info['speed'])
267
+ with col4:
268
+ st.metric("Quality", model_info['quality'])
269
+
270
+ st.info(f"**Why this model?** {model_info['description']}")
271
 
272
+ # ========================================================================
273
+ # SECTION 3: CHUNKING STRATEGY
274
+ # ========================================================================
 
 
275
 
276
+ with st.expander("βœ‚οΈ Chunking Strategy", expanded=True):
277
+
278
+ st.markdown("**How to split documents into processable chunks**")
279
+
280
+ col1, col2 = st.columns(2)
281
+
282
+ with col1:
283
+ chunk_size = st.slider(
284
+ "Chunk Size (tokens)",
285
+ min_value=100,
286
+ max_value=2000,
287
+ value=st.session_state.db_config['chunk_size'],
288
+ step=50,
289
+ help="Number of tokens per chunk"
290
+ )
291
+ st.session_state.db_config['chunk_size'] = chunk_size
292
+
293
+ st.caption(f"""
294
+ **Small (100-300)**: Better precision, more chunks
295
+ **Medium (400-600)**: Balanced βœ…
296
+ **Large (800-2000)**: More context, fewer chunks
297
+ """)
298
+
299
+ with col2:
300
+ chunk_overlap = st.slider(
301
+ "Chunk Overlap (tokens)",
302
+ min_value=0,
303
+ max_value=min(500, chunk_size // 2),
304
+ value=st.session_state.db_config['chunk_overlap'],
305
+ step=10,
306
+ help="Overlap between consecutive chunks"
307
+ )
308
+ st.session_state.db_config['chunk_overlap'] = chunk_overlap
309
+
310
+ overlap_pct = (chunk_overlap / chunk_size) * 100 if chunk_size > 0 else 0
311
+ st.metric("Overlap %", f"{overlap_pct:.1f}%")
312
+
313
+ st.caption(f"""
314
+ **No Overlap (0%)**: Distinct chunks, might lose context
315
+ **Small (5-10%)**: Minimal redundancy βœ…
316
+ **Large (20-30%)**: More context preservation
317
+ """)
318
+
319
+ # Visualization
320
+ st.markdown("**Chunking Visualization:**")
321
+
322
+ sample_text = "The Pythagorean theorem states that aΒ² + bΒ² = cΒ² for right triangles."
323
+ words = sample_text.split()
324
+
325
+ if len(words) >= 5:
326
+ chunk1 = ' '.join(words[:5])
327
+ chunk2 = ' '.join(words[3:8]) if len(words) >= 8 else ' '.join(words[3:])
328
+
329
+ st.code(f"""
330
+ Chunk 1: "{chunk1}..."
331
+ {'↓' * (chunk_overlap // 10 if chunk_overlap > 0 else 0)}
332
+ Chunk 2: "...{chunk2}..."
333
 
334
+ Overlap: {chunk_overlap} tokens ({overlap_pct:.0f}%)
335
+ """)
 
 
 
 
 
336
 
337
+ # ========================================================================
338
+ # SECTION 4: TOKENIZATION & PARSING
339
+ # ========================================================================
340
 
341
+ with st.expander("πŸ”€ Tokenization & Parsing", expanded=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  col1, col2 = st.columns(2)
344
+
345
  with col1:
346
+ tokenizer_options = {
347
+ 'whitespace': 'Whitespace (Simple, fast)',
348
+ 'nltk': 'NLTK (Sentence-aware)',
349
+ 'tiktoken': 'TikToken (GPT-style, accurate)'
350
+ }
351
+
352
+ tokenizer = st.selectbox(
353
+ "Tokenizer",
354
+ options=list(tokenizer_options.keys()),
355
+ format_func=lambda x: tokenizer_options[x],
356
+ help="How to split text into tokens"
357
+ )
358
+ st.session_state.db_config['tokenizer'] = tokenizer
359
 
360
  with col2:
361
+ max_chunk_tokens = st.number_input(
362
+ "Max Tokens per Chunk",
363
+ min_value=512,
364
+ max_value=32000,
365
+ value=st.session_state.db_config['max_chunk_tokens'],
366
+ step=512,
367
+ help="Maximum tokens before forcing a split"
368
+ )
369
+ st.session_state.db_config['max_chunk_tokens'] = max_chunk_tokens
370
+
371
+ st.info("""
372
+ **Tokenization** converts text into tokens (words/subwords)
373
+ - **Whitespace**: Simple split by spaces (fastest)
374
+ - **NLTK**: Respects sentence boundaries (better)
375
+ - **TikToken**: Matches GPT tokenization (most accurate)
376
+ """)
377
 
378
+ # ========================================================================
379
+ # SAVE CONFIGURATION
380
+ # ========================================================================
 
 
 
 
 
 
 
 
 
381
 
382
  st.markdown("---")
 
 
 
 
 
 
 
383
 
384
+ if st.button("πŸ’Ύ Save Configuration", type="primary"):
385
+ st.success("βœ… Configuration saved!")
386
+ st.json(st.session_state.db_config)
 
 
387
 
388
+ # Show current config
389
+ with st.expander("πŸ“‹ View Current Configuration"):
390
+ st.json(st.session_state.db_config)
391
 
392
  # ============================================================================
393
+ # TAB 2: ANALYTICS & VISUALIZATION
394
  # ============================================================================
395
 
396
+ with tab2:
397
+ st.header("πŸ“Š Database Analytics")
398
 
399
+ try:
400
+ QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
401
+ client = QdrantClient(
402
+ url=os.getenv("QDRANT_URL"),
403
+ api_key=os.getenv("QDRANT_API_KEY")
404
+ )
405
+
406
+ collection_name = st.session_state.db_config['collection_name']
407
+
408
+ # Check if collection exists
409
+ collections = client.get_collections().collections
410
+ exists = any(c.name == collection_name for c in collections)
411
+
412
+ if not exists:
413
+ st.warning(f"⚠️ Collection '{collection_name}' doesn't exist yet. Create it in the Management tab.")
414
+ else:
415
+ st.success(f"βœ… Analyzing collection: {collection_name}")
416
+
417
+ # ================================================================
418
+ # STORAGE ANALYTICS
419
+ # ================================================================
 
 
 
 
420
 
421
+ st.subheader("πŸ’Ύ Storage Analytics")
422
 
423
+ # Get vector count
424
+ count = 0
425
+ offset = None
426
+ max_iter = 1000
427
+
428
+ for _ in range(max_iter):
429
+ result = client.scroll(
430
+ collection_name=collection_name,
431
+ limit=100,
432
+ offset=offset,
433
+ with_payload=False,
434
+ with_vectors=False
435
+ )
436
 
437
+ if result is None or result[0] is None or len(result[0]) == 0:
438
+ break
439
+
440
+ count += len(result[0])
441
+ offset = result[1]
442
+
443
+ if offset is None:
444
+ break
445
+
446
+ col1, col2, col3, col4 = st.columns(4)
447
+
448
+ vector_dim = st.session_state.db_config['embedding_dimensions']
449
+ bytes_per_float = 4
450
+ metadata_overhead = 100
451
+
452
+ vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
453
+ metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
454
+ total_size_mb = vector_size_mb + metadata_size_mb
455
+
456
+ with col1:
457
+ st.metric("Total Vectors", f"{count:,}")
458
+
459
+ with col2:
460
+ st.metric("Vector Data", f"{vector_size_mb:.2f} MB")
461
+
462
+ with col3:
463
+ st.metric("Metadata", f"{metadata_size_mb:.2f} MB")
464
+
465
+ with col4:
466
+ st.metric("Total Size", f"{total_size_mb:.2f} MB")
467
+
468
+ # Storage breakdown
469
+ st.markdown("**Storage Breakdown:**")
470
+
471
+ storage_data = {
472
+ "Component": ["Vector Embeddings", "Metadata", "Index Overhead (est.)"],
473
+ "Size (MB)": [vector_size_mb, metadata_size_mb, total_size_mb * 0.1],
474
+ "Percentage": [
475
+ (vector_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
476
+ (metadata_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
477
+ 10.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  ]
479
+ }
480
+
481
+ st.dataframe(storage_data, use_container_width=True)
482
+
483
+ # Free tier usage
484
+ st.markdown("**Free Tier Capacity:**")
485
+
486
+ free_tier_gb = 1.0
487
+ used_gb = total_size_mb / 1024
488
+ remaining_gb = free_tier_gb - used_gb
489
+ usage_pct = (used_gb / free_tier_gb) * 100
490
+
491
+ col1, col2 = st.columns([2, 1])
492
 
493
+ with col1:
494
+ st.progress(min(usage_pct / 100, 1.0))
495
+ st.caption(f"Used: {used_gb:.3f} GB / {free_tier_gb} GB ({usage_pct:.1f}%)")
496
 
497
+ with col2:
498
+ st.metric("Remaining", f"{remaining_gb:.3f} GB")
499
+
500
+ # Capacity estimates
501
+ st.markdown("**Capacity Estimates:**")
502
+
503
+ if count > 0:
504
+ avg_vector_size = total_size_mb / count
505
+ max_vectors_1gb = int((1024 / avg_vector_size) * 0.9) # 90% of theoretical max
506
 
507
+ st.info(f"""
508
+ **With current data:**
509
+ - Average size per vector: {avg_vector_size:.3f} MB
510
+ - Estimated max vectors (1GB): ~{max_vectors_1gb:,}
511
+ - Current capacity used: {(count / max_vectors_1gb * 100):.1f}%
512
+ """)
513
+
514
+ # ================================================================
515
+ # DATA SOURCE ANALYTICS
516
+ # ================================================================
517
+
518
+ st.subheader("πŸ“š Data Source Breakdown")
519
+
520
+ # Sample vectors to analyze sources
521
+ sample_result = client.scroll(
522
+ collection_name=collection_name,
523
+ limit=min(count, 1000),
524
+ with_payload=True,
525
+ with_vectors=False
526
+ )
527
+
528
+ if sample_result and sample_result[0]:
529
+ source_counts = {}
530
+
531
+ for point in sample_result[0]:
532
+ source = point.payload.get('source_name', 'Unknown')
533
+ source_counts[source] = source_counts.get(source, 0) + 1
534
+
535
+ # Display as table
536
+ source_data = {
537
+ "Source": list(source_counts.keys()),
538
+ "Vectors": list(source_counts.values()),
539
+ "Percentage": [
540
+ f"{(v/count*100):.1f}%" for v in source_counts.values()
541
+ ]
542
+ }
543
+
544
+ st.dataframe(source_data, use_container_width=True)
545
+
546
+ # ================================================================
547
+ # CONFIGURATION SUMMARY
548
+ # ================================================================
549
+
550
+ st.subheader("βš™οΈ Active Configuration")
551
+
552
+ config_display = {
553
+ "Parameter": [
554
+ "Embedding Model",
555
+ "Vector Dimensions",
556
+ "Similarity Metric",
557
+ "Chunk Size",
558
+ "Chunk Overlap",
559
+ "Overlap Percentage"
560
+ ],
561
+ "Value": [
562
+ st.session_state.db_config['embedding_model'].split('/')[-1],
563
+ st.session_state.db_config['embedding_dimensions'],
564
+ st.session_state.db_config['similarity_metric'],
565
+ f"{st.session_state.db_config['chunk_size']} tokens",
566
+ f"{st.session_state.db_config['chunk_overlap']} tokens",
567
+ f"{(st.session_state.db_config['chunk_overlap'] / st.session_state.db_config['chunk_size'] * 100):.1f}%"
568
+ ]
569
+ }
570
+
571
+ st.dataframe(config_display, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
+ except Exception as e:
574
+ st.error(f"❌ Error connecting to database: {str(e)}")
575
 
576
  # ============================================================================
577
+ # TAB 3: MANAGEMENT
578
  # ============================================================================
579
 
580
+ with tab3:
581
+ st.header("πŸ”§ Database Management")
582
 
583
+ st.warning("⚠️ Management operations affect your database. Use carefully!")
584
+
585
+ try:
586
+ QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
587
+ client = QdrantClient(
588
+ url=os.getenv("QDRANT_URL"),
589
+ api_key=os.getenv("QDRANT_API_KEY")
590
  )
591
 
592
+ collection_name = st.session_state.db_config['collection_name']
 
 
593
 
594
+ # Check if exists
595
+ collections = client.get_collections().collections
596
+ exists = any(c.name == collection_name for c in collections)
597
 
598
+ if not exists:
599
+ st.info(f"Collection '{collection_name}' doesn't exist")
600
 
601
+ if st.button("πŸ—οΈ CREATE COLLECTION", type="primary"):
602
+ try:
603
+ # Map string to Distance enum
604
+ distance_map = {
605
+ 'COSINE': Distance.COSINE,
606
+ 'EUCLIDEAN': Distance.EUCLID,
607
+ 'DOT': Distance.DOT
608
+ }
609
 
610
+ client.create_collection(
611
+ collection_name=collection_name,
612
+ vectors_config=VectorParams(
613
+ size=st.session_state.db_config['embedding_dimensions'],
614
+ distance=distance_map[st.session_state.db_config['similarity_metric']]
615
+ )
616
  )
617
 
618
+ st.success(f"βœ… Created collection: {collection_name}")
619
+ st.balloons()
620
+ st.session_state.db_created = True
621
+ st.rerun()
622
+
623
+ except Exception as e:
624
+ st.error(f"❌ Creation failed: {str(e)}")
625
+
626
+ else:
627
+ st.success(f"βœ… Collection exists: {collection_name}")
628
+
629
+ col1, col2 = st.columns(2)
630
+
631
+ with col1:
632
+ if st.button("πŸ—‘οΈ Delete Collection", type="secondary"):
633
+ if st.checkbox("⚠️ Confirm deletion"):
634
+ try:
635
+ client.delete_collection(collection_name)
636
+ st.success("βœ… Collection deleted")
637
+ st.session_state.db_created = False
638
+ st.rerun()
639
+ except Exception as e:
640
+ st.error(f"Error: {e}")
641
+
642
+ with col2:
643
+ if st.button("ℹ️ Collection Info"):
644
+ try:
645
+ info = client.get_collection(collection_name)
646
+ st.json({
647
+ "name": collection_name,
648
+ "status": "active"
649
+ })
650
+ except Exception as e:
651
+ st.error(f"Error: {e}")
652
+
653
+ except Exception as e:
654
+ st.error(f"❌ Connection failed: {str(e)}")
655
+
656
+ # ============================================================================
657
+ # TAB 4: DATA UPLOAD (Quick Access)
658
+ # ============================================================================
659
+
660
+ with tab4:
661
+ st.header("πŸ“š Quick Data Upload")
662
+ st.info("For full upload features, use the main upload interface")
663
+
664
+ st.markdown("[Go to Full Upload Interface β†’](#)")
665
+
666
+ # Simple text upload
667
+ with st.expander("Quick Text Upload"):
668
+ text = st.text_area("Paste text:", height=150)
669
+ if st.button("Upload") and text:
670
+ st.info("Use the main interface for full upload functionality")
671
 
672
  # ============================================================================
673
  # FOOTER
674
  # ============================================================================
675
 
676
  st.markdown("---")
677
+ st.caption("πŸ’‘ Tip: Save your configuration before creating the collection!")