Hebaelsayed commited on
Commit
3f8fbec
Β·
verified Β·
1 Parent(s): 6d08aa2

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +529 -591
src/streamlit_app.py CHANGED
@@ -1,677 +1,615 @@
1
  import streamlit as st
2
  import os
3
- import sys
4
-
5
- # ============================================================================
6
- # LAZY IMPORTS - Only import when needed!
7
- # ============================================================================
8
-
9
- @st.cache_resource
10
- def lazy_import_qdrant():
11
- """Import Qdrant only when needed"""
12
- from qdrant_client import QdrantClient
13
- from qdrant_client.models import Distance, VectorParams, PointStruct
14
- return QdrantClient, Distance, VectorParams, PointStruct
15
-
16
- @st.cache_resource
17
- def lazy_import_embedder():
18
- """Import sentence transformers only when needed"""
19
- from sentence_transformers import SentenceTransformer
20
- return SentenceTransformer
21
-
22
- @st.cache_resource
23
- def lazy_import_datasets():
24
- """Import datasets only when needed"""
25
- from datasets import load_dataset
26
- return load_dataset
27
 
28
  # ============================================================================
29
  # CONFIGURATION
30
  # ============================================================================
31
 
32
  st.set_page_config(
33
- page_title="Math AI - Database Dashboard",
34
  page_icon="πŸ—„οΈ",
35
  layout="wide"
36
  )
37
 
 
 
38
  # ============================================================================
39
- # DATABASE CONFIGURATION SETTINGS
40
  # ============================================================================
41
 
42
- if 'db_config' not in st.session_state:
43
- st.session_state.db_config = {
44
- 'collection_name': 'math_knowledge_base',
45
- 'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
46
- 'embedding_dimensions': 384,
47
- 'chunk_size': 500,
48
- 'chunk_overlap': 50,
49
- 'similarity_metric': 'COSINE',
50
- 'max_chunk_tokens': 8192,
51
- 'tokenizer': 'whitespace'
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  if 'db_created' not in st.session_state:
55
  st.session_state.db_created = False
56
 
57
- if 'embedder_loaded' not in st.session_state:
58
- st.session_state.embedder_loaded = False
 
 
 
59
 
60
  # ============================================================================
61
- # HEADER
62
  # ============================================================================
63
 
64
- st.title("πŸ—„οΈ Vector Database Configuration & Analytics")
65
- st.markdown("**Complete database setup with full visibility and control**")
 
 
 
66
 
67
  # ============================================================================
68
- # SIDEBAR: QUICK STATS
69
  # ============================================================================
70
 
71
  with st.sidebar:
72
- st.header("πŸ“Š Database Stats")
73
 
74
- # Try to connect and get stats
75
- try:
76
- QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
77
- client = QdrantClient(
78
- url=os.getenv("QDRANT_URL"),
79
- api_key=os.getenv("QDRANT_API_KEY")
80
- )
81
-
82
- collection_name = st.session_state.db_config['collection_name']
83
-
84
- # Check if collection exists
85
- collections = client.get_collections().collections
86
- exists = any(c.name == collection_name for c in collections)
87
-
88
- if exists:
89
- st.success("βœ… Database Online")
90
-
91
- # Get vector count
92
- try:
93
- scroll_result = client.scroll(
94
- collection_name=collection_name,
95
- limit=1,
96
- with_payload=False,
97
- with_vectors=False
98
- )
99
-
100
- # Try multiple ways to get count
101
- count = 0
102
- offset = None
103
- max_iterations = 1000
104
- iteration = 0
105
-
106
- while iteration < max_iterations:
107
- result = client.scroll(
108
- collection_name=collection_name,
109
- limit=100,
110
- offset=offset,
111
- with_payload=False,
112
- with_vectors=False
113
- )
114
-
115
- if result is None or result[0] is None or len(result[0]) == 0:
116
- break
117
-
118
- count += len(result[0])
119
- offset = result[1]
120
- iteration += 1
121
-
122
- if offset is None:
123
- break
124
-
125
- st.metric("Total Vectors", f"{count:,}")
126
-
127
- # Calculate approximate storage size
128
- vector_dim = st.session_state.db_config['embedding_dimensions']
129
- bytes_per_float = 4
130
- metadata_overhead = 100 # bytes per vector for metadata
131
-
132
- vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
133
- metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
134
- total_size_mb = vector_size_mb + metadata_size_mb
135
-
136
- st.metric("Storage Used", f"{total_size_mb:.2f} MB")
137
- st.caption(f"Vectors: {vector_size_mb:.2f} MB")
138
- st.caption(f"Metadata: {metadata_size_mb:.2f} MB")
139
-
140
- # Calculate storage capacity
141
- free_tier_gb = 1.0
142
- used_gb = total_size_mb / 1024
143
- remaining_gb = free_tier_gb - used_gb
144
- usage_pct = (used_gb / free_tier_gb) * 100
145
-
146
- st.metric("Free Tier Usage", f"{usage_pct:.1f}%")
147
- st.progress(min(usage_pct / 100, 1.0))
148
- st.caption(f"Remaining: {remaining_gb:.3f} GB")
149
-
150
- except Exception as e:
151
- st.error(f"Stats error: {e}")
152
- else:
153
- st.warning("⚠️ Database Not Created")
154
 
155
- except Exception as e:
156
- st.error("❌ Connection Failed")
157
- st.caption(str(e)[:50])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  # ============================================================================
160
- # TAB 1: DATABASE CONFIGURATION
161
  # ============================================================================
162
 
163
- tab1, tab2, tab3, tab4 = st.tabs([
164
- "βš™οΈ Configuration",
165
- "πŸ“Š Analytics",
166
- "πŸ”§ Management",
167
- "πŸ“š Data Upload"
168
- ])
169
 
170
- with tab1:
171
- st.header("βš™οΈ Database Configuration")
172
-
173
- st.info("**Configure your vector database parameters before creation**")
 
 
174
 
175
- # ========================================================================
176
- # SECTION 1: COLLECTION SETTINGS
177
- # ========================================================================
178
 
179
- with st.expander("πŸ—„οΈ Collection Settings", expanded=True):
180
-
181
- col1, col2 = st.columns(2)
182
-
183
- with col1:
184
- collection_name = st.text_input(
185
- "Collection Name",
186
- value=st.session_state.db_config['collection_name'],
187
- help="Name of your vector database collection"
188
- )
189
- st.session_state.db_config['collection_name'] = collection_name
190
-
191
- with col2:
192
- similarity_options = {
193
- 'COSINE': 'Cosine Similarity (Best for text, -1 to 1)',
194
- 'EUCLIDEAN': 'Euclidean Distance (L2 norm)',
195
- 'DOT': 'Dot Product (Fast, unnormalized)'
196
- }
197
-
198
- similarity_metric = st.selectbox(
199
- "Similarity Metric",
200
- options=list(similarity_options.keys()),
201
- index=0,
202
- help="How to measure similarity between vectors",
203
- format_func=lambda x: similarity_options[x]
204
- )
205
- st.session_state.db_config['similarity_metric'] = similarity_metric
206
-
207
- # Explanation
208
- st.caption("""
209
- **Cosine Similarity**: Measures angle between vectors (best for text)
210
- **Euclidean**: Measures distance in space (sensitive to magnitude)
211
- **Dot Product**: Fast but requires normalized vectors
212
- """)
213
 
214
- # ========================================================================
215
- # SECTION 2: EMBEDDING MODEL
216
- # ========================================================================
217
 
218
- with st.expander("πŸ€– Embedding Model Configuration", expanded=True):
219
-
220
- embedding_models = {
221
- 'sentence-transformers/all-MiniLM-L6-v2': {
222
- 'name': 'all-MiniLM-L6-v2 (Recommended)',
223
- 'dimensions': 384,
224
- 'size': '90 MB',
225
- 'speed': 'Fast',
226
- 'quality': 'Good',
227
- 'description': 'Best balance of speed and quality for math content'
228
- },
229
- 'sentence-transformers/all-mpnet-base-v2': {
230
- 'name': 'all-mpnet-base-v2 (High Quality)',
231
- 'dimensions': 768,
232
- 'size': '420 MB',
233
- 'speed': 'Medium',
234
- 'quality': 'Excellent',
235
- 'description': 'Higher quality embeddings, slower inference'
236
- },
237
- 'sentence-transformers/all-MiniLM-L12-v2': {
238
- 'name': 'all-MiniLM-L12-v2 (Balanced)',
239
- 'dimensions': 384,
240
- 'size': '120 MB',
241
- 'speed': 'Medium',
242
- 'quality': 'Very Good',
243
- 'description': 'Larger MiniLM, better quality than L6'
244
- }
245
- }
246
-
247
- selected_model = st.selectbox(
248
- "Select Embedding Model",
249
- options=list(embedding_models.keys()),
250
- format_func=lambda x: embedding_models[x]['name']
251
- )
252
-
253
- st.session_state.db_config['embedding_model'] = selected_model
254
- st.session_state.db_config['embedding_dimensions'] = embedding_models[selected_model]['dimensions']
255
-
256
- # Model details
257
- model_info = embedding_models[selected_model]
258
-
259
- col1, col2, col3, col4 = st.columns(4)
260
-
261
- with col1:
262
- st.metric("Dimensions", model_info['dimensions'])
263
- with col2:
264
- st.metric("Model Size", model_info['size'])
265
- with col3:
266
- st.metric("Speed", model_info['speed'])
267
- with col4:
268
- st.metric("Quality", model_info['quality'])
269
-
270
- st.info(f"**Why this model?** {model_info['description']}")
271
 
272
- # ========================================================================
273
- # SECTION 3: CHUNKING STRATEGY
274
- # ========================================================================
275
 
276
- with st.expander("βœ‚οΈ Chunking Strategy", expanded=True):
277
-
278
- st.markdown("**How to split documents into processable chunks**")
279
-
280
- col1, col2 = st.columns(2)
281
-
282
- with col1:
283
- chunk_size = st.slider(
284
- "Chunk Size (tokens)",
285
- min_value=100,
286
- max_value=2000,
287
- value=st.session_state.db_config['chunk_size'],
288
- step=50,
289
- help="Number of tokens per chunk"
290
- )
291
- st.session_state.db_config['chunk_size'] = chunk_size
292
-
293
- st.caption(f"""
294
- **Small (100-300)**: Better precision, more chunks
295
- **Medium (400-600)**: Balanced βœ…
296
- **Large (800-2000)**: More context, fewer chunks
297
- """)
298
-
299
- with col2:
300
- chunk_overlap = st.slider(
301
- "Chunk Overlap (tokens)",
302
- min_value=0,
303
- max_value=min(500, chunk_size // 2),
304
- value=st.session_state.db_config['chunk_overlap'],
305
- step=10,
306
- help="Overlap between consecutive chunks"
307
- )
308
- st.session_state.db_config['chunk_overlap'] = chunk_overlap
309
-
310
- overlap_pct = (chunk_overlap / chunk_size) * 100 if chunk_size > 0 else 0
311
- st.metric("Overlap %", f"{overlap_pct:.1f}%")
312
-
313
- st.caption(f"""
314
- **No Overlap (0%)**: Distinct chunks, might lose context
315
- **Small (5-10%)**: Minimal redundancy βœ…
316
- **Large (20-30%)**: More context preservation
317
- """)
318
-
319
- # Visualization
320
- st.markdown("**Chunking Visualization:**")
321
-
322
- sample_text = "The Pythagorean theorem states that aΒ² + bΒ² = cΒ² for right triangles."
323
- words = sample_text.split()
324
-
325
- if len(words) >= 5:
326
- chunk1 = ' '.join(words[:5])
327
- chunk2 = ' '.join(words[3:8]) if len(words) >= 8 else ' '.join(words[3:])
328
-
329
- st.code(f"""
330
- Chunk 1: "{chunk1}..."
331
- {'↓' * (chunk_overlap // 10 if chunk_overlap > 0 else 0)}
332
- Chunk 2: "...{chunk2}..."
333
 
334
- Overlap: {chunk_overlap} tokens ({overlap_pct:.0f}%)
335
- """)
336
-
337
- # ========================================================================
338
- # SECTION 4: TOKENIZATION & PARSING
339
- # ========================================================================
340
 
341
- with st.expander("πŸ”€ Tokenization & Parsing", expanded=False):
 
342
 
343
  col1, col2 = st.columns(2)
344
-
345
  with col1:
346
- tokenizer_options = {
347
- 'whitespace': 'Whitespace (Simple, fast)',
348
- 'nltk': 'NLTK (Sentence-aware)',
349
- 'tiktoken': 'TikToken (GPT-style, accurate)'
350
- }
351
-
352
- tokenizer = st.selectbox(
353
- "Tokenizer",
354
- options=list(tokenizer_options.keys()),
355
- format_func=lambda x: tokenizer_options[x],
356
- help="How to split text into tokens"
357
- )
358
- st.session_state.db_config['tokenizer'] = tokenizer
359
 
360
  with col2:
361
- max_chunk_tokens = st.number_input(
362
- "Max Tokens per Chunk",
363
- min_value=512,
364
- max_value=32000,
365
- value=st.session_state.db_config['max_chunk_tokens'],
366
- step=512,
367
- help="Maximum tokens before forcing a split"
368
- )
369
- st.session_state.db_config['max_chunk_tokens'] = max_chunk_tokens
370
-
371
- st.info("""
372
- **Tokenization** converts text into tokens (words/subwords)
373
- - **Whitespace**: Simple split by spaces (fastest)
374
- - **NLTK**: Respects sentence boundaries (better)
375
- - **TikToken**: Matches GPT tokenization (most accurate)
376
- """)
377
 
378
- # ========================================================================
379
- # SAVE CONFIGURATION
380
- # ========================================================================
 
 
 
 
 
 
 
 
 
381
 
382
  st.markdown("---")
 
 
 
 
 
 
 
383
 
384
- if st.button("πŸ’Ύ Save Configuration", type="primary"):
385
- st.success("βœ… Configuration saved!")
386
- st.json(st.session_state.db_config)
 
 
387
 
388
- # Show current config
389
- with st.expander("πŸ“‹ View Current Configuration"):
390
- st.json(st.session_state.db_config)
391
 
392
  # ============================================================================
393
- # TAB 2: ANALYTICS & VISUALIZATION
394
  # ============================================================================
395
 
396
- with tab2:
397
- st.header("πŸ“Š Database Analytics")
398
 
399
- try:
400
- QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
401
- client = QdrantClient(
402
- url=os.getenv("QDRANT_URL"),
403
- api_key=os.getenv("QDRANT_API_KEY")
404
- )
405
-
406
- collection_name = st.session_state.db_config['collection_name']
407
-
408
- # Check if collection exists
409
- collections = client.get_collections().collections
410
- exists = any(c.name == collection_name for c in collections)
411
-
412
- if not exists:
413
- st.warning(f"⚠️ Collection '{collection_name}' doesn't exist yet. Create it in the Management tab.")
414
- else:
415
- st.success(f"βœ… Analyzing collection: {collection_name}")
416
-
417
- # ================================================================
418
- # STORAGE ANALYTICS
419
- # ================================================================
420
-
421
- st.subheader("πŸ’Ύ Storage Analytics")
422
-
423
- # Get vector count
424
- count = 0
425
- offset = None
426
- max_iter = 1000
427
-
428
- for _ in range(max_iter):
429
- result = client.scroll(
430
- collection_name=collection_name,
431
- limit=100,
432
- offset=offset,
433
- with_payload=False,
434
- with_vectors=False
435
- )
436
-
437
- if result is None or result[0] is None or len(result[0]) == 0:
438
- break
439
-
440
- count += len(result[0])
441
- offset = result[1]
442
-
443
- if offset is None:
444
- break
445
-
446
- col1, col2, col3, col4 = st.columns(4)
447
-
448
- vector_dim = st.session_state.db_config['embedding_dimensions']
449
- bytes_per_float = 4
450
- metadata_overhead = 100
451
-
452
- vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
453
- metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
454
- total_size_mb = vector_size_mb + metadata_size_mb
455
-
456
- with col1:
457
- st.metric("Total Vectors", f"{count:,}")
458
-
459
- with col2:
460
- st.metric("Vector Data", f"{vector_size_mb:.2f} MB")
461
-
462
- with col3:
463
- st.metric("Metadata", f"{metadata_size_mb:.2f} MB")
464
-
465
- with col4:
466
- st.metric("Total Size", f"{total_size_mb:.2f} MB")
467
-
468
- # Storage breakdown
469
- st.markdown("**Storage Breakdown:**")
470
-
471
- storage_data = {
472
- "Component": ["Vector Embeddings", "Metadata", "Index Overhead (est.)"],
473
- "Size (MB)": [vector_size_mb, metadata_size_mb, total_size_mb * 0.1],
474
- "Percentage": [
475
- (vector_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
476
- (metadata_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
477
- 10.0
478
- ]
479
- }
480
-
481
- st.dataframe(storage_data, use_container_width=True)
482
-
483
- # Free tier usage
484
- st.markdown("**Free Tier Capacity:**")
485
-
486
- free_tier_gb = 1.0
487
- used_gb = total_size_mb / 1024
488
- remaining_gb = free_tier_gb - used_gb
489
- usage_pct = (used_gb / free_tier_gb) * 100
490
-
491
- col1, col2 = st.columns([2, 1])
492
-
493
- with col1:
494
- st.progress(min(usage_pct / 100, 1.0))
495
- st.caption(f"Used: {used_gb:.3f} GB / {free_tier_gb} GB ({usage_pct:.1f}%)")
496
-
497
- with col2:
498
- st.metric("Remaining", f"{remaining_gb:.3f} GB")
499
-
500
- # Capacity estimates
501
- st.markdown("**Capacity Estimates:**")
502
-
503
- if count > 0:
504
- avg_vector_size = total_size_mb / count
505
- max_vectors_1gb = int((1024 / avg_vector_size) * 0.9) # 90% of theoretical max
506
-
507
- st.info(f"""
508
- **With current data:**
509
- - Average size per vector: {avg_vector_size:.3f} MB
510
- - Estimated max vectors (1GB): ~{max_vectors_1gb:,}
511
- - Current capacity used: {(count / max_vectors_1gb * 100):.1f}%
512
- """)
513
-
514
- # ================================================================
515
- # DATA SOURCE ANALYTICS
516
- # ================================================================
517
-
518
- st.subheader("πŸ“š Data Source Breakdown")
519
-
520
- # Sample vectors to analyze sources
521
- sample_result = client.scroll(
522
- collection_name=collection_name,
523
- limit=min(count, 1000),
524
- with_payload=True,
525
- with_vectors=False
526
  )
527
 
528
- if sample_result and sample_result[0]:
529
- source_counts = {}
530
-
531
- for point in sample_result[0]:
532
- source = point.payload.get('source_name', 'Unknown')
533
- source_counts[source] = source_counts.get(source, 0) + 1
534
-
535
- # Display as table
536
- source_data = {
537
- "Source": list(source_counts.keys()),
538
- "Vectors": list(source_counts.values()),
539
- "Percentage": [
540
- f"{(v/count*100):.1f}%" for v in source_counts.values()
541
- ]
542
- }
543
-
544
- st.dataframe(source_data, use_container_width=True)
545
-
546
- # ================================================================
547
- # CONFIGURATION SUMMARY
548
- # ================================================================
549
 
550
- st.subheader("βš™οΈ Active Configuration")
551
-
552
- config_display = {
553
- "Parameter": [
554
- "Embedding Model",
555
- "Vector Dimensions",
556
- "Similarity Metric",
557
- "Chunk Size",
558
- "Chunk Overlap",
559
- "Overlap Percentage"
560
- ],
561
- "Value": [
562
- st.session_state.db_config['embedding_model'].split('/')[-1],
563
- st.session_state.db_config['embedding_dimensions'],
564
- st.session_state.db_config['similarity_metric'],
565
- f"{st.session_state.db_config['chunk_size']} tokens",
566
- f"{st.session_state.db_config['chunk_overlap']} tokens",
567
- f"{(st.session_state.db_config['chunk_overlap'] / st.session_state.db_config['chunk_size'] * 100):.1f}%"
568
- ]
569
- }
570
-
571
- st.dataframe(config_display, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
- except Exception as e:
574
- st.error(f"❌ Error connecting to database: {str(e)}")
575
 
576
  # ============================================================================
577
- # TAB 3: MANAGEMENT
578
  # ============================================================================
579
 
580
- with tab3:
581
- st.header("πŸ”§ Database Management")
582
-
583
- st.warning("⚠️ Management operations affect your database. Use carefully!")
584
 
585
- try:
586
- QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
587
- client = QdrantClient(
588
- url=os.getenv("QDRANT_URL"),
589
- api_key=os.getenv("QDRANT_API_KEY")
590
- )
591
-
592
- collection_name = st.session_state.db_config['collection_name']
593
-
594
- # Check if exists
595
- collections = client.get_collections().collections
596
- exists = any(c.name == collection_name for c in collections)
597
-
598
- if not exists:
599
- st.info(f"Collection '{collection_name}' doesn't exist")
 
 
600
 
601
- if st.button("πŸ—οΈ CREATE COLLECTION", type="primary"):
 
602
  try:
603
- # Map string to Distance enum
604
- distance_map = {
605
- 'COSINE': Distance.COSINE,
606
- 'EUCLIDEAN': Distance.EUCLID,
607
- 'DOT': Distance.DOT
608
- }
609
 
610
- client.create_collection(
611
- collection_name=collection_name,
612
- vectors_config=VectorParams(
613
- size=st.session_state.db_config['embedding_dimensions'],
614
- distance=distance_map[st.session_state.db_config['similarity_metric']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  )
616
- )
 
 
 
 
 
 
617
 
618
- st.success(f"βœ… Created collection: {collection_name}")
619
- st.balloons()
620
- st.session_state.db_created = True
621
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
623
  except Exception as e:
624
- st.error(f"❌ Creation failed: {str(e)}")
625
-
626
- else:
627
- st.success(f"βœ… Collection exists: {collection_name}")
628
-
629
- col1, col2 = st.columns(2)
630
-
631
- with col1:
632
- if st.button("πŸ—‘οΈ Delete Collection", type="secondary"):
633
- if st.checkbox("⚠️ Confirm deletion"):
634
- try:
635
- client.delete_collection(collection_name)
636
- st.success("βœ… Collection deleted")
637
- st.session_state.db_created = False
638
- st.rerun()
639
- except Exception as e:
640
- st.error(f"Error: {e}")
641
-
642
- with col2:
643
- if st.button("ℹ️ Collection Info"):
644
- try:
645
- info = client.get_collection(collection_name)
646
- st.json({
647
- "name": collection_name,
648
- "status": "active"
649
- })
650
- except Exception as e:
651
- st.error(f"Error: {e}")
652
 
653
- except Exception as e:
654
- st.error(f"❌ Connection failed: {str(e)}")
655
 
656
  # ============================================================================
657
- # TAB 4: DATA UPLOAD (Quick Access)
658
  # ============================================================================
659
 
660
- with tab4:
661
- st.header("πŸ“š Quick Data Upload")
662
- st.info("For full upload features, use the main upload interface")
663
 
664
- st.markdown("[Go to Full Upload Interface β†’](#)")
665
-
666
- # Simple text upload
667
- with st.expander("Quick Text Upload"):
668
- text = st.text_area("Paste text:", height=150)
669
- if st.button("Upload") and text:
670
- st.info("Use the main interface for full upload functionality")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
671
 
672
  # ============================================================================
673
  # FOOTER
674
  # ============================================================================
675
 
676
  st.markdown("---")
677
- st.caption("πŸ’‘ Tip: Save your configuration before creating the collection!")
 
1
  import streamlit as st
2
  import os
3
+ import time
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client.models import Distance, VectorParams, PointStruct
6
+ from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # ============================================================================
9
  # CONFIGURATION
10
  # ============================================================================
11
 
12
  st.set_page_config(
13
+ page_title="Math AI - Phase 2: Database",
14
  page_icon="πŸ—„οΈ",
15
  layout="wide"
16
  )
17
 
18
+ COLLECTION_NAME = "math_knowledge_base"
19
+
20
  # ============================================================================
21
+ # CACHED FUNCTIONS
22
  # ============================================================================
23
 
24
+ @st.cache_resource(show_spinner="πŸ”Œ Connecting to Qdrant...")
25
+ def get_qdrant_client():
26
+ """Cache Qdrant client"""
27
+ qdrant_url = os.getenv("QDRANT_URL")
28
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
29
+
30
+ if not qdrant_url or not qdrant_api_key:
31
+ return None
32
+
33
+ return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
34
+
35
+ @st.cache_resource(show_spinner="πŸ€– Loading embedding model (30-60s first time)...")
36
+ def get_embedding_model():
37
+ """Cache embedding model"""
38
+ try:
39
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
40
+ return model
41
+ except Exception as e:
42
+ st.error(f"Failed to load model: {e}")
43
+ return None
44
+
45
+ def get_vector_count_reliable(client, collection_name):
46
+ """Get vector count with fallbacks"""
47
+ try:
48
+ count = 0
49
+ offset = None
50
+ max_iterations = 1000
51
+
52
+ for _ in range(max_iterations):
53
+ result = client.scroll(
54
+ collection_name=collection_name,
55
+ limit=100,
56
+ offset=offset,
57
+ with_payload=False,
58
+ with_vectors=False
59
+ )
60
+
61
+ if result is None or result[0] is None or len(result[0]) == 0:
62
+ break
63
+
64
+ count += len(result[0])
65
+ offset = result[1]
66
+
67
+ if offset is None:
68
+ break
69
+
70
+ return count
71
+ except:
72
+ return 0
73
+
74
+ def check_collection_exists(client, collection_name):
75
+ """Check if collection exists"""
76
+ try:
77
+ collections = client.get_collections().collections
78
+ return any(c.name == collection_name for c in collections)
79
+ except:
80
+ return False
81
+
82
+ # ============================================================================
83
+ # SESSION STATE
84
+ # ============================================================================
85
 
86
  if 'db_created' not in st.session_state:
87
  st.session_state.db_created = False
88
 
89
+ if 'embedder_ready' not in st.session_state:
90
+ st.session_state.embedder_ready = False
91
+
92
+ if 'show_step' not in st.session_state:
93
+ st.session_state.show_step = 'all'
94
 
95
  # ============================================================================
96
+ # MAIN APP
97
  # ============================================================================
98
 
99
+ st.title("πŸ—„οΈ Phase 2: Vector Database Setup")
100
+
101
+ # Get cached resources
102
+ client = get_qdrant_client()
103
+ embedder = get_embedding_model()
104
 
105
  # ============================================================================
106
+ # SIDEBAR
107
  # ============================================================================
108
 
109
  with st.sidebar:
110
+ st.header("⚑ Quick Navigation")
111
 
112
+ if st.button("πŸ“‹ Show All Steps", use_container_width=True):
113
+ st.session_state.show_step = 'all'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ if st.button("πŸš€ Skip to Upload", use_container_width=True):
116
+ st.session_state.show_step = 'upload'
117
+
118
+ if st.button("πŸ” Skip to Search", use_container_width=True):
119
+ st.session_state.show_step = 'search'
120
+
121
+ st.markdown("---")
122
+ st.subheader("πŸ“Š System Status")
123
+
124
+ if client and check_collection_exists(client, COLLECTION_NAME):
125
+ st.success("βœ… Database Ready")
126
+ st.session_state.db_created = True
127
+ else:
128
+ st.warning("⚠️ Database Not Ready")
129
+
130
+ if embedder:
131
+ st.success("βœ… Model Loaded")
132
+ st.session_state.embedder_ready = True
133
+ else:
134
+ st.warning("⚠️ Model Not Loaded")
135
+
136
+ if client and st.session_state.db_created:
137
+ count = get_vector_count_reliable(client, COLLECTION_NAME)
138
+ st.metric("Vectors in DB", f"{count:,}")
139
 
140
  # ============================================================================
141
+ # CONDITIONAL DISPLAY
142
  # ============================================================================
143
 
144
+ show_all = st.session_state.show_step == 'all'
145
+ show_upload = st.session_state.show_step in ['all', 'upload']
146
+ show_search = st.session_state.show_step in ['all', 'search']
 
 
 
147
 
148
+ # ============================================================================
149
+ # STEP 1-2: Quick Status
150
+ # ============================================================================
151
+
152
+ if show_all:
153
+ st.header("Step 1-2: System Check")
154
 
155
+ col1, col2, col3 = st.columns(3)
 
 
156
 
157
+ with col1:
158
+ st.metric("Claude API", "βœ…" if os.getenv("ANTHROPIC_API_KEY") else "❌")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ with col2:
161
+ st.metric("Qdrant", "βœ… Connected" if client else "❌")
 
162
 
163
+ with col3:
164
+ st.metric("Embedder", "βœ… Cached" if embedder else "❌")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ if not client:
167
+ st.error("⚠️ Check Qdrant secrets!")
168
+ st.stop()
169
 
170
+ st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ # ============================================================================
173
+ # STEP 3: Collection Management
174
+ # ============================================================================
175
+
176
+ if show_all:
177
+ st.header("πŸ—οΈ Step 3: Database Collection")
178
 
179
+ if st.session_state.db_created:
180
+ st.success(f"βœ… Collection '{COLLECTION_NAME}' ready!")
181
 
182
  col1, col2 = st.columns(2)
 
183
  with col1:
184
+ if st.button("πŸ”„ Recreate Collection"):
185
+ try:
186
+ client.delete_collection(COLLECTION_NAME)
187
+ st.session_state.db_created = False
188
+ st.rerun()
189
+ except Exception as e:
190
+ st.error(f"Error: {e}")
 
 
 
 
 
 
191
 
192
  with col2:
193
+ if st.button("ℹ️ Collection Info"):
194
+ count = get_vector_count_reliable(client, COLLECTION_NAME)
195
+ st.json({"name": COLLECTION_NAME, "vectors": count, "status": "Ready"})
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ else:
198
+ if st.button("πŸ—οΈ CREATE COLLECTION", type="primary"):
199
+ try:
200
+ client.create_collection(
201
+ collection_name=COLLECTION_NAME,
202
+ vectors_config=VectorParams(size=384, distance=Distance.COSINE)
203
+ )
204
+ st.success(f"πŸŽ‰ Created: {COLLECTION_NAME}")
205
+ st.session_state.db_created = True
206
+ st.rerun()
207
+ except Exception as e:
208
+ st.error(f"❌ Failed: {str(e)}")
209
 
210
  st.markdown("---")
211
+
212
+ # ============================================================================
213
+ # STEP 4: Embedding Model
214
+ # ============================================================================
215
+
216
+ if show_all:
217
+ st.header("πŸ€– Step 4: Embedding Model")
218
 
219
+ if embedder:
220
+ st.success("βœ… Model loaded and cached!")
221
+ st.session_state.embedder_ready = True
222
+ else:
223
+ st.warning("⚠️ Model loading failed. Refresh page.")
224
 
225
+ st.markdown("---")
 
 
226
 
227
  # ============================================================================
228
+ # STEP 5A: Upload Custom Text
229
  # ============================================================================
230
 
231
+ if show_upload:
232
+ st.header("πŸ“ Step 5A: Upload Custom Math Notes")
233
 
234
+ if not st.session_state.db_created or not st.session_state.embedder_ready:
235
+ st.error("⚠️ Complete Steps 3 & 4 first")
236
+ else:
237
+ with st.expander("✍️ Paste text", expanded=True):
238
+
239
+ custom_text = st.text_area(
240
+ "Math notes:",
241
+ value="""Linear Equations: ax + b = 0, solution is x = -b/a
242
+
243
+ Quadratic Equations: axΒ² + bx + c = 0
244
+ Solution: x = (-b ± √(b²-4ac)) / 2a
245
+ Example: xΒ² + 5x - 4 = 0
246
+
247
+ Pythagorean Theorem: aΒ² + bΒ² = cΒ²
248
+
249
+ Derivatives:
250
+ d/dx(xⁿ) = nxⁿ⁻¹
251
+ d/dx(sin x) = cos x
252
+ d/dx(eΛ£) = eΛ£""",
253
+ height=200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  )
255
 
256
+ source_name = st.text_input("Source name:", value="math_notes.txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ if st.button("πŸš€ UPLOAD TEXT", type="primary"):
259
+
260
+ if not custom_text.strip():
261
+ st.error("Please enter text!")
262
+ else:
263
+ try:
264
+ progress = st.progress(0)
265
+ status = st.empty()
266
+
267
+ status.text("πŸ“„ Chunking text...")
268
+ progress.progress(0.2)
269
+
270
+ words = custom_text.split()
271
+ chunks = []
272
+ chunk_size = 50
273
+
274
+ for i in range(0, len(words), 40):
275
+ chunk = ' '.join(words[i:i + chunk_size])
276
+ if chunk.strip():
277
+ chunks.append(chunk)
278
+
279
+ st.write(f"βœ… Created {len(chunks)} chunks")
280
+
281
+ status.text("πŸ”’ Generating embeddings...")
282
+ progress.progress(0.5)
283
+
284
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
285
+ st.write(f"βœ… Generated {len(embeddings)} embeddings")
286
+
287
+ status.text("☁️ Uploading...")
288
+ progress.progress(0.8)
289
+
290
+ points = []
291
+ for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
292
+ points.append(PointStruct(
293
+ id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}")) % (2**63),
294
+ vector=embedding.tolist(),
295
+ payload={
296
+ "content": chunk,
297
+ "source_name": source_name,
298
+ "source_type": "custom_notes",
299
+ "chunk_index": idx
300
+ }
301
+ ))
302
+
303
+ client.upsert(collection_name=COLLECTION_NAME, points=points)
304
+
305
+ progress.progress(1.0)
306
+ status.empty()
307
+
308
+ st.success(f"πŸŽ‰ Uploaded {len(points)} vectors!")
309
+
310
+ count = get_vector_count_reliable(client, COLLECTION_NAME)
311
+ st.info(f"πŸ“Š **Total vectors: {count:,}**")
312
+
313
+ except Exception as e:
314
+ st.error(f"❌ Failed: {str(e)}")
315
+ st.exception(e)
316
 
317
+ st.markdown("---")
 
318
 
319
  # ============================================================================
320
+ # STEP 5B: Load Public Datasets (FIXED WITH ALL OPTIONS)
321
  # ============================================================================
322
 
323
+ if show_upload:
324
+ st.header("πŸ“š Step 5B: Load Public Datasets")
 
 
325
 
326
+ if not st.session_state.db_created or not st.session_state.embedder_ready:
327
+ st.error("⚠️ Complete Steps 3 & 4 first")
328
+ else:
329
+ with st.expander("πŸ“Š Load from Hugging Face", expanded=False):
330
+
331
+ dataset_choice = st.selectbox(
332
+ "Dataset:",
333
+ [
334
+ "GSM8K - Grade School Math (8.5K problems)",
335
+ "MATH - Competition Math (12.5K problems) ✨ FIXED",
336
+ "DeepMind Math - School-level (2M+ examples)",
337
+ "CAMEL-AI Math - GPT-4 Generated (50K problems)",
338
+ "RACE - Reading Comprehension (28K passages)"
339
+ ]
340
+ )
341
+
342
+ sample_size = st.slider("Items to load:", 10, 500, 50)
343
 
344
+ if st.button("πŸ“₯ LOAD DATASET", type="primary"):
345
+
346
  try:
347
+ from datasets import load_dataset
 
 
 
 
 
348
 
349
+ progress = st.progress(0)
350
+ status = st.empty()
351
+
352
+ # ============================================================
353
+ # GSM8K
354
+ # ============================================================
355
+ if "GSM8K" in dataset_choice:
356
+ status.text("πŸ“₯ Downloading GSM8K...")
357
+ progress.progress(0.1)
358
+
359
+ dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
360
+ dataset_name = "GSM8K"
361
+
362
+ texts = []
363
+ for i in range(min(sample_size, len(dataset))):
364
+ item = dataset[i]
365
+ text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
366
+ texts.append(text)
367
+
368
+ # ============================================================
369
+ # MATH (FIXED!)
370
+ # ============================================================
371
+ elif "MATH" in dataset_choice and "Competition" in dataset_choice:
372
+ status.text("πŸ“₯ Downloading MATH...")
373
+ progress.progress(0.1)
374
+
375
+ # Try multiple sources
376
+ dataset = None
377
+ dataset_name = "MATH"
378
+
379
+ # Try source 1
380
+ try:
381
+ dataset = load_dataset(
382
+ "lighteval/MATH",
383
+ split="train",
384
+ trust_remote_code=True
385
+ )
386
+ st.success("βœ… Using lighteval/MATH")
387
+ except:
388
+ pass
389
+
390
+ # Try source 2
391
+ if dataset is None:
392
+ try:
393
+ dataset = load_dataset(
394
+ "DigitalLearningGmbH/MATH-lighteval",
395
+ split="train",
396
+ trust_remote_code=True
397
+ )
398
+ st.success("βœ… Using DigitalLearningGmbH/MATH")
399
+ except:
400
+ pass
401
+
402
+ # Try source 3
403
+ if dataset is None:
404
+ try:
405
+ dataset = load_dataset(
406
+ "EleutherAI/hendrycks_math",
407
+ split="train",
408
+ trust_remote_code=True
409
+ )
410
+ st.success("βœ… Using EleutherAI/hendrycks_math")
411
+ except:
412
+ pass
413
+
414
+ if dataset is None:
415
+ st.error("❌ All MATH sources failed. Try GSM8K or DeepMind instead.")
416
+ st.stop()
417
+
418
+ # Process dataset
419
+ texts = []
420
+ for i in range(min(sample_size, len(dataset))):
421
+ item = dataset[i]
422
+
423
+ # Handle different formats
424
+ problem = item.get('problem', item.get('question', ''))
425
+ solution = item.get('solution', item.get('answer', ''))
426
+ problem_type = item.get('type', item.get('level', 'general'))
427
+
428
+ text = f"Problem ({problem_type}): {problem}\n\nSolution: {solution}"
429
+ texts.append(text)
430
+
431
+ # ============================================================
432
+ # DeepMind Math
433
+ # ============================================================
434
+ elif "DeepMind" in dataset_choice:
435
+ status.text("πŸ“₯ Downloading DeepMind Math...")
436
+ progress.progress(0.1)
437
+
438
+ # Use arithmetic module
439
+ dataset = load_dataset(
440
+ "deepmind/math_dataset",
441
+ "arithmetic__mul",
442
+ split="train",
443
+ trust_remote_code=True
444
  )
445
+ dataset_name = "DeepMind-Math"
446
+
447
+ texts = []
448
+ for i in range(min(sample_size, len(dataset))):
449
+ item = dataset[i]
450
+ text = f"Question: {item['question']}\n\nAnswer: {item['answer']}"
451
+ texts.append(text)
452
 
453
+ # ============================================================
454
+ # CAMEL-AI Math
455
+ # ============================================================
456
+ elif "CAMEL" in dataset_choice:
457
+ status.text("πŸ“₯ Downloading CAMEL-AI...")
458
+ progress.progress(0.1)
459
+
460
+ dataset = load_dataset(
461
+ "camel-ai/math",
462
+ split="train",
463
+ trust_remote_code=True
464
+ )
465
+ dataset_name = "CAMEL-Math"
466
+
467
+ texts = []
468
+ for i in range(min(sample_size, len(dataset))):
469
+ item = dataset[i]
470
+ text = f"Problem: {item['message']}"
471
+ texts.append(text)
472
+
473
+ # ============================================================
474
+ # RACE
475
+ # ============================================================
476
+ else:
477
+ status.text("πŸ“₯ Downloading RACE...")
478
+ progress.progress(0.1)
479
+
480
+ dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
481
+ dataset_name = "RACE"
482
+
483
+ texts = []
484
+ for i in range(min(sample_size, len(dataset))):
485
+ item = dataset[i]
486
+ text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
487
+ texts.append(text)
488
+
489
+ # ============================================================
490
+ # COMMON PROCESSING
491
+ # ============================================================
492
+
493
+ st.write(f"βœ… Loaded {len(texts)} items from {dataset_name}")
494
+ progress.progress(0.3)
495
+
496
+ status.text("πŸ”’ Generating embeddings...")
497
+ embeddings = []
498
+
499
+ for idx, text in enumerate(texts):
500
+ embedding = embedder.encode(text)
501
+ embeddings.append(embedding)
502
+
503
+ if idx % 10 == 0:
504
+ progress.progress(0.3 + (0.5 * idx / len(texts)))
505
+ status.text(f"πŸ”’ Embedding {idx+1}/{len(texts)}")
506
+
507
+ st.write(f"βœ… Generated {len(embeddings)} embeddings")
508
+ progress.progress(0.8)
509
+
510
+ status.text("☁️ Uploading...")
511
 
512
+ points = []
513
+ for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
514
+ content = text[:2000] if len(text) > 2000 else text
515
+
516
+ points.append(PointStruct(
517
+ id=abs(hash(f"{dataset_name}_{idx}_{time.time()}")) % (2**63),
518
+ vector=embedding.tolist(),
519
+ payload={
520
+ "content": content,
521
+ "source_name": dataset_name,
522
+ "source_type": "public_dataset",
523
+ "dataset": dataset_name,
524
+ "index": idx
525
+ }
526
+ ))
527
+
528
+ client.upsert(collection_name=COLLECTION_NAME, points=points)
529
+
530
+ progress.progress(1.0)
531
+ status.empty()
532
+
533
+ st.success(f"πŸŽ‰ Uploaded {len(points)} vectors from {dataset_name}!")
534
+
535
+ count = get_vector_count_reliable(client, COLLECTION_NAME)
536
+ st.info(f"πŸ“Š **Total vectors: {count:,}**")
537
+
538
+ except ImportError:
539
+ st.error("❌ Add 'datasets' to requirements.txt")
540
  except Exception as e:
541
+ st.error(f"❌ Failed: {str(e)}")
542
+ st.exception(e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
+ st.markdown("---")
 
545
 
546
  # ============================================================================
547
+ # STEP 6: Search
548
  # ============================================================================
549
 
550
+ if show_search:
551
+ st.header("πŸ” Step 6: Test Search")
 
552
 
553
+ if not st.session_state.db_created or not st.session_state.embedder_ready:
554
+ st.error("⚠️ Database and embedder must be ready")
555
+ else:
556
+ search_query = st.text_input(
557
+ "Question:",
558
+ placeholder="Solve xΒ² + 5x - 4 = 0"
559
+ )
560
+
561
+ col1, col2 = st.columns([3, 1])
562
+ with col1:
563
+ top_k = st.slider("Results:", 1, 10, 5)
564
+
565
+ with col2:
566
+ st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))
567
+
568
+ if st.button("πŸ” SEARCH", type="primary") and search_query:
569
+
570
+ try:
571
+ with st.spinner("Searching..."):
572
+
573
+ query_embedding = embedder.encode(search_query)
574
+
575
+ results = client.search(
576
+ collection_name=COLLECTION_NAME,
577
+ query_vector=query_embedding.tolist(),
578
+ limit=top_k
579
+ )
580
+
581
+ if results:
582
+ st.success(f"βœ… Found {len(results)} results!")
583
+
584
+ for i, result in enumerate(results, 1):
585
+ similarity_pct = result.score * 100
586
+
587
+ if similarity_pct > 50:
588
+ color = "🟒"
589
+ elif similarity_pct > 30:
590
+ color = "🟑"
591
+ else:
592
+ color = "πŸ”΄"
593
+
594
+ with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
595
+ st.info(result.payload['content'])
596
+
597
+ col1, col2, col3 = st.columns(3)
598
+ with col1:
599
+ st.caption(f"**Source:** {result.payload['source_name']}")
600
+ with col2:
601
+ st.caption(f"**Type:** {result.payload['source_type']}")
602
+ with col3:
603
+ st.caption(f"**Score:** {result.score:.4f}")
604
+ else:
605
+ st.warning("No results found!")
606
+
607
+ except Exception as e:
608
+ st.error(f"❌ Search failed: {str(e)}")
609
 
610
  # ============================================================================
611
  # FOOTER
612
  # ============================================================================
613
 
614
  st.markdown("---")
615
+ st.success("πŸŽ‰ Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG with Claude")