Spaces:

Hebaelsayed
/

math-ai-system

Running

App Files Files Community

Hebaelsayed commited on Jan 4

Commit

3f8fbec

verified ·

1 Parent(s): 6d08aa2

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +529 -591

src/streamlit_app.py CHANGED Viewed

@@ -1,677 +1,615 @@
 import streamlit as st
 import os
-import sys
-# ============================================================================
-# LAZY IMPORTS - Only import when needed!
-# ============================================================================
-@st.cache_resource
-def lazy_import_qdrant():
-    """Import Qdrant only when needed"""
-    from qdrant_client import QdrantClient
-    from qdrant_client.models import Distance, VectorParams, PointStruct
-    return QdrantClient, Distance, VectorParams, PointStruct
-@st.cache_resource
-def lazy_import_embedder():
-    """Import sentence transformers only when needed"""
-    from sentence_transformers import SentenceTransformer
-    return SentenceTransformer
-@st.cache_resource
-def lazy_import_datasets():
-    """Import datasets only when needed"""
-    from datasets import load_dataset
-    return load_dataset
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 st.set_page_config(
-    page_title="Math AI - Database Dashboard",
     page_icon="🗄️",
     layout="wide"
 )
 # ============================================================================
-# DATABASE CONFIGURATION SETTINGS
 # ============================================================================
-if 'db_config' not in st.session_state:
-    st.session_state.db_config = {
-        'collection_name': 'math_knowledge_base',
-        'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
-        'embedding_dimensions': 384,
-        'chunk_size': 500,
-        'chunk_overlap': 50,
-        'similarity_metric': 'COSINE',
-        'max_chunk_tokens': 8192,
-        'tokenizer': 'whitespace'
-    }
 if 'db_created' not in st.session_state:
     st.session_state.db_created = False
-if 'embedder_loaded' not in st.session_state:
-    st.session_state.embedder_loaded = False
 # ============================================================================
-# HEADER
 # ============================================================================
-st.title("🗄️ Vector Database Configuration & Analytics")
-st.markdown("**Complete database setup with full visibility and control**")
 # ============================================================================
-# SIDEBAR: QUICK STATS
 # ============================================================================
 with st.sidebar:
-    st.header("📊 Database Stats")
-    # Try to connect and get stats
-    try:
-        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
-        client = QdrantClient(
-            url=os.getenv("QDRANT_URL"),
-            api_key=os.getenv("QDRANT_API_KEY")
-        )
-        collection_name = st.session_state.db_config['collection_name']
-        # Check if collection exists
-        collections = client.get_collections().collections
-        exists = any(c.name == collection_name for c in collections)
-        if exists:
-            st.success("✅ Database Online")
-            # Get vector count
-            try:
-                scroll_result = client.scroll(
-                    collection_name=collection_name,
-                    limit=1,
-                    with_payload=False,
-                    with_vectors=False
-                )
-                # Try multiple ways to get count
-                count = 0
-                offset = None
-                max_iterations = 1000
-                iteration = 0
-                while iteration < max_iterations:
-                    result = client.scroll(
-                        collection_name=collection_name,
-                        limit=100,
-                        offset=offset,
-                        with_payload=False,
-                        with_vectors=False
-                    )
-                    if result is None or result[0] is None or len(result[0]) == 0:
-                        break
-                    count += len(result[0])
-                    offset = result[1]
-                    iteration += 1
-                    if offset is None:
-                        break
-                st.metric("Total Vectors", f"{count:,}")
-                # Calculate approximate storage size
-                vector_dim = st.session_state.db_config['embedding_dimensions']
-                bytes_per_float = 4
-                metadata_overhead = 100  # bytes per vector for metadata
-                vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
-                metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
-                total_size_mb = vector_size_mb + metadata_size_mb
-                st.metric("Storage Used", f"{total_size_mb:.2f} MB")
-                st.caption(f"Vectors: {vector_size_mb:.2f} MB")
-                st.caption(f"Metadata: {metadata_size_mb:.2f} MB")
-                # Calculate storage capacity
-                free_tier_gb = 1.0
-                used_gb = total_size_mb / 1024
-                remaining_gb = free_tier_gb - used_gb
-                usage_pct = (used_gb / free_tier_gb) * 100
-                st.metric("Free Tier Usage", f"{usage_pct:.1f}%")
-                st.progress(min(usage_pct / 100, 1.0))
-                st.caption(f"Remaining: {remaining_gb:.3f} GB")
-            except Exception as e:
-                st.error(f"Stats error: {e}")
-        else:
-            st.warning("⚠️ Database Not Created")
-    except Exception as e:
-        st.error("❌ Connection Failed")
-        st.caption(str(e)[:50])
 # ============================================================================
-# TAB 1: DATABASE CONFIGURATION
 # ============================================================================
-tab1, tab2, tab3, tab4 = st.tabs([
-    "⚙️ Configuration",
-    "📊 Analytics",
-    "🔧 Management",
-    "📚 Data Upload"
-])
-with tab1:
-    st.header("⚙️ Database Configuration")
-    st.info("**Configure your vector database parameters before creation**")
-    # ========================================================================
-    # SECTION 1: COLLECTION SETTINGS
-    # ========================================================================
-    with st.expander("🗄️ Collection Settings", expanded=True):
-        col1, col2 = st.columns(2)
-        with col1:
-            collection_name = st.text_input(
-                "Collection Name",
-                value=st.session_state.db_config['collection_name'],
-                help="Name of your vector database collection"
-            )
-            st.session_state.db_config['collection_name'] = collection_name
-        with col2:
-            similarity_options = {
-                'COSINE': 'Cosine Similarity (Best for text, -1 to 1)',
-                'EUCLIDEAN': 'Euclidean Distance (L2 norm)',
-                'DOT': 'Dot Product (Fast, unnormalized)'
-            }
-            similarity_metric = st.selectbox(
-                "Similarity Metric",
-                options=list(similarity_options.keys()),
-                index=0,
-                help="How to measure similarity between vectors",
-                format_func=lambda x: similarity_options[x]
-            )
-            st.session_state.db_config['similarity_metric'] = similarity_metric
-            # Explanation
-            st.caption("""
-            **Cosine Similarity**: Measures angle between vectors (best for text)
-            **Euclidean**: Measures distance in space (sensitive to magnitude)
-            **Dot Product**: Fast but requires normalized vectors
-            """)
-    # ========================================================================
-    # SECTION 2: EMBEDDING MODEL
-    # ========================================================================
-    with st.expander("🤖 Embedding Model Configuration", expanded=True):
-        embedding_models = {
-            'sentence-transformers/all-MiniLM-L6-v2': {
-                'name': 'all-MiniLM-L6-v2 (Recommended)',
-                'dimensions': 384,
-                'size': '90 MB',
-                'speed': 'Fast',
-                'quality': 'Good',
-                'description': 'Best balance of speed and quality for math content'
-            },
-            'sentence-transformers/all-mpnet-base-v2': {
-                'name': 'all-mpnet-base-v2 (High Quality)',
-                'dimensions': 768,
-                'size': '420 MB',
-                'speed': 'Medium',
-                'quality': 'Excellent',
-                'description': 'Higher quality embeddings, slower inference'
-            },
-            'sentence-transformers/all-MiniLM-L12-v2': {
-                'name': 'all-MiniLM-L12-v2 (Balanced)',
-                'dimensions': 384,
-                'size': '120 MB',
-                'speed': 'Medium',
-                'quality': 'Very Good',
-                'description': 'Larger MiniLM, better quality than L6'
-            }
-        }
-        selected_model = st.selectbox(
-            "Select Embedding Model",
-            options=list(embedding_models.keys()),
-            format_func=lambda x: embedding_models[x]['name']
-        )
-        st.session_state.db_config['embedding_model'] = selected_model
-        st.session_state.db_config['embedding_dimensions'] = embedding_models[selected_model]['dimensions']
-        # Model details
-        model_info = embedding_models[selected_model]
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Dimensions", model_info['dimensions'])
-        with col2:
-            st.metric("Model Size", model_info['size'])
-        with col3:
-            st.metric("Speed", model_info['speed'])
-        with col4:
-            st.metric("Quality", model_info['quality'])
-        st.info(f"**Why this model?** {model_info['description']}")
-    # ========================================================================
-    # SECTION 3: CHUNKING STRATEGY
-    # ========================================================================
-    with st.expander("✂️ Chunking Strategy", expanded=True):
-        st.markdown("**How to split documents into processable chunks**")
-        col1, col2 = st.columns(2)
-        with col1:
-            chunk_size = st.slider(
-                "Chunk Size (tokens)",
-                min_value=100,
-                max_value=2000,
-                value=st.session_state.db_config['chunk_size'],
-                step=50,
-                help="Number of tokens per chunk"
-            )
-            st.session_state.db_config['chunk_size'] = chunk_size
-            st.caption(f"""
-            **Small (100-300)**: Better precision, more chunks
-            **Medium (400-600)**: Balanced ✅
-            **Large (800-2000)**: More context, fewer chunks
-            """)
-        with col2:
-            chunk_overlap = st.slider(
-                "Chunk Overlap (tokens)",
-                min_value=0,
-                max_value=min(500, chunk_size // 2),
-                value=st.session_state.db_config['chunk_overlap'],
-                step=10,
-                help="Overlap between consecutive chunks"
-            )
-            st.session_state.db_config['chunk_overlap'] = chunk_overlap
-            overlap_pct = (chunk_overlap / chunk_size) * 100 if chunk_size > 0 else 0
-            st.metric("Overlap %", f"{overlap_pct:.1f}%")
-            st.caption(f"""
-            **No Overlap (0%)**: Distinct chunks, might lose context
-            **Small (5-10%)**: Minimal redundancy ✅
-            **Large (20-30%)**: More context preservation
-            """)
-        # Visualization
-        st.markdown("**Chunking Visualization:**")
-        sample_text = "The Pythagorean theorem states that a² + b² = c² for right triangles."
-        words = sample_text.split()
-        if len(words) >= 5:
-            chunk1 = ' '.join(words[:5])
-            chunk2 = ' '.join(words[3:8]) if len(words) >= 8 else ' '.join(words[3:])
-            st.code(f"""
-Chunk 1: "{chunk1}..."
-         {'↓' * (chunk_overlap // 10 if chunk_overlap > 0 else 0)}
-Chunk 2: "...{chunk2}..."
-Overlap: {chunk_overlap} tokens ({overlap_pct:.0f}%)
-            """)
-    # ========================================================================
-    # SECTION 4: TOKENIZATION & PARSING
-    # ========================================================================
-    with st.expander("🔤 Tokenization & Parsing", expanded=False):
         col1, col2 = st.columns(2)
         with col1:
-            tokenizer_options = {
-                'whitespace': 'Whitespace (Simple, fast)',
-                'nltk': 'NLTK (Sentence-aware)',
-                'tiktoken': 'TikToken (GPT-style, accurate)'
-            }
-            tokenizer = st.selectbox(
-                "Tokenizer",
-                options=list(tokenizer_options.keys()),
-                format_func=lambda x: tokenizer_options[x],
-                help="How to split text into tokens"
-            )
-            st.session_state.db_config['tokenizer'] = tokenizer
         with col2:
-            max_chunk_tokens = st.number_input(
-                "Max Tokens per Chunk",
-                min_value=512,
-                max_value=32000,
-                value=st.session_state.db_config['max_chunk_tokens'],
-                step=512,
-                help="Maximum tokens before forcing a split"
-            )
-            st.session_state.db_config['max_chunk_tokens'] = max_chunk_tokens
-        st.info("""
-        **Tokenization** converts text into tokens (words/subwords)
-        - **Whitespace**: Simple split by spaces (fastest)
-        - **NLTK**: Respects sentence boundaries (better)
-        - **TikToken**: Matches GPT tokenization (most accurate)
-        """)
-    # ========================================================================
-    # SAVE CONFIGURATION
-    # ========================================================================
     st.markdown("---")
-    if st.button("💾 Save Configuration", type="primary"):
-        st.success("✅ Configuration saved!")
-        st.json(st.session_state.db_config)
-    # Show current config
-    with st.expander("📋 View Current Configuration"):
-        st.json(st.session_state.db_config)
 # ============================================================================
-# TAB 2: ANALYTICS & VISUALIZATION
 # ============================================================================
-with tab2:
-    st.header("📊 Database Analytics")
-    try:
-        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
-        client = QdrantClient(
-            url=os.getenv("QDRANT_URL"),
-            api_key=os.getenv("QDRANT_API_KEY")
-        )
-        collection_name = st.session_state.db_config['collection_name']
-        # Check if collection exists
-        collections = client.get_collections().collections
-        exists = any(c.name == collection_name for c in collections)
-        if not exists:
-            st.warning(f"⚠️ Collection '{collection_name}' doesn't exist yet. Create it in the Management tab.")
-        else:
-            st.success(f"✅ Analyzing collection: {collection_name}")
-            # ================================================================
-            # STORAGE ANALYTICS
-            # ================================================================
-            st.subheader("💾 Storage Analytics")
-            # Get vector count
-            count = 0
-            offset = None
-            max_iter = 1000
-            for _ in range(max_iter):
-                result = client.scroll(
-                    collection_name=collection_name,
-                    limit=100,
-                    offset=offset,
-                    with_payload=False,
-                    with_vectors=False
-                )
-                if result is None or result[0] is None or len(result[0]) == 0:
-                    break
-                count += len(result[0])
-                offset = result[1]
-                if offset is None:
-                    break
-            col1, col2, col3, col4 = st.columns(4)
-            vector_dim = st.session_state.db_config['embedding_dimensions']
-            bytes_per_float = 4
-            metadata_overhead = 100
-            vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
-            metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
-            total_size_mb = vector_size_mb + metadata_size_mb
-            with col1:
-                st.metric("Total Vectors", f"{count:,}")
-            with col2:
-                st.metric("Vector Data", f"{vector_size_mb:.2f} MB")
-            with col3:
-                st.metric("Metadata", f"{metadata_size_mb:.2f} MB")
-            with col4:
-                st.metric("Total Size", f"{total_size_mb:.2f} MB")
-            # Storage breakdown
-            st.markdown("**Storage Breakdown:**")
-            storage_data = {
-                "Component": ["Vector Embeddings", "Metadata", "Index Overhead (est.)"],
-                "Size (MB)": [vector_size_mb, metadata_size_mb, total_size_mb * 0.1],
-                "Percentage": [
-                    (vector_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
-                    (metadata_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
-                    10.0
-                ]
-            }
-            st.dataframe(storage_data, use_container_width=True)
-            # Free tier usage
-            st.markdown("**Free Tier Capacity:**")
-            free_tier_gb = 1.0
-            used_gb = total_size_mb / 1024
-            remaining_gb = free_tier_gb - used_gb
-            usage_pct = (used_gb / free_tier_gb) * 100
-            col1, col2 = st.columns([2, 1])
-            with col1:
-                st.progress(min(usage_pct / 100, 1.0))
-                st.caption(f"Used: {used_gb:.3f} GB / {free_tier_gb} GB ({usage_pct:.1f}%)")
-            with col2:
-                st.metric("Remaining", f"{remaining_gb:.3f} GB")
-            # Capacity estimates
-            st.markdown("**Capacity Estimates:**")
-            if count > 0:
-                avg_vector_size = total_size_mb / count
-                max_vectors_1gb = int((1024 / avg_vector_size) * 0.9)  # 90% of theoretical max
-                st.info(f"""
-                **With current data:**
-                - Average size per vector: {avg_vector_size:.3f} MB
-                - Estimated max vectors (1GB): ~{max_vectors_1gb:,}
-                - Current capacity used: {(count / max_vectors_1gb * 100):.1f}%
-                """)
-            # ================================================================
-            # DATA SOURCE ANALYTICS
-            # ================================================================
-            st.subheader("📚 Data Source Breakdown")
-            # Sample vectors to analyze sources
-            sample_result = client.scroll(
-                collection_name=collection_name,
-                limit=min(count, 1000),
-                with_payload=True,
-                with_vectors=False
             )
-            if sample_result and sample_result[0]:
-                source_counts = {}
-                for point in sample_result[0]:
-                    source = point.payload.get('source_name', 'Unknown')
-                    source_counts[source] = source_counts.get(source, 0) + 1
-                # Display as table
-                source_data = {
-                    "Source": list(source_counts.keys()),
-                    "Vectors": list(source_counts.values()),
-                    "Percentage": [
-                        f"{(v/count*100):.1f}%" for v in source_counts.values()
-                    ]
-                }
-                st.dataframe(source_data, use_container_width=True)
-            # ================================================================
-            # CONFIGURATION SUMMARY
-            # ================================================================
-            st.subheader("⚙️ Active Configuration")
-            config_display = {
-                "Parameter": [
-                    "Embedding Model",
-                    "Vector Dimensions",
-                    "Similarity Metric",
-                    "Chunk Size",
-                    "Chunk Overlap",
-                    "Overlap Percentage"
-                ],
-                "Value": [
-                    st.session_state.db_config['embedding_model'].split('/')[-1],
-                    st.session_state.db_config['embedding_dimensions'],
-                    st.session_state.db_config['similarity_metric'],
-                    f"{st.session_state.db_config['chunk_size']} tokens",
-                    f"{st.session_state.db_config['chunk_overlap']} tokens",
-                    f"{(st.session_state.db_config['chunk_overlap'] / st.session_state.db_config['chunk_size'] * 100):.1f}%"
-                ]
-            }
-            st.dataframe(config_display, use_container_width=True)
-    except Exception as e:
-        st.error(f"❌ Error connecting to database: {str(e)}")
 # ============================================================================
-# TAB 3: MANAGEMENT
 # ============================================================================
-with tab3:
-    st.header("🔧 Database Management")
-    st.warning("⚠️ Management operations affect your database. Use carefully!")
-    try:
-        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
-        client = QdrantClient(
-            url=os.getenv("QDRANT_URL"),
-            api_key=os.getenv("QDRANT_API_KEY")
-        )
-        collection_name = st.session_state.db_config['collection_name']
-        # Check if exists
-        collections = client.get_collections().collections
-        exists = any(c.name == collection_name for c in collections)
-        if not exists:
-            st.info(f"Collection '{collection_name}' doesn't exist")
-            if st.button("🏗️ CREATE COLLECTION", type="primary"):
                 try:
-                    # Map string to Distance enum
-                    distance_map = {
-                        'COSINE': Distance.COSINE,
-                        'EUCLIDEAN': Distance.EUCLID,
-                        'DOT': Distance.DOT
-                    }
-                    client.create_collection(
-                        collection_name=collection_name,
-                        vectors_config=VectorParams(
-                            size=st.session_state.db_config['embedding_dimensions'],
-                            distance=distance_map[st.session_state.db_config['similarity_metric']]
                         )
-                    )
-                    st.success(f"✅ Created collection: {collection_name}")
-                    st.balloons()
-                    st.session_state.db_created = True
-                    st.rerun()
                 except Exception as e:
-                    st.error(f"❌ Creation failed: {str(e)}")
-        else:
-            st.success(f"✅ Collection exists: {collection_name}")
-            col1, col2 = st.columns(2)
-            with col1:
-                if st.button("🗑️ Delete Collection", type="secondary"):
-                    if st.checkbox("⚠️ Confirm deletion"):
-                        try:
-                            client.delete_collection(collection_name)
-                            st.success("✅ Collection deleted")
-                            st.session_state.db_created = False
-                            st.rerun()
-                        except Exception as e:
-                            st.error(f"Error: {e}")
-            with col2:
-                if st.button("ℹ️ Collection Info"):
-                    try:
-                        info = client.get_collection(collection_name)
-                        st.json({
-                            "name": collection_name,
-                            "status": "active"
-                        })
-                    except Exception as e:
-                        st.error(f"Error: {e}")
-    except Exception as e:
-        st.error(f"❌ Connection failed: {str(e)}")
 # ============================================================================
-# TAB 4: DATA UPLOAD (Quick Access)
 # ============================================================================
-with tab4:
-    st.header("📚 Quick Data Upload")
-    st.info("For full upload features, use the main upload interface")
-    st.markdown("[Go to Full Upload Interface →](#)")
-    # Simple text upload
-    with st.expander("Quick Text Upload"):
-        text = st.text_area("Paste text:", height=150)
-        if st.button("Upload") and text:
-            st.info("Use the main interface for full upload functionality")
 # ============================================================================
 # FOOTER
 # ============================================================================
 st.markdown("---")
-st.caption("💡 Tip: Save your configuration before creating the collection!")

 import streamlit as st
 import os
+import time
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+from sentence_transformers import SentenceTransformer
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 st.set_page_config(
+    page_title="Math AI - Phase 2: Database",
     page_icon="🗄️",
     layout="wide"
 )
+COLLECTION_NAME = "math_knowledge_base"
 # ============================================================================
+# CACHED FUNCTIONS
 # ============================================================================
+@st.cache_resource(show_spinner="🔌 Connecting to Qdrant...")
+def get_qdrant_client():
+    """Cache Qdrant client"""
+    qdrant_url = os.getenv("QDRANT_URL")
+    qdrant_api_key = os.getenv("QDRANT_API_KEY")
+    if not qdrant_url or not qdrant_api_key:
+        return None
+    return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
+@st.cache_resource(show_spinner="🤖 Loading embedding model (30-60s first time)...")
+def get_embedding_model():
+    """Cache embedding model"""
+    try:
+        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        return model
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
+        return None
+def get_vector_count_reliable(client, collection_name):
+    """Get vector count with fallbacks"""
+    try:
+        count = 0
+        offset = None
+        max_iterations = 1000
+        for _ in range(max_iterations):
+            result = client.scroll(
+                collection_name=collection_name,
+                limit=100,
+                offset=offset,
+                with_payload=False,
+                with_vectors=False
+            )
+            if result is None or result[0] is None or len(result[0]) == 0:
+                break
+            count += len(result[0])
+            offset = result[1]
+            if offset is None:
+                break
+        return count
+    except:
+        return 0
+def check_collection_exists(client, collection_name):
+    """Check if collection exists"""
+    try:
+        collections = client.get_collections().collections
+        return any(c.name == collection_name for c in collections)
+    except:
+        return False
+# ============================================================================
+# SESSION STATE
+# ============================================================================
 if 'db_created' not in st.session_state:
     st.session_state.db_created = False
+if 'embedder_ready' not in st.session_state:
+    st.session_state.embedder_ready = False
+if 'show_step' not in st.session_state:
+    st.session_state.show_step = 'all'
 # ============================================================================
+# MAIN APP
 # ============================================================================
+st.title("🗄️ Phase 2: Vector Database Setup")
+# Get cached resources
+client = get_qdrant_client()
+embedder = get_embedding_model()
 # ============================================================================
+# SIDEBAR
 # ============================================================================
 with st.sidebar:
+    st.header("⚡ Quick Navigation")
+    if st.button("📋 Show All Steps", use_container_width=True):
+        st.session_state.show_step = 'all'
+    if st.button("🚀 Skip to Upload", use_container_width=True):
+        st.session_state.show_step = 'upload'
+    if st.button("🔍 Skip to Search", use_container_width=True):
+        st.session_state.show_step = 'search'
+    st.markdown("---")
+    st.subheader("📊 System Status")
+    if client and check_collection_exists(client, COLLECTION_NAME):
+        st.success("✅ Database Ready")
+        st.session_state.db_created = True
+    else:
+        st.warning("⚠️ Database Not Ready")
+    if embedder:
+        st.success("✅ Model Loaded")
+        st.session_state.embedder_ready = True
+    else:
+        st.warning("⚠️ Model Not Loaded")
+    if client and st.session_state.db_created:
+        count = get_vector_count_reliable(client, COLLECTION_NAME)
+        st.metric("Vectors in DB", f"{count:,}")
 # ============================================================================
+# CONDITIONAL DISPLAY
 # ============================================================================
+show_all = st.session_state.show_step == 'all'
+show_upload = st.session_state.show_step in ['all', 'upload']
+show_search = st.session_state.show_step in ['all', 'search']
+# ============================================================================
+# STEP 1-2: Quick Status
+# ============================================================================
+if show_all:
+    st.header("Step 1-2: System Check")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Claude API", "✅" if os.getenv("ANTHROPIC_API_KEY") else "❌")
+    with col2:
+        st.metric("Qdrant", "✅ Connected" if client else "❌")
+    with col3:
+        st.metric("Embedder", "✅ Cached" if embedder else "❌")
+    if not client:
+        st.error("⚠️ Check Qdrant secrets!")
+        st.stop()
+    st.markdown("---")
+# ============================================================================
+# STEP 3: Collection Management
+# ============================================================================
+if show_all:
+    st.header("🏗️ Step 3: Database Collection")
+    if st.session_state.db_created:
+        st.success(f"✅ Collection '{COLLECTION_NAME}' ready!")
         col1, col2 = st.columns(2)
         with col1:
+            if st.button("🔄 Recreate Collection"):
+                try:
+                    client.delete_collection(COLLECTION_NAME)
+                    st.session_state.db_created = False
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"Error: {e}")
         with col2:
+            if st.button("ℹ️ Collection Info"):
+                count = get_vector_count_reliable(client, COLLECTION_NAME)
+                st.json({"name": COLLECTION_NAME, "vectors": count, "status": "Ready"})
+    else:
+        if st.button("🏗️ CREATE COLLECTION", type="primary"):
+            try:
+                client.create_collection(
+                    collection_name=COLLECTION_NAME,
+                    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+                )
+                st.success(f"🎉 Created: {COLLECTION_NAME}")
+                st.session_state.db_created = True
+                st.rerun()
+            except Exception as e:
+                st.error(f"❌ Failed: {str(e)}")
     st.markdown("---")
+# ============================================================================
+# STEP 4: Embedding Model
+# ============================================================================
+if show_all:
+    st.header("🤖 Step 4: Embedding Model")
+    if embedder:
+        st.success("✅ Model loaded and cached!")
+        st.session_state.embedder_ready = True
+    else:
+        st.warning("⚠️ Model loading failed. Refresh page.")
+    st.markdown("---")
 # ============================================================================
+# STEP 5A: Upload Custom Text
 # ============================================================================
+if show_upload:
+    st.header("📝 Step 5A: Upload Custom Math Notes")
+    if not st.session_state.db_created or not st.session_state.embedder_ready:
+        st.error("⚠️ Complete Steps 3 & 4 first")
+    else:
+        with st.expander("✍️ Paste text", expanded=True):
+            custom_text = st.text_area(
+                "Math notes:",
+                value="""Linear Equations: ax + b = 0, solution is x = -b/a
+Quadratic Equations: ax² + bx + c = 0
+Solution: x = (-b ± √(b²-4ac)) / 2a
+Example: x² + 5x - 4 = 0
+Pythagorean Theorem: a² + b² = c²
+Derivatives:
+d/dx(xⁿ) = nxⁿ⁻¹
+d/dx(sin x) = cos x
+d/dx(eˣ) = eˣ""",
+                height=200
             )
+            source_name = st.text_input("Source name:", value="math_notes.txt")
+            if st.button("🚀 UPLOAD TEXT", type="primary"):
+                if not custom_text.strip():
+                    st.error("Please enter text!")
+                else:
+                    try:
+                        progress = st.progress(0)
+                        status = st.empty()
+                        status.text("📄 Chunking text...")
+                        progress.progress(0.2)
+                        words = custom_text.split()
+                        chunks = []
+                        chunk_size = 50
+                        for i in range(0, len(words), 40):
+                            chunk = ' '.join(words[i:i + chunk_size])
+                            if chunk.strip():
+                                chunks.append(chunk)
+                        st.write(f"✅ Created {len(chunks)} chunks")
+                        status.text("🔢 Generating embeddings...")
+                        progress.progress(0.5)
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
+                        st.write(f"✅ Generated {len(embeddings)} embeddings")
+                        status.text("☁️ Uploading...")
+                        progress.progress(0.8)
+                        points = []
+                        for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                            points.append(PointStruct(
+                                id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}")) % (2**63),
+                                vector=embedding.tolist(),
+                                payload={
+                                    "content": chunk,
+                                    "source_name": source_name,
+                                    "source_type": "custom_notes",
+                                    "chunk_index": idx
+                                }
+                            ))
+                        client.upsert(collection_name=COLLECTION_NAME, points=points)
+                        progress.progress(1.0)
+                        status.empty()
+                        st.success(f"🎉 Uploaded {len(points)} vectors!")
+                        count = get_vector_count_reliable(client, COLLECTION_NAME)
+                        st.info(f"📊 **Total vectors: {count:,}**")
+                    except Exception as e:
+                        st.error(f"❌ Failed: {str(e)}")
+                        st.exception(e)
+    st.markdown("---")
 # ============================================================================
+# STEP 5B: Load Public Datasets (FIXED WITH ALL OPTIONS)
 # ============================================================================
+if show_upload:
+    st.header("📚 Step 5B: Load Public Datasets")
+    if not st.session_state.db_created or not st.session_state.embedder_ready:
+        st.error("⚠️ Complete Steps 3 & 4 first")
+    else:
+        with st.expander("📊 Load from Hugging Face", expanded=False):
+            dataset_choice = st.selectbox(
+                "Dataset:",
+                [
+                    "GSM8K - Grade School Math (8.5K problems)",
+                    "MATH - Competition Math (12.5K problems) ✨ FIXED",
+                    "DeepMind Math - School-level (2M+ examples)",
+                    "CAMEL-AI Math - GPT-4 Generated (50K problems)",
+                    "RACE - Reading Comprehension (28K passages)"
+                ]
+            )
+            sample_size = st.slider("Items to load:", 10, 500, 50)
+            if st.button("📥 LOAD DATASET", type="primary"):
                 try:
+                    from datasets import load_dataset
+                    progress = st.progress(0)
+                    status = st.empty()
+                    # ============================================================
+                    # GSM8K
+                    # ============================================================
+                    if "GSM8K" in dataset_choice:
+                        status.text("📥 Downloading GSM8K...")
+                        progress.progress(0.1)
+                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
+                        dataset_name = "GSM8K"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
+                            texts.append(text)
+                    # ============================================================
+                    # MATH (FIXED!)
+                    # ============================================================
+                    elif "MATH" in dataset_choice and "Competition" in dataset_choice:
+                        status.text("📥 Downloading MATH...")
+                        progress.progress(0.1)
+                        # Try multiple sources
+                        dataset = None
+                        dataset_name = "MATH"
+                        # Try source 1
+                        try:
+                            dataset = load_dataset(
+                                "lighteval/MATH",
+                                split="train",
+                                trust_remote_code=True
+                            )
+                            st.success("✅ Using lighteval/MATH")
+                        except:
+                            pass
+                        # Try source 2
+                        if dataset is None:
+                            try:
+                                dataset = load_dataset(
+                                    "DigitalLearningGmbH/MATH-lighteval",
+                                    split="train",
+                                    trust_remote_code=True
+                                )
+                                st.success("✅ Using DigitalLearningGmbH/MATH")
+                            except:
+                                pass
+                        # Try source 3
+                        if dataset is None:
+                            try:
+                                dataset = load_dataset(
+                                    "EleutherAI/hendrycks_math",
+                                    split="train",
+                                    trust_remote_code=True
+                                )
+                                st.success("✅ Using EleutherAI/hendrycks_math")
+                            except:
+                                pass
+                        if dataset is None:
+                            st.error("❌ All MATH sources failed. Try GSM8K or DeepMind instead.")
+                            st.stop()
+                        # Process dataset
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            # Handle different formats
+                            problem = item.get('problem', item.get('question', ''))
+                            solution = item.get('solution', item.get('answer', ''))
+                            problem_type = item.get('type', item.get('level', 'general'))
+                            text = f"Problem ({problem_type}): {problem}\n\nSolution: {solution}"
+                            texts.append(text)
+                    # ============================================================
+                    # DeepMind Math
+                    # ============================================================
+                    elif "DeepMind" in dataset_choice:
+                        status.text("📥 Downloading DeepMind Math...")
+                        progress.progress(0.1)
+                        # Use arithmetic module
+                        dataset = load_dataset(
+                            "deepmind/math_dataset",
+                            "arithmetic__mul",
+                            split="train",
+                            trust_remote_code=True
                         )
+                        dataset_name = "DeepMind-Math"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Question: {item['question']}\n\nAnswer: {item['answer']}"
+                            texts.append(text)
+                    # ============================================================
+                    # CAMEL-AI Math
+                    # ============================================================
+                    elif "CAMEL" in dataset_choice:
+                        status.text("📥 Downloading CAMEL-AI...")
+                        progress.progress(0.1)
+                        dataset = load_dataset(
+                            "camel-ai/math",
+                            split="train",
+                            trust_remote_code=True
+                        )
+                        dataset_name = "CAMEL-Math"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Problem: {item['message']}"
+                            texts.append(text)
+                    # ============================================================
+                    # RACE
+                    # ============================================================
+                    else:
+                        status.text("📥 Downloading RACE...")
+                        progress.progress(0.1)
+                        dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
+                        dataset_name = "RACE"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
+                            texts.append(text)
+                    # ============================================================
+                    # COMMON PROCESSING
+                    # ============================================================
+                    st.write(f"✅ Loaded {len(texts)} items from {dataset_name}")
+                    progress.progress(0.3)
+                    status.text("🔢 Generating embeddings...")
+                    embeddings = []
+                    for idx, text in enumerate(texts):
+                        embedding = embedder.encode(text)
+                        embeddings.append(embedding)
+                        if idx % 10 == 0:
+                            progress.progress(0.3 + (0.5 * idx / len(texts)))
+                            status.text(f"🔢 Embedding {idx+1}/{len(texts)}")
+                    st.write(f"✅ Generated {len(embeddings)} embeddings")
+                    progress.progress(0.8)
+                    status.text("☁️ Uploading...")
+                    points = []
+                    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
+                        content = text[:2000] if len(text) > 2000 else text
+                        points.append(PointStruct(
+                            id=abs(hash(f"{dataset_name}_{idx}_{time.time()}")) % (2**63),
+                            vector=embedding.tolist(),
+                            payload={
+                                "content": content,
+                                "source_name": dataset_name,
+                                "source_type": "public_dataset",
+                                "dataset": dataset_name,
+                                "index": idx
+                            }
+                        ))
+                    client.upsert(collection_name=COLLECTION_NAME, points=points)
+                    progress.progress(1.0)
+                    status.empty()
+                    st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")
+                    count = get_vector_count_reliable(client, COLLECTION_NAME)
+                    st.info(f"📊 **Total vectors: {count:,}**")
+                except ImportError:
+                    st.error("❌ Add 'datasets' to requirements.txt")
                 except Exception as e:
+                    st.error(f"❌ Failed: {str(e)}")
+                    st.exception(e)
+    st.markdown("---")
 # ============================================================================
+# STEP 6: Search
 # ============================================================================
+if show_search:
+    st.header("🔍 Step 6: Test Search")
+    if not st.session_state.db_created or not st.session_state.embedder_ready:
+        st.error("⚠️ Database and embedder must be ready")
+    else:
+        search_query = st.text_input(
+            "Question:",
+            placeholder="Solve x² + 5x - 4 = 0"
+        )
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            top_k = st.slider("Results:", 1, 10, 5)
+        with col2:
+            st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))
+        if st.button("🔍 SEARCH", type="primary") and search_query:
+            try:
+                with st.spinner("Searching..."):
+                    query_embedding = embedder.encode(search_query)
+                    results = client.search(
+                        collection_name=COLLECTION_NAME,
+                        query_vector=query_embedding.tolist(),
+                        limit=top_k
+                    )
+                    if results:
+                        st.success(f"✅ Found {len(results)} results!")
+                        for i, result in enumerate(results, 1):
+                            similarity_pct = result.score * 100
+                            if similarity_pct > 50:
+                                color = "🟢"
+                            elif similarity_pct > 30:
+                                color = "🟡"
+                            else:
+                                color = "🔴"
+                            with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
+                                st.info(result.payload['content'])
+                                col1, col2, col3 = st.columns(3)
+                                with col1:
+                                    st.caption(f"**Source:** {result.payload['source_name']}")
+                                with col2:
+                                    st.caption(f"**Type:** {result.payload['source_type']}")
+                                with col3:
+                                    st.caption(f"**Score:** {result.score:.4f}")
+                    else:
+                        st.warning("No results found!")
+            except Exception as e:
+                st.error(f"❌ Search failed: {str(e)}")
 # ============================================================================
 # FOOTER
 # ============================================================================
 st.markdown("---")
+st.success("🎉 Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG with Claude")