Spaces:

Hebaelsayed
/

math-ai-system

Sleeping

App Files Files Community

Hebaelsayed commited on Jan 4

Commit

989d169

verified ·

1 Parent(s): f75aa89

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +418 -364

src/streamlit_app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
 # ============================================================================
-# PHASE 2: DATABASE + TWO UPLOAD METHODS
 # ============================================================================
 st.set_page_config(
@@ -15,14 +15,76 @@ st.set_page_config(
     layout="wide"
 )
-st.title("🗄️ Phase 2: Vector Database Setup")
-st.markdown("**Database creation + Upload for custom notes AND public datasets**")
-# Initialize session state
 if 'db_created' not in st.session_state:
     st.session_state.db_created = False
-if 'embedder_loaded' not in st.session_state:
-    st.session_state.embedder_loaded = False
 # ============================================================================
 # STEP 1: API Keys Check
@@ -37,370 +99,357 @@ qdrant_api_key = os.getenv("QDRANT_API_KEY")
 col1, col2, col3 = st.columns(3)
 with col1:
-    if anthropic_key:
-        st.success("✅ Claude API")
-    else:
-        st.error("❌ Claude API")
 with col2:
     if qdrant_url:
-        st.success(f"✅ Qdrant URL")
         st.caption(qdrant_url[:30] + "...")
-    else:
-        st.error("❌ Qdrant URL")
 with col3:
-    if qdrant_api_key:
-        st.success("✅ Qdrant API Key")
-    else:
-        st.error("❌ Qdrant API Key")
 if not all([anthropic_key, qdrant_url, qdrant_api_key]):
-    st.warning("⚠️ Missing secrets! Add them in Settings → Repository Secrets")
     st.stop()
-st.markdown("---")
-# ============================================================================
-# STEP 2: Connect to Qdrant
-# ============================================================================
-st.header("Step 2: Connect to Qdrant Database")
-col1, col2 = st.columns([2, 1])
-with col1:
-    st.info("**Platform:** Qdrant Cloud (https://cloud.qdrant.io)")
-    st.caption("This tests connection to your cloud database cluster")
-with col2:
-    if st.button("🔌 Test Connection"):
-        try:
-            with st.spinner("Connecting..."):
-                client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-                collections = client.get_collections()
-                st.success("✅ Connected!")
-                st.metric("Collections", len(collections.collections))
-                st.session_state.qdrant_client = client
-        except Exception as e:
-            st.error(f"❌ Failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
-# STEP 3: Create Collection
 # ============================================================================
-st.header("🏗️ Step 3: Create Vector Database Collection")
-st.info("""
-**🖥️ Where this happens:**
-- You click button HERE in your HF Space app
-- App creates collection in Qdrant Cloud
-- You can verify in Qdrant dashboard
-**What gets created:**
-- Collection name: `math_knowledge_base`
-- Vector dimensions: 384 (matches embedding model)
-- Distance metric: COSINE similarity
-""")
-collection_name = st.text_input(
-    "Collection Name:",
-    value="math_knowledge_base",
-    help="This is your database name"
-)
-col1, col2 = st.columns([3, 1])
-with col1:
     if st.button("🏗️ CREATE DATABASE COLLECTION", type="primary"):
         try:
-            client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-            collections = client.get_collections().collections
-            exists = any(c.name == collection_name for c in collections)
-            if exists:
-                st.warning(f"Collection '{collection_name}' already exists!")
-                if st.button("✅ Use Existing"):
-                    st.session_state.db_created = True
-            else:
-                with st.spinner("Creating..."):
-                    client.create_collection(
-                        collection_name=collection_name,
-                        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
                     )
-                    st.success(f"🎉 Created: **{collection_name}**")
-                    st.balloons()
-                    st.session_state.db_created = True
         except Exception as e:
             st.error(f"❌ Failed: {str(e)}")
-with col2:
-    st.markdown("**Verify in:**")
-    st.link_button("Open Qdrant", "https://cloud.qdrant.io", use_container_width=True)
 st.markdown("---")
 # ============================================================================
-# STEP 4: Load Embedding Model
 # ============================================================================
 st.header("🤖 Step 4: Load Embedding Model")
-st.info("""
-**🖥️ Where this happens:**
-- Downloads from Hugging Face Model Hub
-- Loads into YOUR HF Space's memory
-- Takes 30-60 seconds first time
-**Model:** `sentence-transformers/all-MiniLM-L6-v2`
-- Size: ~90MB
-- Output: 384 dimensions
-- Purpose: Convert text → vectors
-""")
-if st.button("📥 LOAD EMBEDDING MODEL", type="primary"):
-    try:
-        with st.spinner("⏳ Loading... (30-60 sec first time)"):
-            model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-            st.session_state.embedder = model
-            st.session_state.embedder_loaded = True
-            st.success("✅ Model loaded!")
-            # Test
-            test_text = "Pythagorean theorem: a² + b² = c²"
-            test_embedding = model.encode(test_text)
-            st.write(f"**Test embedding shape:** {test_embedding.shape}")
-            st.caption(f"First 5 values: {test_embedding[:5]}")
-    except Exception as e:
-        st.error(f"❌ Failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
-# STEP 5A: Upload Custom Notes (Manual Text)
 # ============================================================================
-st.header("📝 Step 5A: Upload Custom Math Notes (Text)")
-st.success("**For:** Your handwritten notes (converted to text) or typed notes")
-st.info("""
-**🖥️ Where this happens:**
-1. You paste text HERE in HF Space app
-2. App chunks it into pieces
-3. App converts to vectors (using model from Step 4)
-4. App uploads to Qdrant Cloud database
-""")
-with st.expander("📄 Paste your custom math notes here", expanded=True):
-    custom_text = st.text_area(
-        "Your math content:",
-        value="""Pythagorean Theorem:
-For right triangle: a² + b² = c²
-Example: a=3, b=4 → c=5
-Quadratic Formula:
-ax² + bx + c = 0
-x = (-b ± √(b²-4ac))/2a
 Derivatives:
 d/dx(xⁿ) = nxⁿ⁻¹
-d/dx(sin x) = cos x
-d/dx(eˣ) = eˣ""",
-        height=200,
-        key="custom_notes"
-    )
-    source_name = st.text_input("Note name:", value="my_notes.txt")
-    if st.button("🚀 UPLOAD CUSTOM NOTES", type="primary"):
-        if not st.session_state.get('embedder_loaded'):
-            st.error("⚠️ Load embedding model first (Step 4)")
-            st.stop()
-        if not st.session_state.get('db_created'):
-            st.error("⚠️ Create collection first (Step 3)")
-            st.stop()
-        try:
-            client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-            embedder = st.session_state.embedder
-            with st.spinner("Processing..."):
-                # Chunk
-                words = custom_text.split()
-                chunk_size = 50
-                chunks = []
-                for i in range(0, len(words), chunk_size-10):
-                    chunk = ' '.join(words[i:i + chunk_size])
-                    if chunk:
-                        chunks.append(chunk)
-                st.write(f"✅ Created {len(chunks)} chunks")
-                # Embed
-                embeddings = embedder.encode(chunks, show_progress_bar=False)
-                st.write(f"✅ Generated {len(embeddings)} embeddings")
-                # Upload
-                points = []
-                for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-                    points.append(PointStruct(
-                        id=abs(hash(f"{source_name}_{idx}")) % (2**63),
-                        vector=embedding.tolist(),
-                        payload={
-                            "content": chunk,
-                            "source_name": source_name,
-                            "source_type": "custom_notes",
-                            "chunk_index": idx
-                        }
-                    ))
-                client.upsert(collection_name=collection_name, points=points)
-                st.success(f"🎉 Uploaded {len(points)} vectors!")
-                # Show total
-                info = client.get_collection(collection_name)
-                st.info(f"📊 Total vectors in database: {info.vectors_count:,}")
-        except Exception as e:
-            st.error(f"❌ Failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
-# STEP 5B: Load Public Datasets (MATH, RACE, GSM8K)
 # ============================================================================
 st.header("📚 Step 5B: Load Public Datasets")
-st.success("**For:** MATH, RACE, GSM8K datasets from Hugging Face")
-st.info("""
-**🖥️ Where this happens:**
-1. Select dataset HERE in HF Space app
-2. App downloads from Hugging Face Datasets
-3. App processes problems/solutions
-4. App uploads to Qdrant Cloud database
-**Note:** These datasets are large! Start with small samples.
-""")
-with st.expander("📊 Load public math datasets", expanded=False):
-    dataset_choice = st.selectbox(
-        "Choose dataset:",
-        ["GSM8K (8.5K problems)", "MATH (12.5K problems)", "RACE (28K questions)"]
-    )
-    sample_size = st.slider("Number of problems to load:", 10, 500, 50)
-    st.warning(f"⚠️ Loading {sample_size} problems. Larger numbers take longer!")
-    if st.button("📥 LOAD PUBLIC DATASET", key="load_dataset"):
-        if not st.session_state.get('embedder_loaded'):
-            st.error("⚠️ Load embedding model first (Step 4)")
-            st.stop()
-        if not st.session_state.get('db_created'):
-            st.error("⚠️ Create collection first (Step 3)")
-            st.stop()
-        try:
-            from datasets import load_dataset
-            client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-            embedder = st.session_state.embedder
-            with st.spinner(f"Loading {dataset_choice}..."):
-                # Load appropriate dataset
-                if "GSM8K" in dataset_choice:
-                    dataset = load_dataset("gsm8k", "main", split="train")
-                    dataset_name = "GSM8K"
-                    # Format data
-                    texts = []
-                    for i in range(min(sample_size, len(dataset))):
-                        item = dataset[i]
-                        text = f"Problem: {item['question']}\nSolution: {item['answer']}"
-                        texts.append(text)
-                elif "MATH" in dataset_choice:
-                    dataset = load_dataset("hendrycks/competition_math", split="train")
-                    dataset_name = "MATH"
-                    texts = []
-                    for i in range(min(sample_size, len(dataset))):
-                        item = dataset[i]
-                        text = f"Problem ({item['type']}): {item['problem']}\nSolution: {item['solution']}"
-                        texts.append(text)
-                else:  # RACE
-                    dataset = load_dataset("race", "all", split="train")
-                    dataset_name = "RACE"
-                    texts = []
-                    for i in range(min(sample_size, len(dataset))):
-                        item = dataset[i]
-                        text = f"Article: {item['article']}\nQuestion: {item['question']}\nAnswer: {item['answer']}"
-                        texts.append(text)
-                st.write(f"✅ Loaded {len(texts)} items from {dataset_name}")
-                # Generate embeddings
-                progress_bar = st.progress(0)
-                embeddings = []
-                for idx, text in enumerate(texts):
-                    embedding = embedder.encode(text)
-                    embeddings.append(embedding)
-                    progress_bar.progress((idx + 1) / len(texts))
-                st.write(f"✅ Generated {len(embeddings)} embeddings")
-                # Upload to Qdrant
-                points = []
-                for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
-                    points.append(PointStruct(
-                        id=abs(hash(f"{dataset_name}_{idx}")) % (2**63),
-                        vector=embedding.tolist(),
-                        payload={
-                            "content": text[:1000],  # Truncate if too long
-                            "source_name": dataset_name,
-                            "source_type": "public_dataset",
-                            "dataset": dataset_name,
-                            "index": idx
-                        }
-                    ))
-                client.upsert(collection_name=collection_name, points=points)
-                st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")
-                # Show total
-                info = client.get_collection(collection_name)
-                st.info(f"📊 Total vectors in database: {info.vectors_count:,}")
-        except Exception as e:
-            st.error(f"❌ Failed: {str(e)}")
-            st.exception(e)
 st.markdown("---")
@@ -410,64 +459,55 @@ st.markdown("---")
 st.header("🔍 Step 6: Test Search")
-st.info("""
-**🖥️ Search happens:**
-1. You enter question HERE
-2. App converts to vector
-3. App searches Qdrant Cloud
-4. Returns most similar chunks
-""")
-search_query = st.text_input(
-    "Ask a question:",
-    placeholder="What is the Pythagorean theorem?"
-)
-top_k = st.slider("Results:", 1, 10, 3)
-if st.button("🔍 SEARCH", type="primary") and search_query:
-    if not st.session_state.get('embedder_loaded'):
-        st.error("⚠️ Load embedding model first")
-        st.stop()
-    try:
-        client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-        embedder = st.session_state.embedder
-        with st.spinner("Searching..."):
-            query_embedding = embedder.encode(search_query)
-            results = client.search(
-                collection_name=collection_name,
-                query_vector=query_embedding.tolist(),
-                limit=top_k
-            )
-            if results:
-                st.success(f"✅ Found {len(results)} results!")
-                for i, result in enumerate(results, 1):
-                    similarity_pct = result.score * 100
-                    with st.expander(f"Result {i} - {similarity_pct:.1f}% match", expanded=(i==1)):
-                        st.info(result.payload['content'])
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.caption(f"Source: {result.payload['source_name']}")
-                        with col2:
-                            st.caption(f"Type: {result.payload['source_type']}")
-            else:
-                st.warning("No results found")
-    except Exception as e:
-        st.error(f"❌ Search failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
-# Progress Dashboard
 # ============================================================================
 st.header("✅ Progress Dashboard")
@@ -475,21 +515,35 @@ st.header("✅ Progress Dashboard")
 col1, col2, col3 = st.columns(3)
 with col1:
-    st.metric("Database", "✅" if st.session_state.get('db_created') else "❌")
 with col2:
-    st.metric("Embedder", "✅" if st.session_state.get('embedder_loaded') else "❌")
 with col3:
-    try:
-        if st.session_state.get('db_created'):
-            client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-            info = client.get_collection(collection_name)
-            st.metric("Vectors", f"{info.vectors_count:,}")
-        else:
-            st.metric("Vectors", "N/A")
-    except:
-        st.metric("Vectors", "?")
-if st.session_state.get('db_created') and st.session_state.get('embedder_loaded'):
-    st.success("🎉 Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG")

 from sentence_transformers import SentenceTransformer
 # ============================================================================
+# CONFIGURATION - RUNS ONCE
 # ============================================================================
 st.set_page_config(
     layout="wide"
 )
+# Collection name - centralized
+COLLECTION_NAME = "math_knowledge_base"
+# ============================================================================
+# CACHED FUNCTIONS - LOAD ONCE, REUSE FOREVER
+# ============================================================================
+@st.cache_resource
+def get_qdrant_client():
+    """Cache Qdrant client - only connects once"""
+    qdrant_url = os.getenv("QDRANT_URL")
+    qdrant_api_key = os.getenv("QDRANT_API_KEY")
+    if not qdrant_url or not qdrant_api_key:
+        return None
+    return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
+@st.cache_resource
+def get_embedding_model():
+    """Cache embedding model - only loads once"""
+    try:
+        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        return model
+    except Exception as e:
+        st.error(f"Failed to load model: {e}")
+        return None
+@st.cache_data(ttl=10)  # Cache for 10 seconds
+def get_vector_count(_client, collection_name):
+    """Get vector count with caching"""
+    try:
+        info = _client.get_collection(collection_name)
+        # Handle both old and new Qdrant API versions
+        if hasattr(info, 'vectors_count'):
+            return info.vectors_count
+        elif hasattr(info, 'points_count'):
+            return info.points_count
+        else:
+            return 0
+    except:
+        return 0
+def check_collection_exists(client, collection_name):
+    """Check if collection exists"""
+    try:
+        collections = client.get_collections().collections
+        return any(c.name == collection_name for c in collections)
+    except:
+        return False
+# ============================================================================
+# INITIALIZE SESSION STATE
+# ============================================================================
 if 'db_created' not in st.session_state:
     st.session_state.db_created = False
+if 'embedder_ready' not in st.session_state:
+    st.session_state.embedder_ready = False
+if 'manual_db_check' not in st.session_state:
+    st.session_state.manual_db_check = False
+# ============================================================================
+# MAIN APP
+# ============================================================================
+st.title("🗄️ Phase 2: Vector Database Setup")
+st.markdown("**Optimized: Components load once and stay cached!**")
 # ============================================================================
 # STEP 1: API Keys Check
 col1, col2, col3 = st.columns(3)
 with col1:
+    st.metric("Claude API", "✅" if anthropic_key else "❌")
 with col2:
+    st.metric("Qdrant URL", "✅" if qdrant_url else "❌")
     if qdrant_url:
         st.caption(qdrant_url[:30] + "...")
 with col3:
+    st.metric("Qdrant Key", "✅" if qdrant_api_key else "❌")
 if not all([anthropic_key, qdrant_url, qdrant_api_key]):
+    st.error("⚠️ Missing secrets! Add in Settings → Repository Secrets")
     st.stop()
+st.success("✅ All API keys configured!")
+# Get cached client
+client = get_qdrant_client()
+if not client:
+    st.error("Failed to create Qdrant client")
+    st.stop()
 st.markdown("---")
 # ============================================================================
+# STEP 2: Auto-check Connection
 # ============================================================================
+st.header("Step 2: Qdrant Connection Status")
+try:
+    collections = client.get_collections()
+    st.success(f"✅ Connected to Qdrant! Found {len(collections.collections)} collections")
+    # Auto-check if our collection exists
+    if check_collection_exists(client, COLLECTION_NAME):
+        st.info(f"📊 Collection '{COLLECTION_NAME}' exists!")
+        st.session_state.db_created = True
+        st.session_state.manual_db_check = True
+except Exception as e:
+    st.error(f"❌ Connection failed: {str(e)}")
+    st.stop()
+st.markdown("---")
+# ============================================================================
+# STEP 3: Create Collection (FIXED)
+# ============================================================================
+st.header("🏗️ Step 3: Create Database Collection")
+# Show current status
+if st.session_state.db_created:
+    st.success(f"✅ Collection '{COLLECTION_NAME}' is ready to use!")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("🔄 Recreate Collection (Delete & Rebuild)"):
+            try:
+                client.delete_collection(COLLECTION_NAME)
+                st.session_state.db_created = False
+                st.session_state.manual_db_check = False
+                st.rerun()
+            except Exception as e:
+                st.error(f"Delete failed: {e}")
+    with col2:
+        if st.button("ℹ️ Show Collection Info"):
+            try:
+                info = client.get_collection(COLLECTION_NAME)
+                st.json({
+                    "name": COLLECTION_NAME,
+                    "vectors": get_vector_count(client, COLLECTION_NAME),
+                    "status": "Ready"
+                })
+            except Exception as e:
+                st.error(f"Error: {e}")
+else:
+    # Collection doesn't exist - show create button
+    st.info(f"Collection '{COLLECTION_NAME}' does not exist yet.")
     if st.button("🏗️ CREATE DATABASE COLLECTION", type="primary"):
         try:
+            with st.spinner("Creating collection..."):
+                client.create_collection(
+                    collection_name=COLLECTION_NAME,
+                    vectors_config=VectorParams(
+                        size=384,
+                        distance=Distance.COSINE
                     )
+                )
+                st.success(f"🎉 Created collection: {COLLECTION_NAME}")
+                st.balloons()
+                # Update state
+                st.session_state.db_created = True
+                st.session_state.manual_db_check = True
+                # Force reload
+                st.rerun()
         except Exception as e:
             st.error(f"❌ Failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
+# STEP 4: Load Embedding Model (CACHED - LOADS ONCE)
 # ============================================================================
 st.header("🤖 Step 4: Load Embedding Model")
+# Try to get cached model
+embedder = get_embedding_model()
+if embedder is not None:
+    st.success("✅ Embedding model loaded and cached!")
+    st.session_state.embedder_ready = True
+    # Show test
+    with st.expander("🧪 Model Test"):
+        test_text = "Pythagorean theorem: a² + b² = c²"
+        test_embedding = embedder.encode(test_text)
+        st.write(f"**Shape:** {test_embedding.shape}")
+        st.write(f"**Sample values:** {test_embedding[:5]}")
+else:
+    st.warning("⚠️ Model not loaded yet")
+    if st.button("📥 LOAD EMBEDDING MODEL", type="primary"):
+        st.info("Loading model... (30-60 seconds first time)")
+        with st.spinner("Loading..."):
+            # Clear cache and reload
+            get_embedding_model.clear()
+            embedder = get_embedding_model()
+            if embedder:
+                st.success("✅ Model loaded!")
+                st.session_state.embedder_ready = True
+                st.rerun()
 st.markdown("---")
 # ============================================================================
+# STEP 5A: Upload Custom Text (FIXED)
 # ============================================================================
+st.header("📝 Step 5A: Upload Custom Math Notes")
+# Check prerequisites
+if not st.session_state.db_created:
+    st.warning("⚠️ Please create collection first (Step 3)")
+elif not st.session_state.embedder_ready:
+    st.warning("⚠️ Please load embedding model first (Step 4)")
+else:
+    with st.expander("✍️ Upload text", expanded=True):
+        custom_text = st.text_area(
+            "Paste your math notes:",
+            value="""Pythagorean Theorem: a² + b² = c²
+Example: If a=3, b=4, then c=5
+Quadratic Formula: x = (-b ± √(b²-4ac))/2a
+For ax² + bx + c = 0
 Derivatives:
 d/dx(xⁿ) = nxⁿ⁻¹
+d/dx(sin x) = cos x""",
+            height=150
+        )
+        source_name = st.text_input("Note name:", value="my_math_notes.txt")
+        if st.button("🚀 UPLOAD TEXT", type="primary", key="upload_text"):
+            if not custom_text.strip():
+                st.error("Please enter some text!")
+            else:
+                try:
+                    with st.spinner("Processing..."):
+                        # Chunk text
+                        words = custom_text.split()
+                        chunk_size = 50
+                        overlap = 10
+                        chunks = []
+                        for i in range(0, len(words), chunk_size - overlap):
+                            chunk = ' '.join(words[i:i + chunk_size])
+                            if chunk.strip():
+                                chunks.append(chunk)
+                        st.write(f"📄 Created {len(chunks)} chunks")
+                        # Generate embeddings
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
+                        st.write(f"🔢 Generated {len(embeddings)} embeddings")
+                        # Upload to Qdrant
+                        points = []
+                        for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                            points.append(PointStruct(
+                                id=abs(hash(f"{source_name}_{idx}_{custom_text[:20]}")) % (2**63),
+                                vector=embedding.tolist(),
+                                payload={
+                                    "content": chunk,
+                                    "source_name": source_name,
+                                    "source_type": "custom_notes",
+                                    "chunk_index": idx
+                                }
+                            ))
+                        client.upsert(
+                            collection_name=COLLECTION_NAME,
+                            points=points
+                        )
+                        st.success(f"🎉 Uploaded {len(points)} vectors!")
+                        # Show updated count
+                        total = get_vector_count(client, COLLECTION_NAME)
+                        st.info(f"📊 Total vectors in database: {total}")
+                        # Clear cache to refresh count
+                        get_vector_count.clear()
+                except Exception as e:
+                    st.error(f"❌ Upload failed: {str(e)}")
+                    st.exception(e)
 st.markdown("---")
 # ============================================================================
+# STEP 5B: Load Public Datasets (FIXED)
 # ============================================================================
 st.header("📚 Step 5B: Load Public Datasets")
+if not st.session_state.db_created:
+    st.warning("⚠️ Please create collection first (Step 3)")
+elif not st.session_state.embedder_ready:
+    st.warning("⚠️ Please load embedding model first (Step 4)")
+else:
+    with st.expander("📊 Load datasets from Hugging Face", expanded=False):
+        dataset_choice = st.selectbox(
+            "Choose dataset:",
+            [
+                "GSM8K - Grade School Math (8.5K problems)",
+                "MATH - Competition Math (12.5K problems)",
+                "RACE - Reading Comprehension (28K passages)"
+            ]
+        )
+        sample_size = st.slider("Number of items to load:", 10, 500, 50)
+        st.warning(f"⚠️ Loading {sample_size} items. First time takes longer!")
+        if st.button("📥 LOAD DATASET", type="primary", key="load_dataset"):
+            try:
+                from datasets import load_dataset
+                with st.spinner(f"Loading {dataset_choice.split('-')[0].strip()}..."):
+                    # Determine dataset
+                    if "GSM8K" in dataset_choice:
+                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
+                        dataset_name = "GSM8K"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
+                            texts.append(text)
+                    elif "MATH" in dataset_choice:
+                        dataset = load_dataset("hendrycks/competition_math", split="train", trust_remote_code=True)
+                        dataset_name = "MATH"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Problem ({item['type']}): {item['problem']}\n\nSolution: {item['solution']}"
+                            texts.append(text)
+                    else:  # RACE
+                        dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
+                        dataset_name = "RACE"
+                        texts = []
+                        for i in range(min(sample_size, len(dataset))):
+                            item = dataset[i]
+                            text = f"Article: {item['article']}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
+                            texts.append(text)
+                    st.write(f"✅ Loaded {len(texts)} items from {dataset_name}")
+                    # Generate embeddings with progress
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    embeddings = []
+                    for idx, text in enumerate(texts):
+                        embedding = embedder.encode(text)
+                        embeddings.append(embedding)
+                        progress_bar.progress((idx + 1) / len(texts))
+                        status_text.text(f"Embedding {idx + 1}/{len(texts)}")
+                    status_text.empty()
+                    st.write(f"✅ Generated {len(embeddings)} embeddings")
+                    # Upload to Qdrant
+                    points = []
+                    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
+                        # Truncate long texts
+                        content = text[:2000] if len(text) > 2000 else text
+                        points.append(PointStruct(
+                            id=abs(hash(f"{dataset_name}_{idx}")) % (2**63),
+                            vector=embedding.tolist(),
+                            payload={
+                                "content": content,
+                                "source_name": dataset_name,
+                                "source_type": "public_dataset",
+                                "dataset": dataset_name,
+                                "index": idx
+                            }
+                        ))
+                    client.upsert(
+                        collection_name=COLLECTION_NAME,
+                        points=points
+                    )
+                    st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")
+                    # Show updated count (FIXED)
+                    get_vector_count.clear()  # Clear cache
+                    total = get_vector_count(client, COLLECTION_NAME)
+                    st.info(f"📊 Total vectors in database: {total}")
+            except ImportError:
+                st.error("❌ 'datasets' library not installed. Add 'datasets' to requirements.txt")
+            except Exception as e:
+                st.error(f"❌ Failed: {str(e)}")
+                st.exception(e)
 st.markdown("---")
 st.header("🔍 Step 6: Test Search")
+if not st.session_state.db_created or not st.session_state.embedder_ready:
+    st.warning("⚠️ Complete Steps 3 & 4 first")
+else:
+    search_query = st.text_input(
+        "Ask a question:",
+        placeholder="What is the Pythagorean theorem?"
+    )
+    top_k = st.slider("Number of results:", 1, 10, 3)
+    if st.button("🔍 SEARCH", type="primary") and search_query:
+        try:
+            with st.spinner("Searching..."):
+                # Generate query embedding
+                query_embedding = embedder.encode(search_query)
+                # Search Qdrant
+                results = client.search(
+                    collection_name=COLLECTION_NAME,
+                    query_vector=query_embedding.tolist(),
+                    limit=top_k
+                )
+                if results:
+                    st.success(f"✅ Found {len(results)} results!")
+                    for i, result in enumerate(results, 1):
+                        similarity_pct = result.score * 100
+                        with st.expander(f"📄 Result {i} - {similarity_pct:.1f}% match", expanded=(i==1)):
+                            st.info(result.payload['content'])
+                            col1, col2 = st.columns(2)
+                            with col1:
+                                st.caption(f"**Source:** {result.payload['source_name']}")
+                            with col2:
+                                st.caption(f"**Type:** {result.payload['source_type']}")
+                else:
+                    st.warning("No results found. Upload more data!")
+        except Exception as e:
+            st.error(f"❌ Search failed: {str(e)}")
 st.markdown("---")
 # ============================================================================
+# PROGRESS DASHBOARD (FIXED)
 # ============================================================================
 st.header("✅ Progress Dashboard")
 col1, col2, col3 = st.columns(3)
 with col1:
+    st.metric("Database", "✅ Ready" if st.session_state.db_created else "❌ Not Created")
 with col2:
+    st.metric("Embedder", "✅ Ready" if st.session_state.embedder_ready else "❌ Not Loaded")
 with col3:
+    if st.session_state.db_created:
+        vector_count = get_vector_count(client, COLLECTION_NAME)
+        st.metric("Vectors", f"{vector_count:,}" if vector_count else "0")
+    else:
+        st.metric("Vectors", "N/A")
+# Success message
+if st.session_state.db_created and st.session_state.embedder_ready:
+    st.success("🎉 Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG")
+# Debug panel
+with st.expander("🔧 Debug Info"):
+    st.json({
+        "db_created": st.session_state.db_created,
+        "embedder_ready": st.session_state.embedder_ready,
+        "collection_name": COLLECTION_NAME,
+        "cached_client": client is not None,
+        "cached_embedder": embedder is not None
+    })
+    if st.button("🔄 Clear All Caches & Restart"):
+        get_qdrant_client.clear()
+        get_embedding_model.clear()
+        get_vector_count.clear()
+        st.session_state.clear()
+        st.rerun()