Spaces:

Hebaelsayed
/

math-ai-system

Building

App Files Files Community

Hebaelsayed commited on 19 days ago

Commit

3ebdd9a

verified ·

1 Parent(s): f41b68e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +504 -560

src/streamlit_app.py CHANGED Viewed

@@ -1,681 +1,625 @@
 import streamlit as st
 import os
 import time
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
-import PyPDF2
-import io
 # ============================================================================
-# CONFIGURATION
 # ============================================================================
 st.set_page_config(
-    page_title="Math AI - Phase 2.5: Database + PDF",
-    page_icon="🗄️",
-    layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
 # ============================================================================
-# CACHED FUNCTIONS
 # ============================================================================
-@st.cache_resource(show_spinner="🔌 Connecting to Qdrant...")
-def get_qdrant_client():
-    qdrant_url = os.getenv("QDRANT_URL")
-    qdrant_api_key = os.getenv("QDRANT_API_KEY")
-    if not qdrant_url or not qdrant_api_key:
-        return None
-    return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-@st.cache_resource(show_spinner="🤖 Loading embedding model (30-60s first time)...")
-def get_embedding_model():
     try:
-        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-        return model
     except Exception as e:
-        st.error(f"Failed to load model: {e}")
         return None
-def get_vector_count_reliable(client, collection_name):
     try:
         count = 0
         offset = None
-        max_iterations = 1000
-        for _ in range(max_iterations):
-            result = client.scroll(
-                collection_name=collection_name,
                 limit=100,
                 offset=offset,
                 with_payload=False,
                 with_vectors=False
             )
-            if result is None or result[0] is None or len(result[0]) == 0:
                 break
             count += len(result[0])
             offset = result[1]
             if offset is None:
                 break
         return count
     except:
         return 0
-def check_collection_exists(client, collection_name):
-    try:
-        collections = client.get_collections().collections
-        return any(c.name == collection_name for c in collections)
-    except:
-        return False
-def extract_text_from_pdf(pdf_file):
-    """Extract text from PDF file"""
-    try:
-        pdf_reader = PyPDF2.PdfReader(pdf_file)
-        text = ""
-        for page_num, page in enumerate(pdf_reader.pages):
-            page_text = page.extract_text()
-            text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"
-        return text
-    except Exception as e:
-        st.error(f"PDF extraction error: {str(e)}")
-        return None
 # ============================================================================
-# SESSION STATE
 # ============================================================================
-if 'db_created' not in st.session_state:
-    st.session_state.db_created = False
-if 'embedder_ready' not in st.session_state:
-    st.session_state.embedder_ready = False
-if 'show_step' not in st.session_state:
-    st.session_state.show_step = 'all'
 # ============================================================================
-# MAIN APP
 # ============================================================================
-st.title("🗄️ Phase 2.5: Database Setup + PDF Upload")
-client = get_qdrant_client()
-embedder = get_embedding_model()
-# ============================================================================
-# SIDEBAR
-# ============================================================================
-with st.sidebar:
-    st.header("⚡ Quick Navigation")
-    if st.button("📋 Show All Steps", use_container_width=True):
-        st.session_state.show_step = 'all'
-    if st.button("🚀 Skip to Upload", use_container_width=True):
-        st.session_state.show_step = 'upload'
-    if st.button("🔍 Skip to Search", use_container_width=True):
-        st.session_state.show_step = 'search'
-    st.markdown("---")
-    st.subheader("📊 System Status")
-    if client and check_collection_exists(client, COLLECTION_NAME):
-        st.success("✅ Database Ready")
-        st.session_state.db_created = True
-    else:
-        st.warning("⚠️ Database Not Ready")
-    if embedder:
-        st.success("✅ Model Loaded")
-        st.session_state.embedder_ready = True
-    else:
-        st.warning("⚠️ Model Not Loaded")
-    if client and st.session_state.db_created:
-        count = get_vector_count_reliable(client, COLLECTION_NAME)
-        st.metric("Vectors in DB", f"{count:,}")
-show_all = st.session_state.show_step == 'all'
-show_upload = st.session_state.show_step in ['all', 'upload']
-show_search = st.session_state.show_step in ['all', 'search']
 # ============================================================================
-# STEP 1-2: Quick Status
 # ============================================================================
-if show_all:
-    st.header("Step 1-2: System Check")
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        st.metric("Claude API", "✅" if os.getenv("ANTHROPIC_API_KEY") else "❌")
-    with col2:
-        st.metric("Qdrant", "✅ Connected" if client else "❌")
-    with col3:
-        st.metric("Embedder", "✅ Cached" if embedder else "❌")
-    if not client:
-        st.error("⚠️ Check Qdrant secrets!")
-        st.stop()
-    st.markdown("---")
-# ============================================================================
-# STEP 3: Collection Management
-# ============================================================================
-if show_all:
-    st.header("🏗️ Step 3: Database Collection")
-    if st.session_state.db_created:
-        st.success(f"✅ Collection '{COLLECTION_NAME}' ready!")
         col1, col2 = st.columns(2)
         with col1:
-            if st.button("🔄 Recreate Collection"):
-                try:
-                    client.delete_collection(COLLECTION_NAME)
-                    st.session_state.db_created = False
-                    st.rerun()
-                except Exception as e:
-                    st.error(f"Error: {e}")
         with col2:
-            if st.button("ℹ️ Collection Info"):
-                count = get_vector_count_reliable(client, COLLECTION_NAME)
-                st.json({"name": COLLECTION_NAME, "vectors": count, "status": "Ready"})
-    else:
-        if st.button("🏗️ CREATE COLLECTION", type="primary"):
             try:
-                client.create_collection(
                     collection_name=COLLECTION_NAME,
-                    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
                 )
-                st.success(f"🎉 Created: {COLLECTION_NAME}")
-                st.session_state.db_created = True
-                st.rerun()
             except Exception as e:
-                st.error(f"❌ Failed: {str(e)}")
-    st.markdown("---")
-# ============================================================================
-# STEP 4: Embedding Model
-# ============================================================================
-if show_all:
-    st.header("🤖 Step 4: Embedding Model")
-    if embedder:
-        st.success("✅ Model loaded and cached!")
-        st.session_state.embedder_ready = True
-    else:
-        st.warning("⚠️ Model loading failed. Refresh page.")
-    st.markdown("---")
-# ============================================================================
-# STEP 5A: Upload Custom Text
-# ============================================================================
-if show_upload:
-    st.header("📝 Step 5A: Upload Custom Notes")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Complete Steps 3 & 4 first")
-    else:
-        # Choose upload method
-        upload_method = st.radio(
-            "Upload method:",
-            ["📝 Paste Text", "📄 Upload PDF File"],
-            horizontal=True
-        )
-        if upload_method == "📝 Paste Text":
-            with st.expander("✍️ Paste text", expanded=True):
-                custom_text = st.text_area(
-                    "Math notes:",
-                    value="""Linear Equations: ax + b = 0, solution is x = -b/a
-Quadratic Equations: ax² + bx + c = 0
-Solution: x = (-b ± √(b²-4ac)) / 2a
-Pythagorean Theorem: a² + b² = c²
-Derivatives:
-d/dx(xⁿ) = nxⁿ⁻¹
-d/dx(sin x) = cos x""",
-                    height=200
-                )
-                source_name = st.text_input("Source name:", value="math_notes.txt")
-                if st.button("🚀 UPLOAD TEXT", type="primary"):
-                    if not custom_text.strip():
-                        st.error("Please enter text!")
-                    else:
-                        try:
-                            progress = st.progress(0)
-                            status = st.empty()
-                            status.text("📄 Chunking text...")
-                            progress.progress(0.2)
-                            words = custom_text.split()
-                            chunks = []
-                            chunk_size = 50
-                            for i in range(0, len(words), 40):
-                                chunk = ' '.join(words[i:i + chunk_size])
-                                if chunk.strip():
-                                    chunks.append(chunk)
-                            st.write(f"✅ Created {len(chunks)} chunks")
-                            status.text("🔢 Generating embeddings...")
-                            progress.progress(0.5)
-                            embeddings = embedder.encode(chunks, show_progress_bar=False)
-                            st.write(f"✅ Generated {len(embeddings)} embeddings")
-                            status.text("☁️ Uploading...")
-                            progress.progress(0.8)
-                            points = []
-                            for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-                                points.append(PointStruct(
-                                    id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}_{time.time()}")) % (2**63),
-                                    vector=embedding.tolist(),
-                                    payload={
-                                        "content": chunk,
-                                        "source_name": source_name,
-                                        "source_type": "custom_notes",
-                                        "chunk_index": idx
-                                    }
-                                ))
-                            client.upsert(collection_name=COLLECTION_NAME, points=points)
-                            progress.progress(1.0)
-                            status.empty()
-                            st.success(f"🎉 Uploaded {len(points)} vectors!")
-                            count = get_vector_count_reliable(client, COLLECTION_NAME)
-                            st.info(f"📊 **Total vectors: {count:,}**")
-                        except Exception as e:
-                            st.error(f"❌ Failed: {str(e)}")
-                            st.exception(e)
-        else:  # PDF Upload
-            with st.expander("📄 Upload PDF", expanded=True):
-                st.info("🎉 **NEW!** Upload your math PDFs directly")
-                uploaded_file = st.file_uploader(
-                    "Choose PDF file:",
-                    type=['pdf'],
-                    help="Upload a PDF with math content"
-                )
-                if uploaded_file:
-                    st.write(f"📄 File: {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
-                    source_name = st.text_input(
-                        "Source name:",
-                        value=uploaded_file.name.replace('.pdf', '')
                     )
-                    if st.button("🚀 UPLOAD PDF", type="primary"):
-                        try:
-                            progress = st.progress(0)
-                            status = st.empty()
-                            # Extract text
-                            status.text("📖 Extracting text from PDF...")
-                            progress.progress(0.1)
-                            extracted_text = extract_text_from_pdf(uploaded_file)
-                            if not extracted_text:
-                                st.error("❌ Failed to extract text from PDF")
-                                st.stop()
-                            st.write(f"✅ Extracted {len(extracted_text)} characters")
-                            # Show preview
-                            with st.expander("👁️ Preview extracted text"):
-                                st.text(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text)
-                            # Chunk
-                            status.text("📄 Chunking text...")
-                            progress.progress(0.3)
-                            words = extracted_text.split()
-                            chunks = []
-                            chunk_size = 100  # Larger chunks for PDFs
-                            overlap = 20
-                            for i in range(0, len(words), chunk_size - overlap):
-                                chunk = ' '.join(words[i:i + chunk_size])
-                                if chunk.strip():
-                                    chunks.append(chunk)
-                            st.write(f"✅ Created {len(chunks)} chunks")
-                            # Embed
-                            status.text("🔢 Generating embeddings...")
-                            progress.progress(0.5)
-                            embeddings = []
-                            for idx, chunk in enumerate(chunks):
-                                embedding = embedder.encode(chunk)
-                                embeddings.append(embedding)
-                                if idx % 20 == 0:
-                                    progress.progress(0.5 + (0.3 * idx / len(chunks)))
-                            st.write(f"✅ Generated {len(embeddings)} embeddings")
-                            # Upload
-                            status.text("☁️ Uploading to database...")
-                            progress.progress(0.9)
-                            points = []
-                            for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-                                points.append(PointStruct(
-                                    id=abs(hash(f"pdf_{source_name}_{idx}_{time.time()}")) % (2**63),
-                                    vector=embedding.tolist(),
-                                    payload={
-                                        "content": chunk,
-                                        "source_name": source_name,
-                                        "source_type": "pdf_upload",
-                                        "chunk_index": idx,
-                                        "file_name": uploaded_file.name
-                                    }
-                                ))
-                            client.upsert(collection_name=COLLECTION_NAME, points=points)
-                            progress.progress(1.0)
-                            status.empty()
-                            st.success(f"🎉 Uploaded {len(points)} vectors from PDF!")
-                            st.balloons()
-                            count = get_vector_count_reliable(client, COLLECTION_NAME)
-                            st.info(f"📊 **Total vectors: {count:,}**")
-                        except Exception as e:
-                            st.error(f"❌ Upload failed: {str(e)}")
-                            st.exception(e)
-    st.markdown("---")
 # ============================================================================
-# STEP 5B: Load Public Datasets (FIXED - No DeepMind)
 # ============================================================================
-if show_upload:
-    st.header("📚 Step 5B: Load Public Datasets")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Complete Steps 3 & 4 first")
-    else:
-        with st.expander("📊 Load from Hugging Face", expanded=False):
-            dataset_choice = st.selectbox(
-                "Dataset:",
-                [
-                    "GSM8K - Grade School Math (8.5K problems)",
-                    "MATH - Competition Math (12.5K problems) ✨",
-                    "MathQA - Math Word Problems (37K problems) 🆕",
-                    "CAMEL-AI Math - GPT-4 Generated (50K problems)",
-                    "RACE - Reading Comprehension (28K passages)"
-                ]
-            )
-            # INCREASED LIMIT FROM 500 TO 2000!
-            sample_size = st.slider("Items to load:", 10, 2000, 50)
-            st.warning(f"⚠️ Loading {sample_size} items. Large numbers take 5-15 minutes!")
-            if st.button("📥 LOAD DATASET", type="primary"):
-                try:
-                    from datasets import load_dataset
-                    progress = st.progress(0)
-                    status = st.empty()
-                    # GSM8K
-                    if "GSM8K" in dataset_choice:
-                        status.text("📥 Downloading GSM8K...")
-                        progress.progress(0.1)
-                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
-                        dataset_name = "GSM8K"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
-                            texts.append(text)
-                    # MATH
-                    elif "MATH" in dataset_choice and "Competition" in dataset_choice:
-                        status.text("📥 Downloading MATH...")
-                        progress.progress(0.1)
-                        dataset = None
-                        dataset_name = "MATH"
-                        # Try multiple sources
-                        for source in ["lighteval/MATH", "DigitalLearningGmbH/MATH-lighteval", "EleutherAI/hendrycks_math"]:
-                            try:
-                                dataset = load_dataset(source, split="train", trust_remote_code=True)
-                                st.success(f"✅ Using {source}")
-                                break
-                            except:
-                                continue
-                        if dataset is None:
-                            st.error("❌ All MATH sources failed")
-                            st.stop()
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            problem = item.get('problem', item.get('question', ''))
-                            solution = item.get('solution', item.get('answer', ''))
-                            problem_type = item.get('type', item.get('level', 'general'))
-                            text = f"Problem ({problem_type}): {problem}\n\nSolution: {solution}"
-                            texts.append(text)
-                    # MathQA (REPLACES DEEPMIND)
-                    elif "MathQA" in dataset_choice:
-                        status.text("📥 Downloading MathQA...")
-                        progress.progress(0.1)
-                        st.info("🆕 MathQA: 37K math word problems with detailed solutions")
-                        dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
-                        dataset_name = "MathQA"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Problem: {item['Problem']}\n\nRationale: {item['Rationale']}\n\nAnswer: {item['correct']}"
-                            texts.append(text)
-                    # CAMEL-AI
-                    elif "CAMEL" in dataset_choice:
-                        status.text("📥 Downloading CAMEL-AI...")
-                        progress.progress(0.1)
-                        dataset = load_dataset("camel-ai/math", split="train", trust_remote_code=True)
-                        dataset_name = "CAMEL-Math"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Problem: {item['message']}"
-                            texts.append(text)
-                    # RACE
-                    else:
-                        status.text("📥 Downloading RACE...")
-                        progress.progress(0.1)
-                        dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
-                        dataset_name = "RACE"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
-                            texts.append(text)
-                    # Common processing
-                    st.write(f"✅ Loaded {len(texts)} items from {dataset_name}")
-                    progress.progress(0.3)
-                    status.text("🔢 Generating embeddings...")
-                    embeddings = []
-                    for idx, text in enumerate(texts):
-                        embedding = embedder.encode(text)
-                        embeddings.append(embedding)
-                        if idx % 50 == 0:
-                            progress.progress(0.3 + (0.5 * idx / len(texts)))
-                            status.text(f"🔢 Embedding {idx+1}/{len(texts)}")
-                    st.write(f"✅ Generated {len(embeddings)} embeddings")
-                    progress.progress(0.8)
-                    status.text("☁️ Uploading...")
                     points = []
-                    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
-                        content = text[:2000] if len(text) > 2000 else text
                         points.append(PointStruct(
-                            id=abs(hash(f"{dataset_name}_{idx}_{time.time()}")) % (2**63),
-                            vector=embedding.tolist(),
                             payload={
-                                "content": content,
-                                "source_name": dataset_name,
                                 "source_type": "public_dataset",
-                                "dataset": dataset_name,
-                                "index": idx
                             }
                         ))
-                    client.upsert(collection_name=COLLECTION_NAME, points=points)
-                    progress.progress(1.0)
-                    status.empty()
-                    st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")
-                    count = get_vector_count_reliable(client, COLLECTION_NAME)
-                    st.info(f"📊 **Total vectors: {count:,}**")
-                except ImportError:
-                    st.error("❌ Add 'datasets' to requirements.txt")
-                except Exception as e:
-                    st.error(f"❌ Failed: {str(e)}")
-                    st.exception(e)
-    st.markdown("---")
 # ============================================================================
-# STEP 6: Search
 # ============================================================================
-if show_search:
-    st.header("🔍 Step 6: Test Search")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Database and embedder must be ready")
-    else:
-        search_query = st.text_input(
-            "Question:",
-            placeholder="Solve x² + 5x - 4 = 0"
-        )
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            top_k = st.slider("Results:", 1, 10, 5)
-        with col2:
-            st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))
-        if st.button("🔍 SEARCH", type="primary") and search_query:
-            try:
-                with st.spinner("Searching..."):
-                    query_embedding = embedder.encode(search_query)
-                    results = client.search(
-                        collection_name=COLLECTION_NAME,
-                        query_vector=query_embedding.tolist(),
-                        limit=top_k
-                    )
-                    if results:
-                        st.success(f"✅ Found {len(results)} results!")
-                        for i, result in enumerate(results, 1):
-                            similarity_pct = result.score * 100
-                            if similarity_pct > 50:
-                                color = "🟢"
-                            elif similarity_pct > 30:
-                                color = "🟡"
-                            else:
-                                color = "🔴"
-                            with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
-                                st.info(result.payload['content'])
-                                col1, col2, col3 = st.columns(3)
-                                with col1:
-                                    st.caption(f"**Source:** {result.payload['source_name']}")
-                                with col2:
-                                    st.caption(f"**Type:** {result.payload['source_type']}")
-                                with col3:
-                                    st.caption(f"**Score:** {result.score:.4f}")
-                    else:
-                        st.warning("No results found!")
-            except Exception as e:
-                st.error(f"❌ Search failed: {str(e)}")
-st.markdown("---")
-st.success("🎉 Phase 2.5 Complete! You now have: Text, PDF upload, and 4 working datasets!")

 import streamlit as st
 import os
 import time
+import base64
+from io import BytesIO
+from PIL import Image
+import PyPDF2
+from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
 # ============================================================================
+# COMPLETE MATH AI SYSTEM - ALL-IN-ONE HUGGING FACE SPACE
 # ============================================================================
 st.set_page_config(
+    page_title="Math AI System",
+    page_icon="🎓",
+    layout="wide",
+    initial_sidebar_state="expanded"
 )
 COLLECTION_NAME = "math_knowledge_base"
 # ============================================================================
+# CACHED RESOURCES
 # ============================================================================
+@st.cache_resource
+def get_clients():
+    """Initialize all clients - cached"""
+    qdrant = QdrantClient(
+        url=os.getenv("QDRANT_URL"),
+        api_key=os.getenv("QDRANT_API_KEY")
+    )
+    claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    return qdrant, claude, embedder
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def extract_text_from_pdf(pdf_file):
+    """Extract text from typed PDF"""
     try:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page_num, page in enumerate(pdf_reader.pages):
+            text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
+        return text
     except Exception as e:
         return None
+def chunk_text(text, chunk_size=150, overlap=30):
+    """Split text into chunks"""
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = ' '.join(words[i:i + chunk_size])
+        if chunk.strip():
+            chunks.append(chunk)
+    return chunks
+def get_vector_count(qdrant):
+    """Get total vectors in database"""
     try:
         count = 0
         offset = None
+        for _ in range(1000):
+            result = qdrant.scroll(
+                collection_name=COLLECTION_NAME,
                 limit=100,
                 offset=offset,
                 with_payload=False,
                 with_vectors=False
             )
+            if not result or not result[0]:
                 break
             count += len(result[0])
             offset = result[1]
             if offset is None:
                 break
         return count
     except:
         return 0
 # ============================================================================
+# MAIN APP
 # ============================================================================
+# Initialize clients
+try:
+    qdrant, claude, embedder = get_clients()
+    st.sidebar.success("✅ System Ready")
+except Exception as e:
+    st.error(f"❌ Initialization failed: {e}")
+    st.info("Add QDRANT_URL, QDRANT_API_KEY, and ANTHROPIC_API_KEY in Settings → Secrets")
+    st.stop()
 # ============================================================================
+# SIDEBAR: MODE SELECTION
 # ============================================================================
+st.sidebar.title("🎓 Math AI System")
+mode = st.sidebar.radio(
+    "Select Mode:",
+    ["🔍 Search & Solve", "🏗️ Setup Database", "🧪 Testing Dashboard"],
+    index=0
+)
+st.sidebar.markdown("---")
+# Show database stats
+try:
+    vector_count = get_vector_count(qdrant)
+    st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
+    storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
+    st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
+except:
+    st.sidebar.warning("Database not accessible")
 # ============================================================================
+# MODE 1: SEARCH & SOLVE (Main Interface)
 # ============================================================================
+if mode == "🔍 Search & Solve":
+    st.title("🔍 Math Problem Solver")
+    st.markdown("*Search your knowledge base and get detailed solutions*")
+    # ========================================================================
+    # INPUT: Problem Statement
+    # ========================================================================
+    st.header("📝 Input Problem")
+    input_method = st.radio(
+        "How to input:",
+        ["✍️ Type Question", "📄 Upload Exam PDF"],
+        horizontal=True
+    )
+    problem = None
+    if input_method == "✍️ Type Question":
+        problem = st.text_area(
+            "Enter math problem:",
+            placeholder="Example: Find the gradient of the loss function L(w) = (1/2)||Xw - y||²",
+            height=150
+        )
+    else:
+        uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
+        if uploaded_exam:
+            exam_text = extract_text_from_pdf(uploaded_exam)
+            if exam_text:
+                st.text_area("Extracted text:", exam_text[:1000], height=200)
+                problem = st.text_input("Extract specific question or use full text")
+    # ========================================================================
+    # SETTINGS
+    # ========================================================================
+    with st.expander("⚙️ Advanced Settings"):
         col1, col2 = st.columns(2)
         with col1:
+            search_filter = st.multiselect(
+                "Search in:",
+                ["Books", "Exams", "Handwritten Solutions", "Public Datasets"],
+                default=["Books", "Exams", "Handwritten Solutions"]
+            )
         with col2:
+            top_k = st.slider("Retrieve top:", 3, 20, 5)
+            detail_level = st.select_slider(
+                "Detail level:",
+                ["Concise", "Standard", "Detailed", "Very Detailed"],
+                value="Detailed"
+            )
+    # ========================================================================
+    # SOLVE BUTTON
+    # ========================================================================
+    if st.button("🚀 SOLVE PROBLEM", type="primary") and problem:
+        with st.spinner("🔍 Searching knowledge base..."):
+            # Generate query embedding
+            query_embedding = embedder.encode(problem)
+            # Create filter
+            filter_types = []
+            if "Books" in search_filter:
+                filter_types.append("book")
+            if "Exams" in search_filter:
+                filter_types.append("exam")
+            if "Handwritten Solutions" in search_filter:
+                filter_types.append("answer_handwritten")
+            if "Public Datasets" in search_filter:
+                filter_types.append("public_dataset")
+            # Search Qdrant
             try:
+                results = qdrant.search(
                     collection_name=COLLECTION_NAME,
+                    query_vector=query_embedding.tolist(),
+                    limit=top_k
                 )
             except Exception as e:
+                st.error(f"Search failed: {e}")
+                results = []
+        if not results:
+            st.warning("No relevant context found. Try loading more data in Setup mode.")
+        else:
+            st.success(f"✅ Found {len(results)} relevant references!")
+            # Show retrieved context
+            with st.expander("📚 Retrieved References"):
+                for i, result in enumerate(results, 1):
+                    similarity = result.score * 100
+                    st.markdown(f"**Reference {i}** ({similarity:.1f}% relevant)")
+                    st.info(result.payload['content'][:300] + "...")
+                    st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
+                    st.markdown("---")
+            # Generate solution with Claude
+            with st.spinner("🤖 Claude is generating solution..."):
+                # Prepare context
+                context = "\n\n".join([
+                    f"[Reference {i+1} from {r.payload.get('source_name', 'Unknown')}]:\n{r.payload['content']}"
+                    for i, r in enumerate(results)
+                ])
+                # Determine detail level
+                detail_instructions = {
+                    "Concise": "Provide a brief solution focusing on key steps.",
+                    "Standard": "Provide a clear solution with main steps explained.",
+                    "Detailed": "Provide a comprehensive solution with detailed explanations.",
+                    "Very Detailed": "Provide an exhaustive solution with all intermediate steps, intuitions, and alternative approaches."
+                }
+                # Create prompt
+                prompt = f"""You are an expert mathematics tutor specializing in machine learning mathematics.
+PROBLEM TO SOLVE:
+{problem}
+REFERENCE MATERIALS (from student's books, exams, and notes):
+{context}
+TASK:
+Solve this problem providing a complete, educational solution.
+{detail_instructions[detail_level]}
+FORMAT YOUR RESPONSE EXACTLY LIKE THIS:
+## SOLUTION
+[Provide step-by-step solution here with clear mathematical notation]
+## REASONING & APPROACH
+[Explain WHY you chose this approach, what concepts are involved, and how the references helped]
+## REFERENCES USED
+[List which references you used and HOW each contributed to the solution. Be specific - mention what information came from which source]
+## VERIFICATION
+[If applicable, verify the solution or discuss how to check if it's correct]
+IMPORTANT:
+- Use proper mathematical notation (LaTeX if needed: ∫, ∑, ∂, etc.)
+- Reference the student's materials when explaining concepts
+- Make it educational - help them understand, not just get an answer"""
+                try:
+                    message = claude.messages.create(
+                        model="claude-sonnet-4-20250514",
+                        max_tokens=4000,
+                        messages=[{"role": "user", "content": prompt}]
+                    )
+                    solution = message.content[0].text
+                    # Display solution
+                    st.markdown("---")
+                    st.markdown(solution)
+                    # Download option
+                    st.download_button(
+                        "📥 Download Solution",
+                        solution,
+                        file_name=f"solution_{int(time.time())}.md",
+                        mime="text/markdown"
                     )
+                    # API usage
+                    with st.expander("📊 API Usage"):
+                        st.json({
+                            "model": "claude-sonnet-4-20250514",
+                            "input_tokens": message.usage.input_tokens,
+                            "output_tokens": message.usage.output_tokens,
+                            "cost_estimate": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
+                        })
+                except Exception as e:
+                    st.error(f"Claude error: {e}")
 # ============================================================================
+# MODE 2: SETUP DATABASE (One-Time Processing)
 # ============================================================================
+elif mode == "🏗️ Setup Database":
+    st.title("🏗️ Database Setup")
+    st.markdown("*Process and upload your documents (run once)*")
+    st.warning("""
+    ⚠️ **IMPORTANT LIMITATION**:
+    Hugging Face Spaces cannot directly access Google Drive files.
+    **Recommended Solution:**
+    1. Use **Google Colab** for one-time processing (cloud, free)
+    2. Use **this HF Space** for daily searching/solving
+    **Alternative (Manual)**:
+    - Download PDFs from Google Drive
+    - Upload them here one by one
+    """)
+    # ========================================================================
+    # CREATE COLLECTION
+    # ========================================================================
+    st.header("Step 1: Create Database Collection")
+    try:
+        collections = qdrant.get_collections().collections
+        exists = any(c.name == COLLECTION_NAME for c in collections)
+        if exists:
+            st.success(f"✅ Collection '{COLLECTION_NAME}' exists")
+        else:
+            if st.button("🏗️ Create Collection"):
+                qdrant.create_collection(
+                    collection_name=COLLECTION_NAME,
+                    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
+                )
+                st.success("✅ Created!")
+                st.rerun()
+    except Exception as e:
+        st.error(f"Error: {e}")
+    st.markdown("---")
+    # ========================================================================
+    # UPLOAD OPTIONS
+    # ========================================================================
+    st.header("Step 2: Upload Documents")
+    tab1, tab2, tab3 = st.tabs(["📚 Upload PDFs", "📊 Load Public Datasets", "🖊️ Process Handwritten (Colab)"])
+    with tab1:
+        st.info("Upload your books and typed exams here")
+        uploaded_files = st.file_uploader(
+            "Choose PDF files:",
+            type=['pdf'],
+            accept_multiple_files=True
+        )
+        doc_type = st.selectbox("Document type:", ["Book", "Exam", "Other"])
+        if uploaded_files and st.button("Process & Upload PDFs"):
+            for uploaded_file in uploaded_files:
+                with st.expander(f"Processing {uploaded_file.name}"):
+                    try:
+                        # Extract
+                        text = extract_text_from_pdf(uploaded_file)
+                        if not text:
+                            st.error("Failed to extract text")
+                            continue
+                        st.write(f"✅ Extracted {len(text):,} chars")
+                        # Chunk
+                        chunks = chunk_text(text)
+                        st.write(f"✅ Created {len(chunks)} chunks")
+                        # Embed
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
+                        # Upload
+                        points = []
+                        for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+                            points.append(PointStruct(
+                                id=abs(hash(f"{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
+                                vector=emb.tolist(),
+                                payload={
+                                    "content": chunk,
+                                    "source_name": uploaded_file.name,
+                                    "source_type": doc_type.lower(),
+                                    "chunk_index": i
+                                }
+                            ))
+                        qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                        st.success(f"✅ Uploaded {len(points)} vectors!")
+                    except Exception as e:
+                        st.error(f"Error: {e}")
+    with tab2:
+        st.info("Load pre-built math datasets")
+        dataset_choice = st.selectbox(
+            "Choose dataset:",
+            ["GSM8K", "MATH", "MathQA"]
+        )
+        sample_size = st.slider("Number of samples:", 10, 1000, 100)
+        if st.button("Load Dataset"):
+            try:
+                from datasets import load_dataset
+                with st.spinner(f"Loading {dataset_choice}..."):
+                    if dataset_choice == "GSM8K":
+                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
+                        texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
+                                for i in range(min(sample_size, len(dataset)))]
+                    elif dataset_choice == "MATH":
+                        dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
+                        texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
+                                for i in range(min(sample_size, len(dataset)))]
+                    else:  # MathQA
+                        dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
+                        texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
+                                for i in range(min(sample_size, len(dataset)))]
+                    st.write(f"✅ Loaded {len(texts)} problems")
+                    # Embed & upload
+                    embeddings = embedder.encode(texts, show_progress_bar=True)
                     points = []
+                    for i, (text, emb) in enumerate(zip(texts, embeddings)):
                         points.append(PointStruct(
+                            id=abs(hash(f"{dataset_choice}_{i}_{time.time()}")) % (2**63),
+                            vector=emb.tolist(),
                             payload={
+                                "content": text[:2000],
+                                "source_name": dataset_choice,
                                 "source_type": "public_dataset",
+                                "index": i
                             }
                         ))
+                    qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                    st.success(f"✅ Uploaded {len(points)} vectors!")
+                    st.balloons()
+            except Exception as e:
+                st.error(f"Error: {e}")
+    with tab3:
+        st.warning("**Handwritten OCR requires Google Colab** (HF Spaces limitation)")
+        st.markdown("""
+        ### Why Colab for Handwritten Notes?
+        1. **File Access**: Need direct Google Drive access
+        2. **Processing Power**: OCR is compute-intensive
+        3. **Image Processing**: Requires additional libraries
+        ### Steps:
+        1. **Click button below** to open ready-to-use Colab notebook
+        2. **Run the notebook** (processes handwritten PDFs with AI OCR)
+        3. **Vectors auto-upload** to your Qdrant database
+        4. **Come back here** to search!
+        The notebook handles:
+        - ✅ Google Drive connection
+        - ✅ Italian cursive handwriting OCR (Claude Vision)
+        - ✅ Context from books/exams
+        - ✅ Direct upload to Qdrant
+        """)
+        colab_code_url = "https://colab.research.google.com/drive/your-notebook-id"
+        st.link_button(
+            "📓 Open Google Colab Notebook",
+            colab_code_url,
+            use_container_width=True
+        )
+        st.info("""
+        **What the Colab notebook will do:**
+        - Connect to your Google Drive (one click)
+        - Read PDFs from Math_AI_Documents/answers/
+        - Use Claude Vision to OCR handwritten Italian cursive
+        - Upload directly to this same Qdrant database
+        - Takes ~30-60 minutes, costs ~$0.60
+        """)
 # ============================================================================
+# MODE 3: TESTING DASHBOARD
 # ============================================================================
+elif mode == "🧪 Testing Dashboard":
+    st.title("🧪 Testing Dashboard")
+    st.markdown("*Evaluate system performance*")
+    tab1, tab2, tab3 = st.tabs(["📊 Database Stats", "🎯 Accuracy Tests", "📈 Performance"])
+    with tab1:
+        st.header("Database Statistics")
+        try:
+            # Get sample
+            sample = qdrant.scroll(
+                collection_name=COLLECTION_NAME,
+                limit=1000,
+                with_payload=True,
+                with_vectors=False
+            )
+            if sample and sample[0]:
+                # Count by type
+                types = {}
+                sources = set()
+                for point in sample[0]:
+                    src_type = point.payload.get('source_type', 'unknown')
+                    types[src_type] = types.get(src_type, 0) + 1
+                    sources.add(point.payload.get('source_name', 'Unknown'))
+                # Display
+                col1, col2, col3 = st.columns(3)
+                with col1:
+                    st.metric("Total Vectors", get_vector_count(qdrant))
+                with col2:
+                    st.metric("Unique Sources", len(sources))
+                with col3:
+                    st.metric("Document Types", len(types))
+                # Breakdown
+                st.subheader("Breakdown by Type")
+                for doc_type, count in sorted(types.items()):
+                    st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
+                # Sources
+                st.subheader("Sources")
+                for src in sorted(sources)[:20]:
+                    st.caption(f"• {src}")
+        except Exception as e:
+            st.error(f"Error: {e}")
+    with tab2:
+        st.header("Test Search Accuracy")
+        test_query = st.text_input("Test query:", placeholder="gradient descent")
+        if st.button("Run Test Search") and test_query:
+            query_emb = embedder.encode(test_query)
+            results = qdrant.search(
+                collection_name=COLLECTION_NAME,
+                query_vector=query_emb.tolist(),
+                limit=5
+            )
+            st.write(f"**Found {len(results)} results:**")
+            for i, r in enumerate(results, 1):
+                similarity = r.score * 100
+                quality = "🟢 Excellent" if similarity > 70 else "🟡 Good" if similarity > 50 else "🔴 Fair"
+                st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
+                st.text(r.payload['content'][:200] + "...")
+                st.caption(f"Source: {r.payload.get('source_name')}")
+                st.markdown("---")
+    with tab3:
+        st.header("Performance Metrics")
+        st.info("Coming soon: Response time, token usage, cost tracking")
+# ============================================================================
+# FOOTER
+# ============================================================================
+st.sidebar.markdown("---")
+st.sidebar.caption("🎓 Math AI System v1.0")
+st.sidebar.caption("Powered by Claude + Qdrant")