Spaces:

Hebaelsayed
/

math-ai-system

Running

App Files Files Community

Hebaelsayed commited on 14 days ago

Commit

3508eff

verified ·

1 Parent(s): 081bf0a

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +534 -404

src/streamlit_app.py CHANGED Viewed

@@ -2,10 +2,11 @@ import streamlit as st
 import os
 import time
 import base64
 from io import BytesIO
 from PIL import Image
 import PyPDF2
-from pdf2image import convert_from_bytes
 from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
@@ -13,53 +14,103 @@ from sentence_transformers import SentenceTransformer
 from huggingface_hub import hf_hub_download, list_repo_files
 # ============================================================================
-# MATH AI SYSTEM - READS FROM HF DATASET (PERMANENT STORAGE!)
 # ============================================================================
 st.set_page_config(
-    page_title="Math AI System",
     page_icon="🎓",
     layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
-# YOUR DATASET =
-DATASET_REPO = "Hebaelsayed/math-ai-documents"
 # ============================================================================
 # CACHED RESOURCES
 # ============================================================================
 @st.cache_resource
-def get_clients():
-    """Initialize clients"""
-    qdrant = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
     )
-    claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
-    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-    return qdrant, claude, embedder
 # ============================================================================
-# DATASET OPERATIONS (Reads from HF Dataset)
 # ============================================================================
 def list_dataset_files(folder_path):
-    """List all PDF files in a folder from HF Dataset"""
     try:
-        # Get HF token from environment
         hf_token = os.getenv("HF_TOKEN")
-        # List all files in the dataset
         all_files = list_repo_files(
             repo_id=DATASET_REPO,
             repo_type="dataset",
             token=hf_token
         )
-        # Filter for PDFs in specific folder
         pdf_files = [
             f for f in all_files
             if f.startswith(folder_path) and f.endswith('.pdf')
@@ -71,12 +122,11 @@ def list_dataset_files(folder_path):
         st.error(f"Error listing files: {e}")
         return []
-def download_file_from_dataset(file_path):
-    """Download a file from HF Dataset"""
     try:
         hf_token = os.getenv("HF_TOKEN")
-        # Download file
         local_path = hf_hub_download(
             repo_id=DATASET_REPO,
             filename=file_path,
@@ -87,43 +137,39 @@ def download_file_from_dataset(file_path):
         return local_path
     except Exception as e:
-        st.error(f"Error downloading {file_path}: {e}")
         return None
-# ============================================================================
-# PROCESSING FUNCTIONS
-# ============================================================================
 def extract_text_from_pdf(pdf_path):
-    """Extract text from PDF file"""
     try:
         with open(pdf_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
             text = ""
-            for page_num, page in enumerate(pdf_reader.pages):
                 text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
             return text
     except Exception as e:
-        st.error(f"PDF extraction error: {e}")
         return None
 def pdf_to_images(pdf_path):
-    """Convert PDF to images"""
     try:
-        from pdf2image import convert_from_path
         images = convert_from_path(pdf_path, dpi=200)
         return images
     except Exception as e:
-        st.error(f"Conversion error: {e}")
         return []
 def resize_image(image, max_size=(2048, 2048)):
-    """Resize for Claude"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
-    """Convert to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
@@ -134,7 +180,7 @@ def ocr_with_claude(claude_client, image, context=""):
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
-    prompt = f"""Transcribe this handwritten math solution.
 STYLE: Italian cursive (connected letters)
 LANGUAGE: English
@@ -145,9 +191,9 @@ INSTRUCTIONS:
 1. Transcribe in English
 2. Use proper math notation: ∫, ∑, √, ∂, etc.
 3. Maintain structure
-4. Mark unclear parts: [unclear: guess]
-OUTPUT: Just the transcription."""
     try:
         message = claude_client.messages.create(
@@ -167,10 +213,11 @@ OUTPUT: Just the transcription."""
         return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
     except Exception as e:
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
-    """Split into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
@@ -180,7 +227,7 @@ def chunk_text(text, chunk_size=150, overlap=30):
     return chunks
 def get_vector_count(qdrant):
-    """Get total vectors"""
     try:
         count = 0
         offset = None
@@ -203,15 +250,16 @@ def get_vector_count(qdrant):
         return 0
 # ============================================================================
-# INITIALIZE
 # ============================================================================
 try:
-    qdrant, claude, embedder = get_clients()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
-    st.error(f"❌ Init failed: {e}")
-    st.info("Add these in Settings → Secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
     st.stop()
 # ============================================================================
@@ -219,437 +267,519 @@ except Exception as e:
 # ============================================================================
 st.sidebar.title("🎓 Math AI System")
-mode = st.sidebar.radio(
-    "Mode:",
-    ["🔍 Search & Solve", "🏗️ Process Dataset Files", "📊 Stats"],
-    index=0
-)
-st.sidebar.markdown("---")
 try:
     vector_count = get_vector_count(qdrant)
-    st.sidebar.metric("Vectors", f"{vector_count:,}")
 except:
-    pass
 # ============================================================================
-# MODE 1: SEARCH & SOLVE
 # ============================================================================
-if mode == "🔍 Search & Solve":
-    st.title("🔍 Math Problem Solver")
-    problem = st.text_area(
-        "Enter problem:",
-        placeholder="Find the gradient of L(w) = (1/2)||Xw - y||²",
-        height=150
-    )
-    top_k = st.slider("Retrieve:", 3, 20, 5)
-    if st.button("🚀 SOLVE", type="primary") and problem:
-        with st.spinner("Searching..."):
-            query_emb = embedder.encode(problem)
-            try:
-                results = qdrant.search(
-                    collection_name=COLLECTION_NAME,
-                    query_vector=query_emb.tolist(),
-                    limit=top_k
-                )
-            except:
-                results = []
-        if not results:
-            st.warning("No context found. Process your files in 'Process Dataset Files' mode.")
-        else:
-            st.success(f"Found {len(results)} references!")
-            with st.expander("References"):
-                for i, r in enumerate(results, 1):
-                    st.markdown(f"**{i}.** {r.payload['content'][:200]}...")
-                    st.caption(f"Source: {r.payload.get('source_name')}")
-            with st.spinner("Generating solution..."):
-                context = "\n\n".join([r.payload['content'] for r in results])
-                prompt = f"""Solve this problem using the references.
-PROBLEM: {problem}
-REFERENCES: {context}
-FORMAT:
-## SOLUTION
-[Step-by-step]
-## REASONING
-[Why this approach]
-## REFERENCES USED
-[Which sources helped]"""
-                try:
-                    message = claude.messages.create(
-                        model="claude-sonnet-4-20250514",
-                        max_tokens=4000,
-                        messages=[{"role": "user", "content": prompt}]
-                    )
-                    st.markdown("---")
-                    st.markdown(message.content[0].text)
-                    st.download_button(
-                        "📥 Download",
-                        message.content[0].text,
-                        file_name=f"solution.md"
                     )
-                except Exception as e:
-                    st.error(f"Error: {e}")
-# ============================================================================
-# MODE 2: PROCESS DATASET FILES
-# ============================================================================
-elif mode == "🏗️ Process Dataset Files":
-    st.title("🏗️ Process Files from HF Dataset")
-    st.info(f"""
-    **Dataset:** `{DATASET_REPO}`
-    Files are stored permanently in your HF Dataset.
-    Process them once, search forever!
-    """)
-    # Check if HF token exists
-    if not os.getenv("HF_TOKEN"):
-        st.error("⚠️ Missing HF_TOKEN! Add it in Settings → Repository Secrets")
-        st.info("""
-        **How to get your HF Token:**
-        1. Go to: https://huggingface.co/settings/tokens
-        2. Click "New token"
-        3. Name: "math-ai-access"
-        4. Type: Read
-        5. Copy the token
-        6. Add as HF_TOKEN in Space Settings → Secrets
-        """)
-        st.stop()
-    # Create collection if needed
-    st.header("Step 1: Setup Collection")
-    try:
-        collections = qdrant.get_collections().collections
-        exists = any(c.name == COLLECTION_NAME for c in collections)
-        if exists:
-            st.success(f"✅ Collection exists")
-        else:
-            if st.button("Create Collection"):
-                qdrant.create_collection(
-                    collection_name=COLLECTION_NAME,
-                    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
-                )
-                st.success("Created!")
-                st.rerun()
-    except Exception as e:
-        st.error(f"Error: {e}")
     st.markdown("---")
-    # Process files
-    st.header("Step 2: Process Files")
-    tab1, tab2, tab3 = st.tabs(["📚 Books", "📝 Exams", "🖊️ Handwritten Answers"])
     # ========================================================================
-    # BOOKS
     # ========================================================================
-    with tab1:
-        st.subheader("Process Books (Typed PDFs)")
-        if st.button("📚 List Books in Dataset"):
-            book_files = list_dataset_files("books/")
-            if book_files:
-                st.write(f"Found {len(book_files)} books:")
-                for f in book_files:
-                    st.text(f"• {f}")
-                st.session_state.book_files = book_files
-            else:
-                st.warning("No books found in dataset/books/ folder")
-        if 'book_files' in st.session_state and st.button("🚀 Process All Books"):
-            for book_file in st.session_state.book_files:
-                with st.expander(f"Processing {book_file}"):
-                    try:
-                        # Download
-                        st.write("📥 Downloading...")
-                        local_path = download_file_from_dataset(book_file)
-                        if not local_path:
-                            continue
-                        # Extract
-                        st.write("📖 Extracting text...")
-                        text = extract_text_from_pdf(local_path)
-                        if not text:
-                            continue
-                        st.write(f"✅ {len(text):,} chars")
-                        # Chunk
-                        chunks = chunk_text(text)
-                        st.write(f"✂️ {len(chunks)} chunks")
-                        # Embed
-                        embeddings = embedder.encode(chunks, show_progress_bar=False)
-                        # Upload
-                        points = []
-                        for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
-                            points.append(PointStruct(
-                                id=abs(hash(f"{book_file}_{i}_{time.time()}")) % (2**63),
-                                vector=emb.tolist(),
-                                payload={
-                                    "content": chunk,
-                                    "source_name": book_file.split('/')[-1],
-                                    "source_type": "book",
-                                    "chunk_index": i
-                                }
-                            ))
-                        qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                        st.success(f"✅ Uploaded {len(points)} vectors!")
-                    except Exception as e:
-                        st.error(f"Error: {e}")
     # ========================================================================
-    # EXAMS
     # ========================================================================
-    with tab2:
-        st.subheader("Process Exams (Typed PDFs)")
-        if st.button("📝 List Exams in Dataset"):
-            exam_files = list_dataset_files("exams/")
-            if exam_files:
-                st.write(f"Found {len(exam_files)} exams:")
-                for f in exam_files:
-                    st.text(f"• {f}")
-                st.session_state.exam_files = exam_files
-            else:
-                st.warning("No exams found")
-        if 'exam_files' in st.session_state and st.button("🚀 Process All Exams"):
-            for exam_file in st.session_state.exam_files:
-                with st.expander(f"Processing {exam_file}"):
-                    try:
-                        local_path = download_file_from_dataset(exam_file)
-                        text = extract_text_from_pdf(local_path)
-                        if not text:
-                            continue
-                        st.write(f"✅ {len(text):,} chars")
-                        chunks = chunk_text(text)
-                        embeddings = embedder.encode(chunks, show_progress_bar=False)
                         points = []
-                        for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                             points.append(PointStruct(
-                                id=abs(hash(f"{exam_file}_{i}_{time.time()}")) % (2**63),
                                 vector=emb.tolist(),
                                 payload={
-                                    "content": chunk,
-                                    "source_name": exam_file.split('/')[-1],
-                                    "source_type": "exam",
-                                    "chunk_index": i
                                 }
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                         st.success(f"✅ Uploaded {len(points)} vectors!")
-                    except Exception as e:
-                        st.error(f"Error: {e}")
-    # ========================================================================
-    # HANDWRITTEN ANSWERS (AI OCR)
-    # ========================================================================
-    with tab3:
-        st.subheader("Process Handwritten Answers (AI OCR)")
-        st.warning("⚠️ This uses Claude Vision - costs ~$0.05-0.10 per PDF page")
-        if st.button("🖊️ List Answer Files"):
-            answer_files = list_dataset_files("answers/")
-            if answer_files:
-                st.write(f"Found {len(answer_files)} answer files:")
-                for f in answer_files:
-                    st.text(f"• {f}")
-                st.session_state.answer_files = answer_files
-            else:
-                st.warning("No answers found")
-        if 'answer_files' in st.session_state:
-            # Get context from books if available
-            context_books = ""
-            try:
-                book_samples = qdrant.scroll(
-                    collection_name=COLLECTION_NAME,
-                    limit=5,
-                    with_payload=True,
-                    with_vectors=False,
-                    scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
-                )
-                if book_samples and book_samples[0]:
-                    context_books = "\n".join([p.payload['content'] for p in book_samples[0]])
-                    st.info("✅ Using book context for better OCR")
-            except:
-                st.caption("No books processed yet - OCR will work but may be less accurate")
-            if st.button("🤖 PROCESS WITH AI OCR", type="primary"):
-                total_tokens = 0
-                for answer_file in st.session_state.answer_files:
-                    with st.expander(f"Processing {answer_file}"):
-                        try:
-                            # Download
-                            local_path = download_file_from_dataset(answer_file)
-                            # Convert to images
-                            st.write("🖼️ Converting to images...")
-                            images = pdf_to_images(local_path)
-                            if not images:
-                                continue
-                            st.write(f"✅ {len(images)} pages")
-                            # OCR each page
-                            transcribed_pages = []
-                            page_tokens = 0
-                            for page_num, image in enumerate(images, 1):
-                                st.write(f"🤖 OCR Page {page_num}/{len(images)}...")
-                                transcription, tokens = ocr_with_claude(
-                                    claude,
-                                    image,
-                                    context=context_books
-                                )
-                                if transcription:
-                                    transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
-                                    page_tokens += tokens
-                            if not transcribed_pages:
-                                st.error("OCR failed")
-                                continue
-                            full_text = "\n\n".join(transcribed_pages)
-                            st.success(f"✅ Transcribed {len(full_text):,} chars")
-                            st.info(f"Tokens: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
-                            total_tokens += page_tokens
-                            # Chunk
-                            chunks = chunk_text(full_text)
-                            embeddings = embedder.encode(chunks, show_progress_bar=False)
-                            # Upload
-                            points = []
-                            for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
-                                points.append(PointStruct(
-                                    id=abs(hash(f"{answer_file}_{i}_{time.time()}")) % (2**63),
-                                    vector=emb.tolist(),
-                                    payload={
-                                        "content": chunk,
-                                        "source_name": answer_file.split('/')[-1],
-                                        "source_type": "answer_handwritten",
-                                        "chunk_index": i,
-                                        "ocr_tokens": page_tokens
-                                    }
-                                ))
-                            qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                            st.success(f"✅ Uploaded {len(points)} vectors!")
-                        except Exception as e:
-                            st.error(f"Error: {e}")
-                st.success(f"Total tokens: {total_tokens:,} | Cost: ${total_tokens * 0.000003:.2f}")
 # ============================================================================
-# MODE 3: STATS
 # ============================================================================
-elif mode == "📊 Stats":
-    st.title("📊 Database Statistics")
-    try:
-        sample = qdrant.scroll(
-            collection_name=COLLECTION_NAME,
-            limit=1000,
-            with_payload=True,
-            with_vectors=False
         )
-        if sample and sample[0]:
-            types = {}
-            sources = set()
-            for point in sample[0]:
-                src_type = point.payload.get('source_type', 'unknown')
-                types[src_type] = types.get(src_type, 0) + 1
-                sources.add(point.payload.get('source_name', 'Unknown'))
-            col1, col2 = st.columns(2)
-            with col1:
-                st.metric("Total Vectors", get_vector_count(qdrant))
-            with col2:
-                st.metric("Unique Sources", len(sources))
-            st.subheader("By Type")
-            for doc_type, count in sorted(types.items()):
-                st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
-            st.subheader("Sources")
-            for src in sorted(sources):
-                st.caption(f"• {src}")
-    except Exception as e:
-        st.error(f"Error: {e}")
-st.sidebar.caption("🎓 Math AI v1.0")

 import os
 import time
 import base64
+import hashlib
 from io import BytesIO
 from PIL import Image
 import PyPDF2
+from pdf2image import convert_from_path
 from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from huggingface_hub import hf_hub_download, list_repo_files
 # ============================================================================
+# PRODUCTION MATH AI SYSTEM - SMART PROCESSING
 # ============================================================================
 st.set_page_config(
+    page_title="Math AI System - Production",
     page_icon="🎓",
     layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
+DATASET_REPO = "yourusername/math-ai-documents"  # ← CHANGE THIS!
+# ============================================================================
+# AVAILABLE EMBEDDING MODELS
+# ============================================================================
+EMBEDDING_MODELS = {
+    "MiniLM-L6 (Fast, 384D)": {
+        "name": "sentence-transformers/all-MiniLM-L6-v2",
+        "dimensions": 384,
+        "speed": "Fast",
+        "quality": "Good"
+    },
+    "MiniLM-L12 (Balanced, 384D)": {
+        "name": "sentence-transformers/all-MiniLM-L12-v2",
+        "dimensions": 384,
+        "speed": "Medium",
+        "quality": "Better"
+    },
+    "MPNet (Best Quality, 768D)": {
+        "name": "sentence-transformers/all-mpnet-base-v2",
+        "dimensions": 768,
+        "speed": "Slower",
+        "quality": "Excellent"
+    }
+}
 # ============================================================================
 # CACHED RESOURCES
 # ============================================================================
 @st.cache_resource
+def get_qdrant_client():
+    """Initialize Qdrant client"""
+    return QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
     )
+@st.cache_resource
+def get_claude_client():
+    """Initialize Claude client"""
+    return Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+@st.cache_resource
+def get_embedding_model(model_name):
+    """Load embedding model (cached per model)"""
+    return SentenceTransformer(model_name)
 # ============================================================================
+# HELPER FUNCTIONS
 # ============================================================================
+def get_file_hash(file_path):
+    """Generate unique hash for file to track if already processed"""
+    return hashlib.md5(file_path.encode()).hexdigest()
+def check_if_processed(qdrant, file_name):
+    """Check if file already processed in Qdrant"""
+    try:
+        results = qdrant.scroll(
+            collection_name=COLLECTION_NAME,
+            scroll_filter={
+                "must": [
+                    {"key": "source_name", "match": {"value": file_name}}
+                ]
+            },
+            limit=1,
+            with_payload=True,
+            with_vectors=False
+        )
+        return len(results[0]) > 0 if results and results[0] else False
+    except:
+        return False
 def list_dataset_files(folder_path):
+    """List PDF files in HF Dataset folder"""
     try:
         hf_token = os.getenv("HF_TOKEN")
         all_files = list_repo_files(
             repo_id=DATASET_REPO,
             repo_type="dataset",
             token=hf_token
         )
         pdf_files = [
             f for f in all_files
             if f.startswith(folder_path) and f.endswith('.pdf')
         st.error(f"Error listing files: {e}")
         return []
+def download_from_dataset(file_path):
+    """Download file from HF Dataset"""
     try:
         hf_token = os.getenv("HF_TOKEN")
         local_path = hf_hub_download(
             repo_id=DATASET_REPO,
             filename=file_path,
         return local_path
     except Exception as e:
+        st.error(f"Download error: {e}")
         return None
 def extract_text_from_pdf(pdf_path):
+    """Extract text from typed PDF"""
     try:
         with open(pdf_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
             text = ""
+            for page_num, page in enumerate(reader.pages):
                 text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
             return text
     except Exception as e:
+        st.error(f"Text extraction error: {e}")
         return None
 def pdf_to_images(pdf_path):
+    """Convert PDF to images for OCR"""
     try:
         images = convert_from_path(pdf_path, dpi=200)
         return images
     except Exception as e:
+        st.error(f"PDF to image error: {e}")
+        st.info("💡 This requires poppler-utils. Add 'poppler-utils' to packages.txt file in your Space")
         return []
 def resize_image(image, max_size=(2048, 2048)):
+    """Resize image for Claude Vision"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
+    """Convert PIL Image to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
+    prompt = f"""Transcribe handwritten math solution.
 STYLE: Italian cursive (connected letters)
 LANGUAGE: English
 1. Transcribe in English
 2. Use proper math notation: ∫, ∑, √, ∂, etc.
 3. Maintain structure
+4. Mark unclear: [unclear: guess]
+OUTPUT: Transcription only."""
     try:
         message = claude_client.messages.create(
         return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
     except Exception as e:
+        st.error(f"OCR error: {e}")
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
+    """Split text into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
     return chunks
 def get_vector_count(qdrant):
+    """Get total vectors in database"""
     try:
         count = 0
         offset = None
         return 0
 # ============================================================================
+# INITIALIZE CLIENTS
 # ============================================================================
 try:
+    qdrant = get_qdrant_client()
+    claude = get_claude_client()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
+    st.error(f"❌ Initialization failed: {e}")
+    st.info("Add these secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
     st.stop()
 # ============================================================================
 # ============================================================================
 st.sidebar.title("🎓 Math AI System")
+st.sidebar.caption("Production Version")
 try:
     vector_count = get_vector_count(qdrant)
+    st.sidebar.metric("Total Vectors", f"{vector_count:,}")
+    storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
+    st.sidebar.metric("Storage", f"{storage_mb:.1f} MB")
 except:
+    st.sidebar.warning("Database unavailable")
+st.sidebar.markdown("---")
 # ============================================================================
+# MAIN TABS (Reordered as requested)
 # ============================================================================
+tab1, tab2, tab3 = st.tabs([
+    "📊 Dataset Manager",
+    "🔍 Search & Solve",
+    "📈 Statistics"
+])
+# ============================================================================
+# TAB 1: DATASET MANAGER (Primary Interface)
+# ============================================================================
+with tab1:
+    st.title("📊 Dataset Manager")
+    st.markdown("*Manage all your data sources in one place*")
+    # Check HF Token
+    if not os.getenv("HF_TOKEN"):
+        st.error("⚠️ Missing HF_TOKEN in secrets!")
+        st.info("Add it in Settings → Repository Secrets")
+        st.stop()
+    # Collection setup
+    st.header("🏗️ Step 1: Database Setup")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        try:
+            collections = qdrant.get_collections().collections
+            exists = any(c.name == COLLECTION_NAME for c in collections)
+            if exists:
+                st.success(f"✅ Collection '{COLLECTION_NAME}' exists")
+            else:
+                st.warning(f"Collection '{COLLECTION_NAME}' doesn't exist")
+                # Show embedding model choice for initial creation
+                st.subheader("Choose Embedding Model")
+                for model_name, specs in EMBEDDING_MODELS.items():
+                    with st.expander(f"{model_name} - {specs['quality']} quality, {specs['speed']} speed"):
+                        st.write(f"**Dimensions:** {specs['dimensions']}")
+                        st.write(f"**Model:** `{specs['name']}`")
+                selected_model_key = st.selectbox(
+                    "Select embedding model:",
+                    list(EMBEDDING_MODELS.keys())
+                )
+                if st.button("🏗️ Create Collection", type="primary"):
+                    dimensions = EMBEDDING_MODELS[selected_model_key]["dimensions"]
+                    qdrant.create_collection(
+                        collection_name=COLLECTION_NAME,
+                        vectors_config=VectorParams(
+                            size=dimensions,
+                            distance=Distance.COSINE
+                        )
                     )
+                    st.success(f"✅ Created with {dimensions}D vectors!")
+                    st.session_state.embedding_model = EMBEDDING_MODELS[selected_model_key]["name"]
+                    st.rerun()
+        except Exception as e:
+            st.error(f"Error: {e}")
+    with col2:
+        st.info(f"""
+        **Dataset:**
+        `{DATASET_REPO}`
+        **Collection:**
+        `{COLLECTION_NAME}`
+        """)
+    st.markdown("---")
+    # Processing options
+    st.header("⚙️ Step 2: Processing Configuration")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.subheader("Chunking Strategy")
+        chunk_size = st.slider("Chunk size (words):", 50, 500, 150)
+        chunk_overlap = st.slider("Overlap (words):", 0, 100, 30)
+        st.caption(f"Overlap: {(chunk_overlap/chunk_size*100):.0f}%")
+    with col2:
+        st.subheader("Embedding Model")
+        # Get current model from collection or use default
+        current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+        # Find which key this model belongs to
+        current_model_key = "MiniLM-L6 (Fast, 384D)"
+        for key, specs in EMBEDDING_MODELS.items():
+            if specs["name"] == current_model:
+                current_model_key = key
+                break
+        st.info(f"**Active:** {current_model_key}")
+        st.caption(f"Model: `{current_model}`")
+    with col3:
+        st.subheader("OCR Settings")
+        use_context_for_ocr = st.checkbox("Use book context", value=True, help="Better accuracy, higher cost")
+        st.caption("Context helps Claude understand symbols")
     st.markdown("---")
+    # Data sources
+    st.header("📁 Step 3: Data Sources")
+    source_tabs = st.tabs([
+        "📂 Your Dataset Files",
+        "🌐 Public Datasets (GSM8K, MATH, etc.)"
+    ])
     # ========================================================================
+    # SOURCE 1: HF Dataset Files
     # ========================================================================
+    with source_tabs[0]:
+        st.subheader("Files from Your HF Dataset")
+        folder_type = st.radio(
+            "Select folder:",
+            ["📚 Books (Typed PDFs)", "📝 Exams (Typed PDFs)", "🖊️ Answers (Handwritten - needs OCR)"],
+            horizontal=True
+        )
+        # Determine folder path
+        if "Books" in folder_type:
+            folder_path = "books/"
+            doc_type = "book"
+        elif "Exams" in folder_type:
+            folder_path = "exams/"
+            doc_type = "exam"
+        else:
+            folder_path = "answers/"
+            doc_type = "answer_handwritten"
+        # List files
+        if st.button(f"🔍 Scan {folder_path} folder"):
+            with st.spinner("Scanning dataset..."):
+                files = list_dataset_files(folder_path)
+                if files:
+                    # Check processing status for each file
+                    file_status = []
+                    for file in files:
+                        file_name = file.split('/')[-1]
+                        is_processed = check_if_processed(qdrant, file_name)
+                        file_status.append({
+                            "file": file,
+                            "name": file_name,
+                            "processed": is_processed
+                        })
+                    st.session_state.current_files = file_status
+                    st.session_state.current_folder = folder_path
+                    st.session_state.current_doc_type = doc_type
+                else:
+                    st.warning(f"No files found in {folder_path}")
+        # Display files with status
+        if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
+            st.write(f"**Found {len(st.session_state.current_files)} files:**")
+            # Summary
+            processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
+            pending_count = len(st.session_state.current_files) - processed_count
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Total", len(st.session_state.current_files))
+            with col2:
+                st.metric("✅ Processed", processed_count)
+            with col3:
+                st.metric("⏳ Pending", pending_count)
+            # File list with checkboxes
+            st.subheader("Select files to process:")
+            selected_files = []
+            for file_info in st.session_state.current_files:
+                col1, col2 = st.columns([3, 1])
+                with col1:
+                    # Only allow selection if not processed
+                    if file_info['processed']:
+                        st.checkbox(
+                            f"✅ {file_info['name']} (Already processed)",
+                            value=False,
+                            disabled=True,
+                            key=f"file_{file_info['name']}"
+                        )
+                    else:
+                        if st.checkbox(
+                            f"⏳ {file_info['name']}",
+                            value=True,  # Auto-select pending files
+                            key=f"file_{file_info['name']}"
+                        ):
+                            selected_files.append(file_info)
+                with col2:
+                    if file_info['processed']:
+                        st.caption("Skip")
+                    else:
+                        st.caption("Ready")
+            # Process button
+            if selected_files:
+                st.markdown("---")
+                st.write(f"**Ready to process {len(selected_files)} file(s)**")
+                # Show cost estimate for OCR
+                if doc_type == "answer_handwritten":
+                    est_pages = len(selected_files) * 5  # Assume 5 pages per PDF
+                    est_cost = est_pages * 0.08
+                    st.warning(f"⚠️ OCR Cost Estimate: ~${est_cost:.2f} ({est_pages} pages × ~$0.08/page)")
+                if st.button(f"🚀 PROCESS SELECTED FILES", type="primary"):
+                    # Load embedding model
+                    embedder = get_embedding_model(current_model)
+                    # Get context if needed
+                    context_books = ""
+                    if doc_type == "answer_handwritten" and use_context_for_ocr:
+                        try:
+                            book_samples = qdrant.scroll(
+                                collection_name=COLLECTION_NAME,
+                                limit=10,
+                                with_payload=True,
+                                with_vectors=False,
+                                scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
+                            )
+                            if book_samples and book_samples[0]:
+                                context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
+                                st.info("✅ Using book context for OCR")
+                        except:
+                            st.caption("No books in database - OCR will work but may be less accurate")
+                    # Process each selected file
+                    total_tokens = 0
+                    total_vectors = 0
+                    for file_info in selected_files:
+                        with st.expander(f"Processing {file_info['name']}", expanded=True):
+                            try:
+                                # Download
+                                st.write("📥 Downloading...")
+                                local_path = download_from_dataset(file_info['file'])
+                                if not local_path:
+                                    st.error("Download failed")
+                                    continue
+                                # Extract or OCR
+                                if doc_type == "answer_handwritten":
+                                    # OCR path
+                                    st.write("🖼️ Converting to images...")
+                                    images = pdf_to_images(local_path)
+                                    if not images:
+                                        st.error("Conversion failed - poppler-utils not installed?")
+                                        continue
+                                    st.write(f"✅ {len(images)} pages")
+                                    # OCR each page
+                                    transcribed_pages = []
+                                    page_tokens = 0
+                                    for page_num, image in enumerate(images, 1):
+                                        st.write(f"🤖 OCR page {page_num}/{len(images)}...")
+                                        transcription, tokens = ocr_with_claude(
+                                            claude,
+                                            image,
+                                            context=context_books
+                                        )
+                                        if transcription:
+                                            transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
+                                            page_tokens += tokens
+                                    if not transcribed_pages:
+                                        st.error("OCR failed")
+                                        continue
+                                    text = "\n\n".join(transcribed_pages)
+                                    total_tokens += page_tokens
+                                    st.success(f"✅ Transcribed {len(text):,} chars (${page_tokens * 0.000003:.3f})")
+                                else:
+                                    # Text extraction
+                                    st.write("📖 Extracting text...")
+                                    text = extract_text_from_pdf(local_path)
+                                    if not text:
+                                        st.error("Text extraction failed")
+                                        continue
+                                    st.write(f"✅ {len(text):,} chars")
+                                # Chunk
+                                chunks = chunk_text(text, chunk_size, chunk_overlap)
+                                st.write(f"✂️ {len(chunks)} chunks")
+                                # Embed
+                                st.write("🔢 Embedding...")
+                                embeddings = embedder.encode(chunks, show_progress_bar=False)
+                                # Upload
+                                points = []
+                                for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+                                    points.append(PointStruct(
+                                        id=abs(hash(f"{file_info['file']}_{i}_{time.time()}")) % (2**63),
+                                        vector=emb.tolist(),
+                                        payload={
+                                            "content": chunk,
+                                            "source_name": file_info['name'],
+                                            "source_type": doc_type,
+                                            "chunk_index": i,
+                                            "embedding_model": current_model
+                                        }
+                                    ))
+                                qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                                total_vectors += len(points)
+                                st.success(f"✅ Uploaded {len(points)} vectors!")
+                            except Exception as e:
+                                st.error(f"Error: {e}")
+                    # Summary
+                    st.balloons()
+                    st.success(f"""
+                    🎉 Processing Complete!
+                    - Files processed: {len(selected_files)}
+                    - Vectors added: {total_vectors:,}
+                    - OCR tokens used: {total_tokens:,}
+                    - OCR cost: ${total_tokens * 0.000003:.2f}
+                    """)
+                    # Clear selection
+                    st.session_state.pop('current_files', None)
+                    st.rerun()
     # ========================================================================
+    # SOURCE 2: Public Datasets
     # ========================================================================
+    with source_tabs[1]:
+        st.subheader("Public Math Datasets")
+        dataset_choice = st.selectbox(
+            "Select dataset:",
+            [
+                "GSM8K - Grade School Math (8.5K problems)",
+                "MATH - Competition Math (12.5K problems)",
+                "MathQA - Math Word Problems (37K problems)"
+            ]
+        )
+        sample_size = st.slider("Number of samples:", 10, 2000, 100)
+        # Check if already loaded
+        dataset_name = dataset_choice.split(" - ")[0]
+        already_loaded = check_if_processed(qdrant, dataset_name)
+        if already_loaded:
+            st.success(f"✅ {dataset_name} already loaded!")
+            st.info("Vectors from this dataset are already in your database.")
+        else:
+            if st.button(f"📥 Load {dataset_name}", type="primary"):
+                try:
+                    from datasets import load_dataset
+                    embedder = get_embedding_model(current_model)
+                    with st.spinner(f"Loading {dataset_name}..."):
+                        if "GSM8K" in dataset_choice:
+                            dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
+                            texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
+                                    for i in range(min(sample_size, len(dataset)))]
+                        elif "MATH" in dataset_choice:
+                            dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
+                            texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
+                                    for i in range(min(sample_size, len(dataset)))]
+                        else:  # MathQA
+                            dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
+                            texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
+                                    for i in range(min(sample_size, len(dataset)))]
+                        st.write(f"✅ Loaded {len(texts)} problems")
+                        # Embed
+                        st.write("🔢 Embedding...")
+                        embeddings = embedder.encode(texts, show_progress_bar=True)
+                        # Upload
                         points = []
+                        for i, (text, emb) in enumerate(zip(texts, embeddings)):
                             points.append(PointStruct(
+                                id=abs(hash(f"{dataset_name}_{i}_{time.time()}")) % (2**63),
                                 vector=emb.tolist(),
                                 payload={
+                                    "content": text[:2000],
+                                    "source_name": dataset_name,
+                                    "source_type": "public_dataset",
+                                    "index": i,
+                                    "embedding_model": current_model
                                 }
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                         st.success(f"✅ Uploaded {len(points)} vectors!")
+                        st.balloons()
+                except Exception as e:
+                    st.error(f"Error: {e}")
 # ============================================================================
+# TAB 2: SEARCH & SOLVE
 # ============================================================================
+with tab2:
+    st.title("🔍 Search & Solve")
+    problem = st.text_area(
+        "Enter math problem:",
+        placeholder="Find the gradient of the loss function L(w) = (1/2)||Xw - y||²",
+        height=150
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        top_k = st.slider("Retrieve top:", 3, 20, 5)
+    with col2:
+        detail = st.select_slider(
+            "Detail level:",
+            ["Concise", "Standard", "Detailed", "Exhaustive"],
+            value="Detailed"
         )
+    if st.button("🚀 SOLVE", type="primary") and problem:
+        # Get embedding model
+        current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+        embedder = get_embedding_model(current_model)
+        with st.spinner("Searching..."):
+            query_emb = embedder.encode(problem)
+            try:
+                results = qdrant.search(
+                    collection_name=COLLECTION_NAME,
+                    query_vector=query_emb.tolist(),
+                    limit=top_k
+                )
+            except:
+                results = []
+        if not results:
+            st.warning("No results. Load data in Dataset Manager.")
+        else:
+            st.success(f"Found {len(results)} references!")
+            with st.expander("📚 References"):
+                for i, r in enumerate(results, 1):
+                    st.markdown(f"**{i}.** ({r.score*100:.0f}% match)")
+                    st.text(r.payload['content'][:200] + "...")
+                    st.caption(f"Source: {r.payload.get('source_name')}")
+            with st.spinner("Generating solution..."):
+                context = "\n\n".join([r