Spaces:

Hebaelsayed
/

math-ai-system

Restarting

App Files Files Community

Hebaelsayed commited on 23 days ago

Commit

bd94ae2

verified ·

1 Parent(s): 6470c63

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +317 -196

src/streamlit_app.py CHANGED Viewed

@@ -5,13 +5,14 @@ import base64
 from io import BytesIO
 from PIL import Image
 import PyPDF2
 from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
 # ============================================================================
-# COMPLETE MATH AI SYSTEM - ALL-IN-ONE HUGGING FACE SPACE
 # ============================================================================
 st.set_page_config(
@@ -56,6 +57,95 @@ def extract_text_from_pdf(pdf_file):
     except Exception as e:
         return None
 def chunk_text(text, chunk_size=150, overlap=30):
     """Split text into chunks"""
     words = text.split()
@@ -90,10 +180,9 @@ def get_vector_count(qdrant):
         return 0
 # ============================================================================
-# MAIN APP
 # ============================================================================
-# Initialize clients
 try:
     qdrant, claude, embedder = get_clients()
     st.sidebar.success("✅ System Ready")
@@ -103,7 +192,7 @@ except Exception as e:
     st.stop()
 # ============================================================================
-# SIDEBAR: MODE SELECTION
 # ============================================================================
 st.sidebar.title("🎓 Math AI System")
@@ -116,18 +205,17 @@ mode = st.sidebar.radio(
 st.sidebar.markdown("---")
-# Show database stats
 try:
     vector_count = get_vector_count(qdrant)
     st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
     storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
     st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
 except:
     st.sidebar.warning("Database not accessible")
 # ============================================================================
-# MODE 1: SEARCH & SOLVE (Main Interface)
 # ============================================================================
 if mode == "🔍 Search & Solve":
@@ -135,10 +223,7 @@ if mode == "🔍 Search & Solve":
     st.title("🔍 Math Problem Solver")
     st.markdown("*Search your knowledge base and get detailed solutions*")
-    # ========================================================================
-    # INPUT: Problem Statement
-    # ========================================================================
     st.header("📝 Input Problem")
     input_method = st.radio(
@@ -152,22 +237,18 @@ if mode == "🔍 Search & Solve":
     if input_method == "✍️ Type Question":
         problem = st.text_area(
             "Enter math problem:",
-            placeholder="Example: Find the gradient of the loss function L(w) = (1/2)||Xw - y||²",
             height=150
         )
     else:
         uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
         if uploaded_exam:
             exam_text = extract_text_from_pdf(uploaded_exam)
             if exam_text:
-                st.text_area("Extracted text:", exam_text[:1000], height=200)
-                problem = st.text_input("Extract specific question or use full text")
-    # ========================================================================
-    # SETTINGS
-    # ========================================================================
     with st.expander("⚙️ Advanced Settings"):
         col1, col2 = st.columns(2)
@@ -186,29 +267,13 @@ if mode == "🔍 Search & Solve":
                 value="Detailed"
             )
-    # ========================================================================
-    # SOLVE BUTTON
-    # ========================================================================
     if st.button("🚀 SOLVE PROBLEM", type="primary") and problem:
-        with st.spinner("🔍 Searching knowledge base..."):
-            # Generate query embedding
             query_embedding = embedder.encode(problem)
-            # Create filter
-            filter_types = []
-            if "Books" in search_filter:
-                filter_types.append("book")
-            if "Exams" in search_filter:
-                filter_types.append("exam")
-            if "Handwritten Solutions" in search_filter:
-                filter_types.append("answer_handwritten")
-            if "Public Datasets" in search_filter:
-                filter_types.append("public_dataset")
-            # Search Qdrant
             try:
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
@@ -220,12 +285,11 @@ if mode == "🔍 Search & Solve":
                 results = []
         if not results:
-            st.warning("No relevant context found. Try loading more data in Setup mode.")
         else:
-            st.success(f"✅ Found {len(results)} relevant references!")
-            # Show retrieved context
             with st.expander("📚 Retrieved References"):
                 for i, result in enumerate(results, 1):
                     similarity = result.score * 100
@@ -234,59 +298,48 @@ if mode == "🔍 Search & Solve":
                     st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
                     st.markdown("---")
-            # Generate solution with Claude
-            with st.spinner("🤖 Claude is generating solution..."):
-                # Prepare context
                 context = "\n\n".join([
-                    f"[Reference {i+1} from {r.payload.get('source_name', 'Unknown')}]:\n{r.payload['content']}"
                     for i, r in enumerate(results)
                 ])
-                # Determine detail level
                 detail_instructions = {
-                    "Concise": "Provide a brief solution focusing on key steps.",
-                    "Standard": "Provide a clear solution with main steps explained.",
-                    "Detailed": "Provide a comprehensive solution with detailed explanations.",
-                    "Very Detailed": "Provide an exhaustive solution with all intermediate steps, intuitions, and alternative approaches."
                 }
-                # Create prompt
-                prompt = f"""You are an expert mathematics tutor specializing in machine learning mathematics.
-PROBLEM TO SOLVE:
 {problem}
-REFERENCE MATERIALS (from student's books, exams, and notes):
 {context}
-TASK:
-Solve this problem providing a complete, educational solution.
 {detail_instructions[detail_level]}
-FORMAT YOUR RESPONSE EXACTLY LIKE THIS:
 ## SOLUTION
-[Provide step-by-step solution here with clear mathematical notation]
 ## REASONING & APPROACH
-[Explain WHY you chose this approach, what concepts are involved, and how the references helped]
 ## REFERENCES USED
-[List which references you used and HOW each contributed to the solution. Be specific - mention what information came from which source]
 ## VERIFICATION
-[If applicable, verify the solution or discuss how to check if it's correct]
-IMPORTANT:
-- Use proper mathematical notation (LaTeX if needed: ∫, ∑, ∂, etc.)
-- Reference the student's materials when explaining concepts
-- Make it educational - help them understand, not just get an answer"""
                 try:
                     message = claude.messages.create(
@@ -297,11 +350,9 @@ IMPORTANT:
                     solution = message.content[0].text
-                    # Display solution
                     st.markdown("---")
                     st.markdown(solution)
-                    # Download option
                     st.download_button(
                         "📥 Download Solution",
                         solution,
@@ -309,46 +360,27 @@ IMPORTANT:
                         mime="text/markdown"
                     )
-                    # API usage
                     with st.expander("📊 API Usage"):
                         st.json({
-                            "model": "claude-sonnet-4-20250514",
                             "input_tokens": message.usage.input_tokens,
                             "output_tokens": message.usage.output_tokens,
-                            "cost_estimate": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
                         })
                 except Exception as e:
-                    st.error(f"Claude error: {e}")
 # ============================================================================
-# MODE 2: SETUP DATABASE (One-Time Processing)
 # ============================================================================
 elif mode == "🏗️ Setup Database":
     st.title("🏗️ Database Setup")
-    st.markdown("*Process and upload your documents (run once)*")
-    st.warning("""
-    ⚠️ **IMPORTANT LIMITATION**:
-    Hugging Face Spaces cannot directly access Google Drive files.
-    **Recommended Solution:**
-    1. Use **Google Colab** for one-time processing (cloud, free)
-    2. Use **this HF Space** for daily searching/solving
-    **Alternative (Manual)**:
-    - Download PDFs from Google Drive
-    - Upload them here one by one
-    """)
-    # ========================================================================
-    # CREATE COLLECTION
-    # ========================================================================
-    st.header("Step 1: Create Database Collection")
     try:
         collections = qdrant.get_collections().collections
@@ -369,26 +401,32 @@ elif mode == "🏗️ Setup Database":
     st.markdown("---")
-    # ========================================================================
-    # UPLOAD OPTIONS
-    # ========================================================================
     st.header("Step 2: Upload Documents")
-    tab1, tab2, tab3 = st.tabs(["📚 Upload PDFs", "📊 Load Public Datasets", "🖊️ Process Handwritten (Colab)"])
     with tab1:
-        st.info("Upload your books and typed exams here")
         uploaded_files = st.file_uploader(
             "Choose PDF files:",
             type=['pdf'],
-            accept_multiple_files=True
         )
-        doc_type = st.selectbox("Document type:", ["Book", "Exam", "Other"])
-        if uploaded_files and st.button("Process & Upload PDFs"):
             for uploaded_file in uploaded_files:
                 with st.expander(f"Processing {uploaded_file.name}"):
@@ -397,7 +435,7 @@ elif mode == "🏗️ Setup Database":
                         # Extract
                         text = extract_text_from_pdf(uploaded_file)
                         if not text:
-                            st.error("Failed to extract text")
                             continue
                         st.write(f"✅ Extracted {len(text):,} chars")
@@ -407,7 +445,8 @@ elif mode == "🏗️ Setup Database":
                         st.write(f"✅ Created {len(chunks)} chunks")
                         # Embed
-                        embeddings = embedder.encode(chunks, show_progress_bar=False)
                         # Upload
                         points = []
@@ -418,7 +457,7 @@ elif mode == "🏗️ Setup Database":
                                 payload={
                                     "content": chunk,
                                     "source_name": uploaded_file.name,
-                                    "source_type": doc_type.lower(),
                                     "chunk_index": i
                                 }
                             ))
@@ -429,50 +468,200 @@ elif mode == "🏗️ Setup Database":
                     except Exception as e:
                         st.error(f"Error: {e}")
     with tab2:
         st.info("Load pre-built math datasets")
         dataset_choice = st.selectbox(
             "Choose dataset:",
-            ["GSM8K", "MATH", "MathQA"]
         )
-        sample_size = st.slider("Number of samples:", 10, 1000, 100)
-        if st.button("Load Dataset"):
             try:
                 from datasets import load_dataset
-                with st.spinner(f"Loading {dataset_choice}..."):
-                    if dataset_choice == "GSM8K":
                         dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
                                 for i in range(min(sample_size, len(dataset)))]
-                    elif dataset_choice == "MATH":
                         dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
                                 for i in range(min(sample_size, len(dataset)))]
-                    else:  # MathQA
                         dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                 for i in range(min(sample_size, len(dataset)))]
                     st.write(f"✅ Loaded {len(texts)} problems")
-                    # Embed & upload
                     embeddings = embedder.encode(texts, show_progress_bar=True)
                     points = []
                     for i, (text, emb) in enumerate(zip(texts, embeddings)):
                         points.append(PointStruct(
-                            id=abs(hash(f"{dataset_choice}_{i}_{time.time()}")) % (2**63),
                             vector=emb.tolist(),
                             payload={
                                 "content": text[:2000],
-                                "source_name": dataset_choice,
                                 "source_type": "public_dataset",
                                 "index": i
                             }
@@ -484,64 +673,21 @@ elif mode == "🏗️ Setup Database":
             except Exception as e:
                 st.error(f"Error: {e}")
-    with tab3:
-        st.warning("**Handwritten OCR requires Google Colab** (HF Spaces limitation)")
-        st.markdown("""
-        ### Why Colab for Handwritten Notes?
-        1. **File Access**: Need direct Google Drive access
-        2. **Processing Power**: OCR is compute-intensive
-        3. **Image Processing**: Requires additional libraries
-        ### Steps:
-        1. **Click button below** to open ready-to-use Colab notebook
-        2. **Run the notebook** (processes handwritten PDFs with AI OCR)
-        3. **Vectors auto-upload** to your Qdrant database
-        4. **Come back here** to search!
-        The notebook handles:
-        - ✅ Google Drive connection
-        - ✅ Italian cursive handwriting OCR (Claude Vision)
-        - ✅ Context from books/exams
-        - ✅ Direct upload to Qdrant
-        """)
-        colab_code_url = "https://colab.research.google.com/drive/your-notebook-id"
-        st.link_button(
-            "📓 Open Google Colab Notebook",
-            colab_code_url,
-            use_container_width=True
-        )
-        st.info("""
-        **What the Colab notebook will do:**
-        - Connect to your Google Drive (one click)
-        - Read PDFs from Math_AI_Documents/answers/
-        - Use Claude Vision to OCR handwritten Italian cursive
-        - Upload directly to this same Qdrant database
-        - Takes ~30-60 minutes, costs ~$0.60
-        """)
 # ============================================================================
-# MODE 3: TESTING DASHBOARD
 # ============================================================================
 elif mode == "🧪 Testing Dashboard":
     st.title("🧪 Testing Dashboard")
-    st.markdown("*Evaluate system performance*")
-    tab1, tab2, tab3 = st.tabs(["📊 Database Stats", "🎯 Accuracy Tests", "📈 Performance"])
     with tab1:
         st.header("Database Statistics")
         try:
-            # Get sample
             sample = qdrant.scroll(
                 collection_name=COLLECTION_NAME,
                 limit=1000,
@@ -550,7 +696,6 @@ elif mode == "🧪 Testing Dashboard":
             )
             if sample and sample[0]:
-                # Count by type
                 types = {}
                 sources = set()
@@ -559,67 +704,43 @@ elif mode == "🧪 Testing Dashboard":
                     types[src_type] = types.get(src_type, 0) + 1
                     sources.add(point.payload.get('source_name', 'Unknown'))
-                # Display
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.metric("Total Vectors", get_vector_count(qdrant))
                 with col2:
-                    st.metric("Unique Sources", len(sources))
                 with col3:
-                    st.metric("Document Types", len(types))
-                # Breakdown
-                st.subheader("Breakdown by Type")
                 for doc_type, count in sorted(types.items()):
                     st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
-                # Sources
-                st.subheader("Sources")
-                for src in sorted(sources)[:20]:
-                    st.caption(f"• {src}")
         except Exception as e:
             st.error(f"Error: {e}")
     with tab2:
-        st.header("Test Search Accuracy")
-        test_query = st.text_input("Test query:", placeholder="gradient descent")
-        if st.button("Run Test Search") and test_query:
             query_emb = embedder.encode(test_query)
             results = qdrant.search(
                 collection_name=COLLECTION_NAME,
                 query_vector=query_emb.tolist(),
                 limit=5
             )
-            st.write(f"**Found {len(results)} results:**")
             for i, r in enumerate(results, 1):
                 similarity = r.score * 100
-                quality = "🟢 Excellent" if similarity > 70 else "🟡 Good" if similarity > 50 else "🔴 Fair"
                 st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
                 st.text(r.payload['content'][:200] + "...")
-                st.caption(f"Source: {r.payload.get('source_name')}")
                 st.markdown("---")
-    with tab3:
-        st.header("Performance Metrics")
-        st.info("Coming soon: Response time, token usage, cost tracking")
-# ============================================================================
-# FOOTER
-# ============================================================================
 st.sidebar.markdown("---")
-st.sidebar.caption("🎓 Math AI System v1.0")
-st.sidebar.caption("Powered by Claude + Qdrant")

 from io import BytesIO
 from PIL import Image
 import PyPDF2
+from pdf2image import convert_from_bytes
 from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
 # ============================================================================
+# COMPLETE MATH AI SYSTEM - 100% HUGGING FACE
 # ============================================================================
 st.set_page_config(
     except Exception as e:
         return None
+def pdf_to_images(pdf_bytes):
+    """Convert PDF pages to images for OCR"""
+    try:
+        images = convert_from_bytes(pdf_bytes.read(), dpi=200)
+        return images
+    except Exception as e:
+        st.error(f"PDF to image conversion error: {e}")
+        return []
+def resize_image(image, max_size=(2048, 2048)):
+    """Resize image for Claude Vision"""
+    image.thumbnail(max_size, Image.Resampling.LANCZOS)
+    return image
+def image_to_base64(image):
+    """Convert PIL Image to base64"""
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
+def ocr_with_claude(claude_client, image, context_books="", context_exam=""):
+    """
+    AI-powered OCR for handwritten Italian cursive math notes
+    NOTE: Italian cursive is the HANDWRITING STYLE (connected letters)
+          Language is ENGLISH
+    """
+    resized = resize_image(image.copy())
+    img_b64 = image_to_base64(resized)
+    prompt = f"""You are an expert in transcribing handwritten mathematical solutions.
+IMPORTANT: This is written in ITALIAN CURSIVE style (connected, flowing letters), but the LANGUAGE IS ENGLISH.
+CONTEXT FROM TEXTBOOKS (helps understand symbols):
+{context_books[:2000] if context_books else "No context available"}
+EXAM QUESTION (helps understand what's being solved):
+{context_exam[:1000] if context_exam else "No exam question available"}
+TASK: Transcribe this handwritten math solution into clean, readable text.
+INSTRUCTIONS:
+1. Language is ENGLISH (just cursive style is Italian)
+2. Convert math notation properly:
+   - Use standard symbols: ∫, ∑, √, ∂, lim, etc.
+   - Use LaTeX for complex formulas
+   - Preserve Greek letters: α, β, γ, π, etc.
+3. Maintain structure (paragraphs, steps)
+4. If unclear, mark as [unclear: best guess]
+5. Describe diagrams as [DIAGRAM: description]
+OUTPUT: Just the transcribed text, no preamble."""
+    try:
+        message = claude_client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=4000,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": img_b64
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": prompt
+                        }
+                    ]
+                }
+            ]
+        )
+        transcription = message.content[0].text
+        tokens = message.usage.input_tokens + message.usage.output_tokens
+        return transcription, tokens
+    except Exception as e:
+        st.error(f"OCR error: {e}")
+        return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
     """Split text into chunks"""
     words = text.split()
         return 0
 # ============================================================================
+# INITIALIZE
 # ============================================================================
 try:
     qdrant, claude, embedder = get_clients()
     st.sidebar.success("✅ System Ready")
     st.stop()
 # ============================================================================
+# SIDEBAR
 # ============================================================================
 st.sidebar.title("🎓 Math AI System")
 st.sidebar.markdown("---")
+# Database stats
 try:
     vector_count = get_vector_count(qdrant)
     st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
     storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
     st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
 except:
     st.sidebar.warning("Database not accessible")
 # ============================================================================
+# MODE 1: SEARCH & SOLVE
 # ============================================================================
 if mode == "🔍 Search & Solve":
     st.title("🔍 Math Problem Solver")
     st.markdown("*Search your knowledge base and get detailed solutions*")
+    # Input
     st.header("📝 Input Problem")
     input_method = st.radio(
     if input_method == "✍️ Type Question":
         problem = st.text_area(
             "Enter math problem:",
+            placeholder="Example: Find the gradient of L(w) = (1/2)||Xw - y||²",
             height=150
         )
     else:
         uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
         if uploaded_exam:
             exam_text = extract_text_from_pdf(uploaded_exam)
             if exam_text:
+                st.text_area("Extracted:", exam_text[:1000], height=200)
+                problem = st.text_input("Specific question or use full text")
+    # Settings
     with st.expander("⚙️ Advanced Settings"):
         col1, col2 = st.columns(2)
                 value="Detailed"
             )
+    # Solve
     if st.button("🚀 SOLVE PROBLEM", type="primary") and problem:
+        with st.spinner("🔍 Searching..."):
             query_embedding = embedder.encode(problem)
             try:
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
                 results = []
         if not results:
+            st.warning("No relevant context found. Load data in Setup mode.")
         else:
+            st.success(f"✅ Found {len(results)} references!")
+            # Show context
             with st.expander("📚 Retrieved References"):
                 for i, result in enumerate(results, 1):
                     similarity = result.score * 100
                     st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
                     st.markdown("---")
+            # Generate solution
+            with st.spinner("🤖 Generating solution..."):
                 context = "\n\n".join([
+                    f"[Reference {i+1} from {r.payload.get('source_name')}]:\n{r.payload['content']}"
                     for i, r in enumerate(results)
                 ])
                 detail_instructions = {
+                    "Concise": "Brief solution, key steps only.",
+                    "Standard": "Clear solution with main steps.",
+                    "Detailed": "Comprehensive solution with detailed explanations.",
+                    "Very Detailed": "Exhaustive solution with all steps and intuitions."
                 }
+                prompt = f"""You are an expert mathematics tutor for machine learning.
+PROBLEM:
 {problem}
+REFERENCES (from student's materials):
 {context}
+TASK: Solve providing a complete educational solution.
 {detail_instructions[detail_level]}
+FORMAT:
 ## SOLUTION
+[Step-by-step solution with clear notation]
 ## REASONING & APPROACH
+[WHY this approach, what concepts, how references helped]
 ## REFERENCES USED
+[Which references used and HOW each contributed]
 ## VERIFICATION
+[How to verify the solution]
+Use proper notation (LaTeX if needed). Reference the materials when explaining."""
                 try:
                     message = claude.messages.create(
                     solution = message.content[0].text
                     st.markdown("---")
                     st.markdown(solution)
                     st.download_button(
                         "📥 Download Solution",
                         solution,
                         mime="text/markdown"
                     )
                     with st.expander("📊 API Usage"):
                         st.json({
                             "input_tokens": message.usage.input_tokens,
                             "output_tokens": message.usage.output_tokens,
+                            "cost": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
                         })
                 except Exception as e:
+                    st.error(f"Error: {e}")
 # ============================================================================
+# MODE 2: SETUP DATABASE
 # ============================================================================
 elif mode == "🏗️ Setup Database":
     st.title("🏗️ Database Setup")
+    st.markdown("*Upload and process your documents*")
+    # Create collection
+    st.header("Step 1: Create Collection")
     try:
         collections = qdrant.get_collections().collections
     st.markdown("---")
+    # Upload documents
     st.header("Step 2: Upload Documents")
+    tab1, tab2, tab3 = st.tabs([
+        "📚 Books & Exams (Typed PDFs)",
+        "🖊️ Handwritten Solutions (OCR)",
+        "📊 Public Datasets"
+    ])
+    # ========================================================================
+    # TAB 1: Typed PDFs
+    # ========================================================================
     with tab1:
+        st.info("✅ Upload your typed PDFs (books, exams) here")
         uploaded_files = st.file_uploader(
             "Choose PDF files:",
             type=['pdf'],
+            accept_multiple_files=True,
+            key="typed_pdfs"
         )
+        doc_type = st.selectbox("Document type:", ["book", "exam", "reference"])
+        if uploaded_files and st.button("📤 Process & Upload", key="upload_typed"):
             for uploaded_file in uploaded_files:
                 with st.expander(f"Processing {uploaded_file.name}"):
                         # Extract
                         text = extract_text_from_pdf(uploaded_file)
                         if not text:
+                            st.error("Text extraction failed")
                             continue
                         st.write(f"✅ Extracted {len(text):,} chars")
                         st.write(f"✅ Created {len(chunks)} chunks")
                         # Embed
+                        with st.spinner("Embedding..."):
+                            embeddings = embedder.encode(chunks, show_progress_bar=False)
                         # Upload
                         points = []
                                 payload={
                                     "content": chunk,
                                     "source_name": uploaded_file.name,
+                                    "source_type": doc_type,
                                     "chunk_index": i
                                 }
                             ))
                     except Exception as e:
                         st.error(f"Error: {e}")
+    # ========================================================================
+    # TAB 2: Handwritten OCR (100% IN HF SPACES!)
+    # ========================================================================
     with tab2:
+        st.success("✅ AI-POWERED OCR - Process handwritten notes RIGHT HERE!")
+        st.markdown("""
+        ### How it works:
+        1. Upload handwritten solution PDFs (from your Google Drive)
+        2. AI OCR processes each page with Claude Vision
+        3. Uses your books/exams as context for better accuracy
+        4. Uploads transcribed text to database
+        **Cost:** ~$0.05-0.10 per handwritten PDF page
+        """)
+        # Upload handwritten PDFs
+        handwritten_files = st.file_uploader(
+            "Upload handwritten solution PDFs:",
+            type=['pdf'],
+            accept_multiple_files=True,
+            key="handwritten_pdfs",
+            help="Your answer PDFs from Google Drive/Math_AI_Documents/answers/"
+        )
+        # Optional: Context from books
+        context_books = ""
+        use_context = st.checkbox("Use book context for better OCR accuracy", value=True)
+        if use_context:
+            # Get some book context from database
+            try:
+                book_samples = qdrant.scroll(
+                    collection_name=COLLECTION_NAME,
+                    limit=10,
+                    with_payload=True,
+                    with_vectors=False,
+                    scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
+                )
+                if book_samples and book_samples[0]:
+                    context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
+                    st.caption(f"✅ Using {len(book_samples[0])} book excerpts as context")
+            except:
+                st.caption("⚠️ No books in database yet. OCR will work but may be less accurate.")
+        if handwritten_files and st.button("🤖 PROCESS WITH AI OCR", type="primary"):
+            total_tokens = 0
+            for uploaded_file in handwritten_files:
+                st.markdown(f"### Processing: {uploaded_file.name}")
+                try:
+                    # Convert PDF to images
+                    with st.spinner("Converting PDF to images..."):
+                        # Read bytes
+                        pdf_bytes = BytesIO(uploaded_file.read())
+                        images = pdf_to_images(pdf_bytes)
+                    if not images:
+                        st.error("PDF conversion failed")
+                        continue
+                    st.write(f"✅ Converted to {len(images)} pages")
+                    # OCR each page
+                    transcribed_pages = []
+                    page_tokens = 0
+                    for page_num, image in enumerate(images, 1):
+                        with st.spinner(f"OCR Page {page_num}/{len(images)}..."):
+                            transcription, tokens = ocr_with_claude(
+                                claude,
+                                image,
+                                context_books=context_books,
+                                context_exam=""
+                            )
+                            if transcription:
+                                transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
+                                page_tokens += tokens
+                                st.write(f"   ✅ Page {page_num} ({tokens:,} tokens)")
+                            else:
+                                st.write(f"   ❌ Page {page_num} failed")
+                    if not transcribed_pages:
+                        st.error("No pages transcribed successfully")
+                        continue
+                    # Combine all pages
+                    full_text = "\n\n".join(transcribed_pages)
+                    st.success(f"✅ Transcribed {len(full_text):,} characters")
+                    st.info(f"📊 Tokens used: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
+                    total_tokens += page_tokens
+                    # Show preview
+                    with st.expander("👁️ Preview transcription"):
+                        st.text(full_text[:500] + "...")
+                    # Chunk
+                    chunks = chunk_text(full_text)
+                    st.write(f"✅ Created {len(chunks)} chunks")
+                    # Embed
+                    with st.spinner("Embedding..."):
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
+                    # Upload
+                    points = []
+                    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+                        points.append(PointStruct(
+                            id=abs(hash(f"handwritten_{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
+                            vector=emb.tolist(),
+                            payload={
+                                "content": chunk,
+                                "source_name": uploaded_file.name,
+                                "source_type": "answer_handwritten",
+                                "chunk_index": i,
+                                "handwriting_style": "italian_cursive",
+                                "language": "english",
+                                "ocr_method": "claude_vision",
+                                "tokens_used": page_tokens
+                            }
+                        ))
+                    qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                    st.success(f"🎉 Uploaded {len(points)} vectors from handwritten notes!")
+                    st.balloons()
+                except Exception as e:
+                    st.error(f"Error: {e}")
+                    st.exception(e)
+            st.markdown("---")
+            st.success(f"✅ Total tokens used: {total_tokens:,}")
+            st.info(f"💰 Estimated total cost: ${total_tokens * 0.000003:.2f}")
+    # ========================================================================
+    # TAB 3: Public Datasets
+    # ========================================================================
+    with tab3:
         st.info("Load pre-built math datasets")
         dataset_choice = st.selectbox(
             "Choose dataset:",
+            ["GSM8K - Grade School Math",
+             "MATH - Competition Math",
+             "MathQA - Word Problems"]
         )
+        sample_size = st.slider("Samples:", 10, 1000, 100)
+        if st.button("📥 Load Dataset"):
             try:
                 from datasets import load_dataset
+                with st.spinner(f"Loading..."):
+                    if "GSM8K" in dataset_choice:
                         dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
                                 for i in range(min(sample_size, len(dataset)))]
+                        name = "GSM8K"
+                    elif "MATH" in dataset_choice:
                         dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
                                 for i in range(min(sample_size, len(dataset)))]
+                        name = "MATH"
+                    else:
                         dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
                         texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                 for i in range(min(sample_size, len(dataset)))]
+                        name = "MathQA"
                     st.write(f"✅ Loaded {len(texts)} problems")
+                    # Embed
                     embeddings = embedder.encode(texts, show_progress_bar=True)
+                    # Upload
                     points = []
                     for i, (text, emb) in enumerate(zip(texts, embeddings)):
                         points.append(PointStruct(
+                            id=abs(hash(f"{name}_{i}_{time.time()}")) % (2**63),
                             vector=emb.tolist(),
                             payload={
                                 "content": text[:2000],
+                                "source_name": name,
                                 "source_type": "public_dataset",
                                 "index": i
                             }
             except Exception as e:
                 st.error(f"Error: {e}")
 # ============================================================================
+# MODE 3: TESTING
 # ============================================================================
 elif mode == "🧪 Testing Dashboard":
     st.title("🧪 Testing Dashboard")
+    tab1, tab2 = st.tabs(["📊 Stats", "🎯 Accuracy"])
     with tab1:
         st.header("Database Statistics")
         try:
             sample = qdrant.scroll(
                 collection_name=COLLECTION_NAME,
                 limit=1000,
             )
             if sample and sample[0]:
                 types = {}
                 sources = set()
                     types[src_type] = types.get(src_type, 0) + 1
                     sources.add(point.payload.get('source_name', 'Unknown'))
                 col1, col2, col3 = st.columns(3)
                 with col1:
                     st.metric("Total Vectors", get_vector_count(qdrant))
                 with col2:
+                    st.metric("Sources", len(sources))
                 with col3:
+                    st.metric("Types", len(types))
+                st.subheader("By Type")
                 for doc_type, count in sorted(types.items()):
                     st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
         except Exception as e:
             st.error(f"Error: {e}")
     with tab2:
+        st.header("Test Accuracy")
+        test_query = st.text_input("Test query:")
+        if st.button("Test") and test_query:
             query_emb = embedder.encode(test_query)
             results = qdrant.search(
                 collection_name=COLLECTION_NAME,
                 query_vector=query_emb.tolist(),
                 limit=5
             )
             for i, r in enumerate(results, 1):
                 similarity = r.score * 100
+                quality = "🟢" if similarity > 70 else "🟡" if similarity > 50 else "🔴"
                 st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
                 st.text(r.payload['content'][:200] + "...")
                 st.markdown("---")
 st.sidebar.markdown("---")
+st.sidebar.caption("🎓 Math AI v1.0")