Spaces:

Hebaelsayed
/

math-ai-system

Running

App Files Files Community

Hebaelsayed commited on 20 days ago

Commit

d9f0bf7

verified ·

1 Parent(s): 06c1259

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +367 -458

src/streamlit_app.py CHANGED Viewed

@@ -10,107 +10,144 @@ from anthropic import Anthropic
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
 # ============================================================================
-# COMPLETE MATH AI SYSTEM - 100% HUGGING FACE
 # ============================================================================
 st.set_page_config(
     page_title="Math AI System",
     page_icon="🎓",
-    layout="wide",
-    initial_sidebar_state="expanded"
 )
 COLLECTION_NAME = "math_knowledge_base"
 # ============================================================================
 # CACHED RESOURCES
 # ============================================================================
 @st.cache_resource
 def get_clients():
-    """Initialize all clients - cached"""
     qdrant = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
     )
     claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
     embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
     return qdrant, claude, embedder
 # ============================================================================
-# HELPER FUNCTIONS
 # ============================================================================
-def extract_text_from_pdf(pdf_file):
-    """Extract text from typed PDF"""
     try:
-        pdf_reader = PyPDF2.PdfReader(pdf_file)
-        text = ""
-        for page_num, page in enumerate(pdf_reader.pages):
-            text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
-        return text
     except Exception as e:
         return None
-def pdf_to_images(pdf_bytes):
-    """Convert PDF pages to images for OCR"""
     try:
-        images = convert_from_bytes(pdf_bytes.read(), dpi=200)
         return images
     except Exception as e:
-        st.error(f"PDF to image conversion error: {e}")
         return []
 def resize_image(image, max_size=(2048, 2048)):
-    """Resize image for Claude Vision"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
-    """Convert PIL Image to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
-def ocr_with_claude(claude_client, image, context_books="", context_exam=""):
-    """
-    AI-powered OCR for handwritten Italian cursive math notes
-    NOTE: Italian cursive is the HANDWRITING STYLE (connected letters)
-          Language is ENGLISH
-    """
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
-    prompt = f"""You are an expert in transcribing handwritten mathematical solutions.
-IMPORTANT: This is written in ITALIAN CURSIVE style (connected, flowing letters), but the LANGUAGE IS ENGLISH.
-CONTEXT FROM TEXTBOOKS (helps understand symbols):
-{context_books[:2000] if context_books else "No context available"}
-EXAM QUESTION (helps understand what's being solved):
-{context_exam[:1000] if context_exam else "No exam question available"}
-TASK: Transcribe this handwritten math solution into clean, readable text.
 INSTRUCTIONS:
-1. Language is ENGLISH (just cursive style is Italian)
-2. Convert math notation properly:
-   - Use standard symbols: ∫, ∑, √, ∂, lim, etc.
-   - Use LaTeX for complex formulas
-   - Preserve Greek letters: α, β, γ, π, etc.
-3. Maintain structure (paragraphs, steps)
-4. If unclear, mark as [unclear: best guess]
-5. Describe diagrams as [DIAGRAM: description]
-OUTPUT: Just the transcribed text, no preamble."""
     try:
         message = claude_client.messages.create(
@@ -120,34 +157,20 @@ OUTPUT: Just the transcribed text, no preamble."""
                 {
                     "role": "user",
                     "content": [
-                        {
-                            "type": "image",
-                            "source": {
-                                "type": "base64",
-                                "media_type": "image/png",
-                                "data": img_b64
-                            }
-                        },
-                        {
-                            "type": "text",
-                            "text": prompt
-                        }
                     ]
                 }
             ]
         )
-        transcription = message.content[0].text
-        tokens = message.usage.input_tokens + message.usage.output_tokens
-        return transcription, tokens
     except Exception as e:
-        st.error(f"OCR error: {e}")
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
-    """Split text into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
@@ -157,7 +180,7 @@ def chunk_text(text, chunk_size=150, overlap=30):
     return chunks
 def get_vector_count(qdrant):
-    """Get total vectors in database"""
     try:
         count = 0
         offset = None
@@ -187,8 +210,8 @@ try:
     qdrant, claude, embedder = get_clients()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
-    st.error(f"❌ Initialization failed: {e}")
-    st.info("Add QDRANT_URL, QDRANT_API_KEY, and ANTHROPIC_API_KEY in Settings → Secrets")
     st.stop()
 # ============================================================================
@@ -198,21 +221,18 @@ except Exception as e:
 st.sidebar.title("🎓 Math AI System")
 mode = st.sidebar.radio(
-    "Select Mode:",
-    ["🔍 Search & Solve", "🏗️ Setup Database", "🧪 Testing Dashboard"],
     index=0
 )
 st.sidebar.markdown("---")
-# Database stats
 try:
     vector_count = get_vector_count(qdrant)
-    st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
-    storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
-    st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
 except:
-    st.sidebar.warning("Database not accessible")
 # ============================================================================
 # MODE 1: SEARCH & SOLVE
@@ -221,125 +241,58 @@ except:
 if mode == "🔍 Search & Solve":
     st.title("🔍 Math Problem Solver")
-    st.markdown("*Search your knowledge base and get detailed solutions*")
-    # Input
-    st.header("📝 Input Problem")
-    input_method = st.radio(
-        "How to input:",
-        ["✍️ Type Question", "📄 Upload Exam PDF"],
-        horizontal=True
     )
-    problem = None
-    if input_method == "✍️ Type Question":
-        problem = st.text_area(
-            "Enter math problem:",
-            placeholder="Example: Find the gradient of L(w) = (1/2)||Xw - y||²",
-            height=150
-        )
-    else:
-        uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
-        if uploaded_exam:
-            exam_text = extract_text_from_pdf(uploaded_exam)
-            if exam_text:
-                st.text_area("Extracted:", exam_text[:1000], height=200)
-                problem = st.text_input("Specific question or use full text")
-    # Settings
-    with st.expander("⚙️ Advanced Settings"):
-        col1, col2 = st.columns(2)
-        with col1:
-            search_filter = st.multiselect(
-                "Search in:",
-                ["Books", "Exams", "Handwritten Solutions", "Public Datasets"],
-                default=["Books", "Exams", "Handwritten Solutions"]
-            )
-        with col2:
-            top_k = st.slider("Retrieve top:", 3, 20, 5)
-            detail_level = st.select_slider(
-                "Detail level:",
-                ["Concise", "Standard", "Detailed", "Very Detailed"],
-                value="Detailed"
-            )
-    # Solve
-    if st.button("🚀 SOLVE PROBLEM", type="primary") and problem:
-        with st.spinner("🔍 Searching..."):
-            query_embedding = embedder.encode(problem)
             try:
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
-                    query_vector=query_embedding.tolist(),
                     limit=top_k
                 )
-            except Exception as e:
-                st.error(f"Search failed: {e}")
                 results = []
         if not results:
-            st.warning("No relevant context found. Load data in Setup mode.")
         else:
-            st.success(f"✅ Found {len(results)} references!")
-            # Show context
-            with st.expander("📚 Retrieved References"):
-                for i, result in enumerate(results, 1):
-                    similarity = result.score * 100
-                    st.markdown(f"**Reference {i}** ({similarity:.1f}% relevant)")
-                    st.info(result.payload['content'][:300] + "...")
-                    st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
-                    st.markdown("---")
-            # Generate solution
-            with st.spinner("🤖 Generating solution..."):
-                context = "\n\n".join([
-                    f"[Reference {i+1} from {r.payload.get('source_name')}]:\n{r.payload['content']}"
-                    for i, r in enumerate(results)
-                ])
-                detail_instructions = {
-                    "Concise": "Brief solution, key steps only.",
-                    "Standard": "Clear solution with main steps.",
-                    "Detailed": "Comprehensive solution with detailed explanations.",
-                    "Very Detailed": "Exhaustive solution with all steps and intuitions."
-                }
-                prompt = f"""You are an expert mathematics tutor for machine learning.
-PROBLEM:
-{problem}
-REFERENCES (from student's materials):
-{context}
-TASK: Solve providing a complete educational solution.
-{detail_instructions[detail_level]}
 FORMAT:
 ## SOLUTION
-[Step-by-step solution with clear notation]
-## REASONING & APPROACH
-[WHY this approach, what concepts, how references helped]
 ## REFERENCES USED
-[Which references used and HOW each contributed]
-## VERIFICATION
-[How to verify the solution]
-Use proper notation (LaTeX if needed). Reference the materials when explaining."""
                 try:
                     message = claude.messages.create(
@@ -348,116 +301,133 @@ Use proper notation (LaTeX if needed). Reference the materials when explaining."
                         messages=[{"role": "user", "content": prompt}]
                     )
-                    solution = message.content[0].text
                     st.markdown("---")
-                    st.markdown(solution)
                     st.download_button(
-                        "📥 Download Solution",
-                        solution,
-                        file_name=f"solution_{int(time.time())}.md",
-                        mime="text/markdown"
                     )
-                    with st.expander("📊 API Usage"):
-                        st.json({
-                            "input_tokens": message.usage.input_tokens,
-                            "output_tokens": message.usage.output_tokens,
-                            "cost": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
-                        })
                 except Exception as e:
                     st.error(f"Error: {e}")
 # ============================================================================
-# MODE 2: SETUP DATABASE
 # ============================================================================
-elif mode == "🏗️ Setup Database":
-    st.title("🏗️ Database Setup")
-    st.markdown("*Upload and process your documents*")
-    # Create collection
-    st.header("Step 1: Create Collection")
     try:
         collections = qdrant.get_collections().collections
         exists = any(c.name == COLLECTION_NAME for c in collections)
         if exists:
-            st.success(f"✅ Collection '{COLLECTION_NAME}' exists")
         else:
-            if st.button("🏗️ Create Collection"):
                 qdrant.create_collection(
                     collection_name=COLLECTION_NAME,
                     vectors_config=VectorParams(size=384, distance=Distance.COSINE)
                 )
-                st.success("✅ Created!")
                 st.rerun()
     except Exception as e:
         st.error(f"Error: {e}")
     st.markdown("---")
-    # Upload documents
-    st.header("Step 2: Upload Documents")
-    tab1, tab2, tab3 = st.tabs([
-        "📚 Books & Exams (Typed PDFs)",
-        "🖊️ Handwritten Solutions (OCR)",
-        "📊 Public Datasets"
-    ])
     # ========================================================================
-    # TAB 1: Typed PDFs
     # ========================================================================
     with tab1:
-        st.info("✅ Upload your typed PDFs (books, exams) here")
-        uploaded_files = st.file_uploader(
-            "Choose PDF files:",
-            type=['pdf'],
-            accept_multiple_files=True,
-            key="typed_pdfs"
-        )
-        doc_type = st.selectbox("Document type:", ["book", "exam", "reference"])
-        if uploaded_files and st.button("📤 Process & Upload", key="upload_typed"):
-            for uploaded_file in uploaded_files:
-                with st.expander(f"Processing {uploaded_file.name}"):
                     try:
                         # Extract
-                        text = extract_text_from_pdf(uploaded_file)
                         if not text:
-                            st.error("Text extraction failed")
                             continue
-                        st.write(f"✅ Extracted {len(text):,} chars")
                         # Chunk
                         chunks = chunk_text(text)
-                        st.write(f"✅ Created {len(chunks)} chunks")
                         # Embed
-                        with st.spinner("Embedding..."):
-                            embeddings = embedder.encode(chunks, show_progress_bar=False)
                         # Upload
                         points = []
                         for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                             points.append(PointStruct(
-                                id=abs(hash(f"{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
                                 vector=emb.tolist(),
                                 payload={
                                     "content": chunk,
-                                    "source_name": uploaded_file.name,
-                                    "source_type": doc_type,
                                     "chunk_index": i
                                 }
                             ))
@@ -469,278 +439,217 @@ elif mode == "🏗️ Setup Database":
                         st.error(f"Error: {e}")
     # ========================================================================
-    # TAB 2: Handwritten OCR (100% IN HF SPACES!)
     # ========================================================================
     with tab2:
-        st.success("✅ AI-POWERED OCR - Process handwritten notes RIGHT HERE!")
-        st.markdown("""
-        ### How it works:
-        1. Upload handwritten solution PDFs (from your Google Drive)
-        2. AI OCR processes each page with Claude Vision
-        3. Uses your books/exams as context for better accuracy
-        4. Uploads transcribed text to database
-        **Cost:** ~$0.05-0.10 per handwritten PDF page
-        """)
-        # Upload handwritten PDFs
-        handwritten_files = st.file_uploader(
-            "Upload handwritten solution PDFs:",
-            type=['pdf'],
-            accept_multiple_files=True,
-            key="handwritten_pdfs",
-            help="Your answer PDFs from Google Drive/Math_AI_Documents/answers/"
-        )
-        # Optional: Context from books
-        context_books = ""
-        use_context = st.checkbox("Use book context for better OCR accuracy", value=True)
-        if use_context:
-            # Get some book context from database
             try:
                 book_samples = qdrant.scroll(
                     collection_name=COLLECTION_NAME,
-                    limit=10,
                     with_payload=True,
                     with_vectors=False,
                     scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
                 )
                 if book_samples and book_samples[0]:
-                    context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
-                    st.caption(f"✅ Using {len(book_samples[0])} book excerpts as context")
             except:
-                st.caption("⚠️ No books in database yet. OCR will work but may be less accurate.")
-        if handwritten_files and st.button("🤖 PROCESS WITH AI OCR", type="primary"):
-            total_tokens = 0
-            for uploaded_file in handwritten_files:
-                st.markdown(f"### Processing: {uploaded_file.name}")
-                try:
-                    # Convert PDF to images
-                    with st.spinner("Converting PDF to images..."):
-                        # Read bytes
-                        pdf_bytes = BytesIO(uploaded_file.read())
-                        images = pdf_to_images(pdf_bytes)
-                    if not images:
-                        st.error("PDF conversion failed")
-                        continue
-                    st.write(f"✅ Converted to {len(images)} pages")
-                    # OCR each page
-                    transcribed_pages = []
-                    page_tokens = 0
-                    for page_num, image in enumerate(images, 1):
-                        with st.spinner(f"OCR Page {page_num}/{len(images)}..."):
-                            transcription, tokens = ocr_with_claude(
-                                claude,
-                                image,
-                                context_books=context_books,
-                                context_exam=""
-                            )
-                            if transcription:
-                                transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
-                                page_tokens += tokens
-                                st.write(f"   ✅ Page {page_num} ({tokens:,} tokens)")
-                            else:
-                                st.write(f"   ❌ Page {page_num} failed")
-                    if not transcribed_pages:
-                        st.error("No pages transcribed successfully")
-                        continue
-                    # Combine all pages
-                    full_text = "\n\n".join(transcribed_pages)
-                    st.success(f"✅ Transcribed {len(full_text):,} characters")
-                    st.info(f"📊 Tokens used: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
-                    total_tokens += page_tokens
-                    # Show preview
-                    with st.expander("👁️ Preview transcription"):
-                        st.text(full_text[:500] + "...")
-                    # Chunk
-                    chunks = chunk_text(full_text)
-                    st.write(f"✅ Created {len(chunks)} chunks")
-                    # Embed
-                    with st.spinner("Embedding..."):
-                        embeddings = embedder.encode(chunks, show_progress_bar=False)
-                    # Upload
-                    points = []
-                    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
-                        points.append(PointStruct(
-                            id=abs(hash(f"handwritten_{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
-                            vector=emb.tolist(),
-                            payload={
-                                "content": chunk,
-                                "source_name": uploaded_file.name,
-                                "source_type": "answer_handwritten",
-                                "chunk_index": i,
-                                "handwriting_style": "italian_cursive",
-                                "language": "english",
-                                "ocr_method": "claude_vision",
-                                "tokens_used": page_tokens
-                            }
-                        ))
-                    qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                    st.success(f"🎉 Uploaded {len(points)} vectors from handwritten notes!")
-                    st.balloons()
-                except Exception as e:
-                    st.error(f"Error: {e}")
-                    st.exception(e)
-            st.markdown("---")
-            st.success(f"✅ Total tokens used: {total_tokens:,}")
-            st.info(f"💰 Estimated total cost: ${total_tokens * 0.000003:.2f}")
-    # ========================================================================
-    # TAB 3: Public Datasets
-    # ========================================================================
-    with tab3:
-        st.info("Load pre-built math datasets")
-        dataset_choice = st.selectbox(
-            "Choose dataset:",
-            ["GSM8K - Grade School Math",
-             "MATH - Competition Math",
-             "MathQA - Word Problems"]
-        )
-        sample_size = st.slider("Samples:", 10, 1000, 100)
-        if st.button("📥 Load Dataset"):
-            try:
-                from datasets import load_dataset
-                with st.spinner(f"Loading..."):
-                    if "GSM8K" in dataset_choice:
-                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
-                        texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
-                                for i in range(min(sample_size, len(dataset)))]
-                        name = "GSM8K"
-                    elif "MATH" in dataset_choice:
-                        dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
-                        texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
-                                for i in range(min(sample_size, len(dataset)))]
-                        name = "MATH"
-                    else:
-                        dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
-                        texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
-                                for i in range(min(sample_size, len(dataset)))]
-                        name = "MathQA"
-                    st.write(f"✅ Loaded {len(texts)} problems")
-                    # Embed
-                    embeddings = embedder.encode(texts, show_progress_bar=True)
-                    # Upload
-                    points = []
-                    for i, (text, emb) in enumerate(zip(texts, embeddings)):
-                        points.append(PointStruct(
-                            id=abs(hash(f"{name}_{i}_{time.time()}")) % (2**63),
-                            vector=emb.tolist(),
-                            payload={
-                                "content": text[:2000],
-                                "source_name": name,
-                                "source_type": "public_dataset",
-                                "index": i
-                            }
-                        ))
-                    qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                    st.success(f"✅ Uploaded {len(points)} vectors!")
-                    st.balloons()
-            except Exception as e:
-                st.error(f"Error: {e}")
 # ============================================================================
-# MODE 3: TESTING
 # ============================================================================
-elif mode == "🧪 Testing Dashboard":
-    st.title("🧪 Testing Dashboard")
-    tab1, tab2 = st.tabs(["📊 Stats", "🎯 Accuracy"])
-    with tab1:
-        st.header("Database Statistics")
-        try:
-            sample = qdrant.scroll(
-                collection_name=COLLECTION_NAME,
-                limit=1000,
-                with_payload=True,
-                with_vectors=False
-            )
-            if sample and sample[0]:
-                types = {}
-                sources = set()
-                for point in sample[0]:
-                    src_type = point.payload.get('source_type', 'unknown')
-                    types[src_type] = types.get(src_type, 0) + 1
-                    sources.add(point.payload.get('source_name', 'Unknown'))
-                col1, col2, col3 = st.columns(3)
-                with col1:
-                    st.metric("Total Vectors", get_vector_count(qdrant))
-                with col2:
-                    st.metric("Sources", len(sources))
-                with col3:
-                    st.metric("Types", len(types))
-                st.subheader("By Type")
-                for doc_type, count in sorted(types.items()):
-                    st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
-        except Exception as e:
-            st.error(f"Error: {e}")
-    with tab2:
-        st.header("Test Accuracy")
-        test_query = st.text_input("Test query:")
-        if st.button("Test") and test_query:
-            query_emb = embedder.encode(test_query)
-            results = qdrant.search(
-                collection_name=COLLECTION_NAME,
-                query_vector=query_emb.tolist(),
-                limit=5
-            )
-            for i, r in enumerate(results, 1):
-                similarity = r.score * 100
-                quality = "🟢" if similarity > 70 else "🟡" if similarity > 50 else "🔴"
-                st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
-                st.text(r.payload['content'][:200] + "...")
-                st.markdown("---")
-st.sidebar.markdown("---")
 st.sidebar.caption("🎓 Math AI v1.0")

 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams, PointStruct
 from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download, list_repo_files
 # ============================================================================
+# MATH AI SYSTEM - READS FROM HF DATASET (PERMANENT STORAGE!)
 # ============================================================================
 st.set_page_config(
     page_title="Math AI System",
     page_icon="🎓",
+    layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
+# YOUR DATASET - Change this to your dataset name!
+DATASET_REPO = "YOUR_USERNAME/math-ai-documents"  # ← EDIT THIS!
 # ============================================================================
 # CACHED RESOURCES
 # ============================================================================
 @st.cache_resource
 def get_clients():
+    """Initialize clients"""
     qdrant = QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
     )
     claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
     embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
     return qdrant, claude, embedder
 # ============================================================================
+# DATASET OPERATIONS (Reads from HF Dataset)
 # ============================================================================
+def list_dataset_files(folder_path):
+    """List all PDF files in a folder from HF Dataset"""
     try:
+        # Get HF token from environment
+        hf_token = os.getenv("HF_TOKEN")
+        # List all files in the dataset
+        all_files = list_repo_files(
+            repo_id=DATASET_REPO,
+            repo_type="dataset",
+            token=hf_token
+        )
+        # Filter for PDFs in specific folder
+        pdf_files = [
+            f for f in all_files
+            if f.startswith(folder_path) and f.endswith('.pdf')
+        ]
+        return pdf_files
     except Exception as e:
+        st.error(f"Error listing files: {e}")
+        return []
+def download_file_from_dataset(file_path):
+    """Download a file from HF Dataset"""
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        # Download file
+        local_path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename=file_path,
+            repo_type="dataset",
+            token=hf_token
+        )
+        return local_path
+    except Exception as e:
+        st.error(f"Error downloading {file_path}: {e}")
         return None
+# ============================================================================
+# PROCESSING FUNCTIONS
+# ============================================================================
+def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF file"""
     try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
+            return text
+    except Exception as e:
+        st.error(f"PDF extraction error: {e}")
+        return None
+def pdf_to_images(pdf_path):
+    """Convert PDF to images"""
+    try:
+        from pdf2image import convert_from_path
+        images = convert_from_path(pdf_path, dpi=200)
         return images
     except Exception as e:
+        st.error(f"Conversion error: {e}")
         return []
 def resize_image(image, max_size=(2048, 2048)):
+    """Resize for Claude"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
+    """Convert to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
+def ocr_with_claude(claude_client, image, context=""):
+    """AI OCR with Claude Vision"""
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
+    prompt = f"""Transcribe this handwritten math solution.
+STYLE: Italian cursive (connected letters)
+LANGUAGE: English
+CONTEXT: {context[:2000] if context else ""}
 INSTRUCTIONS:
+1. Transcribe in English
+2. Use proper math notation: ∫, ∑, √, ∂, etc.
+3. Maintain structure
+4. Mark unclear parts: [unclear: guess]
+OUTPUT: Just the transcription."""
     try:
         message = claude_client.messages.create(
                 {
                     "role": "user",
                     "content": [
+                        {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
+                        {"type": "text", "text": prompt}
                     ]
                 }
             ]
         )
+        return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
     except Exception as e:
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
+    """Split into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
     return chunks
 def get_vector_count(qdrant):
+    """Get total vectors"""
     try:
         count = 0
         offset = None
     qdrant, claude, embedder = get_clients()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
+    st.error(f"❌ Init failed: {e}")
+    st.info("Add these in Settings → Secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
     st.stop()
 # ============================================================================
 st.sidebar.title("🎓 Math AI System")
 mode = st.sidebar.radio(
+    "Mode:",
+    ["🔍 Search & Solve", "🏗️ Process Dataset Files", "📊 Stats"],
     index=0
 )
 st.sidebar.markdown("---")
 try:
     vector_count = get_vector_count(qdrant)
+    st.sidebar.metric("Vectors", f"{vector_count:,}")
 except:
+    pass
 # ============================================================================
 # MODE 1: SEARCH & SOLVE
 if mode == "🔍 Search & Solve":
     st.title("🔍 Math Problem Solver")
+    problem = st.text_area(
+        "Enter problem:",
+        placeholder="Find the gradient of L(w) = (1/2)||Xw - y||²",
+        height=150
     )
+    top_k = st.slider("Retrieve:", 3, 20, 5)
+    if st.button("🚀 SOLVE", type="primary") and problem:
+        with st.spinner("Searching..."):
+            query_emb = embedder.encode(problem)
             try:
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
+                    query_vector=query_emb.tolist(),
                     limit=top_k
                 )
+            except:
                 results = []
         if not results:
+            st.warning("No context found. Process your files in 'Process Dataset Files' mode.")
         else:
+            st.success(f"Found {len(results)} references!")
+            with st.expander("References"):
+                for i, r in enumerate(results, 1):
+                    st.markdown(f"**{i}.** {r.payload['content'][:200]}...")
+                    st.caption(f"Source: {r.payload.get('source_name')}")
+            with st.spinner("Generating solution..."):
+                context = "\n\n".join([r.payload['content'] for r in results])
+                prompt = f"""Solve this problem using the references.
+PROBLEM: {problem}
+REFERENCES: {context}
 FORMAT:
 ## SOLUTION
+[Step-by-step]
+## REASONING
+[Why this approach]
 ## REFERENCES USED
+[Which sources helped]"""
                 try:
                     message = claude.messages.create(
                         messages=[{"role": "user", "content": prompt}]
                     )
                     st.markdown("---")
+                    st.markdown(message.content[0].text)
                     st.download_button(
+                        "📥 Download",
+                        message.content[0].text,
+                        file_name=f"solution.md"
                     )
                 except Exception as e:
                     st.error(f"Error: {e}")
 # ============================================================================
+# MODE 2: PROCESS DATASET FILES
 # ============================================================================
+elif mode == "🏗️ Process Dataset Files":
+    st.title("🏗️ Process Files from HF Dataset")
+    st.info(f"""
+    **Dataset:** `{DATASET_REPO}`
+    Files are stored permanently in your HF Dataset.
+    Process them once, search forever!
+    """)
+    # Check if HF token exists
+    if not os.getenv("HF_TOKEN"):
+        st.error("⚠️ Missing HF_TOKEN! Add it in Settings → Repository Secrets")
+        st.info("""
+        **How to get your HF Token:**
+        1. Go to: https://huggingface.co/settings/tokens
+        2. Click "New token"
+        3. Name: "math-ai-access"
+        4. Type: Read
+        5. Copy the token
+        6. Add as HF_TOKEN in Space Settings → Secrets
+        """)
+        st.stop()
+    # Create collection if needed
+    st.header("Step 1: Setup Collection")
     try:
         collections = qdrant.get_collections().collections
         exists = any(c.name == COLLECTION_NAME for c in collections)
         if exists:
+            st.success(f"✅ Collection exists")
         else:
+            if st.button("Create Collection"):
                 qdrant.create_collection(
                     collection_name=COLLECTION_NAME,
                     vectors_config=VectorParams(size=384, distance=Distance.COSINE)
                 )
+                st.success("Created!")
                 st.rerun()
     except Exception as e:
         st.error(f"Error: {e}")
     st.markdown("---")
+    # Process files
+    st.header("Step 2: Process Files")
+    tab1, tab2, tab3 = st.tabs(["📚 Books", "📝 Exams", "🖊️ Handwritten Answers"])
     # ========================================================================
+    # BOOKS
     # ========================================================================
     with tab1:
+        st.subheader("Process Books (Typed PDFs)")
+        if st.button("📚 List Books in Dataset"):
+            book_files = list_dataset_files("books/")
+            if book_files:
+                st.write(f"Found {len(book_files)} books:")
+                for f in book_files:
+                    st.text(f"• {f}")
+                st.session_state.book_files = book_files
+            else:
+                st.warning("No books found in dataset/books/ folder")
+        if 'book_files' in st.session_state and st.button("🚀 Process All Books"):
+            for book_file in st.session_state.book_files:
+                with st.expander(f"Processing {book_file}"):
                     try:
+                        # Download
+                        st.write("📥 Downloading...")
+                        local_path = download_file_from_dataset(book_file)
+                        if not local_path:
+                            continue
                         # Extract
+                        st.write("📖 Extracting text...")
+                        text = extract_text_from_pdf(local_path)
                         if not text:
                             continue
+                        st.write(f"✅ {len(text):,} chars")
                         # Chunk
                         chunks = chunk_text(text)
+                        st.write(f"✂️ {len(chunks)} chunks")
                         # Embed
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
                         # Upload
                         points = []
                         for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                             points.append(PointStruct(
+                                id=abs(hash(f"{book_file}_{i}_{time.time()}")) % (2**63),
                                 vector=emb.tolist(),
                                 payload={
                                     "content": chunk,
+                                    "source_name": book_file.split('/')[-1],
+                                    "source_type": "book",
                                     "chunk_index": i
                                 }
                             ))
                         st.error(f"Error: {e}")
     # ========================================================================
+    # EXAMS
     # ========================================================================
     with tab2:
+        st.subheader("Process Exams (Typed PDFs)")
+        if st.button("📝 List Exams in Dataset"):
+            exam_files = list_dataset_files("exams/")
+            if exam_files:
+                st.write(f"Found {len(exam_files)} exams:")
+                for f in exam_files:
+                    st.text(f"• {f}")
+                st.session_state.exam_files = exam_files
+            else:
+                st.warning("No exams found")
+        if 'exam_files' in st.session_state and st.button("🚀 Process All Exams"):
+            for exam_file in st.session_state.exam_files:
+                with st.expander(f"Processing {exam_file}"):
+                    try:
+                        local_path = download_file_from_dataset(exam_file)
+                        text = extract_text_from_pdf(local_path)
+                        if not text:
+                            continue
+                        st.write(f"✅ {len(text):,} chars")
+                        chunks = chunk_text(text)
+                        embeddings = embedder.encode(chunks, show_progress_bar=False)
+                        points = []
+                        for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+                            points.append(PointStruct(
+                                id=abs(hash(f"{exam_file}_{i}_{time.time()}")) % (2**63),
+                                vector=emb.tolist(),
+                                payload={
+                                    "content": chunk,
+                                    "source_name": exam_file.split('/')[-1],
+                                    "source_type": "exam",
+                                    "chunk_index": i
+                                }
+                            ))
+                        qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                        st.success(f"✅ Uploaded {len(points)} vectors!")
+                    except Exception as e:
+                        st.error(f"Error: {e}")
+    # ========================================================================
+    # HANDWRITTEN ANSWERS (AI OCR)
+    # ========================================================================
+    with tab3:
+        st.subheader("Process Handwritten Answers (AI OCR)")
+        st.warning("⚠️ This uses Claude Vision - costs ~$0.05-0.10 per PDF page")
+        if st.button("🖊️ List Answer Files"):
+            answer_files = list_dataset_files("answers/")
+            if answer_files:
+                st.write(f"Found {len(answer_files)} answer files:")
+                for f in answer_files:
+                    st.text(f"• {f}")
+                st.session_state.answer_files = answer_files
+            else:
+                st.warning("No answers found")
+        if 'answer_files' in st.session_state:
+            # Get context from books if available
+            context_books = ""
             try:
                 book_samples = qdrant.scroll(
                     collection_name=COLLECTION_NAME,
+                    limit=5,
                     with_payload=True,
                     with_vectors=False,
                     scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
                 )
                 if book_samples and book_samples[0]:
+                    context_books = "\n".join([p.payload['content'] for p in book_samples[0]])
+                    st.info("✅ Using book context for better OCR")
             except:
+                st.caption("No books processed yet - OCR will work but may be less accurate")
+            if st.button("🤖 PROCESS WITH AI OCR", type="primary"):
+                total_tokens = 0
+                for answer_file in st.session_state.answer_files:
+                    with st.expander(f"Processing {answer_file}"):
+                        try:
+                            # Download
+                            local_path = download_file_from_dataset(answer_file)
+                            # Convert to images
+                            st.write("🖼️ Converting to images...")
+                            images = pdf_to_images(local_path)
+                            if not images:
+                                continue
+                            st.write(f"✅ {len(images)} pages")
+                            # OCR each page
+                            transcribed_pages = []
+                            page_tokens = 0
+                            for page_num, image in enumerate(images, 1):
+                                st.write(f"🤖 OCR Page {page_num}/{len(images)}...")
+                                transcription, tokens = ocr_with_claude(
+                                    claude,
+                                    image,
+                                    context=context_books
+                                )
+                                if transcription:
+                                    transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
+                                    page_tokens += tokens
+                            if not transcribed_pages:
+                                st.error("OCR failed")
+                                continue
+                            full_text = "\n\n".join(transcribed_pages)
+                            st.success(f"✅ Transcribed {len(full_text):,} chars")
+                            st.info(f"Tokens: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
+                            total_tokens += page_tokens
+                            # Chunk
+                            chunks = chunk_text(full_text)
+                            embeddings = embedder.encode(chunks, show_progress_bar=False)
+                            # Upload
+                            points = []
+                            for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
+                                points.append(PointStruct(
+                                    id=abs(hash(f"{answer_file}_{i}_{time.time()}")) % (2**63),
+                                    vector=emb.tolist(),
+                                    payload={
+                                        "content": chunk,
+                                        "source_name": answer_file.split('/')[-1],
+                                        "source_type": "answer_handwritten",
+                                        "chunk_index": i,
+                                        "ocr_tokens": page_tokens
+                                    }
+                                ))
+                            qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                            st.success(f"✅ Uploaded {len(points)} vectors!")
+                        except Exception as e:
+                            st.error(f"Error: {e}")
+                st.success(f"Total tokens: {total_tokens:,} | Cost: ${total_tokens * 0.000003:.2f}")
 # ============================================================================
+# MODE 3: STATS
 # ============================================================================
+elif mode == "📊 Stats":
+    st.title("📊 Database Statistics")
+    try:
+        sample = qdrant.scroll(
+            collection_name=COLLECTION_NAME,
+            limit=1000,
+            with_payload=True,
+            with_vectors=False
+        )
+        if sample and sample[0]:
+            types = {}
+            sources = set()
+            for point in sample[0]:
+                src_type = point.payload.get('source_type', 'unknown')
+                types[src_type] = types.get(src_type, 0) + 1
+                sources.add(point.payload.get('source_name', 'Unknown'))
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Total Vectors", get_vector_count(qdrant))
+            with col2:
+                st.metric("Unique Sources", len(sources))
+            st.subheader("By Type")
+            for doc_type, count in sorted(types.items()):
+                st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
+            st.subheader("Sources")
+            for src in sorted(sources):
+                st.caption(f"• {src}")
+    except Exception as e:
+        st.error(f"Error: {e}")
 st.sidebar.caption("🎓 Math AI v1.0")