Spaces:

Hebaelsayed
/

math-ai-system

Sleeping

App Files Files Community

Hebaelsayed commited on Jan 31

Commit

3331648

verified ·

1 Parent(s): 6e1b1a8

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +147 -370

src/streamlit_app.py CHANGED Viewed

@@ -14,20 +14,20 @@ from sentence_transformers import SentenceTransformer
 from huggingface_hub import hf_hub_download, list_repo_files
 # ============================================================================
-# PRODUCTION MATH AI SYSTEM - SMART PROCESSING
 # ============================================================================
 st.set_page_config(
-    page_title="Math AI System - Production",
     page_icon="🎓",
     layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
-DATASET_REPO = "Hebaelsayed/math-ai-documents"
 # ============================================================================
-# AVAILABLE EMBEDDING MODELS
 # ============================================================================
 EMBEDDING_MODELS = {
@@ -43,7 +43,7 @@ EMBEDDING_MODELS = {
         "speed": "Medium",
         "quality": "Better"
     },
-    "MPNet (Best Quality, 768D)": {
         "name": "sentence-transformers/all-mpnet-base-v2",
         "dimensions": 768,
         "speed": "Slower",
@@ -57,7 +57,6 @@ EMBEDDING_MODELS = {
 @st.cache_resource
 def get_qdrant_client():
-    """Initialize Qdrant client"""
     return QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
@@ -65,44 +64,34 @@ def get_qdrant_client():
 @st.cache_resource
 def get_claude_client():
-    """Initialize Claude client"""
     return Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 @st.cache_resource
 def get_embedding_model(model_name):
-    """Load embedding model (cached per model)"""
     return SentenceTransformer(model_name)
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
-def get_file_hash(file_path):
-    """Generate unique hash for file to track if already processed"""
-    return hashlib.md5(file_path.encode()).hexdigest()
 def check_if_processed(qdrant, file_name):
-    """Check if file already processed in Qdrant"""
     try:
         results = qdrant.scroll(
             collection_name=COLLECTION_NAME,
             scroll_filter={
-                "must": [
-                    {"key": "source_name", "match": {"value": file_name}}
-                ]
             },
             limit=1,
             with_payload=True,
             with_vectors=False
         )
         return len(results[0]) > 0 if results and results[0] else False
     except:
         return False
 def list_dataset_files(folder_path):
-    """List PDF files in HF Dataset folder"""
     try:
         hf_token = os.getenv("HF_TOKEN")
         all_files = list_repo_files(
@@ -110,38 +99,27 @@ def list_dataset_files(folder_path):
             repo_type="dataset",
             token=hf_token
         )
-        pdf_files = [
-            f for f in all_files
-            if f.startswith(folder_path) and f.endswith('.pdf')
-        ]
-        return pdf_files
     except Exception as e:
-        st.error(f"Error listing files: {e}")
         return []
 def download_from_dataset(file_path):
-    """Download file from HF Dataset"""
     try:
         hf_token = os.getenv("HF_TOKEN")
-        local_path = hf_hub_download(
             repo_id=DATASET_REPO,
             filename=file_path,
             repo_type="dataset",
             token=hf_token
         )
-        return local_path
     except Exception as e:
         st.error(f"Download error: {e}")
         return None
 def extract_text_from_pdf(pdf_path):
-    """Extract text from typed PDF"""
     try:
         with open(pdf_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
@@ -150,74 +128,60 @@ def extract_text_from_pdf(pdf_path):
                 text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
             return text
     except Exception as e:
-        st.error(f"Text extraction error: {e}")
         return None
 def pdf_to_images(pdf_path):
-    """Convert PDF to images for OCR"""
     try:
         images = convert_from_path(pdf_path, dpi=200)
         return images
     except Exception as e:
-        st.error(f"PDF to image error: {e}")
-        st.info("💡 This requires poppler-utils. Add 'poppler-utils' to packages.txt file in your Space")
         return []
 def resize_image(image, max_size=(2048, 2048)):
-    """Resize image for Claude Vision"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
-    """Convert PIL Image to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
 def ocr_with_claude(claude_client, image, context=""):
-    """AI OCR with Claude Vision"""
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
-    prompt = f"""Transcribe handwritten math solution.
-STYLE: Italian cursive (connected letters)
 LANGUAGE: English
 CONTEXT: {context[:2000] if context else ""}
-INSTRUCTIONS:
-1. Transcribe in English
-2. Use proper math notation: ∫, ∑, √, ∂, etc.
-3. Maintain structure
-4. Mark unclear: [unclear: guess]
 OUTPUT: Transcription only."""
     try:
         message = claude_client.messages.create(
             model="claude-sonnet-4-20250514",
             max_tokens=4000,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
-                        {"type": "text", "text": prompt}
-                    ]
-                }
-            ]
         )
         return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
     except Exception as e:
         st.error(f"OCR error: {e}")
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
-    """Split text into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
@@ -227,7 +191,7 @@ def chunk_text(text, chunk_size=150, overlap=30):
     return chunks
 def get_vector_count(qdrant):
-    """Get total vectors in database"""
     try:
         count = 0
         offset = None
@@ -250,7 +214,7 @@ def get_vector_count(qdrant):
         return 0
 # ============================================================================
-# INITIALIZE CLIENTS
 # ============================================================================
 try:
@@ -258,357 +222,221 @@ try:
     claude = get_claude_client()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
-    st.error(f"❌ Initialization failed: {e}")
-    st.info("Add these secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
     st.stop()
 # ============================================================================
 # SIDEBAR
 # ============================================================================
-st.sidebar.title("🎓 Math AI System")
-st.sidebar.caption("Production Version")
 try:
     vector_count = get_vector_count(qdrant)
-    st.sidebar.metric("Total Vectors", f"{vector_count:,}")
-    storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
-    st.sidebar.metric("Storage", f"{storage_mb:.1f} MB")
 except:
-    st.sidebar.warning("Database unavailable")
 st.sidebar.markdown("---")
 # ============================================================================
-# MAIN TABS (Reordered as requested)
 # ============================================================================
-tab1, tab2, tab3 = st.tabs([
-    "📊 Dataset Manager",
-    "🔍 Search & Solve",
-    "📈 Statistics"
-])
 # ============================================================================
-# TAB 1: DATASET MANAGER (Primary Interface)
 # ============================================================================
 with tab1:
     st.title("📊 Dataset Manager")
-    st.markdown("*Manage all your data sources in one place*")
-    # Check HF Token
     if not os.getenv("HF_TOKEN"):
-        st.error("⚠️ Missing HF_TOKEN in secrets!")
-        st.info("Add it in Settings → Repository Secrets")
         st.stop()
     # Collection setup
-    st.header("🏗️ Step 1: Database Setup")
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        try:
-            collections = qdrant.get_collections().collections
-            exists = any(c.name == COLLECTION_NAME for c in collections)
-            if exists:
-                st.success(f"✅ Collection '{COLLECTION_NAME}' exists")
-            else:
-                st.warning(f"Collection '{COLLECTION_NAME}' doesn't exist")
-                # Show embedding model choice for initial creation
-                st.subheader("Choose Embedding Model")
-                for model_name, specs in EMBEDDING_MODELS.items():
-                    with st.expander(f"{model_name} - {specs['quality']} quality, {specs['speed']} speed"):
-                        st.write(f"**Dimensions:** {specs['dimensions']}")
-                        st.write(f"**Model:** `{specs['name']}`")
-                selected_model_key = st.selectbox(
-                    "Select embedding model:",
-                    list(EMBEDDING_MODELS.keys())
                 )
-                if st.button("🏗️ Create Collection", type="primary"):
-                    dimensions = EMBEDDING_MODELS[selected_model_key]["dimensions"]
-                    qdrant.create_collection(
-                        collection_name=COLLECTION_NAME,
-                        vectors_config=VectorParams(
-                            size=dimensions,
-                            distance=Distance.COSINE
-                        )
-                    )
-                    st.success(f"✅ Created with {dimensions}D vectors!")
-                    st.session_state.embedding_model = EMBEDDING_MODELS[selected_model_key]["name"]
-                    st.rerun()
-        except Exception as e:
-            st.error(f"Error: {e}")
-    with col2:
-        st.info(f"""
-        **Dataset:**
-        `{DATASET_REPO}`
-        **Collection:**
-        `{COLLECTION_NAME}`
-        """)
     st.markdown("---")
-    # Processing options
-    st.header("⚙️ Step 2: Processing Configuration")
-    col1, col2, col3 = st.columns(3)
     with col1:
-        st.subheader("Chunking Strategy")
-        chunk_size = st.slider("Chunk size (words):", 50, 500, 150)
-        chunk_overlap = st.slider("Overlap (words):", 0, 100, 30)
-        st.caption(f"Overlap: {(chunk_overlap/chunk_size*100):.0f}%")
     with col2:
-        st.subheader("Embedding Model")
-        # Get current model from collection or use default
         current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
-        # Find which key this model belongs to
-        current_model_key = "MiniLM-L6 (Fast, 384D)"
-        for key, specs in EMBEDDING_MODELS.items():
-            if specs["name"] == current_model:
-                current_model_key = key
-                break
-        st.info(f"**Active:** {current_model_key}")
-        st.caption(f"Model: `{current_model}`")
-    with col3:
-        st.subheader("OCR Settings")
-        use_context_for_ocr = st.checkbox("Use book context", value=True, help="Better accuracy, higher cost")
-        st.caption("Context helps Claude understand symbols")
     st.markdown("---")
     # Data sources
-    st.header("📁 Step 3: Data Sources")
-    source_tabs = st.tabs([
-        "📂 Your Dataset Files",
-        "🌐 Public Datasets (GSM8K, MATH, etc.)"
-    ])
-    # ========================================================================
-    # SOURCE 1: HF Dataset Files
-    # ========================================================================
     with source_tabs[0]:
-        st.subheader("Files from Your HF Dataset")
         folder_type = st.radio(
-            "Select folder:",
-            ["📚 Books (Typed PDFs)", "📝 Exams (Typed PDFs)", "🖊️ Answers (Handwritten - needs OCR)"],
             horizontal=True
         )
-        # Determine folder path
         if "Books" in folder_type:
-            folder_path = "books/"
-            doc_type = "book"
         elif "Exams" in folder_type:
-            folder_path = "exams/"
-            doc_type = "exam"
         else:
-            folder_path = "answers/"
-            doc_type = "answer_handwritten"
-        # List files
-        if st.button(f"🔍 Scan {folder_path} folder"):
-            with st.spinner("Scanning dataset..."):
                 files = list_dataset_files(folder_path)
                 if files:
-                    # Check processing status for each file
                     file_status = []
                     for file in files:
-                        file_name = file.split('/')[-1]
-                        is_processed = check_if_processed(qdrant, file_name)
-                        file_status.append({
-                            "file": file,
-                            "name": file_name,
-                            "processed": is_processed
-                        })
                     st.session_state.current_files = file_status
                     st.session_state.current_folder = folder_path
                     st.session_state.current_doc_type = doc_type
                 else:
-                    st.warning(f"No files found in {folder_path}")
-        # Display files with status
         if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
-            st.write(f"**Found {len(st.session_state.current_files)} files:**")
-            # Summary
             processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
             pending_count = len(st.session_state.current_files) - processed_count
             col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Total", len(st.session_state.current_files))
-            with col2:
-                st.metric("✅ Processed", processed_count)
-            with col3:
-                st.metric("⏳ Pending", pending_count)
-            # File list with checkboxes
-            st.subheader("Select files to process:")
             selected_files = []
             for file_info in st.session_state.current_files:
-                col1, col2 = st.columns([3, 1])
-                with col1:
-                    # Only allow selection if not processed
-                    if file_info['processed']:
-                        st.checkbox(
-                            f"✅ {file_info['name']} (Already processed)",
-                            value=False,
-                            disabled=True,
-                            key=f"file_{file_info['name']}"
-                        )
-                    else:
-                        if st.checkbox(
-                            f"⏳ {file_info['name']}",
-                            value=True,  # Auto-select pending files
-                            key=f"file_{file_info['name']}"
-                        ):
-                            selected_files.append(file_info)
-                with col2:
-                    if file_info['processed']:
-                        st.caption("Skip")
-                    else:
-                        st.caption("Ready")
-            # Process button
             if selected_files:
                 st.markdown("---")
-                st.write(f"**Ready to process {len(selected_files)} file(s)**")
-                # Show cost estimate for OCR
                 if doc_type == "answer_handwritten":
-                    est_pages = len(selected_files) * 5  # Assume 5 pages per PDF
-                    est_cost = est_pages * 0.08
-                    st.warning(f"⚠️ OCR Cost Estimate: ~${est_cost:.2f} ({est_pages} pages × ~$0.08/page)")
-                if st.button(f"🚀 PROCESS SELECTED FILES", type="primary"):
-                    # Load embedding model
                     embedder = get_embedding_model(current_model)
-                    # Get context if needed
                     context_books = ""
-                    if doc_type == "answer_handwritten" and use_context_for_ocr:
                         try:
-                            book_samples = qdrant.scroll(
                                 collection_name=COLLECTION_NAME,
                                 limit=10,
                                 with_payload=True,
                                 with_vectors=False,
                                 scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
                             )
-                            if book_samples and book_samples[0]:
-                                context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
-                                st.info("✅ Using book context for OCR")
                         except:
-                            st.caption("No books in database - OCR will work but may be less accurate")
-                    # Process each selected file
                     total_tokens = 0
                     total_vectors = 0
                     for file_info in selected_files:
                         with st.expander(f"Processing {file_info['name']}", expanded=True):
                             try:
-                                # Download
                                 st.write("📥 Downloading...")
                                 local_path = download_from_dataset(file_info['file'])
                                 if not local_path:
-                                    st.error("Download failed")
                                     continue
-                                # Extract or OCR
                                 if doc_type == "answer_handwritten":
-                                    # OCR path
-                                    st.write("🖼️ Converting to images...")
                                     images = pdf_to_images(local_path)
                                     if not images:
-                                        st.error("Conversion failed - poppler-utils not installed?")
                                         continue
                                     st.write(f"✅ {len(images)} pages")
-                                    # OCR each page
-                                    transcribed_pages = []
-                                    page_tokens = 0
-                                    for page_num, image in enumerate(images, 1):
-                                        st.write(f"🤖 OCR page {page_num}/{len(images)}...")
-                                        transcription, tokens = ocr_with_claude(
-                                            claude,
-                                            image,
-                                            context=context_books
-                                        )
-                                        if transcription:
-                                            transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
-                                            page_tokens += tokens
-                                    if not transcribed_pages:
                                         st.error("OCR failed")
                                         continue
-                                    text = "\n\n".join(transcribed_pages)
-                                    total_tokens += page_tokens
-                                    st.success(f"✅ Transcribed {len(text):,} chars (${page_tokens * 0.000003:.3f})")
                                 else:
-                                    # Text extraction
-                                    st.write("📖 Extracting text...")
                                     text = extract_text_from_pdf(local_path)
                                     if not text:
-                                        st.error("Text extraction failed")
                                         continue
                                     st.write(f"✅ {len(text):,} chars")
-                                # Chunk
                                 chunks = chunk_text(text, chunk_size, chunk_overlap)
                                 st.write(f"✂️ {len(chunks)} chunks")
-                                # Embed
                                 st.write("🔢 Embedding...")
                                 embeddings = embedder.encode(chunks, show_progress_bar=False)
-                                # Upload
                                 points = []
                                 for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                                     points.append(PointStruct(
@@ -618,92 +446,60 @@ with tab1:
                                             "content": chunk,
                                             "source_name": file_info['name'],
                                             "source_type": doc_type,
-                                            "chunk_index": i,
-                                            "embedding_model": current_model
                                         }
                                     ))
                                 qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                                 total_vectors += len(points)
-                                st.success(f"✅ Uploaded {len(points)} vectors!")
                             except Exception as e:
                                 st.error(f"Error: {e}")
-                    # Summary
                     st.balloons()
-                    st.success(f"""
-                    🎉 Processing Complete!
-                    - Files processed: {len(selected_files)}
-                    - Vectors added: {total_vectors:,}
-                    - OCR tokens used: {total_tokens:,}
-                    - OCR cost: ${total_tokens * 0.000003:.2f}
-                    """)
-                    # Clear selection
                     st.session_state.pop('current_files', None)
                     st.rerun()
-    # ========================================================================
-    # SOURCE 2: Public Datasets
-    # ========================================================================
     with source_tabs[1]:
-        st.subheader("Public Math Datasets")
         dataset_choice = st.selectbox(
-            "Select dataset:",
-            [
-                "GSM8K - Grade School Math (8.5K problems)",
-                "MATH - Competition Math (12.5K problems)",
-                "MathQA - Math Word Problems (37K problems)"
-            ]
         )
-        sample_size = st.slider("Number of samples:", 10, 2000, 100)
-        # Check if already loaded
         dataset_name = dataset_choice.split(" - ")[0]
         already_loaded = check_if_processed(qdrant, dataset_name)
         if already_loaded:
-            st.success(f"✅ {dataset_name} already loaded!")
-            st.info("Vectors from this dataset are already in your database.")
         else:
-            if st.button(f"📥 Load {dataset_name}", type="primary"):
                 try:
                     from datasets import load_dataset
                     embedder = get_embedding_model(current_model)
-                    with st.spinner(f"Loading {dataset_name}..."):
                         if "GSM8K" in dataset_choice:
                             dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
                                     for i in range(min(sample_size, len(dataset)))]
                         elif "MATH" in dataset_choice:
                             dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
                                     for i in range(min(sample_size, len(dataset)))]
-                        else:  # MathQA
                             dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                     for i in range(min(sample_size, len(dataset)))]
-                        st.write(f"✅ Loaded {len(texts)} problems")
-                        # Embed
-                        st.write("🔢 Embedding...")
                         embeddings = embedder.encode(texts, show_progress_bar=True)
-                        # Upload
                         points = []
                         for i, (text, emb) in enumerate(zip(texts, embeddings)):
                             points.append(PointStruct(
@@ -713,13 +509,12 @@ with tab1:
                                     "content": text[:2000],
                                     "source_name": dataset_name,
                                     "source_type": "public_dataset",
-                                    "index": i,
-                                    "embedding_model": current_model
                                 }
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                        st.success(f"✅ Uploaded {len(points)} vectors!")
                         st.balloons()
                 except Exception as e:
@@ -730,30 +525,20 @@ with tab1:
 # ============================================================================
 with tab2:
     st.title("🔍 Search & Solve")
     problem = st.text_area(
-        "Enter math problem:",
-        placeholder="Find the gradient of the loss function L(w) = (1/2)||Xw - y||²",
         height=150
     )
     col1, col2 = st.columns(2)
-    with col1:
-        top_k = st.slider("Retrieve top:", 3, 20, 5)
-    with col2:
-        detail = st.select_slider(
-            "Detail level:",
-            ["Concise", "Standard", "Detailed", "Exhaustive"],
-            value="Detailed"
-        )
     if st.button("🚀 SOLVE", type="primary") and problem:
-        # Get embedding model
         current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
         embedder = get_embedding_model(current_model)
@@ -764,7 +549,7 @@ with tab2:
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
                     query_vector=query_emb.tolist(),
-                    limit=top_k
                 )
             except:
                 results = []
@@ -776,31 +561,31 @@ with tab2:
             with st.expander("📚 References"):
                 for i, r in enumerate(results, 1):
-                    st.markdown(f"**{i}.** ({r.score*100:.0f}% match)")
                     st.text(r.payload['content'][:200] + "...")
                     st.caption(f"Source: {r.payload.get('source_name')}")
-            with st.spinner("Generating solution..."):
                 context = "\n\n".join([r.payload['content'] for r in results])
-                prompt = f"""Solve this problem using references.
 PROBLEM: {problem}
 REFERENCES: {context}
-DETAIL: {detail}
 FORMAT:
 ## SOLUTION
 [Steps]
 ## REASONING
-[Why this approach]
 ## REFERENCES
-[Which sources helped]"""
                 try:
                     message = claude.messages.create(
@@ -813,7 +598,7 @@ FORMAT:
                     st.markdown(message.content[0].text)
                     st.download_button(
-                        "📥 Download Solution",
                         message.content[0].text,
                         file_name=f"solution_{int(time.time())}.md"
                     )
@@ -826,8 +611,7 @@ FORMAT:
 # ============================================================================
 with tab3:
-    st.title("📈 Statistics & Analytics")
     try:
         sample = qdrant.scroll(
@@ -847,20 +631,14 @@ with tab3:
                 sources.add(point.payload.get('source_name', 'Unknown'))
             col1, col2, col3 = st.columns(3)
-            with col1:
-                st.metric("Total Vectors", get_vector_count(qdrant))
-            with col2:
-                st.metric("Unique Sources", len(sources))
-            with col3:
-                st.metric("Document Types", len(types))
-            st.subheader("Distribution by Type")
             for doc_type, count in sorted(types.items()):
                 pct = count / sum(types.values()) * 100
-                st.progress(count / sum(types.values()), text=f"{doc_type}: {count} ({pct:.1f}%)")
             st.subheader("All Sources")
             for src in sorted(sources):
@@ -869,5 +647,4 @@ with tab3:
     except Exception as e:
         st.error(f"Error: {e}")
-st.sidebar.markdown("---")
-st.sidebar.caption("v2.0 - Production")

 from huggingface_hub import hf_hub_download, list_repo_files
 # ============================================================================
+# PRODUCTION MATH AI SYSTEM
 # ============================================================================
 st.set_page_config(
+    page_title="Math AI System",
     page_icon="🎓",
     layout="wide"
 )
 COLLECTION_NAME = "math_knowledge_base"
+DATASET_REPO = "yourusername/math-ai-documents"  # ← CHANGE THIS!
 # ============================================================================
+# EMBEDDING MODELS
 # ============================================================================
 EMBEDDING_MODELS = {
         "speed": "Medium",
         "quality": "Better"
     },
+    "MPNet (Best, 768D)": {
         "name": "sentence-transformers/all-mpnet-base-v2",
         "dimensions": 768,
         "speed": "Slower",
 @st.cache_resource
 def get_qdrant_client():
     return QdrantClient(
         url=os.getenv("QDRANT_URL"),
         api_key=os.getenv("QDRANT_API_KEY")
 @st.cache_resource
 def get_claude_client():
     return Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 @st.cache_resource
 def get_embedding_model(model_name):
     return SentenceTransformer(model_name)
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 def check_if_processed(qdrant, file_name):
+    """Check if file already in database"""
     try:
         results = qdrant.scroll(
             collection_name=COLLECTION_NAME,
             scroll_filter={
+                "must": [{"key": "source_name", "match": {"value": file_name}}]
             },
             limit=1,
             with_payload=True,
             with_vectors=False
         )
         return len(results[0]) > 0 if results and results[0] else False
     except:
         return False
 def list_dataset_files(folder_path):
+    """List PDFs in HF Dataset folder"""
     try:
         hf_token = os.getenv("HF_TOKEN")
         all_files = list_repo_files(
             repo_type="dataset",
             token=hf_token
         )
+        return [f for f in all_files if f.startswith(folder_path) and f.endswith('.pdf')]
     except Exception as e:
+        st.error(f"Error listing: {e}")
         return []
 def download_from_dataset(file_path):
+    """Download from HF Dataset"""
     try:
         hf_token = os.getenv("HF_TOKEN")
+        return hf_hub_download(
             repo_id=DATASET_REPO,
             filename=file_path,
             repo_type="dataset",
             token=hf_token
         )
     except Exception as e:
         st.error(f"Download error: {e}")
         return None
 def extract_text_from_pdf(pdf_path):
+    """Extract text from PDF"""
     try:
         with open(pdf_path, 'rb') as file:
             reader = PyPDF2.PdfReader(file)
                 text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
             return text
     except Exception as e:
+        st.error(f"Extraction error: {e}")
         return None
 def pdf_to_images(pdf_path):
+    """Convert PDF to images"""
     try:
         images = convert_from_path(pdf_path, dpi=200)
         return images
     except Exception as e:
+        st.error(f"Conversion error: {e}")
+        st.info("💡 Add 'poppler-utils' to packages.txt")
         return []
 def resize_image(image, max_size=(2048, 2048)):
+    """Resize for Claude"""
     image.thumbnail(max_size, Image.Resampling.LANCZOS)
     return image
 def image_to_base64(image):
+    """Convert to base64"""
     buffered = BytesIO()
     image.save(buffered, format="PNG")
     return base64.b64encode(buffered.getvalue()).decode()
 def ocr_with_claude(claude_client, image, context=""):
+    """AI OCR"""
     resized = resize_image(image.copy())
     img_b64 = image_to_base64(resized)
+    prompt = f"""Transcribe handwritten math.
+STYLE: Italian cursive
 LANGUAGE: English
 CONTEXT: {context[:2000] if context else ""}
 OUTPUT: Transcription only."""
     try:
         message = claude_client.messages.create(
             model="claude-sonnet-4-20250514",
             max_tokens=4000,
+            messages=[{
+                "role": "user",
+                "content": [
+                    {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
+                    {"type": "text", "text": prompt}
+                ]
+            }]
         )
         return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
     except Exception as e:
         st.error(f"OCR error: {e}")
         return None, 0
 def chunk_text(text, chunk_size=150, overlap=30):
+    """Split into chunks"""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
     return chunks
 def get_vector_count(qdrant):
+    """Get total vectors"""
     try:
         count = 0
         offset = None
         return 0
 # ============================================================================
+# INITIALIZE
 # ============================================================================
 try:
     claude = get_claude_client()
     st.sidebar.success("✅ System Ready")
 except Exception as e:
+    st.error(f"❌ Init failed: {e}")
+    st.info("Add secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
     st.stop()
 # ============================================================================
 # SIDEBAR
 # ============================================================================
+st.sidebar.title("🎓 Math AI")
+st.sidebar.caption("Production v2.0")
 try:
     vector_count = get_vector_count(qdrant)
+    st.sidebar.metric("Vectors", f"{vector_count:,}")
+    st.sidebar.metric("Storage", f"{(vector_count * 384 * 4) / (1024 * 1024):.1f} MB")
 except:
+    st.sidebar.warning("DB unavailable")
 st.sidebar.markdown("---")
 # ============================================================================
+# TABS
 # ============================================================================
+tab1, tab2, tab3 = st.tabs(["📊 Dataset Manager", "🔍 Search & Solve", "📈 Statistics"])
 # ============================================================================
+# TAB 1: DATASET MANAGER
 # ============================================================================
 with tab1:
     st.title("📊 Dataset Manager")
     if not os.getenv("HF_TOKEN"):
+        st.error("⚠️ Add HF_TOKEN in Settings → Secrets")
         st.stop()
     # Collection setup
+    st.header("🏗️ Database Setup")
+    try:
+        collections = qdrant.get_collections().collections
+        exists = any(c.name == COLLECTION_NAME for c in collections)
+        if exists:
+            st.success(f"✅ Collection '{COLLECTION_NAME}' ready")
+        else:
+            st.warning("Collection doesn't exist")
+            selected_model = st.selectbox("Embedding model:", list(EMBEDDING_MODELS.keys()))
+            if st.button("🏗️ Create Collection"):
+                dimensions = EMBEDDING_MODELS[selected_model]["dimensions"]
+                qdrant.create_collection(
+                    collection_name=COLLECTION_NAME,
+                    vectors_config=VectorParams(size=dimensions, distance=Distance.COSINE)
                 )
+                st.success("Created!")
+                st.session_state.embedding_model = EMBEDDING_MODELS[selected_model]["name"]
+                st.rerun()
+    except Exception as e:
+        st.error(f"Error: {e}")
     st.markdown("---")
+    # Processing config
+    st.header("⚙️ Configuration")
+    col1, col2 = st.columns(2)
     with col1:
+        chunk_size = st.slider("Chunk size:", 50, 500, 150)
+        chunk_overlap = st.slider("Overlap:", 0, 100, 30)
     with col2:
         current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+        st.info(f"**Active Model:**\n{current_model}")
+        use_context = st.checkbox("Use context for OCR", value=True)
     st.markdown("---")
     # Data sources
+    st.header("📁 Data Sources")
+    source_tabs = st.tabs(["📂 Your Files", "🌐 Public Datasets"])
     with source_tabs[0]:
         folder_type = st.radio(
+            "Folder:",
+            ["📚 Books", "📝 Exams", "🖊️ Answers (OCR)"],
             horizontal=True
         )
         if "Books" in folder_type:
+            folder_path, doc_type = "books/", "book"
         elif "Exams" in folder_type:
+            folder_path, doc_type = "exams/", "exam"
         else:
+            folder_path, doc_type = "answers/", "answer_handwritten"
+        if st.button(f"🔍 Scan {folder_path}"):
+            with st.spinner("Scanning..."):
                 files = list_dataset_files(folder_path)
                 if files:
                     file_status = []
                     for file in files:
+                        name = file.split('/')[-1]
+                        processed = check_if_processed(qdrant, name)
+                        file_status.append({"file": file, "name": name, "processed": processed})
                     st.session_state.current_files = file_status
                     st.session_state.current_folder = folder_path
                     st.session_state.current_doc_type = doc_type
                 else:
+                    st.warning("No files found")
         if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
             processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
             pending_count = len(st.session_state.current_files) - processed_count
             col1, col2, col3 = st.columns(3)
+            col1.metric("Total", len(st.session_state.current_files))
+            col2.metric("✅ Done", processed_count)
+            col3.metric("⏳ Pending", pending_count)
+            st.subheader("Select files:")
             selected_files = []
             for file_info in st.session_state.current_files:
+                if file_info['processed']:
+                    st.checkbox(f"✅ {file_info['name']}", value=False, disabled=True, key=f"f_{file_info['name']}")
+                else:
+                    if st.checkbox(f"⏳ {file_info['name']}", value=True, key=f"f_{file_info['name']}"):
+                        selected_files.append(file_info)
             if selected_files:
                 st.markdown("---")
                 if doc_type == "answer_handwritten":
+                    est_cost = len(selected_files) * 5 * 0.08
+                    st.warning(f"⚠️ OCR Cost: ~${est_cost:.2f}")
+                if st.button("🚀 PROCESS SELECTED", type="primary"):
                     embedder = get_embedding_model(current_model)
                     context_books = ""
+                    if doc_type == "answer_handwritten" and use_context:
                         try:
+                            samples = qdrant.scroll(
                                 collection_name=COLLECTION_NAME,
                                 limit=10,
                                 with_payload=True,
                                 with_vectors=False,
                                 scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
                             )
+                            if samples and samples[0]:
+                                context_books = "\n".join([p.payload['content'] for p in samples[0][:5]])
                         except:
+                            pass
                     total_tokens = 0
                     total_vectors = 0
                     for file_info in selected_files:
                         with st.expander(f"Processing {file_info['name']}", expanded=True):
                             try:
                                 st.write("📥 Downloading...")
                                 local_path = download_from_dataset(file_info['file'])
                                 if not local_path:
                                     continue
                                 if doc_type == "answer_handwritten":
+                                    st.write("🖼️ Converting...")
                                     images = pdf_to_images(local_path)
                                     if not images:
                                         continue
                                     st.write(f"✅ {len(images)} pages")
+                                    transcribed = []
+                                    tokens = 0
+                                    for i, img in enumerate(images, 1):
+                                        st.write(f"🤖 OCR {i}/{len(images)}...")
+                                        trans, tok = ocr_with_claude(claude, img, context_books)
+                                        if trans:
+                                            transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
+                                            tokens += tok
+                                    if not transcribed:
                                         st.error("OCR failed")
                                         continue
+                                    text = "\n\n".join(transcribed)
+                                    total_tokens += tokens
+                                    st.success(f"✅ {len(text):,} chars (${tokens * 0.000003:.3f})")
                                 else:
+                                    st.write("📖 Extracting...")
                                     text = extract_text_from_pdf(local_path)
                                     if not text:
                                         continue
                                     st.write(f"✅ {len(text):,} chars")
                                 chunks = chunk_text(text, chunk_size, chunk_overlap)
                                 st.write(f"✂️ {len(chunks)} chunks")
                                 st.write("🔢 Embedding...")
                                 embeddings = embedder.encode(chunks, show_progress_bar=False)
                                 points = []
                                 for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                                     points.append(PointStruct(
                                             "content": chunk,
                                             "source_name": file_info['name'],
                                             "source_type": doc_type,
+                                            "chunk_index": i
                                         }
                                     ))
                                 qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                                 total_vectors += len(points)
+                                st.success(f"✅ {len(points)} vectors!")
                             except Exception as e:
                                 st.error(f"Error: {e}")
                     st.balloons()
+                    st.success(f"Done! {total_vectors:,} vectors | ${total_tokens * 0.000003:.2f}")
                     st.session_state.pop('current_files', None)
                     st.rerun()
     with source_tabs[1]:
         dataset_choice = st.selectbox(
+            "Dataset:",
+            ["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"]
         )
+        sample_size = st.slider("Samples:", 10, 2000, 100)
         dataset_name = dataset_choice.split(" - ")[0]
         already_loaded = check_if_processed(qdrant, dataset_name)
         if already_loaded:
+            st.success(f"✅ {dataset_name} loaded!")
         else:
+            if st.button(f"📥 Load {dataset_name}"):
                 try:
                     from datasets import load_dataset
                     embedder = get_embedding_model(current_model)
+                    with st.spinner("Loading..."):
                         if "GSM8K" in dataset_choice:
                             dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
                                     for i in range(min(sample_size, len(dataset)))]
                         elif "MATH" in dataset_choice:
                             dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
                                     for i in range(min(sample_size, len(dataset)))]
+                        else:
                             dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                     for i in range(min(sample_size, len(dataset)))]
+                        st.write(f"✅ {len(texts)} problems")
                         embeddings = embedder.encode(texts, show_progress_bar=True)
                         points = []
                         for i, (text, emb) in enumerate(zip(texts, embeddings)):
                             points.append(PointStruct(
                                     "content": text[:2000],
                                     "source_name": dataset_name,
                                     "source_type": "public_dataset",
+                                    "index": i
                                 }
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                        st.success(f"✅ {len(points)} vectors!")
                         st.balloons()
                 except Exception as e:
 # ============================================================================
 with tab2:
     st.title("🔍 Search & Solve")
     problem = st.text_area(
+        "Problem:",
+        placeholder="Find gradient of L(w) = (1/2)||Xw - y||²",
         height=150
     )
     col1, col2 = st.columns(2)
+    col1.slider("Retrieve:", 3, 20, 5, key="top_k")
+    col2.select_slider("Detail:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
     if st.button("🚀 SOLVE", type="primary") and problem:
         current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
         embedder = get_embedding_model(current_model)
                 results = qdrant.search(
                     collection_name=COLLECTION_NAME,
                     query_vector=query_emb.tolist(),
+                    limit=st.session_state.top_k
                 )
             except:
                 results = []
             with st.expander("📚 References"):
                 for i, r in enumerate(results, 1):
+                    st.markdown(f"**{i}.** ({r.score*100:.0f}%)")
                     st.text(r.payload['content'][:200] + "...")
                     st.caption(f"Source: {r.payload.get('source_name')}")
+            with st.spinner("Generating..."):
                 context = "\n\n".join([r.payload['content'] for r in results])
+                prompt = f"""Solve using references.
 PROBLEM: {problem}
 REFERENCES: {context}
+DETAIL: {st.session_state.detail}
 FORMAT:
 ## SOLUTION
 [Steps]
 ## REASONING
+[Why]
 ## REFERENCES
+[Sources]"""
                 try:
                     message = claude.messages.create(
                     st.markdown(message.content[0].text)
                     st.download_button(
+                        "📥 Download",
                         message.content[0].text,
                         file_name=f"solution_{int(time.time())}.md"
                     )
 # ============================================================================
 with tab3:
+    st.title("📈 Statistics")
     try:
         sample = qdrant.scroll(
                 sources.add(point.payload.get('source_name', 'Unknown'))
             col1, col2, col3 = st.columns(3)
+            col1.metric("Vectors", get_vector_count(qdrant))
+            col2.metric("Sources", len(sources))
+            col3.metric("Types", len(types))
+            st.subheader("Distribution")
             for doc_type, count in sorted(types.items()):
                 pct = count / sum(types.values()) * 100
+                st.progress(count / sum(types.values()), text=f"{doc_type}: {count} ({pct:.0f}%)")
             st.subheader("All Sources")
             for src in sorted(sources):
     except Exception as e:
         st.error(f"Error: {e}")
+st.sidebar.caption("v2.0")