Spaces:

Hebaelsayed
/

math-ai-system

Running

App Files Files Community

Hebaelsayed commited on 16 days ago

Commit

3fbb8fa

verified ·

1 Parent(s): 0872929

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +377 -96

src/streamlit_app.py CHANGED Viewed

@@ -90,6 +90,49 @@ def check_if_processed(qdrant, file_name):
     except:
         return False
 def list_dataset_files(folder_path):
     """List PDFs in HF Dataset folder"""
     try:
@@ -213,6 +256,17 @@ def get_vector_count(qdrant):
     except:
         return 0
 # ============================================================================
 # INITIALIZE
 # ============================================================================
@@ -235,8 +289,20 @@ st.sidebar.caption("Production v2.0")
 try:
     vector_count = get_vector_count(qdrant)
-    st.sidebar.metric("Vectors", f"{vector_count:,}")
-    st.sidebar.metric("Storage", f"{(vector_count * 384 * 4) / (1024 * 1024):.1f} MB")
 except:
     st.sidebar.warning("DB unavailable")
@@ -287,19 +353,58 @@ with tab1:
     st.markdown("---")
-    # Processing config
     st.header("⚙️ Configuration")
-    col1, col2 = st.columns(2)
-    with col1:
-        chunk_size = st.slider("Chunk size:", 50, 500, 150)
-        chunk_overlap = st.slider("Overlap:", 0, 100, 30)
-    with col2:
-        current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
-        st.info(f"**Active Model:**\n{current_model}")
-        use_context = st.checkbox("Use context for OCR", value=True)
     st.markdown("---")
@@ -310,9 +415,10 @@ with tab1:
     with source_tabs[0]:
         folder_type = st.radio(
-            "Folder:",
             ["📚 Books", "📝 Exams", "🖊️ Answers (OCR)"],
-            horizontal=True
         )
         if "Books" in folder_type:
@@ -322,8 +428,8 @@ with tab1:
         else:
             folder_path, doc_type = "answers/", "answer_handwritten"
-        if st.button(f"🔍 Scan {folder_path}"):
-            with st.spinner("Scanning..."):
                 files = list_dataset_files(folder_path)
                 if files:
@@ -331,44 +437,122 @@ with tab1:
                     for file in files:
                         name = file.split('/')[-1]
                         processed = check_if_processed(qdrant, name)
-                        file_status.append({"file": file, "name": name, "processed": processed})
                     st.session_state.current_files = file_status
                     st.session_state.current_folder = folder_path
                     st.session_state.current_doc_type = doc_type
                 else:
-                    st.warning("No files found")
         if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
             processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
             pending_count = len(st.session_state.current_files) - processed_count
-            col1, col2, col3 = st.columns(3)
-            col1.metric("Total", len(st.session_state.current_files))
-            col2.metric("✅ Done", processed_count)
-            col3.metric("⏳ Pending", pending_count)
-            st.subheader("Select files:")
             selected_files = []
             for file_info in st.session_state.current_files:
-                if file_info['processed']:
-                    st.checkbox(f"✅ {file_info['name']}", value=False, disabled=True, key=f"f_{file_info['name']}")
-                else:
-                    if st.checkbox(f"⏳ {file_info['name']}", value=True, key=f"f_{file_info['name']}"):
-                        selected_files.append(file_info)
             if selected_files:
                 st.markdown("---")
-                if doc_type == "answer_handwritten":
-                    est_cost = len(selected_files) * 5 * 0.08
-                    st.warning(f"⚠️ OCR Cost: ~${est_cost:.2f}")
-                if st.button("🚀 PROCESS SELECTED", type="primary"):
-                    embedder = get_embedding_model(current_model)
                     context_books = ""
                     if doc_type == "answer_handwritten" and use_context:
@@ -387,56 +571,73 @@ with tab1:
                     total_tokens = 0
                     total_vectors = 0
-                    for file_info in selected_files:
-                        with st.expander(f"Processing {file_info['name']}", expanded=True):
                             try:
                                 st.write("📥 Downloading...")
                                 local_path = download_from_dataset(file_info['file'])
                                 if not local_path:
                                     continue
                                 if doc_type == "answer_handwritten":
-                                    st.write("🖼️ Converting...")
                                     images = pdf_to_images(local_path)
                                     if not images:
                                         continue
-                                    st.write(f"✅ {len(images)} pages")
                                     transcribed = []
                                     tokens = 0
                                     for i, img in enumerate(images, 1):
-                                        st.write(f"🤖 OCR {i}/{len(images)}...")
                                         trans, tok = ocr_with_claude(claude, img, context_books)
                                         if trans:
                                             transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
                                             tokens += tok
                                     if not transcribed:
-                                        st.error("OCR failed")
                                         continue
                                     text = "\n\n".join(transcribed)
                                     total_tokens += tokens
-                                    st.success(f"✅ {len(text):,} chars (${tokens * 0.000003:.3f})")
                                 else:
-                                    st.write("📖 Extracting...")
                                     text = extract_text_from_pdf(local_path)
                                     if not text:
                                         continue
-                                    st.write(f"✅ {len(text):,} chars")
                                 chunks = chunk_text(text, chunk_size, chunk_overlap)
-                                st.write(f"✂️ {len(chunks)} chunks")
-                                st.write("🔢 Embedding...")
                                 embeddings = embedder.encode(chunks, show_progress_bar=False)
                                 points = []
                                 for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                                     points.append(PointStruct(
@@ -452,37 +653,92 @@ with tab1:
                                 qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                                 total_vectors += len(points)
-                                st.success(f"✅ {len(points)} vectors!")
                             except Exception as e:
-                                st.error(f"Error: {e}")
                     st.balloons()
-                    st.success(f"Done! {total_vectors:,} vectors | ${total_tokens * 0.000003:.2f}")
-                    st.session_state.pop('current_files', None)
-                    st.rerun()
     with source_tabs[1]:
         dataset_choice = st.selectbox(
-            "Dataset:",
-            ["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"]
         )
-        sample_size = st.slider("Samples:", 10, 2000, 100)
         dataset_name = dataset_choice.split(" - ")[0]
         already_loaded = check_if_processed(qdrant, dataset_name)
         if already_loaded:
-            st.success(f"✅ {dataset_name} loaded!")
         else:
-            if st.button(f"📥 Load {dataset_name}"):
                 try:
                     from datasets import load_dataset
-                    embedder = get_embedding_model(current_model)
-                    with st.spinner("Loading..."):
                         if "GSM8K" in dataset_choice:
                             dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
@@ -496,10 +752,12 @@ with tab1:
                             texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                     for i in range(min(sample_size, len(dataset)))]
-                        st.write(f"✅ {len(texts)} problems")
                         embeddings = embedder.encode(texts, show_progress_bar=True)
                         points = []
                         for i, (text, emb) in enumerate(zip(texts, embeddings)):
                             points.append(PointStruct(
@@ -514,11 +772,11 @@ with tab1:
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
-                        st.success(f"✅ {len(points)} vectors!")
                         st.balloons()
                 except Exception as e:
-                    st.error(f"Error: {e}")
 # ============================================================================
 # TAB 2: SEARCH & SOLVE
@@ -528,21 +786,22 @@ with tab2:
     st.title("🔍 Search & Solve")
     problem = st.text_area(
-        "Problem:",
         placeholder="Find gradient of L(w) = (1/2)||Xw - y||²",
-        height=150
     )
     col1, col2 = st.columns(2)
-    col1.slider("Retrieve:", 3, 20, 5, key="top_k")
-    col2.select_slider("Detail:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
-    if st.button("🚀 SOLVE", type="primary") and problem:
-        current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
-        embedder = get_embedding_model(current_model)
-        with st.spinner("Searching..."):
             query_emb = embedder.encode(problem)
             try:
@@ -555,37 +814,41 @@ with tab2:
                 results = []
         if not results:
-            st.warning("No results. Load data in Dataset Manager.")
         else:
-            st.success(f"Found {len(results)} references!")
-            with st.expander("📚 References"):
                 for i, r in enumerate(results, 1):
-                    st.markdown(f"**{i}.** ({r.score*100:.0f}%)")
-                    st.text(r.payload['content'][:200] + "...")
-                    st.caption(f"Source: {r.payload.get('source_name')}")
-            with st.spinner("Generating..."):
                 context = "\n\n".join([r.payload['content'] for r in results])
-                prompt = f"""Solve using references.
-PROBLEM: {problem}
-REFERENCES: {context}
-DETAIL: {st.session_state.detail}
-FORMAT:
 ## SOLUTION
-[Steps]
 ## REASONING
-[Why]
 ## REFERENCES
-[Sources]"""
                 try:
                     message = claude.messages.create(
@@ -595,25 +858,28 @@ FORMAT:
                     )
                     st.markdown("---")
                     st.markdown(message.content[0].text)
                     st.download_button(
-                        "📥 Download",
                         message.content[0].text,
-                        file_name=f"solution_{int(time.time())}.md"
                     )
                 except Exception as e:
-                    st.error(f"Error: {e}")
 # ============================================================================
 # TAB 3: STATISTICS
 # ============================================================================
 with tab3:
-    st.title("📈 Statistics")
     try:
         sample = qdrant.scroll(
             collection_name=COLLECTION_NAME,
             limit=1000,
@@ -624,27 +890,42 @@ with tab3:
         if sample and sample[0]:
             types = {}
             sources = set()
             for point in sample[0]:
                 src_type = point.payload.get('source_type', 'unknown')
                 types[src_type] = types.get(src_type, 0) + 1
-                sources.add(point.payload.get('source_name', 'Unknown'))
             col1, col2, col3 = st.columns(3)
-            col1.metric("Vectors", get_vector_count(qdrant))
-            col2.metric("Sources", len(sources))
-            col3.metric("Types", len(types))
-            st.subheader("Distribution")
-            for doc_type, count in sorted(types.items()):
                 pct = count / sum(types.values()) * 100
-                st.progress(count / sum(types.values()), text=f"{doc_type}: {count} ({pct:.0f}%)")
-            st.subheader("All Sources")
             for src in sorted(sources):
-                st.caption(f"• {src}")
     except Exception as e:
-        st.error(f"Error: {e}")
-st.sidebar.caption("v2.0")

     except:
         return False
+def get_file_vector_count(qdrant, file_name):
+    """Get number of vectors for a specific file"""
+    try:
+        count = 0
+        offset = None
+        for _ in range(100):  # Safety limit
+            results = qdrant.scroll(
+                collection_name=COLLECTION_NAME,
+                scroll_filter={
+                    "must": [{"key": "source_name", "match": {"value": file_name}}]
+                },
+                limit=100,
+                offset=offset,
+                with_payload=False,
+                with_vectors=False
+            )
+            if not results or not results[0]:
+                break
+            count += len(results[0])
+            offset = results[1]
+            if offset is None:
+                break
+        return count
+    except:
+        return 0
+def estimate_chunks(pdf_path, chunk_size, overlap):
+    """Estimate number of chunks from a PDF"""
+    try:
+        with open(pdf_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            total_words = 0
+            for page in reader.pages:
+                text = page.extract_text()
+                total_words += len(text.split())
+            # Calculate estimated chunks
+            effective_chunk_size = chunk_size - overlap
+            estimated_chunks = max(1, (total_words - chunk_size) // effective_chunk_size + 1)
+            return estimated_chunks, total_words
+    except:
+        return 0, 0
 def list_dataset_files(folder_path):
     """List PDFs in HF Dataset folder"""
     try:
     except:
         return 0
+# ============================================================================
+# INITIALIZE SESSION STATE
+# ============================================================================
+if 'processing_complete' not in st.session_state:
+    st.session_state.processing_complete = False
+if 'last_processed_files' not in st.session_state:
+    st.session_state.last_processed_files = []
+if 'processing_stats' not in st.session_state:
+    st.session_state.processing_stats = {}
 # ============================================================================
 # INITIALIZE
 # ============================================================================
 try:
     vector_count = get_vector_count(qdrant)
+    st.sidebar.metric("📊 Total Vectors", f"{vector_count:,}")
+    # Get current embedding model
+    current_model_key = None
+    current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+    for key, value in EMBEDDING_MODELS.items():
+        if value["name"] == current_model_name:
+            current_model_key = key
+            break
+    if current_model_key:
+        dimensions = EMBEDDING_MODELS[current_model_key]["dimensions"]
+        storage_mb = (vector_count * dimensions * 4) / (1024 * 1024)
+        st.sidebar.metric("💾 Storage", f"{storage_mb:.1f} MB")
 except:
     st.sidebar.warning("DB unavailable")
     st.markdown("---")
+    # Processing configuration - ALWAYS VISIBLE
     st.header("⚙️ Configuration")
+    config_col1, config_col2 = st.columns(2)
+    with config_col1:
+        st.subheader("Chunking Settings")
+        chunk_size = st.slider("Chunk size (words):", 50, 500, 150, key="chunk_size_slider")
+        chunk_overlap = st.slider("Overlap (words):", 0, 100, 30, key="chunk_overlap_slider")
+        # Show effective chunk size
+        effective_size = chunk_size - chunk_overlap
+        st.caption(f"📏 Effective chunk: {effective_size} words")
+    with config_col2:
+        st.subheader("Embedding Model")
+        # Get current model
+        current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+        current_model_key = None
+        for key, value in EMBEDDING_MODELS.items():
+            if value["name"] == current_model_name:
+                current_model_key = key
+                break
+        if not current_model_key:
+            current_model_key = "MiniLM-L6 (Fast, 384D)"
+        selected_embedding = st.selectbox(
+            "Select model:",
+            list(EMBEDDING_MODELS.keys()),
+            index=list(EMBEDDING_MODELS.keys()).index(current_model_key),
+            key="embedding_selector"
+        )
+        # Display model info
+        model_info = EMBEDDING_MODELS[selected_embedding]
+        st.info(f"""
+**Active Model:** {selected_embedding}
+- **Dimensions:** {model_info['dimensions']}D
+- **Speed:** {model_info['speed']}
+- **Quality:** {model_info['quality']}
+        """)
+        # Update session state
+        if st.session_state.embedding_model != model_info['name']:
+            if st.button("🔄 Apply Model Change"):
+                st.session_state.embedding_model = model_info['name']
+                st.success("Model updated! New uploads will use this model.")
+                st.rerun()
+        use_context = st.checkbox("Use context for OCR", value=True, key="use_context_checkbox")
     st.markdown("---")
     with source_tabs[0]:
         folder_type = st.radio(
+            "Select folder type:",
             ["📚 Books", "📝 Exams", "🖊️ Answers (OCR)"],
+            horizontal=True,
+            key="folder_type_radio"
         )
         if "Books" in folder_type:
         else:
             folder_path, doc_type = "answers/", "answer_handwritten"
+        if st.button(f"🔍 Scan {folder_path}", key="scan_button"):
+            with st.spinner("Scanning HuggingFace dataset..."):
                 files = list_dataset_files(folder_path)
                 if files:
                     for file in files:
                         name = file.split('/')[-1]
                         processed = check_if_processed(qdrant, name)
+                        vector_count_file = get_file_vector_count(qdrant, name) if processed else 0
+                        file_status.append({
+                            "file": file,
+                            "name": name,
+                            "processed": processed,
+                            "vectors": vector_count_file
+                        })
                     st.session_state.current_files = file_status
                     st.session_state.current_folder = folder_path
                     st.session_state.current_doc_type = doc_type
+                    st.session_state.processing_complete = False
                 else:
+                    st.warning(f"No PDF files found in {folder_path}")
+        # Display files if scanned
         if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
             processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
             pending_count = len(st.session_state.current_files) - processed_count
+            total_vectors = sum(f['vectors'] for f in st.session_state.current_files)
+            # Summary metrics
+            metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
+            metric_col1.metric("📁 Total Files", len(st.session_state.current_files))
+            metric_col2.metric("✅ Processed", processed_count)
+            metric_col3.metric("⏳ Pending", pending_count)
+            metric_col4.metric("🔢 Vectors", f"{total_vectors:,}")
+            st.markdown("---")
+            st.subheader("File Status & Selection")
+            # File selection with status
             selected_files = []
             for file_info in st.session_state.current_files:
+                col1, col2, col3 = st.columns([3, 1, 1])
+                with col1:
+                    if file_info['processed']:
+                        checkbox_label = f"✅ {file_info['name']}"
+                        is_selected = st.checkbox(
+                            checkbox_label,
+                            value=False,
+                            disabled=True,
+                            key=f"file_{file_info['name']}"
+                        )
+                    else:
+                        checkbox_label = f"⏳ {file_info['name']}"
+                        is_selected = st.checkbox(
+                            checkbox_label,
+                            value=True,
+                            key=f"file_{file_info['name']}"
+                        )
+                        if is_selected:
+                            selected_files.append(file_info)
+                with col2:
+                    if file_info['processed']:
+                        st.caption(f"🔢 {file_info['vectors']} vectors")
+                    else:
+                        st.caption("Not uploaded")
+                with col3:
+                    if file_info['processed']:
+                        status_color = "🟢"
+                    else:
+                        status_color = "🔴"
+                    st.caption(status_color)
+            # Sizing estimation for selected files
             if selected_files:
                 st.markdown("---")
+                st.subheader("📊 Processing Preview")
+                # Download one file to estimate
+                sample_file = selected_files[0]
+                with st.spinner("Calculating estimates..."):
+                    local_path = download_from_dataset(sample_file['file'])
+                    if local_path:
+                        est_chunks, est_words = estimate_chunks(local_path, chunk_size, chunk_overlap)
+                        # Calculate totals
+                        total_est_chunks = est_chunks * len(selected_files)
+                        total_est_words = est_words * len(selected_files)
+                        # Get embedding dimensions
+                        current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+                        dimensions = 384  # default
+                        for key, value in EMBEDDING_MODELS.items():
+                            if value["name"] == current_model_name:
+                                dimensions = value["dimensions"]
+                                break
+                        est_storage_mb = (total_est_chunks * dimensions * 4) / (1024 * 1024)
+                        # Display estimates
+                        est_col1, est_col2, est_col3, est_col4 = st.columns(4)
+                        est_col1.metric("📄 Files", len(selected_files))
+                        est_col2.metric("📝 Est. Words", f"{total_est_words:,}")
+                        est_col3.metric("✂️ Est. Chunks", f"{total_est_chunks:,}")
+                        est_col4.metric("💾 Est. Storage", f"{est_storage_mb:.2f} MB")
+                        # OCR cost estimation
+                        if doc_type == "answer_handwritten":
+                            # Estimate ~5 pages per exam, $0.08 per page
+                            est_pages = len(selected_files) * 5
+                            est_cost = est_pages * 0.08
+                            st.warning(f"⚠️ **OCR Processing Cost Estimate:** ~${est_cost:.2f} ({est_pages} pages × $0.08/page)")
+                st.markdown("---")
+                # Process button
+                if st.button("🚀 PROCESS SELECTED FILES", type="primary", key="process_button"):
+                    current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+                    embedder = get_embedding_model(current_model_name)
                     context_books = ""
                     if doc_type == "answer_handwritten" and use_context:
                     total_tokens = 0
                     total_vectors = 0
+                    processing_stats = {}
+                    # Create progress tracking
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    for idx, file_info in enumerate(selected_files):
+                        # Update progress
+                        progress = (idx) / len(selected_files)
+                        progress_bar.progress(progress)
+                        status_text.text(f"Processing {idx + 1}/{len(selected_files)}: {file_info['name']}")
+                        with st.expander(f"📄 {file_info['name']}", expanded=True):
                             try:
                                 st.write("📥 Downloading...")
                                 local_path = download_from_dataset(file_info['file'])
                                 if not local_path:
+                                    st.error("❌ Download failed")
                                     continue
+                                file_start_time = time.time()
                                 if doc_type == "answer_handwritten":
+                                    st.write("🖼️ Converting to images...")
                                     images = pdf_to_images(local_path)
                                     if not images:
+                                        st.error("❌ Conversion failed")
                                         continue
+                                    st.write(f"✅ Converted {len(images)} pages")
                                     transcribed = []
                                     tokens = 0
                                     for i, img in enumerate(images, 1):
+                                        st.write(f"🤖 OCR page {i}/{len(images)}...")
                                         trans, tok = ocr_with_claude(claude, img, context_books)
                                         if trans:
                                             transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
                                             tokens += tok
                                     if not transcribed:
+                                        st.error("❌ OCR failed")
                                         continue
                                     text = "\n\n".join(transcribed)
                                     total_tokens += tokens
+                                    st.success(f"✅ Transcribed {len(text):,} characters (Cost: ${tokens * 0.000003:.3f})")
                                 else:
+                                    st.write("📖 Extracting text...")
                                     text = extract_text_from_pdf(local_path)
                                     if not text:
+                                        st.error("❌ Extraction failed")
                                         continue
+                                    st.write(f"✅ Extracted {len(text):,} characters")
+                                st.write("✂️ Chunking text...")
                                 chunks = chunk_text(text, chunk_size, chunk_overlap)
+                                st.write(f"✅ Created {len(chunks)} chunks")
+                                st.write("🔢 Generating embeddings...")
                                 embeddings = embedder.encode(chunks, show_progress_bar=False)
+                                st.write("💾 Uploading to vector database...")
                                 points = []
                                 for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                                     points.append(PointStruct(
                                 qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                                 total_vectors += len(points)
+                                file_time = time.time() - file_start_time
+                                st.success(f"✅ Uploaded {len(points)} vectors in {file_time:.1f}s!")
+                                # Store stats
+                                processing_stats[file_info['name']] = {
+                                    'vectors': len(points),
+                                    'chunks': len(chunks),
+                                    'time': file_time,
+                                    'tokens': tokens if doc_type == "answer_handwritten" else 0
+                                }
                             except Exception as e:
+                                st.error(f"❌ Error: {e}")
+                    # Complete progress
+                    progress_bar.progress(1.0)
+                    status_text.text(f"✅ Completed! Processed {len(selected_files)} files")
+                    # Store results in session state
+                    st.session_state.processing_complete = True
+                    st.session_state.last_processed_files = selected_files
+                    st.session_state.processing_stats = processing_stats
                     st.balloons()
+                    # Final summary (persistent)
+                    st.markdown("---")
+                    st.success(f"🎉 **Processing Complete!**")
+                    summary_col1, summary_col2, summary_col3, summary_col4 = st.columns(4)
+                    summary_col1.metric("📁 Files", len(selected_files))
+                    summary_col2.metric("🔢 Vectors", f"{total_vectors:,}")
+                    if total_tokens > 0:
+                        summary_col3.metric("💰 Cost", f"${total_tokens * 0.000003:.2f}")
+                    summary_col4.metric("✅ Status", "Success")
+            # Show persistent results if processing was completed
+            elif st.session_state.processing_complete and st.session_state.processing_stats:
+                st.markdown("---")
+                st.info("ℹ️ Last processing session completed. Results shown below.")
+                st.subheader("📊 Processing Results")
+                total_vectors = sum(stat['vectors'] for stat in st.session_state.processing_stats.values())
+                total_tokens = sum(stat['tokens'] for stat in st.session_state.processing_stats.values())
+                result_col1, result_col2, result_col3, result_col4 = st.columns(4)
+                result_col1.metric("📁 Files", len(st.session_state.processing_stats))
+                result_col2.metric("🔢 Vectors", f"{total_vectors:,}")
+                if total_tokens > 0:
+                    result_col3.metric("💰 Cost", f"${total_tokens * 0.000003:.2f}")
+                result_col4.metric("✅ Status", "Complete")
+                # Detailed breakdown
+                with st.expander("📋 Detailed Breakdown"):
+                    for filename, stats in st.session_state.processing_stats.items():
+                        st.markdown(f"**{filename}**")
+                        st.caption(f"Vectors: {stats['vectors']:,} | Chunks: {stats['chunks']} | Time: {stats['time']:.1f}s")
     with source_tabs[1]:
         dataset_choice = st.selectbox(
+            "Select public dataset:",
+            ["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"],
+            key="dataset_selector"
         )
+        sample_size = st.slider("Number of samples:", 10, 2000, 100, key="sample_size_slider")
         dataset_name = dataset_choice.split(" - ")[0]
         already_loaded = check_if_processed(qdrant, dataset_name)
         if already_loaded:
+            vectors_count = get_file_vector_count(qdrant, dataset_name)
+            st.success(f"✅ **{dataset_name}** already loaded with {vectors_count:,} vectors!")
         else:
+            st.info(f"📥 {dataset_name} not yet loaded")
+            if st.button(f"📥 Load {dataset_name}", type="primary", key="load_dataset_button"):
                 try:
                     from datasets import load_dataset
+                    current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+                    embedder = get_embedding_model(current_model_name)
+                    with st.spinner(f"Loading {dataset_name}..."):
                         if "GSM8K" in dataset_choice:
                             dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
                             texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
                             texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
                                     for i in range(min(sample_size, len(dataset)))]
+                        st.write(f"✅ Loaded {len(texts)} problems")
+                        st.write("🔢 Generating embeddings...")
                         embeddings = embedder.encode(texts, show_progress_bar=True)
+                        st.write("💾 Uploading to vector database...")
                         points = []
                         for i, (text, emb) in enumerate(zip(texts, embeddings)):
                             points.append(PointStruct(
                             ))
                         qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
+                        st.success(f"✅ Uploaded {len(points)} vectors!")
                         st.balloons()
                 except Exception as e:
+                    st.error(f"❌ Error: {e}")
 # ============================================================================
 # TAB 2: SEARCH & SOLVE
     st.title("🔍 Search & Solve")
     problem = st.text_area(
+        "Enter your math problem:",
         placeholder="Find gradient of L(w) = (1/2)||Xw - y||²",
+        height=150,
+        key="problem_input"
     )
     col1, col2 = st.columns(2)
+    col1.slider("Retrieve top K:", 3, 20, 5, key="top_k")
+    col2.select_slider("Detail level:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
+    if st.button("🚀 SOLVE", type="primary", key="solve_button") and problem:
+        current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
+        embedder = get_embedding_model(current_model_name)
+        with st.spinner("Searching knowledge base..."):
             query_emb = embedder.encode(problem)
             try:
                 results = []
         if not results:
+            st.warning("⚠️ No results found. Please load data in Dataset Manager first.")
         else:
+            st.success(f"✅ Found {len(results)} relevant references!")
+            with st.expander("📚 Retrieved References", expanded=False):
                 for i, r in enumerate(results, 1):
+                    st.markdown(f"**Reference {i}** (Relevance: {r.score*100:.1f}%)")
+                    st.text(r.payload['content'][:300] + "...")
+                    st.caption(f"📁 Source: {r.payload.get('source_name')} | Type: {r.payload.get('source_type')}")
+                    st.markdown("---")
+            with st.spinner("Generating solution with Claude..."):
                 context = "\n\n".join([r.payload['content'] for r in results])
+                prompt = f"""Solve the following math problem using the provided references.
+PROBLEM:
+{problem}
+REFERENCES:
+{context}
+DETAIL LEVEL: {st.session_state.detail}
+Please provide your response in the following format:
 ## SOLUTION
+[Step-by-step solution]
 ## REASONING
+[Explain why you solved it this way]
 ## REFERENCES
+[Cite which sources you used]"""
                 try:
                     message = claude.messages.create(
                     )
                     st.markdown("---")
+                    st.markdown("## 📝 Solution")
                     st.markdown(message.content[0].text)
                     st.download_button(
+                        "📥 Download Solution",
                         message.content[0].text,
+                        file_name=f"solution_{int(time.time())}.md",
+                        mime="text/markdown"
                     )
                 except Exception as e:
+                    st.error(f"❌ Error generating solution: {e}")
 # ============================================================================
 # TAB 3: STATISTICS
 # ============================================================================
 with tab3:
+    st.title("📈 Database Statistics")
     try:
+        # Get sample of all data
         sample = qdrant.scroll(
             collection_name=COLLECTION_NAME,
             limit=1000,
         if sample and sample[0]:
             types = {}
             sources = set()
+            source_vectors = {}
             for point in sample[0]:
                 src_type = point.payload.get('source_type', 'unknown')
+                src_name = point.payload.get('source_name', 'Unknown')
                 types[src_type] = types.get(src_type, 0) + 1
+                sources.add(src_name)
+                source_vectors[src_name] = source_vectors.get(src_name, 0) + 1
+            # Overall metrics
+            total_vectors = get_vector_count(qdrant)
             col1, col2, col3 = st.columns(3)
+            col1.metric("📊 Total Vectors", f"{total_vectors:,}")
+            col2.metric("📁 Unique Sources", len(sources))
+            col3.metric("📂 Document Types", len(types))
+            st.markdown("---")
+            # Distribution by type
+            st.subheader("📊 Distribution by Document Type")
+            for doc_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
                 pct = count / sum(types.values()) * 100
+                st.progress(count / sum(types.values()), text=f"{doc_type}: {count:,} vectors ({pct:.1f}%)")
+            st.markdown("---")
+            # All sources
+            st.subheader("📚 All Data Sources")
             for src in sorted(sources):
+                vector_count = source_vectors.get(src, 0)
+                st.caption(f"• **{src}** - {vector_count:,} vectors")
+        else:
+            st.info("📭 No data in database yet. Upload some files in the Dataset Manager!")
     except Exception as e:
+        st.error(f"❌ Error loading statistics: {e}")
+st.sidebar.caption("Powered by Claude AI")