Spaces:

Hebaelsayed
/

math-ai-system

Running

App Files Files Community

Hebaelsayed commited on Jan 4

Commit

6d08aa2

verified ·

1 Parent(s): c89d7a4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +589 -465

src/streamlit_app.py CHANGED Viewed

@@ -1,553 +1,677 @@
 import streamlit as st
 import os
-from anthropic import Anthropic
-from qdrant_client import QdrantClient
-from qdrant_client.models import Distance, VectorParams, PointStruct
-from sentence_transformers import SentenceTransformer
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 st.set_page_config(
-    page_title="Math AI - Phase 2: Database",
     page_icon="🗄️",
     layout="wide"
 )
-COLLECTION_NAME = "math_knowledge_base"
 # ============================================================================
-# CACHED FUNCTIONS - LOAD ONCE
 # ============================================================================
-@st.cache_resource(show_spinner="🔌 Connecting to Qdrant...")
-def get_qdrant_client():
-    """Cache Qdrant client - only connects once"""
-    qdrant_url = os.getenv("QDRANT_URL")
-    qdrant_api_key = os.getenv("QDRANT_API_KEY")
-    if not qdrant_url or not qdrant_api_key:
-        return None
-    return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
-@st.cache_resource(show_spinner="🤖 Loading embedding model (first time: 30-60s)...")
-def get_embedding_model():
-    """Cache embedding model - only loads once"""
-    try:
-        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-        return model
-    except Exception as e:
-        st.error(f"Failed to load model: {e}")
-        return None
-def get_vector_count_reliable(client, collection_name):
-    """Get vector count with multiple fallback methods"""
     try:
-        # Method 1: Try scroll to count
-        scroll_result = client.scroll(
-            collection_name=collection_name,
-            limit=1,
-            with_payload=False,
-            with_vectors=False
         )
-        # If scroll returns None, collection might be empty
-        if scroll_result is None or scroll_result[0] is None:
-            return 0
-        # Method 2: Try collection info
-        try:
-            info = client.get_collection(collection_name)
-            # Try different attribute names
-            if hasattr(info, 'points_count') and info.points_count is not None:
-                return info.points_count
-            elif hasattr(info, 'vectors_count') and info.vectors_count is not None:
-                return info.vectors_count
-        except:
-            pass
-        # Method 3: Count by scrolling through all
-        try:
-            count = 0
-            offset = None
-            while True:
-                result = client.scroll(
                     collection_name=collection_name,
-                    limit=100,
-                    offset=offset,
                     with_payload=False,
                     with_vectors=False
                 )
-                if result is None or result[0] is None or len(result[0]) == 0:
-                    break
-                count += len(result[0])
-                offset = result[1]
-                if offset is None:
-                    break
-            return count
-        except:
-            return 0
     except Exception as e:
-        st.warning(f"Could not get vector count: {e}")
-        return 0
-def check_collection_exists(client, collection_name):
-    """Check if collection exists"""
-    try:
-        collections = client.get_collections().collections
-        return any(c.name == collection_name for c in collections)
-    except:
-        return False
 # ============================================================================
-# INITIALIZE SESSION STATE
 # ============================================================================
-if 'db_created' not in st.session_state:
-    st.session_state.db_created = False
-if 'embedder_ready' not in st.session_state:
-    st.session_state.embedder_ready = False
-if 'show_step' not in st.session_state:
-    st.session_state.show_step = 'all'
-# ============================================================================
-# MAIN APP
-# ============================================================================
-st.title("🗄️ Phase 2: Vector Database Setup")
-# Quick Navigation
-with st.sidebar:
-    st.header("⚡ Quick Navigation")
-    st.caption("Jump to specific steps (saves time!)")
-    if st.button("📋 Show All Steps", use_container_width=True):
-        st.session_state.show_step = 'all'
-    if st.button("🚀 Skip to Upload (Step 5)", use_container_width=True):
-        st.session_state.show_step = 'upload'
-    if st.button("🔍 Skip to Search (Step 6)", use_container_width=True):
-        st.session_state.show_step = 'search'
-    st.markdown("---")
-    # Status indicators
-    st.subheader("📊 System Status")
-    client = get_qdrant_client()
-    embedder = get_embedding_model()
-    if client and check_collection_exists(client, COLLECTION_NAME):
-        st.success("✅ Database Ready")
-        st.session_state.db_created = True
-    else:
-        st.warning("⚠️ Database Not Ready")
-    if embedder:
-        st.success("✅ Model Loaded")
-        st.session_state.embedder_ready = True
-    else:
-        st.warning("⚠️ Model Not Loaded")
-    # Vector count
-    if client and st.session_state.db_created:
-        count = get_vector_count_reliable(client, COLLECTION_NAME)
-        st.metric("Vectors in DB", f"{count:,}")
-# Get cached resources
-client = get_qdrant_client()
-embedder = get_embedding_model()
-# ============================================================================
-# CONDITIONAL DISPLAY BASED ON show_step
-# ============================================================================
-show_all = st.session_state.show_step == 'all'
-show_upload = st.session_state.show_step in ['all', 'upload']
-show_search = st.session_state.show_step in ['all', 'search']
-# ============================================================================
-# STEP 1-2: Quick Status (Always Show)
-# ============================================================================
-if show_all:
-    st.header("Step 1-2: System Check")
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        st.metric("Claude API", "✅" if os.getenv("ANTHROPIC_API_KEY") else "❌")
-    with col2:
-        st.metric("Qdrant", "✅ Connected" if client else "❌")
-    with col3:
-        st.metric("Embedder", "✅ Cached" if embedder else "❌")
-    if not client:
-        st.error("⚠️ Qdrant connection failed. Check secrets!")
-        st.stop()
-    st.markdown("---")
-# ============================================================================
-# STEP 3: Collection Management
-# ============================================================================
-if show_all:
-    st.header("🏗️ Step 3: Database Collection")
-    if st.session_state.db_created:
-        st.success(f"✅ Collection '{COLLECTION_NAME}' is ready!")
         col1, col2 = st.columns(2)
         with col1:
-            if st.button("🔄 Recreate Collection"):
-                try:
-                    client.delete_collection(COLLECTION_NAME)
-                    st.session_state.db_created = False
-                    st.rerun()
-                except Exception as e:
-                    st.error(f"Error: {e}")
         with col2:
-            if st.button("ℹ️ Collection Info"):
-                count = get_vector_count_reliable(client, COLLECTION_NAME)
-                st.json({
-                    "name": COLLECTION_NAME,
-                    "vectors": count,
-                    "status": "Ready"
-                })
-    else:
-        if st.button("🏗️ CREATE COLLECTION", type="primary"):
-            try:
-                client.create_collection(
-                    collection_name=COLLECTION_NAME,
-                    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
-                )
-                st.success(f"🎉 Created: {COLLECTION_NAME}")
-                st.session_state.db_created = True
-                st.rerun()
-            except Exception as e:
-                st.error(f"❌ Failed: {str(e)}")
     st.markdown("---")
-# ============================================================================
-# STEP 4: Embedding Model (Already Loaded via Cache)
-# ============================================================================
-if show_all:
-    st.header("🤖 Step 4: Embedding Model")
-    if embedder:
-        st.success("✅ Model loaded and cached!")
-        st.session_state.embedder_ready = True
-    else:
-        st.warning("⚠️ Model loading failed. Try refreshing page.")
-    st.markdown("---")
 # ============================================================================
-# STEP 5A: Upload Custom Text
 # ============================================================================
-if show_upload:
-    st.header("📝 Step 5A: Upload Custom Math Notes")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Complete Steps 3 & 4 first (or check sidebar status)")
-    else:
-        with st.expander("✍️ Paste text to upload", expanded=True):
-            custom_text = st.text_area(
-                "Math notes:",
-                value="""Linear Equations: ax + b = 0, solution is x = -b/a
-Quadratic Equations: ax² + bx + c = 0
-Solution: x = (-b ± √(b²-4ac)) / 2a
-Example: x² + 5x - 4 = 0
-x = (-5 ± √(25+16)) / 2
-x = (-5 ± √41) / 2
-Pythagorean Theorem: a² + b² = c²
-For right triangles with sides a, b and hypotenuse c
-Derivatives:
-d/dx(xⁿ) = nxⁿ⁻¹
-d/dx(sin x) = cos x
-d/dx(cos x) = -sin x
-d/dx(eˣ) = eˣ""",
-                height=200
-            )
-            source_name = st.text_input("Source name:", value="math_notes.txt")
-            if st.button("🚀 UPLOAD TEXT", type="primary"):
-                if not custom_text.strip():
-                    st.error("Please enter text!")
-                else:
-                    try:
-                        progress = st.progress(0)
-                        status = st.empty()
-                        # Chunk
-                        status.text("📄 Chunking text...")
-                        progress.progress(0.2)
-                        words = custom_text.split()
-                        chunks = []
-                        chunk_size = 50
-                        for i in range(0, len(words), 40):
-                            chunk = ' '.join(words[i:i + chunk_size])
-                            if chunk.strip():
-                                chunks.append(chunk)
-                        st.write(f"✅ Created {len(chunks)} chunks")
-                        # Embed
-                        status.text("🔢 Generating embeddings...")
-                        progress.progress(0.5)
-                        embeddings = embedder.encode(chunks, show_progress_bar=False)
-                        st.write(f"✅ Generated {len(embeddings)} embeddings")
-                        # Upload
-                        status.text("☁️ Uploading to Qdrant...")
-                        progress.progress(0.8)
-                        points = []
-                        for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-                            points.append(PointStruct(
-                                id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}")) % (2**63),
-                                vector=embedding.tolist(),
-                                payload={
-                                    "content": chunk,
-                                    "source_name": source_name,
-                                    "source_type": "custom_notes",
-                                    "chunk_index": idx
-                                }
-                            ))
-                        client.upsert(collection_name=COLLECTION_NAME, points=points)
-                        progress.progress(1.0)
-                        status.empty()
-                        st.success(f"🎉 Uploaded {len(points)} vectors!")
-                        # Get count
-                        count = get_vector_count_reliable(client, COLLECTION_NAME)
-                        st.info(f"📊 **Total vectors in database: {count:,}**")
-                    except Exception as e:
-                        st.error(f"❌ Failed: {str(e)}")
-                        st.exception(e)
-    st.markdown("---")
-# ============================================================================
-# STEP 5B: Load Public Datasets
-# ============================================================================
-if show_upload:
-    st.header("📚 Step 5B: Load Public Datasets")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Complete Steps 3 & 4 first")
-    else:
-        with st.expander("📊 Load from Hugging Face", expanded=False):
-            dataset_choice = st.selectbox(
-                "Dataset:",
-                [
-                    "GSM8K - Grade School Math",
-                    "MATH - Competition Math",
-                    "RACE - Reading Comprehension"
                 ]
-            )
-            sample_size = st.slider("Items to load:", 10, 500, 50)
-            if st.button("📥 LOAD DATASET", type="primary"):
-                try:
-                    from datasets import load_dataset
-                    progress = st.progress(0)
-                    status = st.empty()
-                    # Load dataset
-                    status.text(f"📥 Downloading {dataset_choice.split('-')[0].strip()}...")
-                    progress.progress(0.1)
-                    if "GSM8K" in dataset_choice:
-                        dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
-                        dataset_name = "GSM8K"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
-                            texts.append(text)
-                    elif "MATH" in dataset_choice:
-                        dataset = load_dataset("hendrycks/competition_math", split="train", trust_remote_code=True)
-                        dataset_name = "MATH"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Problem ({item['type']}): {item['problem']}\n\nSolution: {item['solution']}"
-                            texts.append(text)
-                    else:
-                        dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
-                        dataset_name = "RACE"
-                        texts = []
-                        for i in range(min(sample_size, len(dataset))):
-                            item = dataset[i]
-                            text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
-                            texts.append(text)
-                    st.write(f"✅ Loaded {len(texts)} items")
-                    progress.progress(0.3)
-                    # Embed
-                    status.text("🔢 Generating embeddings...")
-                    embeddings = []
-                    for idx, text in enumerate(texts):
-                        embedding = embedder.encode(text)
-                        embeddings.append(embedding)
-                        if idx % 10 == 0:
-                            progress.progress(0.3 + (0.5 * idx / len(texts)))
-                            status.text(f"🔢 Embedding {idx+1}/{len(texts)}")
-                    st.write(f"✅ Generated {len(embeddings)} embeddings")
-                    progress.progress(0.8)
-                    # Upload
-                    status.text("☁️ Uploading to Qdrant...")
-                    points = []
-                    for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
-                        content = text[:2000] if len(text) > 2000 else text
-                        points.append(PointStruct(
-                            id=abs(hash(f"{dataset_name}_{idx}")) % (2**63),
-                            vector=embedding.tolist(),
-                            payload={
-                                "content": content,
-                                "source_name": dataset_name,
-                                "source_type": "public_dataset",
-                                "dataset": dataset_name,
-                                "index": idx
-                            }
-                        ))
-                    client.upsert(collection_name=COLLECTION_NAME, points=points)
-                    progress.progress(1.0)
-                    status.empty()
-                    st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")
-                    # Get count
-                    count = get_vector_count_reliable(client, COLLECTION_NAME)
-                    st.info(f"📊 **Total vectors in database: {count:,}**")
-                except ImportError:
-                    st.error("❌ Add 'datasets' to requirements.txt")
-                except Exception as e:
-                    st.error(f"❌ Failed: {str(e)}")
-                    st.exception(e)
-    st.markdown("---")
 # ============================================================================
-# STEP 6: Search
 # ============================================================================
-if show_search:
-    st.header("🔍 Step 6: Test Search")
-    if not st.session_state.db_created or not st.session_state.embedder_ready:
-        st.error("⚠️ Database and embedder must be ready")
-    else:
-        search_query = st.text_input(
-            "Question:",
-            placeholder="Solve x² + 5x - 4 = 0"
         )
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            top_k = st.slider("Results:", 1, 10, 5)
-        with col2:
-            st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))
-        if st.button("🔍 SEARCH", type="primary") and search_query:
-            try:
-                with st.spinner("Searching..."):
-                    query_embedding = embedder.encode(search_query)
-                    results = client.search(
-                        collection_name=COLLECTION_NAME,
-                        query_vector=query_embedding.tolist(),
-                        limit=top_k
                     )
-                    if results:
-                        st.success(f"✅ Found {len(results)} results!")
-                        for i, result in enumerate(results, 1):
-                            similarity_pct = result.score * 100
-                            # Color code by relevance
-                            if similarity_pct > 50:
-                                color = "🟢"
-                            elif similarity_pct > 30:
-                                color = "🟡"
-                            else:
-                                color = "🔴"
-                            with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
-                                st.info(result.payload['content'])
-                                col1, col2, col3 = st.columns(3)
-                                with col1:
-                                    st.caption(f"**Source:** {result.payload['source_name']}")
-                                with col2:
-                                    st.caption(f"**Type:** {result.payload['source_type']}")
-                                with col3:
-                                    st.caption(f"**Score:** {result.score:.4f}")
-                    else:
-                        st.warning("No results found!")
-            except Exception as e:
-                st.error(f"❌ Search failed: {str(e)}")
 # ============================================================================
 # FOOTER
 # ============================================================================
 st.markdown("---")
-st.success("🎉 Phase 2 Complete! Ready for Phase 3: PDF Upload + Full RAG with Claude")

 import streamlit as st
 import os
+import sys
+# ============================================================================
+# LAZY IMPORTS - Only import when needed!
+# ============================================================================
+@st.cache_resource
+def lazy_import_qdrant():
+    """Import Qdrant only when needed"""
+    from qdrant_client import QdrantClient
+    from qdrant_client.models import Distance, VectorParams, PointStruct
+    return QdrantClient, Distance, VectorParams, PointStruct
+@st.cache_resource
+def lazy_import_embedder():
+    """Import sentence transformers only when needed"""
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer
+@st.cache_resource
+def lazy_import_datasets():
+    """Import datasets only when needed"""
+    from datasets import load_dataset
+    return load_dataset
 # ============================================================================
 # CONFIGURATION
 # ============================================================================
 st.set_page_config(
+    page_title="Math AI - Database Dashboard",
     page_icon="🗄️",
     layout="wide"
 )
+# ============================================================================
+# DATABASE CONFIGURATION SETTINGS
+# ============================================================================
+if 'db_config' not in st.session_state:
+    st.session_state.db_config = {
+        'collection_name': 'math_knowledge_base',
+        'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
+        'embedding_dimensions': 384,
+        'chunk_size': 500,
+        'chunk_overlap': 50,
+        'similarity_metric': 'COSINE',
+        'max_chunk_tokens': 8192,
+        'tokenizer': 'whitespace'
+    }
+if 'db_created' not in st.session_state:
+    st.session_state.db_created = False
+if 'embedder_loaded' not in st.session_state:
+    st.session_state.embedder_loaded = False
 # ============================================================================
+# HEADER
 # ============================================================================
+st.title("🗄️ Vector Database Configuration & Analytics")
+st.markdown("**Complete database setup with full visibility and control**")
+# ============================================================================
+# SIDEBAR: QUICK STATS
+# ============================================================================
+with st.sidebar:
+    st.header("📊 Database Stats")
+    # Try to connect and get stats
     try:
+        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
+        client = QdrantClient(
+            url=os.getenv("QDRANT_URL"),
+            api_key=os.getenv("QDRANT_API_KEY")
         )
+        collection_name = st.session_state.db_config['collection_name']
+        # Check if collection exists
+        collections = client.get_collections().collections
+        exists = any(c.name == collection_name for c in collections)
+        if exists:
+            st.success("✅ Database Online")
+            # Get vector count
+            try:
+                scroll_result = client.scroll(
                     collection_name=collection_name,
+                    limit=1,
                     with_payload=False,
                     with_vectors=False
                 )
+                # Try multiple ways to get count
+                count = 0
+                offset = None
+                max_iterations = 1000
+                iteration = 0
+                while iteration < max_iterations:
+                    result = client.scroll(
+                        collection_name=collection_name,
+                        limit=100,
+                        offset=offset,
+                        with_payload=False,
+                        with_vectors=False
+                    )
+                    if result is None or result[0] is None or len(result[0]) == 0:
+                        break
+                    count += len(result[0])
+                    offset = result[1]
+                    iteration += 1
+                    if offset is None:
+                        break
+                st.metric("Total Vectors", f"{count:,}")
+                # Calculate approximate storage size
+                vector_dim = st.session_state.db_config['embedding_dimensions']
+                bytes_per_float = 4
+                metadata_overhead = 100  # bytes per vector for metadata
+                vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
+                metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
+                total_size_mb = vector_size_mb + metadata_size_mb
+                st.metric("Storage Used", f"{total_size_mb:.2f} MB")
+                st.caption(f"Vectors: {vector_size_mb:.2f} MB")
+                st.caption(f"Metadata: {metadata_size_mb:.2f} MB")
+                # Calculate storage capacity
+                free_tier_gb = 1.0
+                used_gb = total_size_mb / 1024
+                remaining_gb = free_tier_gb - used_gb
+                usage_pct = (used_gb / free_tier_gb) * 100
+                st.metric("Free Tier Usage", f"{usage_pct:.1f}%")
+                st.progress(min(usage_pct / 100, 1.0))
+                st.caption(f"Remaining: {remaining_gb:.3f} GB")
+            except Exception as e:
+                st.error(f"Stats error: {e}")
+        else:
+            st.warning("⚠️ Database Not Created")
     except Exception as e:
+        st.error("❌ Connection Failed")
+        st.caption(str(e)[:50])
 # ============================================================================
+# TAB 1: DATABASE CONFIGURATION
 # ============================================================================
+tab1, tab2, tab3, tab4 = st.tabs([
+    "⚙️ Configuration",
+    "📊 Analytics",
+    "🔧 Management",
+    "📚 Data Upload"
+])
+with tab1:
+    st.header("⚙️ Database Configuration")
+    st.info("**Configure your vector database parameters before creation**")
+    # ========================================================================
+    # SECTION 1: COLLECTION SETTINGS
+    # ========================================================================
+    with st.expander("🗄️ Collection Settings", expanded=True):
+        col1, col2 = st.columns(2)
+        with col1:
+            collection_name = st.text_input(
+                "Collection Name",
+                value=st.session_state.db_config['collection_name'],
+                help="Name of your vector database collection"
+            )
+            st.session_state.db_config['collection_name'] = collection_name
+        with col2:
+            similarity_options = {
+                'COSINE': 'Cosine Similarity (Best for text, -1 to 1)',
+                'EUCLIDEAN': 'Euclidean Distance (L2 norm)',
+                'DOT': 'Dot Product (Fast, unnormalized)'
+            }
+            similarity_metric = st.selectbox(
+                "Similarity Metric",
+                options=list(similarity_options.keys()),
+                index=0,
+                help="How to measure similarity between vectors",
+                format_func=lambda x: similarity_options[x]
+            )
+            st.session_state.db_config['similarity_metric'] = similarity_metric
+            # Explanation
+            st.caption("""
+            **Cosine Similarity**: Measures angle between vectors (best for text)
+            **Euclidean**: Measures distance in space (sensitive to magnitude)
+            **Dot Product**: Fast but requires normalized vectors
+            """)
+    # ========================================================================
+    # SECTION 2: EMBEDDING MODEL
+    # ========================================================================
+    with st.expander("🤖 Embedding Model Configuration", expanded=True):
+        embedding_models = {
+            'sentence-transformers/all-MiniLM-L6-v2': {
+                'name': 'all-MiniLM-L6-v2 (Recommended)',
+                'dimensions': 384,
+                'size': '90 MB',
+                'speed': 'Fast',
+                'quality': 'Good',
+                'description': 'Best balance of speed and quality for math content'
+            },
+            'sentence-transformers/all-mpnet-base-v2': {
+                'name': 'all-mpnet-base-v2 (High Quality)',
+                'dimensions': 768,
+                'size': '420 MB',
+                'speed': 'Medium',
+                'quality': 'Excellent',
+                'description': 'Higher quality embeddings, slower inference'
+            },
+            'sentence-transformers/all-MiniLM-L12-v2': {
+                'name': 'all-MiniLM-L12-v2 (Balanced)',
+                'dimensions': 384,
+                'size': '120 MB',
+                'speed': 'Medium',
+                'quality': 'Very Good',
+                'description': 'Larger MiniLM, better quality than L6'
+            }
+        }
+        selected_model = st.selectbox(
+            "Select Embedding Model",
+            options=list(embedding_models.keys()),
+            format_func=lambda x: embedding_models[x]['name']
+        )
+        st.session_state.db_config['embedding_model'] = selected_model
+        st.session_state.db_config['embedding_dimensions'] = embedding_models[selected_model]['dimensions']
+        # Model details
+        model_info = embedding_models[selected_model]
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Dimensions", model_info['dimensions'])
+        with col2:
+            st.metric("Model Size", model_info['size'])
+        with col3:
+            st.metric("Speed", model_info['speed'])
+        with col4:
+            st.metric("Quality", model_info['quality'])
+        st.info(f"**Why this model?** {model_info['description']}")
+    # ========================================================================
+    # SECTION 3: CHUNKING STRATEGY
+    # ========================================================================
+    with st.expander("✂️ Chunking Strategy", expanded=True):
+        st.markdown("**How to split documents into processable chunks**")
+        col1, col2 = st.columns(2)
+        with col1:
+            chunk_size = st.slider(
+                "Chunk Size (tokens)",
+                min_value=100,
+                max_value=2000,
+                value=st.session_state.db_config['chunk_size'],
+                step=50,
+                help="Number of tokens per chunk"
+            )
+            st.session_state.db_config['chunk_size'] = chunk_size
+            st.caption(f"""
+            **Small (100-300)**: Better precision, more chunks
+            **Medium (400-600)**: Balanced ✅
+            **Large (800-2000)**: More context, fewer chunks
+            """)
+        with col2:
+            chunk_overlap = st.slider(
+                "Chunk Overlap (tokens)",
+                min_value=0,
+                max_value=min(500, chunk_size // 2),
+                value=st.session_state.db_config['chunk_overlap'],
+                step=10,
+                help="Overlap between consecutive chunks"
+            )
+            st.session_state.db_config['chunk_overlap'] = chunk_overlap
+            overlap_pct = (chunk_overlap / chunk_size) * 100 if chunk_size > 0 else 0
+            st.metric("Overlap %", f"{overlap_pct:.1f}%")
+            st.caption(f"""
+            **No Overlap (0%)**: Distinct chunks, might lose context
+            **Small (5-10%)**: Minimal redundancy ✅
+            **Large (20-30%)**: More context preservation
+            """)
+        # Visualization
+        st.markdown("**Chunking Visualization:**")
+        sample_text = "The Pythagorean theorem states that a² + b² = c² for right triangles."
+        words = sample_text.split()
+        if len(words) >= 5:
+            chunk1 = ' '.join(words[:5])
+            chunk2 = ' '.join(words[3:8]) if len(words) >= 8 else ' '.join(words[3:])
+            st.code(f"""
+Chunk 1: "{chunk1}..."
+         {'↓' * (chunk_overlap // 10 if chunk_overlap > 0 else 0)}
+Chunk 2: "...{chunk2}..."
+Overlap: {chunk_overlap} tokens ({overlap_pct:.0f}%)
+            """)
+    # ========================================================================
+    # SECTION 4: TOKENIZATION & PARSING
+    # ========================================================================
+    with st.expander("🔤 Tokenization & Parsing", expanded=False):
         col1, col2 = st.columns(2)
         with col1:
+            tokenizer_options = {
+                'whitespace': 'Whitespace (Simple, fast)',
+                'nltk': 'NLTK (Sentence-aware)',
+                'tiktoken': 'TikToken (GPT-style, accurate)'
+            }
+            tokenizer = st.selectbox(
+                "Tokenizer",
+                options=list(tokenizer_options.keys()),
+                format_func=lambda x: tokenizer_options[x],
+                help="How to split text into tokens"
+            )
+            st.session_state.db_config['tokenizer'] = tokenizer
         with col2:
+            max_chunk_tokens = st.number_input(
+                "Max Tokens per Chunk",
+                min_value=512,
+                max_value=32000,
+                value=st.session_state.db_config['max_chunk_tokens'],
+                step=512,
+                help="Maximum tokens before forcing a split"
+            )
+            st.session_state.db_config['max_chunk_tokens'] = max_chunk_tokens
+        st.info("""
+        **Tokenization** converts text into tokens (words/subwords)
+        - **Whitespace**: Simple split by spaces (fastest)
+        - **NLTK**: Respects sentence boundaries (better)
+        - **TikToken**: Matches GPT tokenization (most accurate)
+        """)
+    # ========================================================================
+    # SAVE CONFIGURATION
+    # ========================================================================
     st.markdown("---")
+    if st.button("💾 Save Configuration", type="primary"):
+        st.success("✅ Configuration saved!")
+        st.json(st.session_state.db_config)
+    # Show current config
+    with st.expander("📋 View Current Configuration"):
+        st.json(st.session_state.db_config)
 # ============================================================================
+# TAB 2: ANALYTICS & VISUALIZATION
 # ============================================================================
+with tab2:
+    st.header("📊 Database Analytics")
+    try:
+        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
+        client = QdrantClient(
+            url=os.getenv("QDRANT_URL"),
+            api_key=os.getenv("QDRANT_API_KEY")
+        )
+        collection_name = st.session_state.db_config['collection_name']
+        # Check if collection exists
+        collections = client.get_collections().collections
+        exists = any(c.name == collection_name for c in collections)
+        if not exists:
+            st.warning(f"⚠️ Collection '{collection_name}' doesn't exist yet. Create it in the Management tab.")
+        else:
+            st.success(f"✅ Analyzing collection: {collection_name}")
+            # ================================================================
+            # STORAGE ANALYTICS
+            # ================================================================
+            st.subheader("💾 Storage Analytics")
+            # Get vector count
+            count = 0
+            offset = None
+            max_iter = 1000
+            for _ in range(max_iter):
+                result = client.scroll(
+                    collection_name=collection_name,
+                    limit=100,
+                    offset=offset,
+                    with_payload=False,
+                    with_vectors=False
+                )
+                if result is None or result[0] is None or len(result[0]) == 0:
+                    break
+                count += len(result[0])
+                offset = result[1]
+                if offset is None:
+                    break
+            col1, col2, col3, col4 = st.columns(4)
+            vector_dim = st.session_state.db_config['embedding_dimensions']
+            bytes_per_float = 4
+            metadata_overhead = 100
+            vector_size_mb = (count * vector_dim * bytes_per_float) / (1024 * 1024)
+            metadata_size_mb = (count * metadata_overhead) / (1024 * 1024)
+            total_size_mb = vector_size_mb + metadata_size_mb
+            with col1:
+                st.metric("Total Vectors", f"{count:,}")
+            with col2:
+                st.metric("Vector Data", f"{vector_size_mb:.2f} MB")
+            with col3:
+                st.metric("Metadata", f"{metadata_size_mb:.2f} MB")
+            with col4:
+                st.metric("Total Size", f"{total_size_mb:.2f} MB")
+            # Storage breakdown
+            st.markdown("**Storage Breakdown:**")
+            storage_data = {
+                "Component": ["Vector Embeddings", "Metadata", "Index Overhead (est.)"],
+                "Size (MB)": [vector_size_mb, metadata_size_mb, total_size_mb * 0.1],
+                "Percentage": [
+                    (vector_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
+                    (metadata_size_mb / total_size_mb * 100) if total_size_mb > 0 else 0,
+                    10.0
                 ]
+            }
+            st.dataframe(storage_data, use_container_width=True)
+            # Free tier usage
+            st.markdown("**Free Tier Capacity:**")
+            free_tier_gb = 1.0
+            used_gb = total_size_mb / 1024
+            remaining_gb = free_tier_gb - used_gb
+            usage_pct = (used_gb / free_tier_gb) * 100
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.progress(min(usage_pct / 100, 1.0))
+                st.caption(f"Used: {used_gb:.3f} GB / {free_tier_gb} GB ({usage_pct:.1f}%)")
+            with col2:
+                st.metric("Remaining", f"{remaining_gb:.3f} GB")
+            # Capacity estimates
+            st.markdown("**Capacity Estimates:**")
+            if count > 0:
+                avg_vector_size = total_size_mb / count
+                max_vectors_1gb = int((1024 / avg_vector_size) * 0.9)  # 90% of theoretical max
+                st.info(f"""
+                **With current data:**
+                - Average size per vector: {avg_vector_size:.3f} MB
+                - Estimated max vectors (1GB): ~{max_vectors_1gb:,}
+                - Current capacity used: {(count / max_vectors_1gb * 100):.1f}%
+                """)
+            # ================================================================
+            # DATA SOURCE ANALYTICS
+            # ================================================================
+            st.subheader("📚 Data Source Breakdown")
+            # Sample vectors to analyze sources
+            sample_result = client.scroll(
+                collection_name=collection_name,
+                limit=min(count, 1000),
+                with_payload=True,
+                with_vectors=False
+            )
+            if sample_result and sample_result[0]:
+                source_counts = {}
+                for point in sample_result[0]:
+                    source = point.payload.get('source_name', 'Unknown')
+                    source_counts[source] = source_counts.get(source, 0) + 1
+                # Display as table
+                source_data = {
+                    "Source": list(source_counts.keys()),
+                    "Vectors": list(source_counts.values()),
+                    "Percentage": [
+                        f"{(v/count*100):.1f}%" for v in source_counts.values()
+                    ]
+                }
+                st.dataframe(source_data, use_container_width=True)
+            # ================================================================
+            # CONFIGURATION SUMMARY
+            # ================================================================
+            st.subheader("⚙️ Active Configuration")
+            config_display = {
+                "Parameter": [
+                    "Embedding Model",
+                    "Vector Dimensions",
+                    "Similarity Metric",
+                    "Chunk Size",
+                    "Chunk Overlap",
+                    "Overlap Percentage"
+                ],
+                "Value": [
+                    st.session_state.db_config['embedding_model'].split('/')[-1],
+                    st.session_state.db_config['embedding_dimensions'],
+                    st.session_state.db_config['similarity_metric'],
+                    f"{st.session_state.db_config['chunk_size']} tokens",
+                    f"{st.session_state.db_config['chunk_overlap']} tokens",
+                    f"{(st.session_state.db_config['chunk_overlap'] / st.session_state.db_config['chunk_size'] * 100):.1f}%"
+                ]
+            }
+            st.dataframe(config_display, use_container_width=True)
+    except Exception as e:
+        st.error(f"❌ Error connecting to database: {str(e)}")
 # ============================================================================
+# TAB 3: MANAGEMENT
 # ============================================================================
+with tab3:
+    st.header("🔧 Database Management")
+    st.warning("⚠️ Management operations affect your database. Use carefully!")
+    try:
+        QdrantClient, Distance, VectorParams, PointStruct = lazy_import_qdrant()
+        client = QdrantClient(
+            url=os.getenv("QDRANT_URL"),
+            api_key=os.getenv("QDRANT_API_KEY")
         )
+        collection_name = st.session_state.db_config['collection_name']
+        # Check if exists
+        collections = client.get_collections().collections
+        exists = any(c.name == collection_name for c in collections)
+        if not exists:
+            st.info(f"Collection '{collection_name}' doesn't exist")
+            if st.button("🏗️ CREATE COLLECTION", type="primary"):
+                try:
+                    # Map string to Distance enum
+                    distance_map = {
+                        'COSINE': Distance.COSINE,
+                        'EUCLIDEAN': Distance.EUCLID,
+                        'DOT': Distance.DOT
+                    }
+                    client.create_collection(
+                        collection_name=collection_name,
+                        vectors_config=VectorParams(
+                            size=st.session_state.db_config['embedding_dimensions'],
+                            distance=distance_map[st.session_state.db_config['similarity_metric']]
+                        )
                     )
+                    st.success(f"✅ Created collection: {collection_name}")
+                    st.balloons()
+                    st.session_state.db_created = True
+                    st.rerun()
+                except Exception as e:
+                    st.error(f"❌ Creation failed: {str(e)}")
+        else:
+            st.success(f"✅ Collection exists: {collection_name}")
+            col1, col2 = st.columns(2)
+            with col1:
+                if st.button("🗑️ Delete Collection", type="secondary"):
+                    if st.checkbox("⚠️ Confirm deletion"):
+                        try:
+                            client.delete_collection(collection_name)
+                            st.success("✅ Collection deleted")
+                            st.session_state.db_created = False
+                            st.rerun()
+                        except Exception as e:
+                            st.error(f"Error: {e}")
+            with col2:
+                if st.button("ℹ️ Collection Info"):
+                    try:
+                        info = client.get_collection(collection_name)
+                        st.json({
+                            "name": collection_name,
+                            "status": "active"
+                        })
+                    except Exception as e:
+                        st.error(f"Error: {e}")
+    except Exception as e:
+        st.error(f"❌ Connection failed: {str(e)}")
+# ============================================================================
+# TAB 4: DATA UPLOAD (Quick Access)
+# ============================================================================
+with tab4:
+    st.header("📚 Quick Data Upload")
+    st.info("For full upload features, use the main upload interface")
+    st.markdown("[Go to Full Upload Interface →](#)")
+    # Simple text upload
+    with st.expander("Quick Text Upload"):
+        text = st.text_area("Paste text:", height=150)
+        if st.button("Upload") and text:
+            st.info("Use the main interface for full upload functionality")
 # ============================================================================
 # FOOTER
 # ============================================================================
 st.markdown("---")
+st.caption("💡 Tip: Save your configuration before creating the collection!")