Spaces:

gopikrishnait
/

CapStoneRAG10

Sleeping

App Files Files Community

Developer commited on Jan 3

Commit

f380f3e

1 Parent(s): 81f53e2

Simple Hello World test app to debug HF Spaces

Browse files

Files changed (3) hide show

Dockerfile +5 -18
streamlit_app.py +45 -1485
streamlit_app_backup.py +1502 -0

Dockerfile CHANGED Viewed

@@ -5,9 +5,7 @@ WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
-    build-essential \
     curl \
-    git \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user for HuggingFace Spaces
@@ -19,25 +17,15 @@ ENV HOME=/home/user \
 # Set working directory for user
 WORKDIR $HOME/app
-# Install torch CPU first (smaller download ~200MB vs 2GB)
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
-    echo "Torch installed successfully"
 # Copy requirements and install Python dependencies
 COPY --chown=user requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt && \
-    echo "All requirements installed successfully"
-# Pre-download the lightweight embedding model to avoid timeout at startup
-RUN python -c "from sentence_transformers import SentenceTransformer; print('Downloading model...'); SentenceTransformer('all-MiniLM-L6-v2'); print('Model downloaded!')"
 # Copy application files
 COPY --chown=user . .
-# Create directories for data (use /tmp for ephemeral storage on Spaces)
-RUN mkdir -p /tmp/chroma_db /tmp/data_cache
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV SPACE_ID=1
@@ -45,6 +33,5 @@ ENV SPACE_ID=1
 # Expose port for Streamlit (HuggingFace Spaces uses 7860)
 EXPOSE 7860
-# No healthcheck - let HF Spaces handle it
-# Run Streamlit with logging
-CMD ["streamlit", "run", "streamlit_app.py", "--server.port=7860", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false", "--logger.level=info"]

 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Create a non-root user for HuggingFace Spaces
 # Set working directory for user
 WORKDIR $HOME/app
 # Copy requirements and install Python dependencies
 COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir -r requirements.txt
 # Copy application files
 COPY --chown=user . .
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV SPACE_ID=1
 # Expose port for Streamlit (HuggingFace Spaces uses 7860)
 EXPOSE 7860
+# Run Streamlit
+CMD ["streamlit", "run", "streamlit_app.py", "--server.port=7860", "--server.address=0.0.0.0"]

streamlit_app.py CHANGED Viewed

@@ -1,1502 +1,62 @@
-"""Streamlit chat interface for RAG application."""
 import streamlit as st
-import sys
-import os
-from datetime import datetime
-import json
-import pandas as pd
-from typing import Optional
-import warnings
-# Suppress warnings
-warnings.filterwarnings('ignore')
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-# Add parent directory to path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-# Check if running on HuggingFace Spaces
-IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
-from config import settings
-from dataset_loader import RAGBenchLoader
-from vector_store import ChromaDBManager, create_vector_store
-try:
-    from vector_store import QdrantManager, QDRANT_AVAILABLE
-except ImportError:
-    QDRANT_AVAILABLE = False
-from llm_client import GroqLLMClient, OllamaLLMClient, RAGPipeline, create_llm_client
-from trace_evaluator import TRACEEvaluator
-from embedding_models import EmbeddingFactory
-from chunking_strategies import ChunkingFactory
-# Page configuration
 st.set_page_config(
-    page_title="RAG Capstone Project",
     page_icon="🤖",
     layout="wide"
 )
-# Initialize session state
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-if "rag_pipeline" not in st.session_state:
-    st.session_state.rag_pipeline = None
-if "vector_store" not in st.session_state:
-    st.session_state.vector_store = None
-if "collection_loaded" not in st.session_state:
-    st.session_state.collection_loaded = False
-if "evaluation_results" not in st.session_state:
-    st.session_state.evaluation_results = None
-if "dataset_size" not in st.session_state:
-    st.session_state.dataset_size = 10000
-if "current_dataset" not in st.session_state:
-    st.session_state.current_dataset = None
-if "current_llm" not in st.session_state:
-    st.session_state.current_llm = settings.llm_models[1]
-if "selected_collection" not in st.session_state:
-    st.session_state.selected_collection = None
-if "available_collections" not in st.session_state:
-    st.session_state.available_collections = []
-if "dataset_name" not in st.session_state:
-    st.session_state.dataset_name = None
-if "collection_name" not in st.session_state:
-    st.session_state.collection_name = None
-if "embedding_model" not in st.session_state:
-    st.session_state.embedding_model = None
-if "groq_api_key" not in st.session_state:
-    st.session_state.groq_api_key = ""
-if "llm_provider" not in st.session_state:
-    st.session_state.llm_provider = settings.llm_provider
-if "ollama_model" not in st.session_state:
-    st.session_state.ollama_model = settings.ollama_model
-if "vector_store_provider" not in st.session_state:
-    st.session_state.vector_store_provider = settings.vector_store_provider
-if "qdrant_url" not in st.session_state:
-    st.session_state.qdrant_url = settings.qdrant_url
-if "qdrant_api_key" not in st.session_state:
-    st.session_state.qdrant_api_key = settings.qdrant_api_key
-def get_available_collections(provider: str = None):
-    """Get list of available collections from vector store."""
-    provider = provider or st.session_state.get("vector_store_provider", "chroma")
-    try:
-        if provider == "qdrant" and QDRANT_AVAILABLE:
-            qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
-            qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
-            if qdrant_url and qdrant_api_key:
-                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
-                collections = vector_store.list_collections()
-                return collections
-            return []
-        else:
-            vector_store = ChromaDBManager(settings.chroma_persist_directory)
-            collections = vector_store.list_collections()
-            return collections
-    except Exception as e:
-        print(f"Error getting collections: {e}")
-        return []
-def main():
-    """Main Streamlit application."""
-    st.title("🤖 RAG Capstone Project")
-    st.markdown("### Retrieval-Augmented Generation with TRACE Evaluation")
-    # Show HuggingFace Spaces notice
-    if IS_HUGGINGFACE_SPACE:
-        st.info("🤗 Running on Hugging Face Spaces - Using Groq API (cloud-based LLM)")
-    # Get available collections at startup
-    available_collections = get_available_collections()
-    st.session_state.available_collections = available_collections
-    # Sidebar for configuration
-    with st.sidebar:
-        st.header("Configuration")
-        # LLM Provider Selection - Disable Ollama on HuggingFace Spaces
-        st.subheader("🔌 LLM Provider")
-        if IS_HUGGINGFACE_SPACE:
-            # Force Groq on HuggingFace Spaces (Ollama not available)
-            st.caption("☁️ **Groq API** (Ollama unavailable on Spaces)")
-            llm_provider = "groq"
-            st.session_state.llm_provider = "groq"
-        else:
-            llm_provider = st.radio(
-                "Choose LLM Provider:",
-                options=["groq", "ollama"],
-                index=0 if st.session_state.llm_provider == "groq" else 1,
-                format_func=lambda x: "☁️ Groq API (Cloud)" if x == "groq" else "🖥️ Ollama (Local)",
-                help="Groq: Cloud API with rate limits. Ollama: Local unlimited inference.",
-                key="llm_provider_radio"
-            )
-            st.session_state.llm_provider = llm_provider
-        # Provider-specific settings
-        if llm_provider == "groq":
-            st.caption("⚠️ Free tier: 30 requests/min")
-            # On HuggingFace Spaces, check for API key in secrets first
-            default_api_key = os.environ.get("GROQ_API_KEY", "") or settings.groq_api_key or ""
-            # API Key input
-            groq_api_key = st.text_input(
-                "Groq API Key",
-                type="password",
-                value=default_api_key,
-                help="Enter your Groq API key (or set GROQ_API_KEY in Spaces secrets)"
-            )
-            if IS_HUGGINGFACE_SPACE and not groq_api_key:
-                st.warning("💡 Tip: Add GROQ_API_KEY to your Space secrets for persistence")
-        else:
-            # Ollama settings (only available locally)
-            st.caption("✅ No rate limits - unlimited usage!")
-            ollama_host = st.text_input(
-                "Ollama Host",
-                value=settings.ollama_host,
-                help="Ollama server URL (default: http://localhost:11434)"
-            )
-            ollama_model = st.selectbox(
-                "Select Ollama Model:",
-                options=settings.ollama_models,
-                index=settings.ollama_models.index(st.session_state.ollama_model) if st.session_state.ollama_model in settings.ollama_models else 0,
-                key="ollama_model_selector"
-            )
-            st.session_state.ollama_model = ollama_model
-            # Connection check button
-            if st.button("🔍 Check Ollama Connection"):
-                try:
-                    import requests
-                    response = requests.get(f"{ollama_host}/api/tags", timeout=5)
-                    if response.status_code == 200:
-                        models = response.json().get("models", [])
-                        model_names = [m["name"] for m in models]
-                        st.success(f"✅ Connected! Available models: {', '.join(model_names)}")
-                    else:
-                        st.error(f"❌ Connection failed: {response.status_code}")
-                except Exception as e:
-                    st.error(f"❌ Cannot connect to Ollama: {e}")
-                    st.info("Make sure Ollama is running: `ollama serve`")
-            groq_api_key = ""  # Not needed for Ollama
-        st.divider()
-        # Vector Store Provider Selection
-        st.subheader("💾 Vector Store")
-        if IS_HUGGINGFACE_SPACE:
-            st.caption("☁️ Use **Qdrant Cloud** for persistent storage")
-            vector_store_options = ["qdrant", "chroma"]
-            default_idx = 0
-        else:
-            vector_store_options = ["chroma", "qdrant"]
-            default_idx = 0
-        vector_store_provider = st.radio(
-            "Choose Vector Store:",
-            options=vector_store_options,
-            index=default_idx,
-            format_func=lambda x: "☁️ Qdrant Cloud (Persistent)" if x == "qdrant" else "💾 ChromaDB (Local)",
-            help="Qdrant: Cloud storage (persistent). ChromaDB: Local storage (ephemeral on Spaces).",
-            key="vector_store_radio"
-        )
-        st.session_state.vector_store_provider = vector_store_provider
-        # Qdrant settings
-        if vector_store_provider == "qdrant":
-            default_qdrant_url = os.environ.get("QDRANT_URL", "") or settings.qdrant_url
-            default_qdrant_key = os.environ.get("QDRANT_API_KEY", "") or settings.qdrant_api_key
-            qdrant_url = st.text_input(
-                "Qdrant URL",
-                value=default_qdrant_url,
-                placeholder="https://xxx-xxx.aws.cloud.qdrant.io:6333",
-                help="Your Qdrant Cloud cluster URL"
-            )
-            qdrant_api_key = st.text_input(
-                "Qdrant API Key",
-                type="password",
-                value=default_qdrant_key,
-                help="Your Qdrant API key"
-            )
-            st.session_state.qdrant_url = qdrant_url
-            st.session_state.qdrant_api_key = qdrant_api_key
-            if not qdrant_url or not qdrant_api_key:
-                st.warning("⚠️ Get free Qdrant Cloud at: https://cloud.qdrant.io")
-            # Test Qdrant connection
-            if st.button("🔍 Test Qdrant Connection"):
-                if qdrant_url and qdrant_api_key:
-                    try:
-                        test_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
-                        collections = test_store.list_collections()
-                        st.success(f"✅ Connected! Found {len(collections)} collection(s)")
-                    except Exception as e:
-                        st.error(f"❌ Connection failed: {e}")
-                else:
-                    st.error("Please enter Qdrant URL and API Key")
-        st.divider()
-        # Get available collections based on provider
-        available_collections = get_available_collections(vector_store_provider)
-        st.session_state.available_collections = available_collections
-        # Option 1: Use existing collection
-        if available_collections:
-            st.subheader("📚 Existing Collections")
-            st.write(f"Found {len(available_collections)} collection(s)")
-            selected_collection = st.selectbox(
-                "Or select existing collection:",
-                available_collections,
-                key="collection_selector"
-            )
-            if st.button("📖 Load Existing Collection", type="secondary"):
-                # Validate based on provider
-                if llm_provider == "groq" and not groq_api_key:
-                    st.error("Please enter your Groq API key")
-                elif vector_store_provider == "qdrant" and (not st.session_state.get("qdrant_url") or not st.session_state.get("qdrant_api_key")):
-                    st.error("Please enter Qdrant URL and API Key")
-                else:
-                    load_existing_collection(
-                        groq_api_key,
-                        selected_collection,
-                        llm_provider,
-                        ollama_host if llm_provider == "ollama" else None,
-                        vector_store_provider
-                    )
-            st.divider()
-        # Option 2: Create new collection
-        st.subheader("🆕 Create New Collection")
-        # Dataset selection
-        st.subheader("1. Dataset Selection")
-        dataset_name = st.selectbox(
-            "Choose Dataset",
-            settings.ragbench_datasets,
-            index=0
-        )
-        # Get dataset size dynamically
-        if st.button("🔍 Check Dataset Size", key="check_size"):
-            with st.spinner("Checking dataset size..."):
-                try:
-                    from datasets import load_dataset
-                    # Load dataset with download_mode to avoid cache issues
-                    st.info(f"Fetching dataset info for '{dataset_name}'...")
-                    ds = load_dataset(
-                        "rungalileo/ragbench",
-                        dataset_name,
-                        split="train",
-                        trust_remote_code=True,
-                        download_mode="force_redownload"  # Force fresh download to avoid cache corruption
-                    )
-                    dataset_size = len(ds)
-                    st.session_state.dataset_size = dataset_size
-                    st.session_state.current_dataset = dataset_name
-                    st.success(f"✅ Dataset '{dataset_name}' has {dataset_size:,} samples available")
-                except Exception as e:
-                    st.error(f"❌ Error: {str(e)}")
-                    st.exception(e)
-                    st.warning(f"Could not determine dataset size. Using default of 10,000.")
-                    st.session_state.dataset_size = 10000
-                    st.session_state.current_dataset = dataset_name
-        # Use stored dataset size or default
-        max_samples_available = st.session_state.get('dataset_size', 10000)
-        st.caption(f"Max available samples: {max_samples_available:,}")
-        num_samples = st.slider(
-            "Number of samples",
-            min_value=10,
-            max_value=max_samples_available,
-            value=min(100, max_samples_available),
-            step=50 if max_samples_available > 1000 else 10,
-            help="Adjust slider to select number of samples"
-        )
-        load_all_samples = st.checkbox(
-            "Load all available samples",
-            value=False,
-            help="Override slider and load entire dataset"
-        )
-        st.divider()
-        # Chunking strategy
-        st.subheader("2. Chunking Strategy")
-        chunking_strategy = st.selectbox(
-            "Choose Chunking Strategy",
-            settings.chunking_strategies,
-            index=0
-        )
-        chunk_size = st.slider(
-            "Chunk Size",
-            min_value=256,
-            max_value=1024,
-            value=512,
-            step=128
-        )
-        overlap = st.slider(
-            "Overlap",
-            min_value=0,
-            max_value=200,
-            value=50,
-            step=10
-        )
-        st.divider()
-        # Embedding model
-        st.subheader("3. Embedding Model")
-        embedding_model = st.selectbox(
-            "Choose Embedding Model",
-            settings.embedding_models,
-            index=0
-        )
-        st.divider()
-        # LLM model selection for new collection
-        st.subheader("4. LLM Model")
-        if llm_provider == "groq":
-            llm_model = st.selectbox(
-                "Choose Groq LLM",
-                settings.llm_models,
-                index=1
-            )
-        else:
-            llm_model = st.selectbox(
-                "Choose Ollama Model",
-                settings.ollama_models,
-                index=settings.ollama_models.index(st.session_state.ollama_model) if st.session_state.ollama_model in settings.ollama_models else 0,
-                key="llm_model_ollama"
-            )
-        st.divider()
-        # Load data button
-        if st.button("🚀 Load Data & Create Collection", type="primary"):
-            # Validate based on provider
-            if llm_provider == "groq" and not groq_api_key:
-                st.error("Please enter your Groq API key")
-            elif vector_store_provider == "qdrant" and (not st.session_state.get("qdrant_url") or not st.session_state.get("qdrant_api_key")):
-                st.error("Please enter Qdrant URL and API Key")
-            else:
-                # Use None for num_samples if loading all data
-                samples_to_load = None if load_all_samples else num_samples
-                load_and_create_collection(
-                    groq_api_key,
-                    dataset_name,
-                    samples_to_load,
-                    chunking_strategy,
-                    chunk_size,
-                    overlap,
-                    embedding_model,
-                    llm_model,
-                    llm_provider,
-                    ollama_host if llm_provider == "ollama" else None,
-                    vector_store_provider
-                )
-    # Main content area
-    if not st.session_state.collection_loaded:
-        st.info("👈 Please configure and load a dataset from the sidebar to begin")
-        # Show instructions
-        with st.expander("📖 How to Use", expanded=True):
-            st.markdown("""
-            1. **Enter your Groq API Key** in the sidebar
-            2. **Select a dataset** from RAG Bench
-            3. **Choose a chunking strategy** (dense, sparse, hybrid, re-ranking)
-            4. **Select an embedding model** for document vectorization
-            5. **Choose an LLM model** for response generation
-            6. **Click "Load Data & Create Collection"** to initialize
-            7. **Start chatting** in the chat interface
-            8. **View retrieved documents** and evaluation metrics
-            9. **Run TRACE evaluation** on test data
-            """)
-        # Show available options
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("📊 Available Datasets")
-            for ds in settings.ragbench_datasets:
-                st.markdown(f"- {ds}")
-        with col2:
-            st.subheader("🤖 Available Models")
-            st.markdown("**Embedding Models:**")
-            for em in settings.embedding_models:
-                st.markdown(f"- {em}")
-            st.markdown("**LLM Models:**")
-            for lm in settings.llm_models:
-                st.markdown(f"- {lm}")
-    else:
-        # Create tabs for different functionalities
-        tab1, tab2, tab3 = st.tabs(["💬 Chat", "📊 Evaluation", "📜 History"])
-        with tab1:
-            chat_interface()
-        with tab2:
-            evaluation_interface()
-        with tab3:
-            history_interface()
-def load_existing_collection(api_key: str, collection_name: str, llm_provider: str = "groq", ollama_host: str = None, vector_store_provider: str = "chroma"):
-    """Load an existing collection from vector store."""
-    with st.spinner(f"Loading collection '{collection_name}'..."):
-        try:
-            # Initialize vector store based on provider
-            if vector_store_provider == "qdrant":
-                qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
-                qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
-                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
-            else:
-                vector_store = ChromaDBManager(settings.chroma_persist_directory)
-            vector_store.get_collection(collection_name)
-            # Extract dataset name from collection name (format: dataset_name_strategy_model)
-            # Try to find which dataset this collection is based on
-            dataset_name = None
-            for ds in settings.ragbench_datasets:
-                if collection_name.startswith(ds.replace("-", "_")):
-                    dataset_name = ds
-                    break
-            if not dataset_name:
-                dataset_name = collection_name.split("_")[0]  # Fallback: use first part
-            # Prompt for LLM selection based on provider
-            if llm_provider == "groq":
-                st.session_state.current_llm = st.selectbox(
-                    "Select Groq LLM for this collection:",
-                    settings.llm_models,
-                    key=f"llm_selector_{collection_name}"
-                )
-            else:
-                st.session_state.current_llm = st.selectbox(
-                    "Select Ollama Model for this collection:",
-                    settings.ollama_models,
-                    key=f"ollama_selector_{collection_name}"
-                )
-            # Initialize LLM client based on provider
-            st.info(f"Initializing LLM client ({llm_provider})...")
-            llm_client = create_llm_client(
-                provider=llm_provider,
-                api_key=api_key,
-                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
-                model_name=st.session_state.current_llm,
-                ollama_host=ollama_host or settings.ollama_host,
-                max_rpm=settings.groq_rpm_limit,
-                rate_limit_delay=settings.rate_limit_delay,
-                max_retries=settings.max_retries,
-                retry_delay=settings.retry_delay
-            )
-            # Create RAG pipeline with correct parameter names
-            st.info("Creating RAG pipeline...")
-            rag_pipeline = RAGPipeline(
-                llm_client=llm_client,
-                vector_store_manager=vector_store
-            )
-            # Store in session state
-            st.session_state.vector_store = vector_store
-            st.session_state.rag_pipeline = rag_pipeline
-            st.session_state.collection_loaded = True
-            st.session_state.current_collection = collection_name
-            st.session_state.selected_collection = collection_name
-            st.session_state.groq_api_key = api_key
-            st.session_state.dataset_name = dataset_name
-            st.session_state.collection_name = collection_name
-            st.session_state.llm_provider = llm_provider
-            # Display system prompt and model info
-            provider_icon = "☁️" if llm_provider == "groq" else "🖥️"
-            st.success(f"✅ Collection '{collection_name}' loaded successfully! {provider_icon} Using {llm_provider.upper()}")
-            with st.expander("🤖 Model & System Prompt Information", expanded=False):
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.write(f"**Provider:** {provider_icon} {llm_provider.upper()}")
-                    st.write(f"**Model:** {st.session_state.current_llm}")
-                    st.write(f"**Collection:** {collection_name}")
-                    st.write(f"**Dataset:** {dataset_name}")
-                with col2:
-                    st.write(f"**Temperature:** 0.0")
-                    st.write(f"**Max Tokens:** 2048")
-                    if llm_provider == "groq":
-                        st.write(f"**Rate Limit:** {settings.groq_rpm_limit} RPM")
-                    else:
-                        st.write(f"**Rate Limit:** ✅ Unlimited (Local)")
-                st.markdown("#### System Prompt")
-                st.info("""
-You are a Fact-Checking and Citation Specialist. Your task is to perform a rigorous audit of a response against provided documents to determine its accuracy, relevance, and level of support.
-**Task:**
-1. Analyze the provided documents and identify information relevant to the user's question
-2. Evaluate the response sentence-by-sentence
-3. Verify each response sentence maps to supporting document sentences
-4. Identify which document sentences were actually used in the response
-                """)
-            st.rerun()
-        except Exception as e:
-            st.error(f"Error loading collection: {str(e)}")
-            st.exception(e)
-def load_and_create_collection(
-    api_key: str,
-    dataset_name: str,
-    num_samples: Optional[int],
-    chunking_strategy: str,
-    chunk_size: int,
-    overlap: int,
-    embedding_model: str,
-    llm_model: str,
-    llm_provider: str = "groq",
-    ollama_host: str = None,
-    vector_store_provider: str = "chroma"
-):
-    """Load dataset and create vector collection."""
-    with st.spinner("Loading dataset and creating collection..."):
-        try:
-            # Initialize dataset loader
-            loader = RAGBenchLoader()
-            # Load dataset
-            if num_samples is None:
-                st.info(f"Loading {dataset_name} dataset (all available samples)...")
-            else:
-                st.info(f"Loading {dataset_name} dataset ({num_samples} samples)...")
-            dataset = loader.load_dataset(dataset_name, split="train", max_samples=num_samples)
-            if not dataset:
-                st.error("Failed to load dataset")
-                return
-            # Initialize vector store based on provider
-            st.info(f"Initializing vector store ({vector_store_provider})...")
-            if vector_store_provider == "qdrant":
-                qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
-                qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
-                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
-            else:
-                vector_store = ChromaDBManager(settings.chroma_persist_directory)
-            # Create collection name
-            collection_name = f"{dataset_name}_{chunking_strategy}_{embedding_model.split('/')[-1]}"
-            collection_name = collection_name.replace("-", "_").replace(".", "_")
-            # Delete existing collection with same name (if exists)
-            existing_collections = vector_store.list_collections()
-            if collection_name in existing_collections:
-                st.warning(f"Collection '{collection_name}' already exists. Deleting and recreating...")
-                vector_store.delete_collection(collection_name)
-                st.info("Old collection deleted. Creating new one...")
-            # Load data into collection
-            st.info(f"Creating collection with {chunking_strategy} chunking...")
-            vector_store.load_dataset_into_collection(
-                collection_name=collection_name,
-                embedding_model_name=embedding_model,
-                chunking_strategy=chunking_strategy,
-                dataset_data=dataset,
-                chunk_size=chunk_size,
-                overlap=overlap
-            )
-            # Initialize LLM client based on provider
-            st.info(f"Initializing LLM client ({llm_provider})...")
-            llm_client = create_llm_client(
-                provider=llm_provider,
-                api_key=api_key,
-                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
-                model_name=llm_model,
-                ollama_host=ollama_host or settings.ollama_host,
-                max_rpm=settings.groq_rpm_limit,
-                rate_limit_delay=settings.rate_limit_delay,
-                max_retries=settings.max_retries,
-                retry_delay=settings.retry_delay
-            )
-            # Create RAG pipeline with correct parameter names
-            rag_pipeline = RAGPipeline(
-                llm_client=llm_client,
-                vector_store_manager=vector_store
-            )
-            # Store in session state
-            st.session_state.vector_store = vector_store
-            st.session_state.rag_pipeline = rag_pipeline
-            st.session_state.collection_loaded = True
-            st.session_state.current_collection = collection_name
-            st.session_state.dataset_name = dataset_name
-            st.session_state.dataset = dataset
-            st.session_state.collection_name = collection_name
-            st.session_state.embedding_model = embedding_model
-            st.session_state.groq_api_key = api_key
-            st.session_state.llm_provider = llm_provider
-            st.session_state.vector_store_provider = vector_store_provider
-            provider_icon = "☁️" if llm_provider == "groq" else "🖥️"
-            vs_icon = "☁️" if vector_store_provider == "qdrant" else "💾"
-            st.success(f"✅ Collection '{collection_name}' created successfully! {provider_icon} Using {llm_provider.upper()}")
-            st.rerun()
-        except Exception as e:
-            st.error(f"Error: {str(e)}")
-def chat_interface():
-    """Chat interface tab."""
-    st.subheader("💬 Chat Interface")
-    # Check if collection is loaded
-    if not st.session_state.collection_loaded:
-        st.warning("⚠️ No data loaded. Please use the configuration panel to load a dataset and create a collection.")
-        st.info("""
-        Steps:
-        1. Select a dataset from the dropdown
-        2. Click "Load Data & Create Collection" button
-        3. Wait for the collection to be created
-        4. Then you can start chatting
-        """)
-        return
-    # Display collection info and LLM selector
-    col1, col2, col3 = st.columns([2, 2, 1])
-    with col1:
-        provider_icon = "☁️" if st.session_state.get("llm_provider", "groq") == "groq" else "🖥️"
-        st.info(f"📚 Collection: {st.session_state.current_collection} | {provider_icon} {st.session_state.get('llm_provider', 'groq').upper()}")
-    with col2:
-        # LLM selector for chat - based on provider
-        current_provider = st.session_state.get("llm_provider", "groq")
-        if current_provider == "groq":
-            model_options = settings.llm_models
-            try:
-                current_index = settings.llm_models.index(st.session_state.current_llm)
-            except ValueError:
-                current_index = 0
-        else:
-            model_options = settings.ollama_models
-            try:
-                current_index = settings.ollama_models.index(st.session_state.current_llm)
-            except ValueError:
-                current_index = 0
-        selected_llm = st.selectbox(
-            f"Select {'Groq' if current_provider == 'groq' else 'Ollama'} Model for chat:",
-            model_options,
-            index=current_index,
-            key="chat_llm_selector"
-        )
-        if selected_llm != st.session_state.current_llm:
-            st.session_state.current_llm = selected_llm
-            # Recreate LLM client with new model
-            llm_client = create_llm_client(
-                provider=current_provider,
-                api_key=st.session_state.groq_api_key if "groq_api_key" in st.session_state else "",
-                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
-                model_name=selected_llm,
-                ollama_host=settings.ollama_host,
-                max_rpm=settings.groq_rpm_limit,
-                rate_limit_delay=settings.rate_limit_delay
-            )
-            st.session_state.rag_pipeline.llm = llm_client
-    with col3:
-        if st.button("🗑️ Clear History"):
-            st.session_state.chat_history = []
-            st.session_state.rag_pipeline.clear_history()
-            st.rerun()
-    # Show system prompt info in expandable section
-    with st.expander("🤖 System Prompt & Model Info", expanded=False):
-        current_provider = st.session_state.get("llm_provider", "groq")
-        col1, col2 = st.columns(2)
-        with col1:
-            provider_icon = "☁️" if current_provider == "groq" else "🖥️"
-            st.write(f"**Provider:** {provider_icon} {current_provider.upper()}")
-            st.write(f"**LLM Model:** {st.session_state.current_llm}")
-            st.write(f"**Temperature:** 0.0")
-            st.write(f"**Max Tokens:** 2048")
-        with col2:
-            st.write(f"**Collection:** {st.session_state.current_collection}")
-            st.write(f"**Dataset:** {st.session_state.get('dataset_name', 'N/A')}")
-            if current_provider == "groq":
-                st.write(f"**Rate Limit:** {settings.groq_rpm_limit} RPM")
-            else:
-                st.write(f"**Rate Limit:** ✅ Unlimited (Local)")
-        st.markdown("#### System Prompt Being Used")
-        system_prompt = """You are a Fact-Checking and Citation Specialist. Your task is to perform a rigorous audit of a response against provided documents to determine its accuracy, relevance, and level of support.
-**TASK OVERVIEW**
-1. **Analyze Documents**: Review the provided documents and identify information relevant to the user's question.
-2. **Evaluate Response**: Review the provided answer sentence-by-sentence.
-3. **Verify Support**: Map each answer sentence to specific supporting sentences in the documents.
-4. **Identify Utilization**: Determine which document sentences were actually used (directly or implicitly) to form the answer."""
-        st.info(system_prompt)
-    # Chat container
-    chat_container = st.container()
-    # Display chat history
-    with chat_container:
-        for chat_idx, entry in enumerate(st.session_state.chat_history):
-            # User message
-            with st.chat_message("user"):
-                st.write(entry["query"])
-            # Assistant message
-            with st.chat_message("assistant"):
-                st.write(entry["response"])
-                # Show retrieved documents in expander
-                with st.expander("📄 Retrieved Documents"):
-                    for doc_idx, doc in enumerate(entry["retrieved_documents"]):
-                        st.markdown(f"**Document {doc_idx+1}** (Distance: {doc.get('distance', 'N/A'):.4f})")
-                        st.text_area(
-                            f"doc_{chat_idx}_{doc_idx}",
-                            value=doc["document"],
-                            height=100,
-                            key=f"doc_area_{chat_idx}_{doc_idx}",
-                            label_visibility="collapsed"
-                        )
-                        if doc.get("metadata"):
-                            st.caption(f"Metadata: {doc['metadata']}")
-    # Chat input
-    query = st.chat_input("Ask a question...")
-    if query:
-        # Check if collection exists
-        if not st.session_state.rag_pipeline or not st.session_state.rag_pipeline.vector_store.current_collection:
-            st.error("❌ No data loaded. Please load a dataset first using the configuration panel.")
-            st.stop()
-        # Add user message
-        with chat_container:
-            with st.chat_message("user"):
-                st.write(query)
-        # Generate response
-        with st.spinner("Generating response..."):
-            try:
-                result = st.session_state.rag_pipeline.query(query)
-            except Exception as e:
-                st.error(f"❌ Error querying: {str(e)}")
-                st.info("Please load a dataset and create a collection first.")
-                st.stop()
-        # Add assistant message
-        with chat_container:
-            with st.chat_message("assistant"):
-                st.write(result["response"])
-                # Show retrieved documents
-                with st.expander("📄 Retrieved Documents"):
-                    for doc_idx, doc in enumerate(result["retrieved_documents"]):
-                        st.markdown(f"**Document {doc_idx+1}** (Distance: {doc.get('distance', 'N/A'):.4f})")
-                        st.text_area(
-                            f"doc_current_{doc_idx}",
-                            value=doc["document"],
-                            height=100,
-                            key=f"doc_current_area_{doc_idx}",
-                            label_visibility="collapsed"
-                        )
-                        if doc.get("metadata"):
-                            st.caption(f"Metadata: {doc['metadata']}")
-        # Store in history
-        st.session_state.chat_history.append(result)
-        st.rerun()
-def evaluation_interface():
-    """Evaluation interface tab."""
-    st.subheader("📊 RAG Evaluation")
-    # Check if collection is loaded
-    if not st.session_state.collection_loaded:
-        st.warning("⚠️ No data loaded. Please load a collection first.")
-        return
-    # Evaluation method selector
-    eval_col1, eval_col2 = st.columns([2, 1])
-    with eval_col1:
-        evaluation_method = st.radio(
-            "Evaluation Method:",
-            options=["TRACE (Heuristic)", "GPT Labeling (LLM-based)", "Hybrid (Both)"],
-            horizontal=True,
-            help="TRACE is fast (no LLM). GPT Labeling is accurate but slower (requires LLM calls)."
-        )
-    # Map UI labels to method IDs
-    method_map = {
-        "TRACE (Heuristic)": "trace",
-        "GPT Labeling (LLM-based)": "gpt_labeling",
-        "Hybrid (Both)": "hybrid"
-    }
-    selected_method = method_map[evaluation_method]
-    # LLM selector for evaluation
-    current_provider = st.session_state.get("llm_provider", "groq")
-    col1, col2 = st.columns([3, 1])
-    with col1:
-        # Show provider-specific models
-        if current_provider == "groq":
-            model_options = settings.llm_models
-            try:
-                current_index = settings.llm_models.index(st.session_state.current_llm)
-            except ValueError:
-                current_index = 0
-        else:
-            model_options = settings.ollama_models
-            try:
-                current_index = settings.ollama_models.index(st.session_state.current_llm)
-            except ValueError:
-                current_index = 0
-        selected_llm = st.selectbox(
-            f"Select {'Groq' if current_provider == 'groq' else 'Ollama'} Model for evaluation:",
-            model_options,
-            index=current_index,
-            key="eval_llm_selector"
-        )
-        # Show provider info
-        provider_icon = "☁️" if current_provider == "groq" else "🖥️"
-        if current_provider == "ollama":
-            st.caption(f"{provider_icon} Using local Ollama - **No rate limits!** Fast evaluation possible.")
-        else:
-            st.caption(f"{provider_icon} Using Groq API - Rate limited to {settings.groq_rpm_limit} RPM")
-    # Show method description
-    method_descriptions = {
-        "trace": """
-        **TRACE Heuristic Method** (Fast, Rule-Based)
-        - Utilization: How well the system uses retrieved documents
-        - Relevance: Relevance of retrieved documents to the query
-        - Adherence: How well the response adheres to the retrieved context
-        - Completeness: How complete the response is in answering the query
-        - ⚡ Speed: ~100ms per evaluation
-        - 💰 Cost: Free (no API calls)
-        """,
-        "gpt_labeling": """
-        **GPT Labeling Method** (Accurate, LLM-based)
-        - Uses sentence-level LLM analysis (from RAGBench paper)
-        - Context Relevance: Fraction of context relevant to query
-        - Context Utilization: Fraction of relevant context used
-        - Completeness: Fraction of relevant info covered
-        - Adherence: Response supported by context (no hallucinations)
-        - ⏱️ Speed: ~2-5 seconds per evaluation
-        - 💰 Cost: ~$0.002-0.01 per evaluation
-        """,
-        "hybrid": """
-        **Hybrid Method** (Comprehensive)
-        - Runs both TRACE and GPT Labeling methods
-        - Provides both fast and accurate evaluation metrics
-        - Best for detailed analysis
-        - ⏱️ Speed: ~3-6 seconds per evaluation
-        - 💰 Cost: Same as GPT Labeling
-        """
-    }
-    st.markdown(method_descriptions[selected_method])
-    # Get maximum test samples available for current dataset
-    try:
-        loader = RAGBenchLoader()
-        max_test_samples = loader.get_test_data_size(st.session_state.dataset_name)
-        st.caption(f"📊 Available test samples: {max_test_samples:,}")
-    except Exception as e:
-        max_test_samples = 100
-        st.caption(f"Available test samples: ~{max_test_samples} (estimated)")
-    # Ensure min and max are reasonable
-    max_test_samples = max(5, min(max_test_samples, 500))  # Cap at 500 for performance
-    num_test_samples = st.slider(
-        "Number of test samples",
-        min_value=5,
-        max_value=max_test_samples,
-        value=min(10, max_test_samples),
-        step=5
-    )
-    # Show warning for GPT labeling (API cost) - only for Groq
-    if selected_method in ["gpt_labeling", "hybrid"]:
-        current_provider = st.session_state.get("llm_provider", "groq")
-        if current_provider == "groq":
-            st.warning(f"⚠️ **{evaluation_method}** requires LLM API calls. This will incur costs and be slower due to rate limiting ({settings.groq_rpm_limit} RPM).")
-        else:
-            st.info(f"ℹ️ **{evaluation_method}** using local Ollama - **No rate limits!** Evaluation will be much faster.")
-    if st.button("🔬 Run Evaluation", type="primary"):
-        # Use selected LLM for evaluation
-        run_evaluation(num_test_samples, selected_llm, selected_method)
-    # Display results
-    if st.session_state.evaluation_results:
-        results = st.session_state.evaluation_results
-        st.success("✅ Evaluation Complete!")
-        st.divider()
-        st.markdown("## 📊 Evaluation Metrics")
-        # Display aggregate scores - handle both TRACE and GPT Labeling metric names
-        st.markdown("### Main Metrics")
-        col1, col2, col3, col4, col5 = st.columns(5)
-        # Determine which metrics are available
-        utilization = results.get('utilization') or results.get('context_utilization', 0)
-        relevance = results.get('relevance') or results.get('context_relevance', 0)
-        adherence = results.get('adherence', 0)
-        completeness = results.get('completeness', 0)
-        average = results.get('average', 0)
-        with col1:
-            st.metric("📊 Utilization", f"{utilization:.3f}")
-        with col2:
-            st.metric("🎯 Relevance", f"{relevance:.3f}")
-        with col3:
-            st.metric("✅ Adherence", f"{adherence:.3f}")
-        with col4:
-            st.metric("📝 Completeness", f"{completeness:.3f}")
-        with col5:
-            st.metric("⭐ Average", f"{average:.3f}")
-        # Detailed results summary - handle both metric types
-        if "individual_scores" in results:
-            with st.expander("📋 Summary Metrics by Query"):
-                df = pd.DataFrame(results["individual_scores"])
-                st.dataframe(df, use_container_width=True)
-        # Detailed per-query results
-        if "detailed_results" in results and results["detailed_results"]:
-            with st.expander("🔍 Detailed Per-Query Analysis"):
-                for query_result in results.get("detailed_results", []):
-                    with st.expander(f"Query {query_result['query_id']}: {query_result['question'][:60]}..."):
-                        st.markdown("### Question")
-                        st.write(query_result['question'])
-                        st.markdown("### LLM Response")
-                        st.write(query_result.get('llm_response', 'N/A'))
-                        st.markdown("### Retrieved Documents")
-                        for doc_idx, doc in enumerate(query_result.get('retrieved_documents', []), 1):
-                            with st.expander(f"📄 Document {doc_idx}"):
-                                st.write(doc)
-                        if query_result.get('ground_truth'):
-                            st.markdown("### Ground Truth")
-                            st.write(query_result['ground_truth'])
-                        # Display metrics with correct labels based on method
-                        metrics = query_result.get('metrics', {})
-                        if metrics:
-                            st.markdown("### Evaluation Metrics")
-                            col1, col2, col3, col4, col5 = st.columns(5)
-                            # Get metric values (handle both TRACE and GPT names)
-                            util_val = metrics.get('utilization') or metrics.get('context_utilization', 0)
-                            rel_val = metrics.get('relevance') or metrics.get('context_relevance', 0)
-                            adh_val = metrics.get('adherence', 0)
-                            comp_val = metrics.get('completeness', 0)
-                            avg_val = metrics.get('average', 0)
-                            with col1:
-                                st.metric("Util", f"{util_val:.3f}")
-                            with col2:
-                                st.metric("Rel", f"{rel_val:.3f}")
-                            with col3:
-                                st.metric("Adh", f"{adh_val:.3f}")
-                            with col4:
-                                st.metric("Comp", f"{comp_val:.3f}")
-                            with col5:
-                                st.metric("Avg", f"{avg_val:.3f}")
-        # For GPT Labeling and Hybrid methods, show additional metrics
-        method = results.get("method", "")
-        if "gpt_labeling" in method or "hybrid" in method:
-            # Show RMSE aggregation metrics (consistency across evaluations)
-            if "rmse_metrics" in results:
-                st.markdown("### 📊 RMSE Aggregation (Metric Consistency)")
-                rmse_data = results.get("rmse_metrics", {})
-                rmse_cols = st.columns(4)
-                with rmse_cols[0]:
-                    rel_mean = rmse_data.get("context_relevance", {}).get("mean", 0)
-                    rel_std = rmse_data.get("context_relevance", {}).get("std_dev", 0)
-                    st.metric("Relevance", f"{rel_mean:.3f} ±{rel_std:.3f}", help="Mean and Std Dev")
-                with rmse_cols[1]:
-                    util_mean = rmse_data.get("context_utilization", {}).get("mean", 0)
-                    util_std = rmse_data.get("context_utilization", {}).get("std_dev", 0)
-                    st.metric("Utilization", f"{util_mean:.3f} ±{util_std:.3f}", help="Mean and Std Dev")
-                with rmse_cols[2]:
-                    comp_mean = rmse_data.get("completeness", {}).get("mean", 0)
-                    comp_std = rmse_data.get("completeness", {}).get("std_dev", 0)
-                    st.metric("Completeness", f"{comp_mean:.3f} ±{comp_std:.3f}", help="Mean and Std Dev")
-                with rmse_cols[3]:
-                    adh_mean = rmse_data.get("adherence", {}).get("mean", 0)
-                    adh_std = rmse_data.get("adherence", {}).get("std_dev", 0)
-                    st.metric("Adherence", f"{adh_mean:.3f} ±{adh_std:.3f}", help="Mean and Std Dev")
-                # Show detailed RMSE statistics in expander
-                with st.expander("See detailed RMSE aggregation statistics"):
-                    for metric_name, metric_data in rmse_data.items():
-                        st.write(f"**{metric_name}**")
-                        col1, col2, col3, col4 = st.columns(4)
-                        with col1:
-                            st.write(f"Mean: {metric_data.get('mean', 0):.4f}")
-                        with col2:
-                            st.write(f"Std Dev: {metric_data.get('std_dev', 0):.4f}")
-                        with col3:
-                            st.write(f"Min: {metric_data.get('min', 0):.4f}")
-                        with col4:
-                            st.write(f"Max: {metric_data.get('max', 0):.4f}")
-            # Show per-metric statistics if available
-            if "per_metric_statistics" in results:
-                st.markdown("### 📈 Per-Metric Statistics (Distribution)")
-                stats_data = results.get("per_metric_statistics", {})
-                stats_cols = st.columns(4)
-                with stats_cols[0]:
-                    rel_stats = stats_data.get("context_relevance", {})
-                    st.metric("Relevance Mean", f"{rel_stats.get('mean', 0):.3f}", help=f"Median: {rel_stats.get('median', 0):.3f}")
-                with stats_cols[1]:
-                    util_stats = stats_data.get("context_utilization", {})
-                    st.metric("Utilization Mean", f"{util_stats.get('mean', 0):.3f}", help=f"Median: {util_stats.get('median', 0):.3f}")
-                with stats_cols[2]:
-                    comp_stats = stats_data.get("completeness", {})
-                    st.metric("Completeness Mean", f"{comp_stats.get('mean', 0):.3f}", help=f"Median: {comp_stats.get('median', 0):.3f}")
-                with stats_cols[3]:
-                    adh_stats = stats_data.get("adherence", {})
-                    st.metric("Adherence Mean", f"{adh_stats.get('mean', 0):.3f}", help=f"Median: {adh_stats.get('median', 0):.3f}")
-                # Show detailed statistics
-                with st.expander("See detailed per-metric statistics"):
-                    for metric_name, metric_stats in stats_data.items():
-                        st.write(f"**{metric_name}**")
-                        col1, col2 = st.columns(2)
-                        with col1:
-                            st.write(f"""
-- Mean: {metric_stats.get('mean', 0):.4f}
-- Median: {metric_stats.get('median', 0):.4f}
-- Std Dev: {metric_stats.get('std_dev', 0):.4f}
-- Min: {metric_stats.get('min', 0):.4f}
-- Max: {metric_stats.get('max', 0):.4f}
-                            """)
-                        with col2:
-                            st.write(f"""
-- 25th percentile: {metric_stats.get('percentile_25', 0):.4f}
-- 75th percentile: {metric_stats.get('percentile_75', 0):.4f}
-- Perfect (>=0.95): {metric_stats.get('perfect_count', 0)}
-- Poor (<0.3): {metric_stats.get('poor_count', 0)}
-- Samples: {metric_stats.get('sample_count', 0)}
-                            """)
-            # Show RMSE vs RAGBench Ground Truth (per RAGBench paper requirement)
-            if "rmse_vs_ground_truth" in results:
-                st.markdown("### 📉 RMSE vs RAGBench Ground Truth")
-                st.info("Compares predicted TRACE scores against original RAGBench dataset scores")
-                rmse_gt = results.get("rmse_vs_ground_truth", {})
-                per_metric_rmse = rmse_gt.get("per_metric_rmse", {})
-                if per_metric_rmse:
-                    rmse_gt_cols = st.columns(5)
-                    with rmse_gt_cols[0]:
-                        st.metric("Relevance RMSE", f"{per_metric_rmse.get('context_relevance', 0):.4f}",
-                                  delta=None, help="Lower is better (0 = perfect match)")
-                    with rmse_gt_cols[1]:
-                        st.metric("Utilization RMSE", f"{per_metric_rmse.get('context_utilization', 0):.4f}")
-                    with rmse_gt_cols[2]:
-                        st.metric("Completeness RMSE", f"{per_metric_rmse.get('completeness', 0):.4f}")
-                    with rmse_gt_cols[3]:
-                        st.metric("Adherence RMSE", f"{per_metric_rmse.get('adherence', 0):.4f}")
-                    with rmse_gt_cols[4]:
-                        agg_rmse = rmse_gt.get("aggregated_rmse", 0)
-                        consistency = rmse_gt.get("consistency_score", 0)
-                        st.metric("Aggregated RMSE", f"{agg_rmse:.4f}",
-                                  delta=f"Consistency: {consistency:.2%}", delta_color="normal")
-            # Show AUCROC vs RAGBench Ground Truth (per RAGBench paper requirement)
-            if "aucroc_vs_ground_truth" in results:
-                st.markdown("### 📊 AUC-ROC vs RAGBench Ground Truth")
-                st.info("Area Under ROC Curve comparing predicted vs ground truth binary classifications")
-                auc_gt = results.get("aucroc_vs_ground_truth", {})
-                if auc_gt:
-                    auc_cols = st.columns(5)
-                    with auc_cols[0]:
-                        st.metric("Relevance AUC", f"{auc_gt.get('context_relevance', 0):.4f}",
-                                  help="Higher is better (1.0 = perfect classification)")
-                    with auc_cols[1]:
-                        st.metric("Utilization AUC", f"{auc_gt.get('context_utilization', 0):.4f}")
-                    with auc_cols[2]:
-                        st.metric("Completeness AUC", f"{auc_gt.get('completeness', 0):.4f}")
-                    with auc_cols[3]:
-                        st.metric("Adherence AUC", f"{auc_gt.get('adherence', 0):.4f}")
-                    with auc_cols[4]:
-                        avg_auc = auc_gt.get("average", 0)
-                        st.metric("Average AUC", f"{avg_auc:.4f}")
-        # Download results
-        st.divider()
-        st.markdown("## 💾 Download Results")
-        # Create a comprehensive download with all details
-        download_data = {
-            "evaluation_metadata": {
-                "timestamp": datetime.now().isoformat(),
-                "dataset": st.session_state.dataset_name,
-                "method": results.get("evaluation_config", {}).get("evaluation_method", "gpt_labeling_prompts"),
-                "total_samples": results.get("num_samples", 0),
-                "embedding_model": st.session_state.embedding_model,
-            },
-            "aggregate_metrics": {
-                "context_relevance": results.get("context_relevance") or results.get("relevance", 0),
-                "context_utilization": results.get("context_utilization") or results.get("utilization", 0),
-                "completeness": results.get("completeness", 0),
-                "adherence": results.get("adherence", 0),
-                "average": results.get("average", 0),
-            },
-            "rmse_metrics": results.get("rmse_metrics", {}),
-            "per_metric_statistics": results.get("per_metric_statistics", {}),
-            "rmse_vs_ground_truth": results.get("rmse_vs_ground_truth", {}),
-            "aucroc_vs_ground_truth": results.get("aucroc_vs_ground_truth", {}),
-            "detailed_results": results.get("detailed_results", [])
-        }
-        results_json = json.dumps(download_data, indent=2, default=str)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.download_button(
-                label="📥 Download Complete Results (JSON)",
-                data=results_json,
-                file_name=f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
-                mime="application/json",
-                help="Download all evaluation results including metrics and per-query details"
-            )
-        with col2:
-            st.download_button(
-                label="📋 Download Metrics Only (JSON)",
-                data=json.dumps(download_data["aggregate_metrics"], indent=2),
-                file_name=f"evaluation_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
-                mime="application/json",
-                help="Download only the aggregate metrics"
-            )
-def run_evaluation(num_samples: int, selected_llm: str = None, method: str = "trace"):
-    """Run evaluation using selected method (TRACE, GPT Labeling, or Hybrid).
-    Args:
-        num_samples: Number of test samples to evaluate
-        selected_llm: LLM model to use for evaluation
-        method: Evaluation method ("trace", "gpt_labeling", or "hybrid")
-    """
-    with st.spinner(f"Running evaluation on {num_samples} samples..."):
-        try:
-            # Create logs container
-            logs_container = st.container()
-            logs_list = []
-            # Display logs header once outside function
-            logs_placeholder = st.empty()
-            def add_log(message: str):
-                """Add log message and update display."""
-                logs_list.append(message)
-                with logs_placeholder.container():
-                    st.markdown("### 📋 Evaluation Logs:")
-                    for log_msg in logs_list:
-                        st.caption(log_msg)
-            # Log evaluation start
-            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-            add_log(f"⏱️ Evaluation started at {timestamp}")
-            add_log(f"📊 Dataset: {st.session_state.dataset_name}")
-            add_log(f"📈 Total samples: {num_samples}")
-            add_log(f"🤖 LLM Model: {selected_llm if selected_llm else st.session_state.current_llm}")
-            add_log(f"🔗 Vector Store: {st.session_state.collection_name}")
-            add_log(f"🧠 Embedding Model: {st.session_state.embedding_model}")
-            # Map method names
-            method_names = {
-                "trace": "TRACE (Heuristic)",
-                "gpt_labeling": "GPT Labeling (LLM-based)",
-                "hybrid": "Hybrid (Both)"
-            }
-            add_log(f"🔬 Evaluation Method: {method_names.get(method, method)}")
-            # Use selected LLM if provided - create with appropriate provider
-            eval_llm_client = None
-            original_llm = None
-            current_provider = st.session_state.get("llm_provider", "groq")
-            if selected_llm and selected_llm != st.session_state.current_llm:
-                add_log(f"🔄 Switching LLM to {selected_llm} ({current_provider.upper()})...")
-                groq_api_key = st.session_state.groq_api_key if "groq_api_key" in st.session_state else ""
-                eval_llm_client = create_llm_client(
-                    provider=current_provider,
-                    api_key=groq_api_key,
-                    api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
-                    model_name=selected_llm,
-                    ollama_host=settings.ollama_host,
-                    max_rpm=settings.groq_rpm_limit,
-                    rate_limit_delay=settings.rate_limit_delay,
-                    max_retries=settings.max_retries,
-                    retry_delay=settings.retry_delay
-                )
-                # Temporarily replace LLM client
-                original_llm = st.session_state.rag_pipeline.llm
-                st.session_state.rag_pipeline.llm = eval_llm_client
-            else:
-                eval_llm_client = st.session_state.rag_pipeline.llm
-            # Log provider info
-            provider_icon = "☁️" if current_provider == "groq" else "🖥️"
-            add_log(f"{provider_icon} LLM Provider: {current_provider.upper()}")
-            # Get test data
-            add_log("📥 Loading test data...")
-            loader = RAGBenchLoader()
-            test_data = loader.get_test_data(
-                st.session_state.dataset_name,
-                num_samples
-            )
-            add_log(f"✅ Loaded {len(test_data)} test samples")
-            # Prepare test cases
-            test_cases = []
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            add_log("🔍 Processing samples...")
-            for i, sample in enumerate(test_data):
-                status_text.text(f"Processing sample {i+1}/{num_samples}")
-                # Query the RAG system
-                result = st.session_state.rag_pipeline.query(
-                    sample["question"],
-                    n_results=5
-                )
-                # Prepare test case
-                test_cases.append({
-                    "query": sample["question"],
-                    "response": result["response"],
-                    "retrieved_documents": [doc["document"] for doc in result["retrieved_documents"]],
-                    "ground_truth": sample.get("answer", "")
-                })
-                # Update progress
-                progress_bar.progress((i + 1) / num_samples)
-                # Log every 10 samples
-                if (i + 1) % 10 == 0 or (i + 1) == num_samples:
-                    add_log(f"  ✓ Processed {i + 1}/{num_samples} samples")
-            status_text.text(f"Running {method_names.get(method, method)} evaluation...")
-            add_log(f"📊 Running evaluation using {method_names.get(method, method)}...")
-            # Extract chunking and embedding metadata from session state
-            # (These were stored when the collection was loaded/created)
-            chunking_strategy = st.session_state.vector_store.chunking_strategy if st.session_state.vector_store else None
-            embedding_model = st.session_state.embedding_model
-            chunk_size = st.session_state.vector_store.chunk_size if st.session_state.vector_store else None
-            chunk_overlap = st.session_state.vector_store.chunk_overlap if st.session_state.vector_store else None
-            # Log retrieval configuration
-            add_log(f"🔧 Retrieval Configuration:")
-            add_log(f"  • Chunking Strategy: {chunking_strategy or 'Unknown'}")
-            add_log(f"  • Chunk Size: {chunk_size or 'Unknown'}")
-            add_log(f"  • Chunk Overlap: {chunk_overlap or 'Unknown'}")
-            add_log(f"  • Embedding Model: {embedding_model or 'Unknown'}")
-            # Import unified pipeline
-            try:
-                from evaluation_pipeline import UnifiedEvaluationPipeline
-                # Run evaluation with metadata using unified pipeline
-                pipeline = UnifiedEvaluationPipeline(
-                    llm_client=eval_llm_client,
-                    chunking_strategy=chunking_strategy,
-                    embedding_model=embedding_model,
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap
-                )
-                # Run evaluation with selected method
-                results = pipeline.evaluate_batch(test_cases, method=method)
-            except ImportError:
-                # Fallback to TRACE only if evaluation_pipeline module not available
-                add_log("⚠️  evaluation_pipeline module not found, falling back to TRACE...")
-                # Run evaluation with metadata using TRACE
-                evaluator = TRACEEvaluator(
-                    chunking_strategy=chunking_strategy,
-                    embedding_model=embedding_model,
-                    chunk_size=chunk_size,
-                    chunk_overlap=chunk_overlap
-                )
-                results = evaluator.evaluate_batch(test_cases)
-            st.session_state.evaluation_results = results
-            # Log evaluation results summary
-            add_log("✅ Evaluation completed successfully!")
-            # Display appropriate metrics based on method
-            if method == "trace":
-                add_log(f"  • Utilization: {results.get('utilization', 0):.2%}")
-                add_log(f"  • Relevance: {results.get('relevance', 0):.2%}")
-                add_log(f"  • Adherence: {results.get('adherence', 0):.2%}")
-                add_log(f"  • Completeness: {results.get('completeness', 0):.2%}")
-                add_log(f"  • Average: {results.get('average', 0):.2%}")
-            elif method == "gpt_labeling":
-                if "context_relevance" in results:
-                    add_log(f"  • Context Relevance: {results.get('context_relevance', 0):.2%}")
-                    add_log(f"  • Context Utilization: {results.get('context_utilization', 0):.2%}")
-                    add_log(f"  • Completeness: {results.get('completeness', 0):.2%}")
-                    add_log(f"  • Adherence: {results.get('adherence', 0):.2%}")
-                    add_log(f"  • Average: {results.get('average', 0):.2%}")
-                # NEW: Display RMSE and AUCROC metrics if available
-                if "rmse_metrics" in results:
-                    add_log(f"📈 RMSE Metrics (vs ground truth):")
-                    rmse_metrics = results.get("rmse_metrics", {})
-                    add_log(f"    • Context Relevance RMSE: {rmse_metrics.get('relevance', 0):.4f}")
-                    add_log(f"    • Context Utilization RMSE: {rmse_metrics.get('utilization', 0):.4f}")
-                    add_log(f"    • Completeness RMSE: {rmse_metrics.get('completeness', 0):.4f}")
-                    add_log(f"    • Adherence RMSE: {rmse_metrics.get('adherence', 0):.4f}")
-                    add_log(f"    • Average RMSE: {rmse_metrics.get('average', 0):.4f}")
-                if "auc_metrics" in results:
-                    add_log(f"📊 AUCROC Metrics (binary classification):")
-                    auc_metrics = results.get("auc_metrics", {})
-                    add_log(f"    • Context Relevance AUCROC: {auc_metrics.get('relevance', 0):.4f}")
-                    add_log(f"    • Context Utilization AUCROC: {auc_metrics.get('utilization', 0):.4f}")
-                    add_log(f"    • Completeness AUCROC: {auc_metrics.get('completeness', 0):.4f}")
-                    add_log(f"    • Adherence AUCROC: {auc_metrics.get('adherence', 0):.4f}")
-                    add_log(f"    • Average AUCROC: {auc_metrics.get('average', 0):.4f}")
-            elif method == "hybrid":
-                add_log("  📊 TRACE Metrics:")
-                trace_res = results.get("trace_results", {})
-                add_log(f"    • Utilization: {trace_res.get('utilization', 0):.2%}")
-                add_log(f"    • Relevance: {trace_res.get('relevance', 0):.2%}")
-                add_log(f"    • Adherence: {trace_res.get('adherence', 0):.2%}")
-                add_log(f"    • Completeness: {trace_res.get('completeness', 0):.2%}")
-                add_log("  🧠 GPT Labeling Metrics:")
-                gpt_res = results.get("gpt_results", {})
-                add_log(f"    • Context Relevance: {gpt_res.get('context_relevance', 0):.2%}")
-                add_log(f"    • Context Utilization: {gpt_res.get('context_utilization', 0):.2%}")
-                add_log(f"    • Completeness: {gpt_res.get('completeness', 0):.2%}")
-                add_log(f"    • Adherence: {gpt_res.get('adherence', 0):.2%}")
-            # Restore original LLM if it was switched
-            if selected_llm and selected_llm != st.session_state.current_llm and original_llm:
-                st.session_state.rag_pipeline.llm = original_llm
-                add_log(f"🔄 Restored original LLM")
-            add_log(f"⏱️ Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-        except Exception as e:
-            st.error(f"Error during evaluation: {str(e)}")
-            add_log(f"❌ Error: {str(e)}")
-def history_interface():
-    """History interface tab."""
-    st.subheader("📜 Chat History")
-    if not st.session_state.chat_history:
-        st.info("No chat history yet. Start a conversation in the Chat tab!")
-        return
-    # Export history
-    col1, col2 = st.columns([3, 1])
-    with col2:
-        history_json = json.dumps(st.session_state.chat_history, indent=2)
-        st.download_button(
-            label="💾 Export History",
-            data=history_json,
-            file_name=f"chat_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
-            mime="application/json"
-        )
-    # Display history
-    for i, entry in enumerate(st.session_state.chat_history):
-        with st.expander(f"💬 Conversation {i+1}: {entry['query'][:50]}..."):
-            st.markdown(f"**Query:** {entry['query']}")
-            st.markdown(f"**Response:** {entry['response']}")
-            st.markdown(f"**Timestamp:** {entry.get('timestamp', 'N/A')}")
-            st.markdown("**Retrieved Documents:**")
-            for j, doc in enumerate(entry["retrieved_documents"]):
-                st.text_area(
-                    f"Document {j+1}",
-                    value=doc["document"],
-                    height=100,
-                    key=f"history_doc_{i}_{j}"
-                )
-if __name__ == "__main__":
-    main()

+"""Simple Hello World app to test HuggingFace Spaces."""
 import streamlit as st
 st.set_page_config(
+    page_title="RAG Capstone - Test",
     page_icon="🤖",
     layout="wide"
 )
+st.title("🤖 Hello World!")
+st.write("If you can see this, the HuggingFace Space is working!")
+st.success("✅ Streamlit is running successfully on port 7860")
+import sys
+st.info(f"Python version: {sys.version}")
+# Test basic imports
+try:
+    import pandas as pd
+    st.success("✅ pandas imported")
+except Exception as e:
+    st.error(f"❌ pandas: {e}")
+try:
+    import numpy as np
+    st.success("✅ numpy imported")
+except Exception as e:
+    st.error(f"❌ numpy: {e}")
+try:
+    from groq import Groq
+    st.success("✅ groq imported")
+except Exception as e:
+    st.error(f"❌ groq: {e}")
+try:
+    import torch
+    st.success(f"✅ torch imported (version: {torch.__version__})")
+except Exception as e:
+    st.error(f"❌ torch: {e}")
+try:
+    from sentence_transformers import SentenceTransformer
+    st.success("✅ sentence_transformers imported")
+except Exception as e:
+    st.error(f"❌ sentence_transformers: {e}")
+try:
+    import chromadb
+    st.success("✅ chromadb imported")
+except Exception as e:
+    st.error(f"❌ chromadb: {e}")
+try:
+    from qdrant_client import QdrantClient
+    st.success("✅ qdrant_client imported")
+except Exception as e:
+    st.error(f"❌ qdrant_client: {e}")
+st.markdown("---")
+st.write("All basic imports completed!")

streamlit_app_backup.py ADDED Viewed

	@@ -0,0 +1,1502 @@

+"""Streamlit chat interface for RAG application."""
+import streamlit as st
+import sys
+import os
+from datetime import datetime
+import json
+import pandas as pd
+from typing import Optional
+import warnings
+# Suppress warnings
+warnings.filterwarnings('ignore')
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# Check if running on HuggingFace Spaces
+IS_HUGGINGFACE_SPACE = os.environ.get("SPACE_ID") is not None
+from config import settings
+from dataset_loader import RAGBenchLoader
+from vector_store import ChromaDBManager, create_vector_store
+try:
+    from vector_store import QdrantManager, QDRANT_AVAILABLE
+except ImportError:
+    QDRANT_AVAILABLE = False
+from llm_client import GroqLLMClient, OllamaLLMClient, RAGPipeline, create_llm_client
+from trace_evaluator import TRACEEvaluator
+from embedding_models import EmbeddingFactory
+from chunking_strategies import ChunkingFactory
+# Page configuration
+st.set_page_config(
+    page_title="RAG Capstone Project",
+    page_icon="🤖",
+    layout="wide"
+)
+# Initialize session state
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "rag_pipeline" not in st.session_state:
+    st.session_state.rag_pipeline = None
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+if "collection_loaded" not in st.session_state:
+    st.session_state.collection_loaded = False
+if "evaluation_results" not in st.session_state:
+    st.session_state.evaluation_results = None
+if "dataset_size" not in st.session_state:
+    st.session_state.dataset_size = 10000
+if "current_dataset" not in st.session_state:
+    st.session_state.current_dataset = None
+if "current_llm" not in st.session_state:
+    st.session_state.current_llm = settings.llm_models[1]
+if "selected_collection" not in st.session_state:
+    st.session_state.selected_collection = None
+if "available_collections" not in st.session_state:
+    st.session_state.available_collections = []
+if "dataset_name" not in st.session_state:
+    st.session_state.dataset_name = None
+if "collection_name" not in st.session_state:
+    st.session_state.collection_name = None
+if "embedding_model" not in st.session_state:
+    st.session_state.embedding_model = None
+if "groq_api_key" not in st.session_state:
+    st.session_state.groq_api_key = ""
+if "llm_provider" not in st.session_state:
+    st.session_state.llm_provider = settings.llm_provider
+if "ollama_model" not in st.session_state:
+    st.session_state.ollama_model = settings.ollama_model
+if "vector_store_provider" not in st.session_state:
+    st.session_state.vector_store_provider = settings.vector_store_provider
+if "qdrant_url" not in st.session_state:
+    st.session_state.qdrant_url = settings.qdrant_url
+if "qdrant_api_key" not in st.session_state:
+    st.session_state.qdrant_api_key = settings.qdrant_api_key
+def get_available_collections(provider: str = None):
+    """Get list of available collections from vector store."""
+    provider = provider or st.session_state.get("vector_store_provider", "chroma")
+    try:
+        if provider == "qdrant" and QDRANT_AVAILABLE:
+            qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
+            qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
+            if qdrant_url and qdrant_api_key:
+                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
+                collections = vector_store.list_collections()
+                return collections
+            return []
+        else:
+            vector_store = ChromaDBManager(settings.chroma_persist_directory)
+            collections = vector_store.list_collections()
+            return collections
+    except Exception as e:
+        print(f"Error getting collections: {e}")
+        return []
+def main():
+    """Main Streamlit application."""
+    st.title("🤖 RAG Capstone Project")
+    st.markdown("### Retrieval-Augmented Generation with TRACE Evaluation")
+    # Show HuggingFace Spaces notice
+    if IS_HUGGINGFACE_SPACE:
+        st.info("🤗 Running on Hugging Face Spaces - Using Groq API (cloud-based LLM)")
+    # Get available collections at startup
+    available_collections = get_available_collections()
+    st.session_state.available_collections = available_collections
+    # Sidebar for configuration
+    with st.sidebar:
+        st.header("Configuration")
+        # LLM Provider Selection - Disable Ollama on HuggingFace Spaces
+        st.subheader("🔌 LLM Provider")
+        if IS_HUGGINGFACE_SPACE:
+            # Force Groq on HuggingFace Spaces (Ollama not available)
+            st.caption("☁️ **Groq API** (Ollama unavailable on Spaces)")
+            llm_provider = "groq"
+            st.session_state.llm_provider = "groq"
+        else:
+            llm_provider = st.radio(
+                "Choose LLM Provider:",
+                options=["groq", "ollama"],
+                index=0 if st.session_state.llm_provider == "groq" else 1,
+                format_func=lambda x: "☁️ Groq API (Cloud)" if x == "groq" else "🖥️ Ollama (Local)",
+                help="Groq: Cloud API with rate limits. Ollama: Local unlimited inference.",
+                key="llm_provider_radio"
+            )
+            st.session_state.llm_provider = llm_provider
+        # Provider-specific settings
+        if llm_provider == "groq":
+            st.caption("⚠️ Free tier: 30 requests/min")
+            # On HuggingFace Spaces, check for API key in secrets first
+            default_api_key = os.environ.get("GROQ_API_KEY", "") or settings.groq_api_key or ""
+            # API Key input
+            groq_api_key = st.text_input(
+                "Groq API Key",
+                type="password",
+                value=default_api_key,
+                help="Enter your Groq API key (or set GROQ_API_KEY in Spaces secrets)"
+            )
+            if IS_HUGGINGFACE_SPACE and not groq_api_key:
+                st.warning("💡 Tip: Add GROQ_API_KEY to your Space secrets for persistence")
+        else:
+            # Ollama settings (only available locally)
+            st.caption("✅ No rate limits - unlimited usage!")
+            ollama_host = st.text_input(
+                "Ollama Host",
+                value=settings.ollama_host,
+                help="Ollama server URL (default: http://localhost:11434)"
+            )
+            ollama_model = st.selectbox(
+                "Select Ollama Model:",
+                options=settings.ollama_models,
+                index=settings.ollama_models.index(st.session_state.ollama_model) if st.session_state.ollama_model in settings.ollama_models else 0,
+                key="ollama_model_selector"
+            )
+            st.session_state.ollama_model = ollama_model
+            # Connection check button
+            if st.button("🔍 Check Ollama Connection"):
+                try:
+                    import requests
+                    response = requests.get(f"{ollama_host}/api/tags", timeout=5)
+                    if response.status_code == 200:
+                        models = response.json().get("models", [])
+                        model_names = [m["name"] for m in models]
+                        st.success(f"✅ Connected! Available models: {', '.join(model_names)}")
+                    else:
+                        st.error(f"❌ Connection failed: {response.status_code}")
+                except Exception as e:
+                    st.error(f"❌ Cannot connect to Ollama: {e}")
+                    st.info("Make sure Ollama is running: `ollama serve`")
+            groq_api_key = ""  # Not needed for Ollama
+        st.divider()
+        # Vector Store Provider Selection
+        st.subheader("💾 Vector Store")
+        if IS_HUGGINGFACE_SPACE:
+            st.caption("☁️ Use **Qdrant Cloud** for persistent storage")
+            vector_store_options = ["qdrant", "chroma"]
+            default_idx = 0
+        else:
+            vector_store_options = ["chroma", "qdrant"]
+            default_idx = 0
+        vector_store_provider = st.radio(
+            "Choose Vector Store:",
+            options=vector_store_options,
+            index=default_idx,
+            format_func=lambda x: "☁️ Qdrant Cloud (Persistent)" if x == "qdrant" else "💾 ChromaDB (Local)",
+            help="Qdrant: Cloud storage (persistent). ChromaDB: Local storage (ephemeral on Spaces).",
+            key="vector_store_radio"
+        )
+        st.session_state.vector_store_provider = vector_store_provider
+        # Qdrant settings
+        if vector_store_provider == "qdrant":
+            default_qdrant_url = os.environ.get("QDRANT_URL", "") or settings.qdrant_url
+            default_qdrant_key = os.environ.get("QDRANT_API_KEY", "") or settings.qdrant_api_key
+            qdrant_url = st.text_input(
+                "Qdrant URL",
+                value=default_qdrant_url,
+                placeholder="https://xxx-xxx.aws.cloud.qdrant.io:6333",
+                help="Your Qdrant Cloud cluster URL"
+            )
+            qdrant_api_key = st.text_input(
+                "Qdrant API Key",
+                type="password",
+                value=default_qdrant_key,
+                help="Your Qdrant API key"
+            )
+            st.session_state.qdrant_url = qdrant_url
+            st.session_state.qdrant_api_key = qdrant_api_key
+            if not qdrant_url or not qdrant_api_key:
+                st.warning("⚠️ Get free Qdrant Cloud at: https://cloud.qdrant.io")
+            # Test Qdrant connection
+            if st.button("🔍 Test Qdrant Connection"):
+                if qdrant_url and qdrant_api_key:
+                    try:
+                        test_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
+                        collections = test_store.list_collections()
+                        st.success(f"✅ Connected! Found {len(collections)} collection(s)")
+                    except Exception as e:
+                        st.error(f"❌ Connection failed: {e}")
+                else:
+                    st.error("Please enter Qdrant URL and API Key")
+        st.divider()
+        # Get available collections based on provider
+        available_collections = get_available_collections(vector_store_provider)
+        st.session_state.available_collections = available_collections
+        # Option 1: Use existing collection
+        if available_collections:
+            st.subheader("📚 Existing Collections")
+            st.write(f"Found {len(available_collections)} collection(s)")
+            selected_collection = st.selectbox(
+                "Or select existing collection:",
+                available_collections,
+                key="collection_selector"
+            )
+            if st.button("📖 Load Existing Collection", type="secondary"):
+                # Validate based on provider
+                if llm_provider == "groq" and not groq_api_key:
+                    st.error("Please enter your Groq API key")
+                elif vector_store_provider == "qdrant" and (not st.session_state.get("qdrant_url") or not st.session_state.get("qdrant_api_key")):
+                    st.error("Please enter Qdrant URL and API Key")
+                else:
+                    load_existing_collection(
+                        groq_api_key,
+                        selected_collection,
+                        llm_provider,
+                        ollama_host if llm_provider == "ollama" else None,
+                        vector_store_provider
+                    )
+            st.divider()
+        # Option 2: Create new collection
+        st.subheader("🆕 Create New Collection")
+        # Dataset selection
+        st.subheader("1. Dataset Selection")
+        dataset_name = st.selectbox(
+            "Choose Dataset",
+            settings.ragbench_datasets,
+            index=0
+        )
+        # Get dataset size dynamically
+        if st.button("🔍 Check Dataset Size", key="check_size"):
+            with st.spinner("Checking dataset size..."):
+                try:
+                    from datasets import load_dataset
+                    # Load dataset with download_mode to avoid cache issues
+                    st.info(f"Fetching dataset info for '{dataset_name}'...")
+                    ds = load_dataset(
+                        "rungalileo/ragbench",
+                        dataset_name,
+                        split="train",
+                        trust_remote_code=True,
+                        download_mode="force_redownload"  # Force fresh download to avoid cache corruption
+                    )
+                    dataset_size = len(ds)
+                    st.session_state.dataset_size = dataset_size
+                    st.session_state.current_dataset = dataset_name
+                    st.success(f"✅ Dataset '{dataset_name}' has {dataset_size:,} samples available")
+                except Exception as e:
+                    st.error(f"❌ Error: {str(e)}")
+                    st.exception(e)
+                    st.warning(f"Could not determine dataset size. Using default of 10,000.")
+                    st.session_state.dataset_size = 10000
+                    st.session_state.current_dataset = dataset_name
+        # Use stored dataset size or default
+        max_samples_available = st.session_state.get('dataset_size', 10000)
+        st.caption(f"Max available samples: {max_samples_available:,}")
+        num_samples = st.slider(
+            "Number of samples",
+            min_value=10,
+            max_value=max_samples_available,
+            value=min(100, max_samples_available),
+            step=50 if max_samples_available > 1000 else 10,
+            help="Adjust slider to select number of samples"
+        )
+        load_all_samples = st.checkbox(
+            "Load all available samples",
+            value=False,
+            help="Override slider and load entire dataset"
+        )
+        st.divider()
+        # Chunking strategy
+        st.subheader("2. Chunking Strategy")
+        chunking_strategy = st.selectbox(
+            "Choose Chunking Strategy",
+            settings.chunking_strategies,
+            index=0
+        )
+        chunk_size = st.slider(
+            "Chunk Size",
+            min_value=256,
+            max_value=1024,
+            value=512,
+            step=128
+        )
+        overlap = st.slider(
+            "Overlap",
+            min_value=0,
+            max_value=200,
+            value=50,
+            step=10
+        )
+        st.divider()
+        # Embedding model
+        st.subheader("3. Embedding Model")
+        embedding_model = st.selectbox(
+            "Choose Embedding Model",
+            settings.embedding_models,
+            index=0
+        )
+        st.divider()
+        # LLM model selection for new collection
+        st.subheader("4. LLM Model")
+        if llm_provider == "groq":
+            llm_model = st.selectbox(
+                "Choose Groq LLM",
+                settings.llm_models,
+                index=1
+            )
+        else:
+            llm_model = st.selectbox(
+                "Choose Ollama Model",
+                settings.ollama_models,
+                index=settings.ollama_models.index(st.session_state.ollama_model) if st.session_state.ollama_model in settings.ollama_models else 0,
+                key="llm_model_ollama"
+            )
+        st.divider()
+        # Load data button
+        if st.button("🚀 Load Data & Create Collection", type="primary"):
+            # Validate based on provider
+            if llm_provider == "groq" and not groq_api_key:
+                st.error("Please enter your Groq API key")
+            elif vector_store_provider == "qdrant" and (not st.session_state.get("qdrant_url") or not st.session_state.get("qdrant_api_key")):
+                st.error("Please enter Qdrant URL and API Key")
+            else:
+                # Use None for num_samples if loading all data
+                samples_to_load = None if load_all_samples else num_samples
+                load_and_create_collection(
+                    groq_api_key,
+                    dataset_name,
+                    samples_to_load,
+                    chunking_strategy,
+                    chunk_size,
+                    overlap,
+                    embedding_model,
+                    llm_model,
+                    llm_provider,
+                    ollama_host if llm_provider == "ollama" else None,
+                    vector_store_provider
+                )
+    # Main content area
+    if not st.session_state.collection_loaded:
+        st.info("👈 Please configure and load a dataset from the sidebar to begin")
+        # Show instructions
+        with st.expander("📖 How to Use", expanded=True):
+            st.markdown("""
+            1. **Enter your Groq API Key** in the sidebar
+            2. **Select a dataset** from RAG Bench
+            3. **Choose a chunking strategy** (dense, sparse, hybrid, re-ranking)
+            4. **Select an embedding model** for document vectorization
+            5. **Choose an LLM model** for response generation
+            6. **Click "Load Data & Create Collection"** to initialize
+            7. **Start chatting** in the chat interface
+            8. **View retrieved documents** and evaluation metrics
+            9. **Run TRACE evaluation** on test data
+            """)
+        # Show available options
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("📊 Available Datasets")
+            for ds in settings.ragbench_datasets:
+                st.markdown(f"- {ds}")
+        with col2:
+            st.subheader("🤖 Available Models")
+            st.markdown("**Embedding Models:**")
+            for em in settings.embedding_models:
+                st.markdown(f"- {em}")
+            st.markdown("**LLM Models:**")
+            for lm in settings.llm_models:
+                st.markdown(f"- {lm}")
+    else:
+        # Create tabs for different functionalities
+        tab1, tab2, tab3 = st.tabs(["💬 Chat", "📊 Evaluation", "📜 History"])
+        with tab1:
+            chat_interface()
+        with tab2:
+            evaluation_interface()
+        with tab3:
+            history_interface()
+def load_existing_collection(api_key: str, collection_name: str, llm_provider: str = "groq", ollama_host: str = None, vector_store_provider: str = "chroma"):
+    """Load an existing collection from vector store."""
+    with st.spinner(f"Loading collection '{collection_name}'..."):
+        try:
+            # Initialize vector store based on provider
+            if vector_store_provider == "qdrant":
+                qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
+                qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
+                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
+            else:
+                vector_store = ChromaDBManager(settings.chroma_persist_directory)
+            vector_store.get_collection(collection_name)
+            # Extract dataset name from collection name (format: dataset_name_strategy_model)
+            # Try to find which dataset this collection is based on
+            dataset_name = None
+            for ds in settings.ragbench_datasets:
+                if collection_name.startswith(ds.replace("-", "_")):
+                    dataset_name = ds
+                    break
+            if not dataset_name:
+                dataset_name = collection_name.split("_")[0]  # Fallback: use first part
+            # Prompt for LLM selection based on provider
+            if llm_provider == "groq":
+                st.session_state.current_llm = st.selectbox(
+                    "Select Groq LLM for this collection:",
+                    settings.llm_models,
+                    key=f"llm_selector_{collection_name}"
+                )
+            else:
+                st.session_state.current_llm = st.selectbox(
+                    "Select Ollama Model for this collection:",
+                    settings.ollama_models,
+                    key=f"ollama_selector_{collection_name}"
+                )
+            # Initialize LLM client based on provider
+            st.info(f"Initializing LLM client ({llm_provider})...")
+            llm_client = create_llm_client(
+                provider=llm_provider,
+                api_key=api_key,
+                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
+                model_name=st.session_state.current_llm,
+                ollama_host=ollama_host or settings.ollama_host,
+                max_rpm=settings.groq_rpm_limit,
+                rate_limit_delay=settings.rate_limit_delay,
+                max_retries=settings.max_retries,
+                retry_delay=settings.retry_delay
+            )
+            # Create RAG pipeline with correct parameter names
+            st.info("Creating RAG pipeline...")
+            rag_pipeline = RAGPipeline(
+                llm_client=llm_client,
+                vector_store_manager=vector_store
+            )
+            # Store in session state
+            st.session_state.vector_store = vector_store
+            st.session_state.rag_pipeline = rag_pipeline
+            st.session_state.collection_loaded = True
+            st.session_state.current_collection = collection_name
+            st.session_state.selected_collection = collection_name
+            st.session_state.groq_api_key = api_key
+            st.session_state.dataset_name = dataset_name
+            st.session_state.collection_name = collection_name
+            st.session_state.llm_provider = llm_provider
+            # Display system prompt and model info
+            provider_icon = "☁️" if llm_provider == "groq" else "🖥️"
+            st.success(f"✅ Collection '{collection_name}' loaded successfully! {provider_icon} Using {llm_provider.upper()}")
+            with st.expander("🤖 Model & System Prompt Information", expanded=False):
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.write(f"**Provider:** {provider_icon} {llm_provider.upper()}")
+                    st.write(f"**Model:** {st.session_state.current_llm}")
+                    st.write(f"**Collection:** {collection_name}")
+                    st.write(f"**Dataset:** {dataset_name}")
+                with col2:
+                    st.write(f"**Temperature:** 0.0")
+                    st.write(f"**Max Tokens:** 2048")
+                    if llm_provider == "groq":
+                        st.write(f"**Rate Limit:** {settings.groq_rpm_limit} RPM")
+                    else:
+                        st.write(f"**Rate Limit:** ✅ Unlimited (Local)")
+                st.markdown("#### System Prompt")
+                st.info("""
+You are a Fact-Checking and Citation Specialist. Your task is to perform a rigorous audit of a response against provided documents to determine its accuracy, relevance, and level of support.
+**Task:**
+1. Analyze the provided documents and identify information relevant to the user's question
+2. Evaluate the response sentence-by-sentence
+3. Verify each response sentence maps to supporting document sentences
+4. Identify which document sentences were actually used in the response
+                """)
+            st.rerun()
+        except Exception as e:
+            st.error(f"Error loading collection: {str(e)}")
+            st.exception(e)
+def load_and_create_collection(
+    api_key: str,
+    dataset_name: str,
+    num_samples: Optional[int],
+    chunking_strategy: str,
+    chunk_size: int,
+    overlap: int,
+    embedding_model: str,
+    llm_model: str,
+    llm_provider: str = "groq",
+    ollama_host: str = None,
+    vector_store_provider: str = "chroma"
+):
+    """Load dataset and create vector collection."""
+    with st.spinner("Loading dataset and creating collection..."):
+        try:
+            # Initialize dataset loader
+            loader = RAGBenchLoader()
+            # Load dataset
+            if num_samples is None:
+                st.info(f"Loading {dataset_name} dataset (all available samples)...")
+            else:
+                st.info(f"Loading {dataset_name} dataset ({num_samples} samples)...")
+            dataset = loader.load_dataset(dataset_name, split="train", max_samples=num_samples)
+            if not dataset:
+                st.error("Failed to load dataset")
+                return
+            # Initialize vector store based on provider
+            st.info(f"Initializing vector store ({vector_store_provider})...")
+            if vector_store_provider == "qdrant":
+                qdrant_url = st.session_state.get("qdrant_url") or settings.qdrant_url
+                qdrant_api_key = st.session_state.get("qdrant_api_key") or settings.qdrant_api_key
+                vector_store = create_vector_store("qdrant", url=qdrant_url, api_key=qdrant_api_key)
+            else:
+                vector_store = ChromaDBManager(settings.chroma_persist_directory)
+            # Create collection name
+            collection_name = f"{dataset_name}_{chunking_strategy}_{embedding_model.split('/')[-1]}"
+            collection_name = collection_name.replace("-", "_").replace(".", "_")
+            # Delete existing collection with same name (if exists)
+            existing_collections = vector_store.list_collections()
+            if collection_name in existing_collections:
+                st.warning(f"Collection '{collection_name}' already exists. Deleting and recreating...")
+                vector_store.delete_collection(collection_name)
+                st.info("Old collection deleted. Creating new one...")
+            # Load data into collection
+            st.info(f"Creating collection with {chunking_strategy} chunking...")
+            vector_store.load_dataset_into_collection(
+                collection_name=collection_name,
+                embedding_model_name=embedding_model,
+                chunking_strategy=chunking_strategy,
+                dataset_data=dataset,
+                chunk_size=chunk_size,
+                overlap=overlap
+            )
+            # Initialize LLM client based on provider
+            st.info(f"Initializing LLM client ({llm_provider})...")
+            llm_client = create_llm_client(
+                provider=llm_provider,
+                api_key=api_key,
+                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
+                model_name=llm_model,
+                ollama_host=ollama_host or settings.ollama_host,
+                max_rpm=settings.groq_rpm_limit,
+                rate_limit_delay=settings.rate_limit_delay,
+                max_retries=settings.max_retries,
+                retry_delay=settings.retry_delay
+            )
+            # Create RAG pipeline with correct parameter names
+            rag_pipeline = RAGPipeline(
+                llm_client=llm_client,
+                vector_store_manager=vector_store
+            )
+            # Store in session state
+            st.session_state.vector_store = vector_store
+            st.session_state.rag_pipeline = rag_pipeline
+            st.session_state.collection_loaded = True
+            st.session_state.current_collection = collection_name
+            st.session_state.dataset_name = dataset_name
+            st.session_state.dataset = dataset
+            st.session_state.collection_name = collection_name
+            st.session_state.embedding_model = embedding_model
+            st.session_state.groq_api_key = api_key
+            st.session_state.llm_provider = llm_provider
+            st.session_state.vector_store_provider = vector_store_provider
+            provider_icon = "☁️" if llm_provider == "groq" else "🖥️"
+            vs_icon = "☁️" if vector_store_provider == "qdrant" else "💾"
+            st.success(f"✅ Collection '{collection_name}' created successfully! {provider_icon} Using {llm_provider.upper()}")
+            st.rerun()
+        except Exception as e:
+            st.error(f"Error: {str(e)}")
+def chat_interface():
+    """Chat interface tab."""
+    st.subheader("💬 Chat Interface")
+    # Check if collection is loaded
+    if not st.session_state.collection_loaded:
+        st.warning("⚠️ No data loaded. Please use the configuration panel to load a dataset and create a collection.")
+        st.info("""
+        Steps:
+        1. Select a dataset from the dropdown
+        2. Click "Load Data & Create Collection" button
+        3. Wait for the collection to be created
+        4. Then you can start chatting
+        """)
+        return
+    # Display collection info and LLM selector
+    col1, col2, col3 = st.columns([2, 2, 1])
+    with col1:
+        provider_icon = "☁️" if st.session_state.get("llm_provider", "groq") == "groq" else "🖥️"
+        st.info(f"📚 Collection: {st.session_state.current_collection} | {provider_icon} {st.session_state.get('llm_provider', 'groq').upper()}")
+    with col2:
+        # LLM selector for chat - based on provider
+        current_provider = st.session_state.get("llm_provider", "groq")
+        if current_provider == "groq":
+            model_options = settings.llm_models
+            try:
+                current_index = settings.llm_models.index(st.session_state.current_llm)
+            except ValueError:
+                current_index = 0
+        else:
+            model_options = settings.ollama_models
+            try:
+                current_index = settings.ollama_models.index(st.session_state.current_llm)
+            except ValueError:
+                current_index = 0
+        selected_llm = st.selectbox(
+            f"Select {'Groq' if current_provider == 'groq' else 'Ollama'} Model for chat:",
+            model_options,
+            index=current_index,
+            key="chat_llm_selector"
+        )
+        if selected_llm != st.session_state.current_llm:
+            st.session_state.current_llm = selected_llm
+            # Recreate LLM client with new model
+            llm_client = create_llm_client(
+                provider=current_provider,
+                api_key=st.session_state.groq_api_key if "groq_api_key" in st.session_state else "",
+                api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
+                model_name=selected_llm,
+                ollama_host=settings.ollama_host,
+                max_rpm=settings.groq_rpm_limit,
+                rate_limit_delay=settings.rate_limit_delay
+            )
+            st.session_state.rag_pipeline.llm = llm_client
+    with col3:
+        if st.button("🗑️ Clear History"):
+            st.session_state.chat_history = []
+            st.session_state.rag_pipeline.clear_history()
+            st.rerun()
+    # Show system prompt info in expandable section
+    with st.expander("🤖 System Prompt & Model Info", expanded=False):
+        current_provider = st.session_state.get("llm_provider", "groq")
+        col1, col2 = st.columns(2)
+        with col1:
+            provider_icon = "☁️" if current_provider == "groq" else "🖥️"
+            st.write(f"**Provider:** {provider_icon} {current_provider.upper()}")
+            st.write(f"**LLM Model:** {st.session_state.current_llm}")
+            st.write(f"**Temperature:** 0.0")
+            st.write(f"**Max Tokens:** 2048")
+        with col2:
+            st.write(f"**Collection:** {st.session_state.current_collection}")
+            st.write(f"**Dataset:** {st.session_state.get('dataset_name', 'N/A')}")
+            if current_provider == "groq":
+                st.write(f"**Rate Limit:** {settings.groq_rpm_limit} RPM")
+            else:
+                st.write(f"**Rate Limit:** ✅ Unlimited (Local)")
+        st.markdown("#### System Prompt Being Used")
+        system_prompt = """You are a Fact-Checking and Citation Specialist. Your task is to perform a rigorous audit of a response against provided documents to determine its accuracy, relevance, and level of support.
+**TASK OVERVIEW**
+1. **Analyze Documents**: Review the provided documents and identify information relevant to the user's question.
+2. **Evaluate Response**: Review the provided answer sentence-by-sentence.
+3. **Verify Support**: Map each answer sentence to specific supporting sentences in the documents.
+4. **Identify Utilization**: Determine which document sentences were actually used (directly or implicitly) to form the answer."""
+        st.info(system_prompt)
+    # Chat container
+    chat_container = st.container()
+    # Display chat history
+    with chat_container:
+        for chat_idx, entry in enumerate(st.session_state.chat_history):
+            # User message
+            with st.chat_message("user"):
+                st.write(entry["query"])
+            # Assistant message
+            with st.chat_message("assistant"):
+                st.write(entry["response"])
+                # Show retrieved documents in expander
+                with st.expander("📄 Retrieved Documents"):
+                    for doc_idx, doc in enumerate(entry["retrieved_documents"]):
+                        st.markdown(f"**Document {doc_idx+1}** (Distance: {doc.get('distance', 'N/A'):.4f})")
+                        st.text_area(
+                            f"doc_{chat_idx}_{doc_idx}",
+                            value=doc["document"],
+                            height=100,
+                            key=f"doc_area_{chat_idx}_{doc_idx}",
+                            label_visibility="collapsed"
+                        )
+                        if doc.get("metadata"):
+                            st.caption(f"Metadata: {doc['metadata']}")
+    # Chat input
+    query = st.chat_input("Ask a question...")
+    if query:
+        # Check if collection exists
+        if not st.session_state.rag_pipeline or not st.session_state.rag_pipeline.vector_store.current_collection:
+            st.error("❌ No data loaded. Please load a dataset first using the configuration panel.")
+            st.stop()
+        # Add user message
+        with chat_container:
+            with st.chat_message("user"):
+                st.write(query)
+        # Generate response
+        with st.spinner("Generating response..."):
+            try:
+                result = st.session_state.rag_pipeline.query(query)
+            except Exception as e:
+                st.error(f"❌ Error querying: {str(e)}")
+                st.info("Please load a dataset and create a collection first.")
+                st.stop()
+        # Add assistant message
+        with chat_container:
+            with st.chat_message("assistant"):
+                st.write(result["response"])
+                # Show retrieved documents
+                with st.expander("📄 Retrieved Documents"):
+                    for doc_idx, doc in enumerate(result["retrieved_documents"]):
+                        st.markdown(f"**Document {doc_idx+1}** (Distance: {doc.get('distance', 'N/A'):.4f})")
+                        st.text_area(
+                            f"doc_current_{doc_idx}",
+                            value=doc["document"],
+                            height=100,
+                            key=f"doc_current_area_{doc_idx}",
+                            label_visibility="collapsed"
+                        )
+                        if doc.get("metadata"):
+                            st.caption(f"Metadata: {doc['metadata']}")
+        # Store in history
+        st.session_state.chat_history.append(result)
+        st.rerun()
+def evaluation_interface():
+    """Evaluation interface tab."""
+    st.subheader("📊 RAG Evaluation")
+    # Check if collection is loaded
+    if not st.session_state.collection_loaded:
+        st.warning("⚠️ No data loaded. Please load a collection first.")
+        return
+    # Evaluation method selector
+    eval_col1, eval_col2 = st.columns([2, 1])
+    with eval_col1:
+        evaluation_method = st.radio(
+            "Evaluation Method:",
+            options=["TRACE (Heuristic)", "GPT Labeling (LLM-based)", "Hybrid (Both)"],
+            horizontal=True,
+            help="TRACE is fast (no LLM). GPT Labeling is accurate but slower (requires LLM calls)."
+        )
+    # Map UI labels to method IDs
+    method_map = {
+        "TRACE (Heuristic)": "trace",
+        "GPT Labeling (LLM-based)": "gpt_labeling",
+        "Hybrid (Both)": "hybrid"
+    }
+    selected_method = method_map[evaluation_method]
+    # LLM selector for evaluation
+    current_provider = st.session_state.get("llm_provider", "groq")
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        # Show provider-specific models
+        if current_provider == "groq":
+            model_options = settings.llm_models
+            try:
+                current_index = settings.llm_models.index(st.session_state.current_llm)
+            except ValueError:
+                current_index = 0
+        else:
+            model_options = settings.ollama_models
+            try:
+                current_index = settings.ollama_models.index(st.session_state.current_llm)
+            except ValueError:
+                current_index = 0
+        selected_llm = st.selectbox(
+            f"Select {'Groq' if current_provider == 'groq' else 'Ollama'} Model for evaluation:",
+            model_options,
+            index=current_index,
+            key="eval_llm_selector"
+        )
+        # Show provider info
+        provider_icon = "☁️" if current_provider == "groq" else "🖥️"
+        if current_provider == "ollama":
+            st.caption(f"{provider_icon} Using local Ollama - **No rate limits!** Fast evaluation possible.")
+        else:
+            st.caption(f"{provider_icon} Using Groq API - Rate limited to {settings.groq_rpm_limit} RPM")
+    # Show method description
+    method_descriptions = {
+        "trace": """
+        **TRACE Heuristic Method** (Fast, Rule-Based)
+        - Utilization: How well the system uses retrieved documents
+        - Relevance: Relevance of retrieved documents to the query
+        - Adherence: How well the response adheres to the retrieved context
+        - Completeness: How complete the response is in answering the query
+        - ⚡ Speed: ~100ms per evaluation
+        - 💰 Cost: Free (no API calls)
+        """,
+        "gpt_labeling": """
+        **GPT Labeling Method** (Accurate, LLM-based)
+        - Uses sentence-level LLM analysis (from RAGBench paper)
+        - Context Relevance: Fraction of context relevant to query
+        - Context Utilization: Fraction of relevant context used
+        - Completeness: Fraction of relevant info covered
+        - Adherence: Response supported by context (no hallucinations)
+        - ⏱️ Speed: ~2-5 seconds per evaluation
+        - 💰 Cost: ~$0.002-0.01 per evaluation
+        """,
+        "hybrid": """
+        **Hybrid Method** (Comprehensive)
+        - Runs both TRACE and GPT Labeling methods
+        - Provides both fast and accurate evaluation metrics
+        - Best for detailed analysis
+        - ⏱️ Speed: ~3-6 seconds per evaluation
+        - 💰 Cost: Same as GPT Labeling
+        """
+    }
+    st.markdown(method_descriptions[selected_method])
+    # Get maximum test samples available for current dataset
+    try:
+        loader = RAGBenchLoader()
+        max_test_samples = loader.get_test_data_size(st.session_state.dataset_name)
+        st.caption(f"📊 Available test samples: {max_test_samples:,}")
+    except Exception as e:
+        max_test_samples = 100
+        st.caption(f"Available test samples: ~{max_test_samples} (estimated)")
+    # Ensure min and max are reasonable
+    max_test_samples = max(5, min(max_test_samples, 500))  # Cap at 500 for performance
+    num_test_samples = st.slider(
+        "Number of test samples",
+        min_value=5,
+        max_value=max_test_samples,
+        value=min(10, max_test_samples),
+        step=5
+    )
+    # Show warning for GPT labeling (API cost) - only for Groq
+    if selected_method in ["gpt_labeling", "hybrid"]:
+        current_provider = st.session_state.get("llm_provider", "groq")
+        if current_provider == "groq":
+            st.warning(f"⚠️ **{evaluation_method}** requires LLM API calls. This will incur costs and be slower due to rate limiting ({settings.groq_rpm_limit} RPM).")
+        else:
+            st.info(f"ℹ️ **{evaluation_method}** using local Ollama - **No rate limits!** Evaluation will be much faster.")
+    if st.button("🔬 Run Evaluation", type="primary"):
+        # Use selected LLM for evaluation
+        run_evaluation(num_test_samples, selected_llm, selected_method)
+    # Display results
+    if st.session_state.evaluation_results:
+        results = st.session_state.evaluation_results
+        st.success("✅ Evaluation Complete!")
+        st.divider()
+        st.markdown("## 📊 Evaluation Metrics")
+        # Display aggregate scores - handle both TRACE and GPT Labeling metric names
+        st.markdown("### Main Metrics")
+        col1, col2, col3, col4, col5 = st.columns(5)
+        # Determine which metrics are available
+        utilization = results.get('utilization') or results.get('context_utilization', 0)
+        relevance = results.get('relevance') or results.get('context_relevance', 0)
+        adherence = results.get('adherence', 0)
+        completeness = results.get('completeness', 0)
+        average = results.get('average', 0)
+        with col1:
+            st.metric("📊 Utilization", f"{utilization:.3f}")
+        with col2:
+            st.metric("🎯 Relevance", f"{relevance:.3f}")
+        with col3:
+            st.metric("✅ Adherence", f"{adherence:.3f}")
+        with col4:
+            st.metric("📝 Completeness", f"{completeness:.3f}")
+        with col5:
+            st.metric("⭐ Average", f"{average:.3f}")
+        # Detailed results summary - handle both metric types
+        if "individual_scores" in results:
+            with st.expander("📋 Summary Metrics by Query"):
+                df = pd.DataFrame(results["individual_scores"])
+                st.dataframe(df, use_container_width=True)
+        # Detailed per-query results
+        if "detailed_results" in results and results["detailed_results"]:
+            with st.expander("🔍 Detailed Per-Query Analysis"):
+                for query_result in results.get("detailed_results", []):
+                    with st.expander(f"Query {query_result['query_id']}: {query_result['question'][:60]}..."):
+                        st.markdown("### Question")
+                        st.write(query_result['question'])
+                        st.markdown("### LLM Response")
+                        st.write(query_result.get('llm_response', 'N/A'))
+                        st.markdown("### Retrieved Documents")
+                        for doc_idx, doc in enumerate(query_result.get('retrieved_documents', []), 1):
+                            with st.expander(f"📄 Document {doc_idx}"):
+                                st.write(doc)
+                        if query_result.get('ground_truth'):
+                            st.markdown("### Ground Truth")
+                            st.write(query_result['ground_truth'])
+                        # Display metrics with correct labels based on method
+                        metrics = query_result.get('metrics', {})
+                        if metrics:
+                            st.markdown("### Evaluation Metrics")
+                            col1, col2, col3, col4, col5 = st.columns(5)
+                            # Get metric values (handle both TRACE and GPT names)
+                            util_val = metrics.get('utilization') or metrics.get('context_utilization', 0)
+                            rel_val = metrics.get('relevance') or metrics.get('context_relevance', 0)
+                            adh_val = metrics.get('adherence', 0)
+                            comp_val = metrics.get('completeness', 0)
+                            avg_val = metrics.get('average', 0)
+                            with col1:
+                                st.metric("Util", f"{util_val:.3f}")
+                            with col2:
+                                st.metric("Rel", f"{rel_val:.3f}")
+                            with col3:
+                                st.metric("Adh", f"{adh_val:.3f}")
+                            with col4:
+                                st.metric("Comp", f"{comp_val:.3f}")
+                            with col5:
+                                st.metric("Avg", f"{avg_val:.3f}")
+        # For GPT Labeling and Hybrid methods, show additional metrics
+        method = results.get("method", "")
+        if "gpt_labeling" in method or "hybrid" in method:
+            # Show RMSE aggregation metrics (consistency across evaluations)
+            if "rmse_metrics" in results:
+                st.markdown("### 📊 RMSE Aggregation (Metric Consistency)")
+                rmse_data = results.get("rmse_metrics", {})
+                rmse_cols = st.columns(4)
+                with rmse_cols[0]:
+                    rel_mean = rmse_data.get("context_relevance", {}).get("mean", 0)
+                    rel_std = rmse_data.get("context_relevance", {}).get("std_dev", 0)
+                    st.metric("Relevance", f"{rel_mean:.3f} ±{rel_std:.3f}", help="Mean and Std Dev")
+                with rmse_cols[1]:
+                    util_mean = rmse_data.get("context_utilization", {}).get("mean", 0)
+                    util_std = rmse_data.get("context_utilization", {}).get("std_dev", 0)
+                    st.metric("Utilization", f"{util_mean:.3f} ±{util_std:.3f}", help="Mean and Std Dev")
+                with rmse_cols[2]:
+                    comp_mean = rmse_data.get("completeness", {}).get("mean", 0)
+                    comp_std = rmse_data.get("completeness", {}).get("std_dev", 0)
+                    st.metric("Completeness", f"{comp_mean:.3f} ±{comp_std:.3f}", help="Mean and Std Dev")
+                with rmse_cols[3]:
+                    adh_mean = rmse_data.get("adherence", {}).get("mean", 0)
+                    adh_std = rmse_data.get("adherence", {}).get("std_dev", 0)
+                    st.metric("Adherence", f"{adh_mean:.3f} ±{adh_std:.3f}", help="Mean and Std Dev")
+                # Show detailed RMSE statistics in expander
+                with st.expander("See detailed RMSE aggregation statistics"):
+                    for metric_name, metric_data in rmse_data.items():
+                        st.write(f"**{metric_name}**")
+                        col1, col2, col3, col4 = st.columns(4)
+                        with col1:
+                            st.write(f"Mean: {metric_data.get('mean', 0):.4f}")
+                        with col2:
+                            st.write(f"Std Dev: {metric_data.get('std_dev', 0):.4f}")
+                        with col3:
+                            st.write(f"Min: {metric_data.get('min', 0):.4f}")
+                        with col4:
+                            st.write(f"Max: {metric_data.get('max', 0):.4f}")
+            # Show per-metric statistics if available
+            if "per_metric_statistics" in results:
+                st.markdown("### 📈 Per-Metric Statistics (Distribution)")
+                stats_data = results.get("per_metric_statistics", {})
+                stats_cols = st.columns(4)
+                with stats_cols[0]:
+                    rel_stats = stats_data.get("context_relevance", {})
+                    st.metric("Relevance Mean", f"{rel_stats.get('mean', 0):.3f}", help=f"Median: {rel_stats.get('median', 0):.3f}")
+                with stats_cols[1]:
+                    util_stats = stats_data.get("context_utilization", {})
+                    st.metric("Utilization Mean", f"{util_stats.get('mean', 0):.3f}", help=f"Median: {util_stats.get('median', 0):.3f}")
+                with stats_cols[2]:
+                    comp_stats = stats_data.get("completeness", {})
+                    st.metric("Completeness Mean", f"{comp_stats.get('mean', 0):.3f}", help=f"Median: {comp_stats.get('median', 0):.3f}")
+                with stats_cols[3]:
+                    adh_stats = stats_data.get("adherence", {})
+                    st.metric("Adherence Mean", f"{adh_stats.get('mean', 0):.3f}", help=f"Median: {adh_stats.get('median', 0):.3f}")
+                # Show detailed statistics
+                with st.expander("See detailed per-metric statistics"):
+                    for metric_name, metric_stats in stats_data.items():
+                        st.write(f"**{metric_name}**")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.write(f"""
+- Mean: {metric_stats.get('mean', 0):.4f}
+- Median: {metric_stats.get('median', 0):.4f}
+- Std Dev: {metric_stats.get('std_dev', 0):.4f}
+- Min: {metric_stats.get('min', 0):.4f}
+- Max: {metric_stats.get('max', 0):.4f}
+                            """)
+                        with col2:
+                            st.write(f"""
+- 25th percentile: {metric_stats.get('percentile_25', 0):.4f}
+- 75th percentile: {metric_stats.get('percentile_75', 0):.4f}
+- Perfect (>=0.95): {metric_stats.get('perfect_count', 0)}
+- Poor (<0.3): {metric_stats.get('poor_count', 0)}
+- Samples: {metric_stats.get('sample_count', 0)}
+                            """)
+            # Show RMSE vs RAGBench Ground Truth (per RAGBench paper requirement)
+            if "rmse_vs_ground_truth" in results:
+                st.markdown("### 📉 RMSE vs RAGBench Ground Truth")
+                st.info("Compares predicted TRACE scores against original RAGBench dataset scores")
+                rmse_gt = results.get("rmse_vs_ground_truth", {})
+                per_metric_rmse = rmse_gt.get("per_metric_rmse", {})
+                if per_metric_rmse:
+                    rmse_gt_cols = st.columns(5)
+                    with rmse_gt_cols[0]:
+                        st.metric("Relevance RMSE", f"{per_metric_rmse.get('context_relevance', 0):.4f}",
+                                  delta=None, help="Lower is better (0 = perfect match)")
+                    with rmse_gt_cols[1]:
+                        st.metric("Utilization RMSE", f"{per_metric_rmse.get('context_utilization', 0):.4f}")
+                    with rmse_gt_cols[2]:
+                        st.metric("Completeness RMSE", f"{per_metric_rmse.get('completeness', 0):.4f}")
+                    with rmse_gt_cols[3]:
+                        st.metric("Adherence RMSE", f"{per_metric_rmse.get('adherence', 0):.4f}")
+                    with rmse_gt_cols[4]:
+                        agg_rmse = rmse_gt.get("aggregated_rmse", 0)
+                        consistency = rmse_gt.get("consistency_score", 0)
+                        st.metric("Aggregated RMSE", f"{agg_rmse:.4f}",
+                                  delta=f"Consistency: {consistency:.2%}", delta_color="normal")
+            # Show AUCROC vs RAGBench Ground Truth (per RAGBench paper requirement)
+            if "aucroc_vs_ground_truth" in results:
+                st.markdown("### 📊 AUC-ROC vs RAGBench Ground Truth")
+                st.info("Area Under ROC Curve comparing predicted vs ground truth binary classifications")
+                auc_gt = results.get("aucroc_vs_ground_truth", {})
+                if auc_gt:
+                    auc_cols = st.columns(5)
+                    with auc_cols[0]:
+                        st.metric("Relevance AUC", f"{auc_gt.get('context_relevance', 0):.4f}",
+                                  help="Higher is better (1.0 = perfect classification)")
+                    with auc_cols[1]:
+                        st.metric("Utilization AUC", f"{auc_gt.get('context_utilization', 0):.4f}")
+                    with auc_cols[2]:
+                        st.metric("Completeness AUC", f"{auc_gt.get('completeness', 0):.4f}")
+                    with auc_cols[3]:
+                        st.metric("Adherence AUC", f"{auc_gt.get('adherence', 0):.4f}")
+                    with auc_cols[4]:
+                        avg_auc = auc_gt.get("average", 0)
+                        st.metric("Average AUC", f"{avg_auc:.4f}")
+        # Download results
+        st.divider()
+        st.markdown("## 💾 Download Results")
+        # Create a comprehensive download with all details
+        download_data = {
+            "evaluation_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "dataset": st.session_state.dataset_name,
+                "method": results.get("evaluation_config", {}).get("evaluation_method", "gpt_labeling_prompts"),
+                "total_samples": results.get("num_samples", 0),
+                "embedding_model": st.session_state.embedding_model,
+            },
+            "aggregate_metrics": {
+                "context_relevance": results.get("context_relevance") or results.get("relevance", 0),
+                "context_utilization": results.get("context_utilization") or results.get("utilization", 0),
+                "completeness": results.get("completeness", 0),
+                "adherence": results.get("adherence", 0),
+                "average": results.get("average", 0),
+            },
+            "rmse_metrics": results.get("rmse_metrics", {}),
+            "per_metric_statistics": results.get("per_metric_statistics", {}),
+            "rmse_vs_ground_truth": results.get("rmse_vs_ground_truth", {}),
+            "aucroc_vs_ground_truth": results.get("aucroc_vs_ground_truth", {}),
+            "detailed_results": results.get("detailed_results", [])
+        }
+        results_json = json.dumps(download_data, indent=2, default=str)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.download_button(
+                label="📥 Download Complete Results (JSON)",
+                data=results_json,
+                file_name=f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json",
+                help="Download all evaluation results including metrics and per-query details"
+            )
+        with col2:
+            st.download_button(
+                label="📋 Download Metrics Only (JSON)",
+                data=json.dumps(download_data["aggregate_metrics"], indent=2),
+                file_name=f"evaluation_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json",
+                help="Download only the aggregate metrics"
+            )
+def run_evaluation(num_samples: int, selected_llm: str = None, method: str = "trace"):
+    """Run evaluation using selected method (TRACE, GPT Labeling, or Hybrid).
+    Args:
+        num_samples: Number of test samples to evaluate
+        selected_llm: LLM model to use for evaluation
+        method: Evaluation method ("trace", "gpt_labeling", or "hybrid")
+    """
+    with st.spinner(f"Running evaluation on {num_samples} samples..."):
+        try:
+            # Create logs container
+            logs_container = st.container()
+            logs_list = []
+            # Display logs header once outside function
+            logs_placeholder = st.empty()
+            def add_log(message: str):
+                """Add log message and update display."""
+                logs_list.append(message)
+                with logs_placeholder.container():
+                    st.markdown("### 📋 Evaluation Logs:")
+                    for log_msg in logs_list:
+                        st.caption(log_msg)
+            # Log evaluation start
+            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            add_log(f"⏱️ Evaluation started at {timestamp}")
+            add_log(f"📊 Dataset: {st.session_state.dataset_name}")
+            add_log(f"📈 Total samples: {num_samples}")
+            add_log(f"🤖 LLM Model: {selected_llm if selected_llm else st.session_state.current_llm}")
+            add_log(f"🔗 Vector Store: {st.session_state.collection_name}")
+            add_log(f"🧠 Embedding Model: {st.session_state.embedding_model}")
+            # Map method names
+            method_names = {
+                "trace": "TRACE (Heuristic)",
+                "gpt_labeling": "GPT Labeling (LLM-based)",
+                "hybrid": "Hybrid (Both)"
+            }
+            add_log(f"🔬 Evaluation Method: {method_names.get(method, method)}")
+            # Use selected LLM if provided - create with appropriate provider
+            eval_llm_client = None
+            original_llm = None
+            current_provider = st.session_state.get("llm_provider", "groq")
+            if selected_llm and selected_llm != st.session_state.current_llm:
+                add_log(f"🔄 Switching LLM to {selected_llm} ({current_provider.upper()})...")
+                groq_api_key = st.session_state.groq_api_key if "groq_api_key" in st.session_state else ""
+                eval_llm_client = create_llm_client(
+                    provider=current_provider,
+                    api_key=groq_api_key,
+                    api_keys=settings.groq_api_keys if settings.groq_api_keys else None,
+                    model_name=selected_llm,
+                    ollama_host=settings.ollama_host,
+                    max_rpm=settings.groq_rpm_limit,
+                    rate_limit_delay=settings.rate_limit_delay,
+                    max_retries=settings.max_retries,
+                    retry_delay=settings.retry_delay
+                )
+                # Temporarily replace LLM client
+                original_llm = st.session_state.rag_pipeline.llm
+                st.session_state.rag_pipeline.llm = eval_llm_client
+            else:
+                eval_llm_client = st.session_state.rag_pipeline.llm
+            # Log provider info
+            provider_icon = "☁️" if current_provider == "groq" else "🖥️"
+            add_log(f"{provider_icon} LLM Provider: {current_provider.upper()}")
+            # Get test data
+            add_log("📥 Loading test data...")
+            loader = RAGBenchLoader()
+            test_data = loader.get_test_data(
+                st.session_state.dataset_name,
+                num_samples
+            )
+            add_log(f"✅ Loaded {len(test_data)} test samples")
+            # Prepare test cases
+            test_cases = []
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            add_log("🔍 Processing samples...")
+            for i, sample in enumerate(test_data):
+                status_text.text(f"Processing sample {i+1}/{num_samples}")
+                # Query the RAG system
+                result = st.session_state.rag_pipeline.query(
+                    sample["question"],
+                    n_results=5
+                )
+                # Prepare test case
+                test_cases.append({
+                    "query": sample["question"],
+                    "response": result["response"],
+                    "retrieved_documents": [doc["document"] for doc in result["retrieved_documents"]],
+                    "ground_truth": sample.get("answer", "")
+                })
+                # Update progress
+                progress_bar.progress((i + 1) / num_samples)
+                # Log every 10 samples
+                if (i + 1) % 10 == 0 or (i + 1) == num_samples:
+                    add_log(f"  ✓ Processed {i + 1}/{num_samples} samples")
+            status_text.text(f"Running {method_names.get(method, method)} evaluation...")
+            add_log(f"📊 Running evaluation using {method_names.get(method, method)}...")
+            # Extract chunking and embedding metadata from session state
+            # (These were stored when the collection was loaded/created)
+            chunking_strategy = st.session_state.vector_store.chunking_strategy if st.session_state.vector_store else None
+            embedding_model = st.session_state.embedding_model
+            chunk_size = st.session_state.vector_store.chunk_size if st.session_state.vector_store else None
+            chunk_overlap = st.session_state.vector_store.chunk_overlap if st.session_state.vector_store else None
+            # Log retrieval configuration
+            add_log(f"🔧 Retrieval Configuration:")
+            add_log(f"  • Chunking Strategy: {chunking_strategy or 'Unknown'}")
+            add_log(f"  • Chunk Size: {chunk_size or 'Unknown'}")
+            add_log(f"  • Chunk Overlap: {chunk_overlap or 'Unknown'}")
+            add_log(f"  • Embedding Model: {embedding_model or 'Unknown'}")
+            # Import unified pipeline
+            try:
+                from evaluation_pipeline import UnifiedEvaluationPipeline
+                # Run evaluation with metadata using unified pipeline
+                pipeline = UnifiedEvaluationPipeline(
+                    llm_client=eval_llm_client,
+                    chunking_strategy=chunking_strategy,
+                    embedding_model=embedding_model,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+                # Run evaluation with selected method
+                results = pipeline.evaluate_batch(test_cases, method=method)
+            except ImportError:
+                # Fallback to TRACE only if evaluation_pipeline module not available
+                add_log("⚠️  evaluation_pipeline module not found, falling back to TRACE...")
+                # Run evaluation with metadata using TRACE
+                evaluator = TRACEEvaluator(
+                    chunking_strategy=chunking_strategy,
+                    embedding_model=embedding_model,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+                results = evaluator.evaluate_batch(test_cases)
+            st.session_state.evaluation_results = results
+            # Log evaluation results summary
+            add_log("✅ Evaluation completed successfully!")
+            # Display appropriate metrics based on method
+            if method == "trace":
+                add_log(f"  • Utilization: {results.get('utilization', 0):.2%}")
+                add_log(f"  • Relevance: {results.get('relevance', 0):.2%}")
+                add_log(f"  • Adherence: {results.get('adherence', 0):.2%}")
+                add_log(f"  • Completeness: {results.get('completeness', 0):.2%}")
+                add_log(f"  • Average: {results.get('average', 0):.2%}")
+            elif method == "gpt_labeling":
+                if "context_relevance" in results:
+                    add_log(f"  • Context Relevance: {results.get('context_relevance', 0):.2%}")
+                    add_log(f"  • Context Utilization: {results.get('context_utilization', 0):.2%}")
+                    add_log(f"  • Completeness: {results.get('completeness', 0):.2%}")
+                    add_log(f"  • Adherence: {results.get('adherence', 0):.2%}")
+                    add_log(f"  • Average: {results.get('average', 0):.2%}")
+                # NEW: Display RMSE and AUCROC metrics if available
+                if "rmse_metrics" in results:
+                    add_log(f"📈 RMSE Metrics (vs ground truth):")
+                    rmse_metrics = results.get("rmse_metrics", {})
+                    add_log(f"    • Context Relevance RMSE: {rmse_metrics.get('relevance', 0):.4f}")
+                    add_log(f"    • Context Utilization RMSE: {rmse_metrics.get('utilization', 0):.4f}")
+                    add_log(f"    • Completeness RMSE: {rmse_metrics.get('completeness', 0):.4f}")
+                    add_log(f"    • Adherence RMSE: {rmse_metrics.get('adherence', 0):.4f}")
+                    add_log(f"    • Average RMSE: {rmse_metrics.get('average', 0):.4f}")
+                if "auc_metrics" in results:
+                    add_log(f"📊 AUCROC Metrics (binary classification):")
+                    auc_metrics = results.get("auc_metrics", {})
+                    add_log(f"    • Context Relevance AUCROC: {auc_metrics.get('relevance', 0):.4f}")
+                    add_log(f"    • Context Utilization AUCROC: {auc_metrics.get('utilization', 0):.4f}")
+                    add_log(f"    • Completeness AUCROC: {auc_metrics.get('completeness', 0):.4f}")
+                    add_log(f"    • Adherence AUCROC: {auc_metrics.get('adherence', 0):.4f}")
+                    add_log(f"    • Average AUCROC: {auc_metrics.get('average', 0):.4f}")
+            elif method == "hybrid":
+                add_log("  📊 TRACE Metrics:")
+                trace_res = results.get("trace_results", {})
+                add_log(f"    • Utilization: {trace_res.get('utilization', 0):.2%}")
+                add_log(f"    • Relevance: {trace_res.get('relevance', 0):.2%}")
+                add_log(f"    • Adherence: {trace_res.get('adherence', 0):.2%}")
+                add_log(f"    • Completeness: {trace_res.get('completeness', 0):.2%}")
+                add_log("  🧠 GPT Labeling Metrics:")
+                gpt_res = results.get("gpt_results", {})
+                add_log(f"    • Context Relevance: {gpt_res.get('context_relevance', 0):.2%}")
+                add_log(f"    • Context Utilization: {gpt_res.get('context_utilization', 0):.2%}")
+                add_log(f"    • Completeness: {gpt_res.get('completeness', 0):.2%}")
+                add_log(f"    • Adherence: {gpt_res.get('adherence', 0):.2%}")
+            # Restore original LLM if it was switched
+            if selected_llm and selected_llm != st.session_state.current_llm and original_llm:
+                st.session_state.rag_pipeline.llm = original_llm
+                add_log(f"🔄 Restored original LLM")
+            add_log(f"⏱️ Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        except Exception as e:
+            st.error(f"Error during evaluation: {str(e)}")
+            add_log(f"❌ Error: {str(e)}")
+def history_interface():
+    """History interface tab."""
+    st.subheader("📜 Chat History")
+    if not st.session_state.chat_history:
+        st.info("No chat history yet. Start a conversation in the Chat tab!")
+        return
+    # Export history
+    col1, col2 = st.columns([3, 1])
+    with col2:
+        history_json = json.dumps(st.session_state.chat_history, indent=2)
+        st.download_button(
+            label="💾 Export History",
+            data=history_json,
+            file_name=f"chat_history_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+            mime="application/json"
+        )
+    # Display history
+    for i, entry in enumerate(st.session_state.chat_history):
+        with st.expander(f"💬 Conversation {i+1}: {entry['query'][:50]}..."):
+            st.markdown(f"**Query:** {entry['query']}")
+            st.markdown(f"**Response:** {entry['response']}")
+            st.markdown(f"**Timestamp:** {entry.get('timestamp', 'N/A')}")
+            st.markdown("**Retrieved Documents:**")
+            for j, doc in enumerate(entry["retrieved_documents"]):
+                st.text_area(
+                    f"Document {j+1}",
+                    value=doc["document"],
+                    height=100,
+                    key=f"history_doc_{i}_{j}"
+                )
+if __name__ == "__main__":
+    main()