Spaces:

aneeb15
/

Auto-FineTune-Ops

Configuration error

File size: 70,136 Bytes

"""

Auto-FineTune-Ops: Streamlit Dashboard

======================================

Premium interactive dashboard for ML fine-tuning pipeline.

"""

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import sys
import os
import json
import time
from datetime import datetime

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))

# Page configuration
st.set_page_config(
    page_title="Auto-FineTune-Ops",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Premium CSS styling
st.markdown("""

<style>

    /* Main container */

    .main .block-container {

        padding-top: 2rem;

        padding-bottom: 2rem;

    }

    

    /* Cards */

    .stMetric {

        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);

        padding: 1rem;

        border-radius: 12px;

        border: 1px solid rgba(99, 102, 241, 0.2);

        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);

    }

    

    /* Gradient headers */

    .gradient-header {

        background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);

        -webkit-background-clip: text;

        -webkit-text-fill-color: transparent;

        font-size: 2.5rem;

        font-weight: 700;

        margin-bottom: 1rem;

    }

    

    /* Info cards */

    .info-card {

        background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);

        padding: 1.5rem;

        border-radius: 16px;

        border: 1px solid rgba(99, 102, 241, 0.3);

        margin: 1rem 0;

    }

    

    /* Success badge */

    .success-badge {

        background: linear-gradient(90deg, #10b981, #059669);

        color: white;

        padding: 0.5rem 1rem;

        border-radius: 20px;

        font-weight: 600;

        display: inline-block;

    }

    

    /* Warning badge */

    .warning-badge {

        background: linear-gradient(90deg, #f59e0b, #d97706);

        color: white;

        padding: 0.5rem 1rem;

        border-radius: 20px;

        font-weight: 600;

        display: inline-block;

    }

    

    /* Sidebar styling */

    section[data-testid="stSidebar"] {

        background: linear-gradient(180deg, #0f0f23 0%, #1a1a2e 100%);

    }

    

    /* Button styling */

    .stButton > button {

        background: linear-gradient(90deg, #6366f1, #8b5cf6);

        color: white;

        border: none;

        border-radius: 8px;

        padding: 0.5rem 2rem;

        font-weight: 600;

        transition: all 0.3s ease;

    }

    

    .stButton > button:hover {

        transform: translateY(-2px);

        box-shadow: 0 4px 20px rgba(99, 102, 241, 0.4);

    }

    

    /* Progress bar */

    .stProgress > div > div {

        background: linear-gradient(90deg, #6366f1, #8b5cf6, #a855f7);

    }

    

    /* Tab styling */

    .stTabs [data-baseweb="tab-list"] {

        gap: 8px;

    }

    

    .stTabs [data-baseweb="tab"] {

        background: rgba(99, 102, 241, 0.1);

        border-radius: 8px;

        padding: 0.5rem 1rem;

    }

    

    .stTabs [aria-selected="true"] {

        background: linear-gradient(90deg, #6366f1, #8b5cf6);

    }

</style>

""", unsafe_allow_html=True)

# Initialize session state
if 'current_page' not in st.session_state:
    st.session_state.current_page = 'home'
if 'uploaded_data' not in st.session_state:
    st.session_state.uploaded_data = None
if 'processed_data_path' not in st.session_state:
    st.session_state.processed_data_path = None
if 'model_path' not in st.session_state:
    st.session_state.model_path = None
if 'training_goal' not in st.session_state:
    st.session_state.training_goal = None
if 'pipeline_status' not in st.session_state:
    st.session_state.pipeline_status = {
        'data': 'pending',
        'training': 'pending',
        'evaluation': 'pending',
        'deployment': 'pending'
    }

# Sidebar navigation
with st.sidebar:
    st.markdown('<p class="gradient-header" style="font-size: 1.5rem;">🤖 Auto-FineTune-Ops</p>', unsafe_allow_html=True)
    st.markdown("---")
    
    # Navigation
    pages = {
        'home': ('🏠', 'Dashboard'),
        'data': ('📊', 'Data Upload'),
        'process': ('🧹', 'Processing'),
        'training': ('🚀', 'Training'),
        'evaluation': ('⚖️', 'Evaluation'),
        'deploy': ('🌐', 'Deploy')
    }
    
    for key, (icon, label) in pages.items():
        if st.button(f"{icon} {label}", key=f"nav_{key}", use_container_width=True):
            st.session_state.current_page = key
    
    st.markdown("---")
    
    # Pipeline status
    st.markdown("### 📋 Pipeline Status")
    status_icons = {'pending': '⏳', 'running': '🔄', 'complete': '✅', 'error': '❌'}
    for stage, status in st.session_state.pipeline_status.items():
        st.markdown(f"{status_icons.get(status, '⏳')} **{stage.title()}**: {status}")
    
    st.markdown("---")
    st.markdown("*Built with ❤️ using Streamlit*")


# ============================================================================
# PAGE: HOME DASHBOARD
# ============================================================================
def render_home():
    st.markdown('<p class="gradient-header">🏠 Pipeline Dashboard</p>', unsafe_allow_html=True)
    st.markdown("**One-click autonomous ML fine-tuning pipeline**")
    
    # Status cards
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric(
            label="📊 Dataset",
            value="Ready" if st.session_state.uploaded_data is not None else "Not Loaded",
            delta="Uploaded" if st.session_state.uploaded_data is not None else None
        )
    
    with col2:
        st.metric(
            label="🧹 Processing",
            value=st.session_state.pipeline_status['data'].title(),
            delta="Complete" if st.session_state.pipeline_status['data'] == 'complete' else None
        )
    
    with col3:
        st.metric(
            label="🚀 Training",
            value=st.session_state.pipeline_status['training'].title(),
            delta="Complete" if st.session_state.pipeline_status['training'] == 'complete' else None
        )
    
    with col4:
        st.metric(
            label="⚖️ Evaluation",
            value=st.session_state.pipeline_status['evaluation'].title(),
            delta="Complete" if st.session_state.pipeline_status['evaluation'] == 'complete' else None
        )
    
    st.markdown("---")
    
    # Quick start guide
    st.markdown("### 🚀 Quick Start Guide")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("""

        <div class="info-card">

            <h4>📊 Step 1: Upload Data</h4>

            <p>Upload your CSV/JSON dataset with instruction-response pairs.</p>

        </div>

        """, unsafe_allow_html=True)
        
        st.markdown("""

        <div class="info-card">

            <h4>🧹 Step 2: Process Data</h4>

            <p>The DataArchitectAgent will clean and format your data.</p>

        </div>

        """, unsafe_allow_html=True)
    
    with col2:
        st.markdown("""

        <div class="info-card">

            <h4>🚀 Step 3: Train Model</h4>

            <p>Fine-tune with auto-configured hyperparameters.</p>

        </div>

        """, unsafe_allow_html=True)
        
        st.markdown("""

        <div class="info-card">

            <h4>⚖️ Step 4: Evaluate</h4>

            <p>Run Model Arena with LLM-as-Judge evaluation.</p>

        </div>

        """, unsafe_allow_html=True)
    
    # Recent output files
    st.markdown("---")
    st.markdown("### 📁 Output Files")
    
    output_dir = Path("./output")
    if output_dir.exists():
        tabs = st.tabs(["📂 Models", "📊 Reports", "📝 Logs"])
        
        with tabs[0]:
            models_dir = output_dir / "models"
            if models_dir.exists():
                models = list(models_dir.glob("*"))
                if models:
                    for model in models[:5]:
                        st.markdown(f"- 🤖 `{model.name}`")
                else:
                    st.info("No trained models yet.")
            else:
                st.info("Models directory not found.")
        
        with tabs[1]:
            reports_dir = output_dir / "reports"
            if reports_dir.exists():
                reports = list(reports_dir.glob("*.json"))
                if reports:
                    for report in reports[:5]:
                        st.markdown(f"- 📊 `{report.name}`")
                else:
                    st.info("No evaluation reports yet.")
            else:
                st.info("Reports directory not found.")
        
        with tabs[2]:
            logs_dir = output_dir / "logs"
            if logs_dir.exists():
                logs = list(logs_dir.glob("*.yaml"))
                if logs:
                    for log in logs[:5]:
                        st.markdown(f"- 📝 `{log.name}`")
                else:
                    st.info("No log files yet.")
            else:
                st.info("Logs directory not found.")
    else:
        st.info("Output directory will be created when you run the pipeline.")


# ============================================================================
# PAGE: DATA UPLOAD
# ============================================================================
def render_data_upload():
    st.markdown('<p class="gradient-header">📊 Data Upload & Preview</p>', unsafe_allow_html=True)

    # ── File Management Bar ──
    if st.session_state.uploaded_data is not None:
        fm1, fm2, fm3 = st.columns([3, 1, 1])
        with fm1:
            st.info(f"📂 Currently loaded: **{st.session_state.get('uploaded_filename', 'dataset')}** ({len(st.session_state.uploaded_data):,} rows)")
        with fm2:
            if st.button("🗑️ Remove Dataset", type="secondary"):
                st.session_state.uploaded_data = None
                st.session_state.uploaded_filename = None
                st.session_state.processed_data_path = None
                st.session_state.pipeline_status['data'] = 'pending'
                st.rerun()
        with fm3:
            if st.button("📎 Add More Data"):
                st.session_state['show_add_file'] = True

    # ── File Uploader ──
    show_uploader = (st.session_state.uploaded_data is None) or st.session_state.get('show_add_file', False)

    if show_uploader:
        upload_label = "Upload your dataset (CSV, JSON, or JSONL)" if st.session_state.uploaded_data is None else "Upload additional file to merge with current dataset"
        uploaded_file = st.file_uploader(
            upload_label,
            type=['csv', 'json', 'jsonl'],
            help="Your dataset should contain instruction-response pairs.",
            key=f"uploader_{st.session_state.get('upload_counter', 0)}"
        )

        if uploaded_file:
            try:
                if uploaded_file.name.endswith('.csv'):
                    new_df = pd.read_csv(uploaded_file)
                elif uploaded_file.name.endswith('.jsonl'):
                    new_df = pd.read_json(uploaded_file, lines=True)
                else:
                    new_df = pd.read_json(uploaded_file)

                # Merge or replace
                if st.session_state.uploaded_data is not None and st.session_state.get('show_add_file', False):
                    existing_df = st.session_state.uploaded_data
                    if list(new_df.columns) == list(existing_df.columns):
                        st.session_state.uploaded_data = pd.concat([existing_df, new_df], ignore_index=True)
                        st.session_state.uploaded_filename = f"{st.session_state.get('uploaded_filename', 'data')} + {uploaded_file.name}"
                        st.success(f"✅ Merged **{uploaded_file.name}** ({len(new_df):,} rows) → Total: **{len(st.session_state.uploaded_data):,}** rows")
                    else:
                        st.error(f"❌ Column mismatch! Existing: {list(existing_df.columns)} vs New: {list(new_df.columns)}")
                else:
                    st.session_state.uploaded_data = new_df
                    st.session_state.uploaded_filename = uploaded_file.name
                    st.success(f"✅ Successfully loaded **{uploaded_file.name}**")

                st.session_state['show_add_file'] = False
                st.session_state['upload_counter'] = st.session_state.get('upload_counter', 0) + 1

            except Exception as e:
                st.error(f"Error loading file: {str(e)}")

    # ── Data Display ──
    if st.session_state.uploaded_data is not None:
        df = st.session_state.uploaded_data

        # Dataset statistics
        st.markdown("### 📈 Dataset Statistics")
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Total Rows", f"{len(df):,}")
        with col2:
            st.metric("Total Columns", len(df.columns))
        with col3:
            total_bytes = df.memory_usage(deep=True).sum()
            st.metric("Memory Size", f"{total_bytes / 1024:.1f} KB")
        with col4:
            missing = df.isnull().sum().sum()
            st.metric("Missing Values", missing)

        st.markdown("---")

        # Column detection
        st.markdown("### 🔍 Auto-Detected Columns")
        instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text']
        output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']

        detected_instruction = None
        detected_output = None

        for col in df.columns:
            col_lower = col.lower()
            for pattern in instruction_patterns:
                if pattern in col_lower and not detected_instruction:
                    detected_instruction = col
            for pattern in output_patterns:
                if pattern in col_lower and not detected_output:
                    detected_output = col

        col1, col2 = st.columns(2)
        with col1:
            if detected_instruction:
                st.markdown(f'<span class="success-badge">Instruction: {detected_instruction}</span>', unsafe_allow_html=True)
            else:
                st.markdown(f'<span class="warning-badge">Instruction: Not detected</span>', unsafe_allow_html=True)
        with col2:
            if detected_output:
                st.markdown(f'<span class="success-badge">Output: {detected_output}</span>', unsafe_allow_html=True)
            else:
                st.markdown(f'<span class="warning-badge">Output: Not detected</span>', unsafe_allow_html=True)

        st.markdown("---")

        # Full data preview (scrollable)
        st.markdown("### 👀 Complete Data Preview")
        st.caption(f"Showing all **{len(df):,}** rows. Scroll to browse the full dataset.")
        st.dataframe(df, use_container_width=True, height=450)

        # Download raw data
        st.markdown("### 📥 Download Dataset")
        dl1, dl2 = st.columns(2)
        with dl1:
            csv_data = df.to_csv(index=False).encode('utf-8')
            st.download_button("⬇️ Download as CSV", csv_data,
                file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.csv",
                mime="text/csv")
        with dl2:
            json_data = df.to_json(orient='records', indent=2).encode('utf-8')
            st.download_button("⬇️ Download as JSON", json_data,
                file_name=f"{st.session_state.get('uploaded_filename', 'dataset').rsplit('.', 1)[0]}.json",
                mime="application/json")

        # Column summary
        st.markdown("### 📋 Column Summary")
        col_info = []
        for col in df.columns:
            col_info.append({
                'Column': col,
                'Type': str(df[col].dtype),
                'Non-Null': df[col].notna().sum(),
                'Unique': df[col].nunique(),
                'Sample': str(df[col].iloc[0])[:80] + '...' if len(str(df[col].iloc[0])) > 80 else str(df[col].iloc[0])
            })
        st.dataframe(pd.DataFrame(col_info), use_container_width=True)


# ============================================================================
# PAGE: DATA PROCESSING
# ============================================================================
def render_processing():
    st.markdown('<p class="gradient-header">🧹 Advanced Data Processing</p>', unsafe_allow_html=True)

    if st.session_state.uploaded_data is None:
        st.warning("⚠️ Please upload a dataset first!")
        if st.button("📊 Go to Data Upload"):
            st.session_state.current_page = 'data'
            st.rerun()
        return

    df = st.session_state.uploaded_data

    # ── Dataset Stats Header ──
    st.markdown("### 📈 Dataset Statistics")
    sc1, sc2, sc3, sc4 = st.columns(4)
    with sc1:
        st.metric("Total Rows", f"{len(df):,}")
    with sc2:
        st.metric("Columns", len(df.columns))
    with sc3:
        avg_len = int(df.iloc[:, 0].astype(str).str.len().mean()) if len(df) > 0 else 0
        st.metric("Avg Text Length", f"{avg_len:,} chars")
    with sc4:
        est_tokens = int(avg_len * len(df) / 4) if avg_len > 0 else 0
        st.metric("Est. Total Tokens", f"{est_tokens:,}")

    st.markdown("---")

    # ── Training Goal ──
    goal = st.text_input(
        "Training Goal",
        value=st.session_state.training_goal or "assistant",
        help="e.g., medical_assistant, customer_support, code_helper"
    )
    st.session_state.training_goal = goal

    # ── Column Mapping ──
    st.markdown("### 🎯 Column Mapping")
    instruction_patterns = ['instruction', 'prompt', 'question', 'query', 'user', 'input_text', 'human']
    output_patterns = ['output', 'response', 'answer', 'completion', 'assistant', 'target']
    input_patterns = ['context', 'input', 'background', 'reference']

    detected_instruction = detected_output = detected_input = None
    available_columns = list(df.columns)

    for col in available_columns:
        col_lower = col.lower()
        for p in instruction_patterns:
            if p in col_lower and not detected_instruction:
                detected_instruction = col
        for p in output_patterns:
            if p in col_lower and not detected_output:
                detected_output = col
        for p in input_patterns:
            if p in col_lower and not detected_input:
                detected_input = col

    mc1, mc2, mc3 = st.columns(3)
    with mc1:
        instruction_col = st.selectbox("Instruction Column *", options=available_columns,
            index=available_columns.index(detected_instruction) if detected_instruction else 0,
            help="Column containing instructions/prompts/questions")
    with mc2:
        output_col = st.selectbox("Output Column *", options=available_columns,
            index=available_columns.index(detected_output) if detected_output else (1 if len(available_columns) > 1 else 0),
            help="Column containing responses/answers/outputs")
    with mc3:
        input_col_options = ["None"] + available_columns
        default_input_idx = input_col_options.index(detected_input) if detected_input else 0
        input_col_selection = st.selectbox("Input/Context Column (Optional)", options=input_col_options,
            index=default_input_idx, help="Optional column containing additional context")
        input_col = None if input_col_selection == "None" else input_col_selection

    st.markdown("---")

    # ── Safe Preset Button ──
    if st.button("🛡️ Load Safe Preset", help="Apply recommended defaults for most datasets"):
        st.session_state['safe_preset'] = True
        st.rerun()

    use_safe = st.session_state.get('safe_preset', False)

    # ====================================================================
    # 1️⃣ Text Cleaning Controls
    # ====================================================================
    with st.expander("1️⃣ Text Cleaning Controls", expanded=False):
        tc1, tc2 = st.columns(2)
        with tc1:
            clean_html = st.checkbox("Remove HTML Tags", value=use_safe, help="Strip all HTML/XML tags from text")
            clean_urls = st.checkbox("Remove URLs", value=use_safe, help="Remove http/https/www links")
            clean_emojis = st.checkbox("Remove Emojis", value=False, help="Strip emoji characters")
            clean_whitespace = st.checkbox("Normalize Whitespace", value=True, help="Collapse multiple spaces/tabs into one")
        with tc2:
            clean_lowercase = st.checkbox("Lowercase All Text", value=False, help="Convert text to lowercase (disable to preserve case)")
            clean_special = st.checkbox("Remove Special Characters", value=False, help="Keep only alphanumeric + basic punctuation")
            clean_linebreaks = st.checkbox("Strip Extra Line Breaks", value=True, help="Reduce 3+ newlines to double newlines")

    # ====================================================================
    # 2️⃣ Tokenization Controls
    # ====================================================================
    with st.expander("2️⃣ Tokenization Controls", expanded=False):
        tk1, tk2 = st.columns(2)
        with tk1:
            tokenizer_choice = st.selectbox("Tokenizer", ["tiktoken", "HuggingFace"],
                help="tiktoken = OpenAI-compatible, HuggingFace = model-specific tokenizer")
            if tokenizer_choice == "HuggingFace":
                hf_model_name = st.text_input("HF Model Name", value="meta-llama/Llama-3-8b",
                    help="HuggingFace model name for tokenizer")
            else:
                hf_model_name = ""
            max_total_tokens = st.slider("Max Tokens per Sample", 128, 8192, 2048,
                help="Maximum total tokens allowed per sample")
        with tk2:
            truncate_long = st.checkbox("Truncate Long Samples", value=False,
                help="Cut text exceeding max tokens")
            split_long = st.checkbox("Split Long Samples into Chunks", value=False,
                help="Break long texts into overlapping chunks")
            if split_long:
                split_overlap = st.slider("Chunk Overlap Tokens", 0, 200, 50,
                    help="Number of overlapping tokens between chunks")
            else:
                split_overlap = 50

        # Token stats preview
        if st.button("📊 Show Token Stats Preview", key="token_stats_btn"):
            with st.spinner("Counting tokens..."):
                try:
                    from preprocessing.tokenization import TokenizationConfig, get_tokenizer, compute_token_stats
                    tk_cfg = TokenizationConfig(
                        tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
                    )
                    tokenizer = get_tokenizer(tk_cfg)
                    is_tiktoken = tokenizer_choice == "tiktoken"
                    stats_cols = [c for c in [instruction_col, output_col] if c in df.columns]
                    stats = compute_token_stats(df.head(200), stats_cols, tokenizer, is_tiktoken)
                    for col_name, s in stats.items():
                        st.markdown(f"**{col_name}**: min={s['min']}, max={s['max']}, mean={s['mean']}, p95={s['p95']}")
                except Exception as e:
                    st.warning(f"Could not compute token stats: {e}")

    # ====================================================================
    # 3️⃣ System Prompt Configuration
    # ====================================================================
    with st.expander("3️⃣ System Prompt Configuration", expanded=False):
        system_prompt_text = st.text_area("Global System Prompt",
            value="You are a helpful AI assistant." if not use_safe else "You are a helpful AI assistant.",
            height=100, help="System prompt prepended to every sample in chat format")
        prepend_system = st.checkbox("Prepend System Prompt to All Samples", value=True,
            help="Include this system prompt in all formatted entries")

        if st.button("👁️ Preview Formatted Chat JSON", key="preview_chat_btn"):
            try:
                from preprocessing.system_prompt import preview_formatted_json
                preview = preview_formatted_json(df, system_prompt_text, instruction_col, output_col, input_col, n=2)
                st.code(preview, language="json")
            except Exception as e:
                st.warning(f"Preview error: {e}")

    # ====================================================================
    # 4️⃣ Dataset Balancing
    # ====================================================================
    with st.expander("4️⃣ Dataset Balancing (Classification)", expanded=False):
        balance_enabled = st.checkbox("Enable Class Balancing", value=False,
            help="Balance class distribution for classification tasks")
        if balance_enabled:
            label_col_options = available_columns
            label_col = st.selectbox("Label Column", options=label_col_options,
                help="Column containing class labels")
            balance_strategy = st.radio("Strategy", ["none", "oversample", "undersample"],
                help="Oversample = duplicate minority, Undersample = drop majority")

            # Show distribution chart
            if label_col in df.columns:
                from preprocessing.dataset_balancing import compute_label_distribution
                dist = compute_label_distribution(df, label_col)
                if dist:
                    fig = px.bar(x=list(dist.keys()), y=list(dist.values()),
                        labels={'x': 'Label', 'y': 'Count'}, title="Label Distribution")
                    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
                        font_color='#e2e8f0')
                    st.plotly_chart(fig, use_container_width=True)
        else:
            label_col = None
            balance_strategy = "none"

    # ====================================================================
    # 5️⃣ Quality Filters
    # ====================================================================
    with st.expander("5️⃣ Quality Filters", expanded=False):
        qf1, qf2 = st.columns(2)
        with qf1:
            min_words = st.number_input("Min Word Count", min_value=0, value=3 if use_safe else 0,
                help="Minimum words required per sample (0 = no filter)")
            max_words = st.number_input("Max Word Count", min_value=0, value=0,
                help="Maximum words allowed per sample (0 = no limit)")
            profanity_filter = st.checkbox("Profanity Filter", value=False,
                help="Remove samples containing profane language")
        with qf2:
            language_filter = st.checkbox("Language Detection Filter", value=False,
                help="Keep only samples in specified languages")
            if language_filter:
                allowed_langs = st.text_input("Allowed Languages (comma-separated)", value="en",
                    help="ISO 639-1 codes, e.g. en,fr,de")
            else:
                allowed_langs = "en"
            remove_low_quality = st.checkbox("Remove Low-Quality Responses", value=use_safe,
                help="Remove short / generic / placeholder responses")

    # ====================================================================
    # 6️⃣ Deduplication Advanced
    # ====================================================================
    with st.expander("6️⃣ Deduplication", expanded=False):
        dedup_exact = st.checkbox("Remove Exact Duplicates", value=True,
            help="Remove rows with identical instruction text")
        dedup_semantic = st.checkbox("Remove Semantic Duplicates", value=False,
            help="Use TF-IDF cosine similarity to find near-duplicates")
        if dedup_semantic:
            semantic_threshold = st.slider("Similarity Threshold", 0.5, 1.0, 0.90, 0.01,
                help="Cosine similarity above this threshold = duplicate (higher = stricter)")
        else:
            semantic_threshold = 0.90

    # ====================================================================
    # 7️⃣ Train / Validation Split
    # ====================================================================
    with st.expander("7️⃣ Train / Validation Split", expanded=False):
        split_enabled = st.checkbox("Enable Train/Val Split", value=True,
            help="Split dataset into training and validation sets")
        if split_enabled:
            train_ratio = st.slider("Train Ratio", 0.5, 0.95, 0.9 if use_safe else 0.8, 0.05,
                help="Proportion of data used for training")
            st.markdown(f"**Split**: {int(train_ratio*100)}% Train / {int((1-train_ratio)*100)}% Validation")
            random_seed = st.number_input("Random Seed", min_value=0, value=42,
                help="Seed for reproducible splits")
            shuffle_data = st.checkbox("Shuffle Before Split", value=True,
                help="Randomly shuffle data before splitting")
        else:
            train_ratio = 0.8
            random_seed = 42
            shuffle_data = True

    # ====================================================================
    # 8️⃣ Output Formatting
    # ====================================================================
    with st.expander("8️⃣ Output Formatting", expanded=False):
        format_type = st.selectbox("Export Format", ["openai_chat", "completion", "classification", "custom"],
            help="OpenAI Chat = messages format, Completion = prompt/completion, Classification = text/label")

        custom_schema = {}
        if format_type == "custom":
            st.markdown("**Define Custom Schema** (output_key → source_column)")
            num_fields = st.number_input("Number of Fields", 1, 10, 2)
            for i in range(int(num_fields)):
                fc1, fc2 = st.columns(2)
                with fc1:
                    key = st.text_input(f"Output Key {i+1}", value=f"field_{i+1}", key=f"ckey_{i}")
                with fc2:
                    val = st.selectbox(f"Source Column {i+1}", options=available_columns, key=f"cval_{i}")
                custom_schema[key] = val

    # ====================================================================
    # 9️⃣ Safety & PII Filtering
    # ====================================================================
    with st.expander("9️⃣ Safety & PII Filtering", expanded=False):
        pii1, pii2 = st.columns(2)
        with pii1:
            pii_emails = st.checkbox("Detect & Mask Emails", value=use_safe,
                help="Replace email addresses with [REDACTED]")
            pii_phones = st.checkbox("Detect & Mask Phone Numbers", value=use_safe,
                help="Replace phone numbers with [REDACTED]")
            pii_ids = st.checkbox("Detect & Mask CNIC/SSN", value=use_safe,
                help="Replace national ID / SSN patterns with [REDACTED]")
        with pii2:
            pii_keys = st.checkbox("Detect & Mask API Keys", value=use_safe,
                help="Replace long hex/base64 strings that look like secrets")
            pii_addresses = st.checkbox("Detect & Mask Addresses", value=False,
                help="Replace street addresses and zip codes")

    # ====================================================================
    # 🔟 Augmentation (Optional)
    # ====================================================================
    with st.expander("🔟 Augmentation (Optional)", expanded=False):
        aug_enabled = st.checkbox("Enable Data Augmentation", value=False,
            help="Generate synthetic variations of existing samples")
        if aug_enabled:
            ag1, ag2 = st.columns(2)
            with ag1:
                aug_paraphrase = st.checkbox("Paraphrase Instructions", value=True,
                    help="Synonym-based paraphrasing of instructions")
                aug_variations = st.checkbox("Generate Variations", value=False,
                    help="Minor text variations (punctuation, casing)")
            with ag2:
                aug_backtranslate = st.checkbox("Back Translation", value=False,
                    help="Simulate back-translation for diversity")
                aug_tone = st.checkbox("Tone Rewriting", value=False,
                    help="Rewrite instructions in different tones")
            aug_factor = st.slider("Augmentation Factor", 1, 5, 1,
                help="Number of augmented copies per original sample")
        else:
            aug_paraphrase = aug_variations = aug_backtranslate = aug_tone = False
            aug_factor = 1

    st.markdown("---")

    # ── Run Pipeline Button ──
    if st.button("🚀 Run Advanced Processing Pipeline", type="primary", use_container_width=True):
        st.session_state.pipeline_status['data'] = 'running'

        with st.spinner("Running preprocessing pipeline..."):
            progress_bar = st.progress(0)
            status_text = st.empty()

            try:
                from preprocessing.pipeline import PreprocessingPipeline, PreprocessingConfig
                from preprocessing.text_cleaning import TextCleaningConfig
                from preprocessing.tokenization import TokenizationConfig
                from preprocessing.system_prompt import SystemPromptConfig
                from preprocessing.dataset_balancing import BalancingConfig
                from preprocessing.quality_filters import QualityFilterConfig
                from preprocessing.deduplication import DeduplicationConfig
                from preprocessing.train_val_split import SplitConfig
                from preprocessing.output_formatter import OutputFormatConfig, format_dataset, export_jsonl, generate_preview
                from preprocessing.pii_filter import PIIFilterConfig
                from preprocessing.augmentation import AugmentationConfig

                # Build config from UI values
                config = PreprocessingConfig(
                    instruction_col=instruction_col,
                    output_col=output_col,
                    input_col=input_col,
                    label_col=label_col if balance_enabled else None,
                    text_cleaning=TextCleaningConfig(
                        remove_html=clean_html, remove_urls=clean_urls,
                        remove_emojis=clean_emojis, normalize_whitespace=clean_whitespace,
                        lowercase=clean_lowercase, remove_special_chars=clean_special,
                        strip_extra_linebreaks=clean_linebreaks,
                    ),
                    tokenization=TokenizationConfig(
                        tokenizer_name="tiktoken" if tokenizer_choice == "tiktoken" else hf_model_name,
                        max_total_tokens=max_total_tokens,
                        truncate_long=truncate_long, split_long=split_long,
                        split_overlap=split_overlap,
                    ),
                    system_prompt=SystemPromptConfig(
                        system_prompt=system_prompt_text,
                        prepend_to_all=prepend_system,
                    ),
                    balancing=BalancingConfig(
                        enabled=balance_enabled,
                        label_column=label_col if balance_enabled else "",
                        strategy=balance_strategy if balance_enabled else "none",
                    ),
                    quality_filters=QualityFilterConfig(
                        min_word_count=min_words, max_word_count=max_words,
                        profanity_filter=profanity_filter,
                        language_filter=language_filter,
                        allowed_languages=[l.strip() for l in allowed_langs.split(',')],
                        remove_low_quality=remove_low_quality,
                    ),
                    deduplication=DeduplicationConfig(
                        remove_exact=dedup_exact, remove_semantic=dedup_semantic,
                        semantic_threshold=semantic_threshold,
                    ),
                    split=SplitConfig(
                        enabled=split_enabled, train_ratio=train_ratio,
                        random_seed=int(random_seed), shuffle=shuffle_data,
                    ),
                    output_format=OutputFormatConfig(
                        format_type=format_type, custom_schema=custom_schema,
                    ),
                    pii_filter=PIIFilterConfig(
                        filter_emails=pii_emails, filter_phones=pii_phones,
                        filter_id_numbers=pii_ids, filter_api_keys=pii_keys,
                        filter_addresses=pii_addresses,
                    ),
                    augmentation=AugmentationConfig(
                        enabled=aug_enabled, paraphrase=aug_paraphrase,
                        generate_variations=aug_variations,
                        back_translate=aug_backtranslate,
                        tone_rewrite=aug_tone,
                        augmentation_factor=aug_factor,
                    ),
                )

                def progress_cb(stage_name, pct):
                    status_text.text(f"⚙️ {stage_name}...")
                    progress_bar.progress(min(pct, 100))

                pipeline = PreprocessingPipeline(config)
                train_df, val_df, logs = pipeline.run(df, progress_callback=progress_cb)

                # Format output
                sys_prompt = system_prompt_text if prepend_system else ""
                formatted_data = format_dataset(
                    train_df, config.output_format,
                    system_prompt=sys_prompt,
                    instruction_col=instruction_col,
                    output_col=output_col,
                    input_col=input_col,
                    label_col=label_col if balance_enabled else None,
                )

                # Export
                output_dir = Path("./output/processed_data")
                output_dir.mkdir(parents=True, exist_ok=True)
                train_path = export_jsonl(formatted_data, str(output_dir / f"{goal}_train.jsonl"))

                val_path = None
                if len(val_df) > 0:
                    val_formatted = format_dataset(
                        val_df, config.output_format,
                        system_prompt=sys_prompt,
                        instruction_col=instruction_col,
                        output_col=output_col,
                        input_col=input_col,
                        label_col=label_col if balance_enabled else None,
                    )
                    val_path = export_jsonl(val_formatted, str(output_dir / f"{goal}_val.jsonl"))

                progress_bar.progress(100)
                status_text.text("✅ Pipeline complete!")

                st.session_state.processed_data_path = train_path
                st.session_state.pipeline_status['data'] = 'complete'

                # ── Results ──
                st.success(f"✅ Training data saved to: `{train_path}`")
                if val_path:
                    st.success(f"✅ Validation data saved to: `{val_path}`")

                # Stats
                rc1, rc2, rc3, rc4 = st.columns(4)
                with rc1:
                    st.metric("Original Rows", f"{len(df):,}")
                with rc2:
                    st.metric("Train Samples", f"{len(train_df):,}")
                with rc3:
                    st.metric("Val Samples", f"{len(val_df):,}")
                with rc4:
                    removed = len(df) - len(train_df) - len(val_df)
                    st.metric("Removed", f"{max(0, removed):,}")

                # ── Pipeline Logs ──
                st.markdown("### 📋 Pipeline Logs")
                log_data = []
                for log in logs:
                    log_data.append({
                        'Stage': log.stage,
                        'Description': log.description,
                        'Rows Before': log.rows_before,
                        'Rows After': log.rows_after,
                        'Delta': log.rows_delta,
                        'Time (ms)': log.duration_ms,
                    })
                st.dataframe(pd.DataFrame(log_data), use_container_width=True)

                # ── Preview ──
                st.markdown("### 👁️ Output Preview")
                preview_json = generate_preview(formatted_data, n=3)
                st.code(preview_json, language="json")

                # ── Download ──
                st.markdown("### 📥 Download")
                dl1, dl2 = st.columns(2)
                with dl1:
                    with open(train_path, 'r', encoding='utf-8') as f:
                        st.download_button("⬇️ Download Train JSONL", f.read(),
                            file_name=f"{goal}_train.jsonl", mime="application/jsonl")
                with dl2:
                    if val_path and Path(val_path).exists():
                        with open(val_path, 'r', encoding='utf-8') as f:
                            st.download_button("⬇️ Download Val JSONL", f.read(),
                                file_name=f"{goal}_val.jsonl", mime="application/jsonl")

            except Exception as e:
                st.session_state.pipeline_status['data'] = 'error'
                st.error(f"❌ Pipeline Error: {str(e)}")
                import traceback
                st.code(traceback.format_exc())

    # Show previously processed data
    if st.session_state.processed_data_path:
        st.markdown("---")
        st.markdown("### 📂 Last Processed Data")
        try:
            processed_path = Path(st.session_state.processed_data_path)
            if processed_path.exists():
                with open(processed_path, encoding='utf-8') as f:
                    samples = [json.loads(line) for line in f.readlines()[:5]]
                for i, sample in enumerate(samples):
                    with st.expander(f"Sample {i+1}"):
                        st.json(sample)
        except Exception as e:
            st.warning(f"Could not load preview: {e}")


# ============================================================================
# PAGE: TRAINING
# ============================================================================
def render_training():
    st.markdown('<p class="gradient-header">🚀 Model Training</p>', unsafe_allow_html=True)

    # Check prerequisites
    if st.session_state.processed_data_path is None:
        st.warning("⚠️ Please process your data first!")
        if st.button("🧹 Go to Processing"):
            st.session_state.current_page = 'process'
            st.rerun()
        return

    # ── GPU Detection ──
    try:
        import torch
        has_gpu = torch.cuda.is_available()
        if has_gpu:
            gpu_name = torch.cuda.get_device_name(0)
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            st.success(f"✅ GPU Available: **{gpu_name}** ({gpu_memory:.1f} GB)")
    except Exception:
        has_gpu = False

    # ── Download Preprocessed Data (always available) ──
    st.markdown("### 📥 Preprocessed Training Data")
    processed_path = Path(st.session_state.processed_data_path)
    if processed_path.exists():
        with open(processed_path, 'r', encoding='utf-8') as f:
            processed_content = f.read()
        dl1, dl2 = st.columns(2)
        with dl1:
            st.download_button("⬇️ Download Training JSONL", processed_content,
                file_name=processed_path.name, mime="application/jsonl")
        with dl2:
            # Check for validation file
            val_path = processed_path.parent / processed_path.name.replace('_train', '_val')
            if val_path.exists():
                with open(val_path, 'r', encoding='utf-8') as f:
                    st.download_button("⬇️ Download Validation JSONL", f.read(),
                        file_name=val_path.name, mime="application/jsonl")
        try:
            sample_count = sum(1 for _ in processed_content.split('\n') if _.strip())
        except Exception:
            sample_count = 0
        st.info(f"📊 Dataset: **{sample_count:,}** samples ready for training")
    else:
        st.warning("Processed data file not found.")

    st.markdown("---")

    # ====================================================================
    # TWO PATHS: GPU Training OR Colab Notebook
    # ====================================================================
    if has_gpu:
        training_mode = "gpu"
    else:
        training_mode = st.radio("🖥️ Select Training Mode", [
            "☁️ Use Google Colab (Recommended – Free GPU)",
            "📤 Upload Fine-Tuned Model (Already trained externally)"
        ], help="No GPU detected on this machine. Choose how to proceed.")

    # ====================================================================
    # PATH A: GPU Training (local)
    # ====================================================================
    if training_mode == "gpu":
        st.markdown("### ⚙️ Training Configuration")

        col1, col2 = st.columns(2)
        with col1:
            model_source = st.radio("Model Source", ["Preset Models", "Custom HuggingFace Model"])
            if model_source == "Preset Models":
                base_model = st.selectbox("Base Model", [
                    "unsloth/llama-3-8b-bnb-4bit",
                    "unsloth/llama-3-70b-bnb-4bit",
                    "unsloth/mistral-7b-bnb-4bit",
                    "unsloth/gemma-7b-bnb-4bit",
                ])
            else:
                base_model = st.text_input("HuggingFace Model ID",
                    value="unsloth/llama-3-8b-bnb-4bit",
                    help="Enter any HuggingFace model ID, e.g. 'meta-llama/Llama-3-8b', 'mistralai/Mistral-7B-v0.1'")
            max_seq_length = st.slider("Max Sequence Length", 512, 4096, 2048)

        with col2:
            dataset_size = sample_count if sample_count > 0 else 1000
            if dataset_size < 1000:
                auto_rank, auto_alpha, auto_lr, auto_epochs = 8, 16, 2e-4, 5
                size_category = "Small"
            elif dataset_size < 10000:
                auto_rank, auto_alpha, auto_lr, auto_epochs = 16, 32, 1e-4, 3
                size_category = "Medium"
            else:
                auto_rank, auto_alpha, auto_lr, auto_epochs = 32, 64, 5e-5, 2
                size_category = "Large"
            st.success(f"Auto-configured for **{size_category}** dataset ({dataset_size:,} samples)")

        st.markdown("---")

        with st.expander("🔧 Advanced Hyperparameters"):
            hc1, hc2, hc3 = st.columns(3)
            with hc1:
                lora_rank = st.slider("LoRA Rank", 4, 64, auto_rank)
                lora_alpha = st.slider("LoRA Alpha", 8, 128, auto_alpha)
            with hc2:
                learning_rate = st.select_slider("Learning Rate",
                    options=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4], value=auto_lr)
                num_epochs = st.slider("Epochs", 1, 10, auto_epochs)
            with hc3:
                batch_size = st.slider("Batch Size", 1, 16, 4)
                gradient_accumulation = st.slider("Gradient Accumulation", 1, 8, 4)

        st.markdown("---")

        col1, col2, col3 = st.columns([1, 2, 1])
        with col2:
            if st.button("🚀 Start Training", type="primary", use_container_width=True):
                st.session_state.pipeline_status['training'] = 'running'
                with st.spinner("Training in progress..."):
                    progress_bar = st.progress(0)
                    status_text = st.empty()
                    try:
                        from agents.training_pilot import TrainingPilot, HyperParams
                        status_text.text("📦 Loading model...")
                        progress_bar.progress(10)
                        pilot = TrainingPilot(
                            base_model=base_model,
                            max_seq_length=max_seq_length,
                            output_dir="./output/models"
                        )
                        status_text.text("🚀 Training...")
                        progress_bar.progress(30)
                        result = pilot.run(
                            data_path=st.session_state.processed_data_path,
                            output_name=st.session_state.training_goal
                        )
                        progress_bar.progress(100)
                        status_text.text("✅ Training complete!")
                        st.session_state.model_path = result.model_path
                        st.session_state.pipeline_status['training'] = 'complete'
                        st.success(f"✅ Model saved to: `{result.model_path}`")
                        rc1, rc2, rc3 = st.columns(3)
                        with rc1:
                            st.metric("Final Loss", f"{result.final_loss:.4f}")
                        with rc2:
                            st.metric("Training Time", f"{result.training_time:.1f}s")
                        with rc3:
                            st.metric("Total Steps", result.num_steps)
                    except Exception as e:
                        st.session_state.pipeline_status['training'] = 'error'
                        st.error(f"❌ Training failed: {str(e)}")
                        import traceback
                        st.code(traceback.format_exc())

    # ====================================================================
    # PATH B: Google Colab Notebook
    # ====================================================================
    elif "Colab" in training_mode:
        st.markdown("### ☁️ Train on Google Colab (Free GPU)")
        st.markdown("""

        Since no GPU was detected on this machine, you can fine-tune your model on Google Colab with a free GPU.

        Follow these steps:

        """)

        st.markdown("""

        **Step 1:** Download your preprocessed training data (above) ⬆️



        **Step 2:** Download or copy the Colab notebook below



        **Step 3:** Open [Google Colab](https://colab.research.google.com/) → Upload the notebook



        **Step 4:** Upload your training JSONL to Colab's file browser



        **Step 5:** Run all cells → Download the fine-tuned model



        **Step 6:** Come back here → Upload your fine-tuned model results for evaluation

        """)

        # Show / Download Colab notebook
        notebook_path = Path("./Auto_FineTune_Ops_Colab.ipynb")
        if notebook_path.exists():
            with open(notebook_path, 'r', encoding='utf-8') as f:
                notebook_content = f.read()

            st.download_button("📓 Download Colab Notebook (.ipynb)", notebook_content,
                file_name="Auto_FineTune_Ops_Colab.ipynb", mime="application/json",
                type="primary", use_container_width=True)

            with st.expander("👁️ View Notebook Code", expanded=False):
                try:
                    import json as json_mod
                    nb = json_mod.loads(notebook_content)
                    for cell in nb.get('cells', []):
                        if cell.get('cell_type') == 'code':
                            source = ''.join(cell.get('source', []))
                            if source.strip():
                                st.code(source, language='python')
                        elif cell.get('cell_type') == 'markdown':
                            source = ''.join(cell.get('source', []))
                            st.markdown(source)
                except Exception:
                    st.code(notebook_content[:5000], language='json')
        else:
            st.warning("⚠️ Colab notebook not found at `Auto_FineTune_Ops_Colab.ipynb`")

        st.markdown("---")
        st.markdown("### 📤 After Training on Colab")
        st.info("Once you've finished training on Colab, download your fine-tuned model outputs and upload them below for evaluation.")

    # ====================================================================
    # PATH C: Upload Fine-Tuned Model / Results
    # ====================================================================
    else:
        st.markdown("### 📤 Upload Fine-Tuned Model Results")
        st.markdown("Upload outputs from your externally trained model for evaluation.")

    # ── Upload Fine-Tuned Results (always shown at bottom) ──
    st.markdown("---")
    st.markdown("### 📦 Upload Fine-Tuned Results for Evaluation")
    st.caption("If you trained on Colab or another machine, upload your model outputs here.")

    upload_tab1, upload_tab2 = st.tabs(["📊 Upload Evaluation Results (JSONL)", "📁 Upload Model Folder Path"])

    with upload_tab1:
        ft_file = st.file_uploader("Upload fine-tuned model outputs (JSONL with predictions)",
            type=['jsonl', 'json'], key="ft_results_upload",
            help="JSONL file with model predictions/outputs from your fine-tuned model")
        if ft_file:
            try:
                ft_df = pd.read_json(ft_file, lines=ft_file.name.endswith('.jsonl'))
                st.success(f"✅ Loaded **{len(ft_df):,}** evaluation samples")
                st.dataframe(ft_df.head(5), use_container_width=True)

                # Save for evaluation
                eval_output = Path("./output/eval_results")
                eval_output.mkdir(parents=True, exist_ok=True)
                eval_path = eval_output / f"finetuned_outputs_{ft_file.name}"
                ft_df.to_json(eval_path, orient='records', lines=True)

                st.session_state.model_path = str(eval_path)
                st.session_state.pipeline_status['training'] = 'complete'
                st.success(f"✅ Results saved! You can now proceed to **Evaluation** page.")

                if st.button("⚖️ Go to Evaluation"):
                    st.session_state.current_page = 'evaluation'
                    st.rerun()
            except Exception as e:
                st.error(f"Error loading file: {e}")

    with upload_tab2:
        model_folder = st.text_input("Model Folder Path",
            placeholder="e.g., ./output/models/my_finetuned_model or /path/to/model",
            help="Local path to the fine-tuned model directory (LoRA adapter or full model)")
        if model_folder and st.button("✅ Set Model Path"):
            if Path(model_folder).exists():
                st.session_state.model_path = model_folder
                st.session_state.pipeline_status['training'] = 'complete'
                st.success(f"✅ Model path set to: `{model_folder}`")
            else:
                st.error(f"❌ Path not found: `{model_folder}`")


# ============================================================================
# PAGE: EVALUATION
# ============================================================================
def render_evaluation():
    st.markdown('<p class="gradient-header">⚖️ Model Evaluation</p>', unsafe_allow_html=True)

    # Initialize session state for results if not present
    if 'eval_results' not in st.session_state:
        st.session_state.eval_results = None

    # ── Judge Provider Selection ──
    st.markdown("### 🤖 Select AI Judge Provider")
    st.caption("Choose which LLM provider to use as the evaluation judge.")

    judge_provider = st.selectbox("AI Provider", [
        "OpenAI (GPT-4o, GPT-4-turbo, etc.)",
        "Anthropic (Claude 3.5, Claude 3 Opus, etc.)",
        "Groq (Llama 3, Mixtral, Gemma, etc.)",
        "Custom OpenAI-Compatible Endpoint"
    ], help="Select the AI provider whose model will act as the judge.")

    st.markdown("---")
    st.markdown("### 🔑 API Configuration")

    api_key = None
    base_url = None
    
    if "OpenAI" in judge_provider:
        col1, col2 = st.columns(2)
        with col1:
            api_key = st.text_input("OpenAI API Key", type="password", key="openai_key_input")
            if api_key: os.environ["OPENAI_API_KEY"] = api_key
        with col2:
            judge_model = st.selectbox("Judge Model", ["gpt-4o", "gpt-4-turbo", "gpt-3.5-turbo"])

    elif "Anthropic" in judge_provider:
        col1, col2 = st.columns(2)
        with col1:
            api_key = st.text_input("Anthropic API Key", type="password", key="anthropic_key_input")
            if api_key: os.environ["ANTHROPIC_API_KEY"] = api_key
        with col2:
            judge_model = st.selectbox("Judge Model", ["claude-3-5-sonnet-20241022", "claude-3-opus-20240229", "claude-3-sonnet-20240229"])

    elif "Groq" in judge_provider:
        col1, col2 = st.columns(2)
        with col1:
            api_key = st.text_input("Groq API Key", type="password", key="groq_key_input")
            if api_key: os.environ["GROQ_API_KEY"] = api_key
        with col2:
            judge_model = st.selectbox("Judge Model", ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"])
        base_url = "https://api.groq.com/openai/v1"

    else:  # Custom
        col1, col2 = st.columns(2)
        with col1:
            base_url = st.text_input("API Base URL", placeholder="https://api.your-provider.com/v1")
            api_key = st.text_input("API Key", type="password", key="custom_key_input")
            if api_key: os.environ["OPENAI_API_KEY"] = api_key
        with col2:
            judge_model = st.text_input("Model Name", placeholder="e.g., my-model")

    st.markdown("---")

    # ── Evaluation Data ──
    st.markdown("### 📊 Evaluation Data")
    
    # 1. Use data from training (if available)
    if st.session_state.model_path and "finetuned_outputs" in str(st.session_state.model_path):
        st.info(f"Using results from training: `{st.session_state.model_path}`")
        try:
            st.session_state['eval_data'] = pd.read_json(st.session_state.model_path, lines=True)
        except Exception:
            pass

    # 2. Upload new data
    eval_upload = st.file_uploader("Upload JSONL (Must contain: 'instruction', 'base_output', 'finetuned_output')",
        type=['jsonl', 'json'], key="eval_uploader")
    
    if eval_upload:
        try:
            df = pd.read_json(eval_upload, lines=eval_upload.name.endswith('.jsonl'))
            required_cols = ['instruction', 'base_output', 'finetuned_output']
            if all(col in df.columns for col in required_cols):
                st.session_state['eval_data'] = df
                st.success(f"✅ Loaded {len(df)} samples")
            else:
                st.error(f"❌ Missing columns! Found: {list(df.columns)}. Required: {required_cols}")
        except Exception as e:
            st.error(f"Error loading file: {e}")

    # Show Preview
    if st.session_state.get('eval_data') is not None:
        with st.expander("👁️ View Data Preview"):
            st.dataframe(st.session_state['eval_data'].head(3), use_container_width=True)

    st.markdown("---")

    # ── Run Evaluation ──
    if st.button("🚀 Run Dynamic Evaluation", type="primary", use_container_width=True):
        if not api_key:
            st.error("❌ Please provide an API Key above!")
            return
        
        if st.session_state.get('eval_data') is None:
            st.error("❌ No evaluation data loaded!")
            return

        # Prepare Judge
        st.session_state.pipeline_status['evaluation'] = 'running'
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        results = []
        df = st.session_state['eval_data']
        total = len(df)
        
        try:
            # Initialize Client
            client = None
            if "Anthropic" in judge_provider:
                from anthropic import Anthropic
                client = Anthropic(api_key=api_key)
            else:
                from openai import OpenAI
                client = OpenAI(api_key=api_key, base_url=base_url)

            JUDGE_PROMPT = """You are an expert evaluator comparing two AI responses.

            

Query: {prompt}



Response A (Base Model):

{response_a}



Response B (Fine-tuned Model):

{response_b}



Compare them on: Accuracy, Helpfulness, Clarity.

Return a valid JSON object ONLY:

{{

    "winner": "A" or "B" or "TIE",

    "score_a": <1-10>,

    "score_b": <1-10>,

    "reasoning": "short explanation",

    "accuracy": {{"A": <1-10>, "B": <1-10>}},

    "helpfulness": {{"A": <1-10>, "B": <1-10>}},

    "clarity": {{"A": <1-10>, "B": <1-10>}}

}}

"""

            for i, row in df.iterrows():
                status_text.text(f"Evaluating sample {i+1}/{total}...")
                
                prompt_text = JUDGE_PROMPT.format(
                    prompt=row['instruction'],
                    response_a=row['base_output'],
                    response_b=row['finetuned_output']
                )

                # Call API
                if "Anthropic" in judge_provider:
                    resp = client.messages.create(
                        model=judge_model, max_tokens=1000, 
                        messages=[{"role": "user", "content": prompt_text}]
                    ).content[0].text
                else:
                    resp = client.chat.completions.create(
                        model=judge_model, max_tokens=1000,
                        messages=[{"role": "user", "content": prompt_text}],
                        response_format={"type": "json_object"}
                    ).choices[0].message.content

                # Parse
                try:
                    import json
                    # Clean json string if needed
                    if "```json" in resp: resp = resp.split("```json")[1].split("```")[0]
                    if "```" in resp: resp = resp.split("```")[1]
                    
                    data = json.loads(resp.strip())
                    data['instruction'] = row['instruction']
                    results.append(data)
                except Exception as e:
                    print(f"Parse error: {e}")
                    results.append({"winner": "TIE", "score_a": 5, "score_b": 5, "reasoning": "Error parsing judge response"})

                progress_bar.progress((i + 1) / total)

            st.session_state.eval_results = results
            st.session_state.pipeline_status['evaluation'] = 'complete'
            status_text.text("✅ Evaluation Complete!")

        except Exception as e:
            st.error(f"Evaluation Failed: {str(e)}")
            st.session_state.pipeline_status['evaluation'] = 'error'

    # ── Display Results ──
    if st.session_state.get('eval_results'):
        results = st.session_state.eval_results
        df_res = pd.DataFrame(results)
        
        # Metrics
        wins_b = len(df_res[df_res['winner'] == 'B'])
        wins_a = len(df_res[df_res['winner'] == 'A'])
        ties = len(df_res[df_res['winner'] == 'TIE'])
        win_rate = (wins_b / len(df_res)) * 100
        
        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Fine-tuned Win Rate", f"{win_rate:.1f}%")
        col2.metric("Fine-Tuned Wins", wins_b)
        col3.metric("Base Model Wins", wins_a)
        col4.metric("Avg Score Improvement", f"{df_res['score_b'].mean() - df_res['score_a'].mean():.2f}")

        # Charts
        c1, c2 = st.columns(2)
        with c1:
            fig = px.pie(values=[wins_b, wins_a, ties], names=['Fine-tuned', 'Base', 'Ties'], 
                         title="Win Distribution", color_discrete_sequence=['#6366f1', '#ef4444', '#94a3b8'])
            st.plotly_chart(fig, use_container_width=True)
            
        with c2:
            avg_scores = pd.DataFrame({
                'Model': ['Base', 'Fine-tuned'],
                'Score': [df_res['score_a'].mean(), df_res['score_b'].mean()]
            })
            fig2 = px.bar(avg_scores, x='Model', y='Score', color='Model', 
                          title="Average Overall Score", color_discrete_map={'Base': '#ef4444', 'Fine-tuned': '#6366f1'})
            st.plotly_chart(fig2, use_container_width=True)

        # Detailed Table
        st.markdown("### 📝 Detailed Verdicts")
        st.dataframe(df_res[['instruction', 'winner', 'score_a', 'score_b', 'reasoning']], use_container_width=True)
        
        # Download
        st.download_button("⬇️ Download Report (JSON)", 
                           data=json.dumps(results, indent=2), 
                           file_name="evaluation_report.json", 
                           mime="application/json")


# ============================================================================
# PAGE: DEPLOYMENT
# ============================================================================
def render_deploy():
    st.markdown('<p class="gradient-header">🌐 Model Deployment</p>', unsafe_allow_html=True)
    
    # Model selection
    st.markdown("### 📦 Select Model")
    
    models_dir = Path("./output/models")
    if models_dir.exists():
        models = [d.name for d in models_dir.iterdir() if d.is_dir()]
        if models:
            selected_model = st.selectbox("Trained Models", models)
            model_path = models_dir / selected_model
            st.info(f"📂 Model path: `{model_path}`")
        else:
            st.warning("No trained models found.")
            selected_model = None
    else:
        st.warning("Models directory not found.")
        selected_model = None
    
    st.markdown("---")
    
    # Deployment options
    st.markdown("### 🚀 Deployment Options")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("""

        <div class="info-card">

            <h4>🖥️ Local FastAPI Server</h4>

            <p>Deploy as a REST API on your local machine.</p>

        </div>

        """, unsafe_allow_html=True)
        
        port = st.number_input("Port", value=8000, min_value=1000, max_value=65535)
        
        if st.button("🚀 Start Server", disabled=not selected_model):
            st.code(f"python scripts/deploy.py --model ./output/models/{selected_model} --port {port}")
            st.info("Run the command above in your terminal to start the server.")
    
    with col2:
        st.markdown("""

        <div class="info-card">

            <h4>☁️ HuggingFace Hub</h4>

            <p>Push your model to HuggingFace for sharing.</p>

        </div>

        """, unsafe_allow_html=True)
        
        hf_token = st.text_input("HuggingFace Token", type="password")
        repo_name = st.text_input("Repository Name", value=f"my-finetuned-{selected_model}" if selected_model else "")
        
        if st.button("☁️ Push to Hub", disabled=not selected_model or not hf_token):
            st.info("Pushing to HuggingFace Hub...")
    
    st.markdown("---")
    
    # API documentation
    st.markdown("### 📚 API Documentation")
    
    st.markdown("""

    Once deployed, your API will have these endpoints:

    

    | Endpoint | Method | Description |

    |----------|--------|-------------|

    | `/` | GET | API info |

    | `/health` | GET | Health check |

    | `/generate` | POST | Generate text |

    | `/generate/batch` | POST | Batch generation |

    """)
    
    with st.expander("📝 Example Request"):
        st.code("""

import requests



response = requests.post("http://localhost:8000/generate", json={

    "prompt": "What are the symptoms of the common cold?",

    "max_tokens": 256,

    "temperature": 0.7

})

print(response.json()["generated_text"])

        """, language="python")


# ============================================================================
# MAIN ROUTER
# ============================================================================
def main():
    page = st.session_state.current_page
    
    if page == 'home':
        render_home()
    elif page == 'data':
        render_data_upload()
    elif page == 'process':
        render_processing()
    elif page == 'training':
        render_training()
    elif page == 'evaluation':
        render_evaluation()
    elif page == 'deploy':
        render_deploy()
    else:
        render_home()


if __name__ == "__main__":
    main()