import streamlit as st
import pandas as pd
from datasets import load_dataset
from datetime import datetime
import json
import plotly.express as px
import plotly.graph_objects as go
from collections import defaultdict, Counter

# Set page config
st.set_page_config(
    page_title="🗺️ ModelAtlas Community Dashboard",
    page_icon="🗺️",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Load community data
@st.cache_data(ttl=300)  # Cache for 5 minutes
def load_community_data():
    try:
        dataset = load_dataset("RadicalNotionAI/community-analyses", split="train")
        df = dataset.to_pandas()
        # Ensure we have a proper DataFrame
        if not isinstance(df, pd.DataFrame):
            return pd.DataFrame()
        return df
    except Exception as e:
        st.error(f"Error loading dataset: {e}")
        return pd.DataFrame()

# Main header
st.title("🗺️ ModelAtlas Community Dashboard")
st.subheader("Collaborative Intelligence for AI Model Architecture Analysis")

# Load data
df = load_community_data()

# Status badge based on actual data
if not df.empty:
    st.success(f"✅ Live with {len(df)} Community Models!")
else:
    st.info("⏳ Waiting for First Contributions")

# Create tabs
tab1, tab2, tab3, tab4 = st.tabs(["📊 Overview", "🗂️ Models", "🔬 Technical Details", "🔐 Access & Contributing"])

with tab1:
    st.header("📊 Community Overview")

    if not df.empty:
        # Real statistics from the community data
        total_models = len(df)
        organizations = df['organization'].nunique() if 'organization' in df.columns else 0
        model_types = len(df['model_type'].unique()) if 'model_type' in df.columns else 0
        latest_analysis = df['analyzed_at'].max() if 'analyzed_at' in df.columns else None

        col1, col2, col3, col4 = st.columns(4)

        with col1:
            st.metric("🗺️ Models Analyzed", total_models)
        with col2:
            st.metric("🏢 Organizations", organizations)
        with col3:
            st.metric("🔧 Model Types", model_types)
        with col4:
            if latest_analysis:
                latest_date = latest_analysis[:10] if isinstance(latest_analysis, str) else str(latest_analysis)[:10]
                st.metric("📅 Latest Analysis", latest_date)
            else:
                st.metric("📅 Latest Analysis", "Unknown")

        st.subheader("🗄️ Community Dataset")
        st.write(f"**Location:** [RadicalNotionAI/community-analyses](https://huggingface.co/datasets/RadicalNotionAI/community-analyses)")
        st.write(f"**Status:** Live with {total_models} community contributions!")

        # Top organizations and model types
        col1, col2 = st.columns(2)

        with col1:
            if 'organization' in df.columns:
                st.subheader("🏢 Top Organizations")
                org_counts = df['organization'].value_counts().head(5)
                for org, count in org_counts.items():
                    st.write(f"• **{org}**: {count} models")

        with col2:
            if 'model_type' in df.columns:
                st.subheader("🔧 Popular Model Types")
                type_counts = df['model_type'].value_counts().head(5)
                for model_type, count in type_counts.items():
                    st.write(f"• **{model_type}**: {count} models")

        # Recent models
        st.subheader("🆕 Recent Contributions")
        if 'analyzed_at' in df.columns:
            # Sort by analyzed_at as string (works for ISO format dates)
            recent_df = df.sort_values('analyzed_at', ascending=False).head(5)[['model_id', 'organization', 'analyzed_at']]
        else:
            recent_df = df.head(5)[['model_id', 'organization']]

        for _, row in recent_df.iterrows():
            analysis_date = f" ({row['analyzed_at'][:10]})" if 'analyzed_at' in row else ""
            st.write(f"• `{row['model_id']}` - {row['organization']}{analysis_date}")

    else:
        st.write("**Status:** The ModelAtlas community platform is live and ready for contributions!")

        col1, col2, col3 = st.columns(3)

        with col1:
            st.subheader("🗄️ Community Dataset")
            st.write("Central repository for model architecture analyses")
            st.write("**Location:** [RadicalNotionAI/community-analyses](https://huggingface.co/datasets/RadicalNotionAI/community-analyses)")
            st.write("**Status:** Live and accepting contributions")

        with col2:
            st.subheader("📈 Features Available")
            st.write("Ready for your contributions:")
            st.write("• Community model browser")
            st.write("• Innovation timeline analytics")
            st.write("• Cross-organizational insights")
            st.write("• Technique adoption tracking")

        with col3:
            st.subheader("🚀 Getting Started")
            st.write("Ready to contribute? Follow these steps:")
            st.write("1. Install ModelAtlas CLI")
            st.write("2. Setup: `python atlas.py contribute --setup`")
            st.write("3. Analyze: `python model_test.py model/name`")
            st.write("4. Submit: `python atlas.py contribute --submit`")

with tab2:
    st.header("🗂️ Community Model Browser")

    if not df.empty:
        st.success(f"**Community Models:** {len(df)} models available from the community!")

        # Filter options
        col1, col2 = st.columns(2)

        selected_org = 'All'
        selected_type = 'All'

        with col1:
            if 'organization' in df.columns:
                orgs = ['All'] + sorted(df['organization'].unique().tolist())
                selected_org = st.selectbox("Filter by Organization", orgs)

        with col2:
            if 'model_type' in df.columns:
                types = ['All'] + sorted(df['model_type'].unique().tolist())
                selected_type = st.selectbox("Filter by Model Type", types)

        # Apply filters
        filtered_df = df.copy()
        if 'organization' in df.columns and selected_org != 'All':
            filtered_df = filtered_df[filtered_df['organization'] == selected_org]
        if 'model_type' in df.columns and selected_type != 'All':
            filtered_df = filtered_df[filtered_df['model_type'] == selected_type]

        st.write(f"**Showing {len(filtered_df)} of {len(df)} models**")

        # Display models
        display_columns = ['model_id', 'organization', 'model_type', 'analyzed_at']
        available_columns = [col for col in display_columns if col in filtered_df.columns]

        if available_columns:
            # Sort by date (string format works for ISO dates) or model_id
            sort_column = 'analyzed_at' if 'analyzed_at' in filtered_df.columns else 'model_id'
            display_df = filtered_df[available_columns].sort_values(
                sort_column, ascending=False
            ).head(50)  # Limit to 50 most recent

            st.dataframe(display_df, use_container_width=True)
        else:
            st.warning("Model data structure is not as expected. Please check dataset format.")

    else:
        st.info("**Current Status:** Waiting for first contributions to populate the dataset.")

        st.subheader("What You'll See Here")
        st.write("• ✅ Community-contributed model analyses")
        st.write("• ✅ Architectural comparisons and insights")
        st.write("• ✅ Technique evolution tracking")
        st.write("• ✅ Cross-organizational innovation patterns")

        st.subheader("Example Models to Analyze")

        examples = [
            ("Qwen/Qwen3-8B", "Advanced architecture with RoPE scaling"),
            ("deepseek-ai/DeepSeek-V3", "Large-scale MoE architecture"),
            ("THUDM/glm-4-9b", "GLM architecture innovations"),
            ("meta-llama/Llama-3.1-8B", "Llama 3.1 improvements")
        ]

        for model, description in examples:
            st.code(model)
            st.write(f"*{description}*")

        st.write("**Start contributing to see your analyses here!**")

# Helper functions for technical analysis
def parse_json_field(field_value):
    """Safely parse JSON field from dataset."""
    if isinstance(field_value, str):
        try:
            return json.loads(field_value)
        except:
            return {}
    return field_value if field_value else {}

def extract_architecture_metrics(df):
    """Extract architecture metrics from the dataset."""
    metrics = []
    for _, row in df.iterrows():
        config = parse_json_field(row.get('config', '{}'))
        techniques = parse_json_field(row.get('techniques', '{}'))

        metric = {
            'model_id': row['model_id'],
            'organization': row.get('organization', 'Unknown'),
            'model_type': row.get('model_type', 'Unknown'),
            'hidden_size': config.get('hidden_size', 0) or 0,
            'num_layers': config.get('num_hidden_layers', config.get('num_layers', 0)) or 0,
            'max_position': config.get('max_position_embeddings', 0) or 0,
            'vocab_size': config.get('vocab_size', 0) or 0,
            'intermediate_size': config.get('intermediate_size', 0) or 0,
            'rope_type': techniques.get('rope_type') or techniques.get('positional_encoding') or 'Unknown',
            'attention_type': techniques.get('attention_implementation', 'Unknown') or 'Unknown',
            'sliding_window': techniques.get('sliding_window_size', 0) or 0
        }
        metrics.append(metric)

    return pd.DataFrame(metrics)

with tab3:
    st.header("🔬 Technical Architecture Analysis")

    if not df.empty:
        # Extract architecture data
        arch_df = extract_architecture_metrics(df)

        # Filter out rows with missing critical data
        valid_arch_df = arch_df[(arch_df['hidden_size'] > 0) & (arch_df['num_layers'] > 0)]

        if not valid_arch_df.empty:
            st.subheader("🏗️ Architecture Parameter Distribution")

            col1, col2 = st.columns(2)

            with col1:
                # Model size scatter plot
                fig_size = px.scatter(
                    valid_arch_df,
                    x='hidden_size',
                    y='num_layers',
                    color='organization',
                    size='max_position',
                    hover_data=['model_id', 'vocab_size'],
                    title="Model Architecture: Hidden Size vs Layers",
                    labels={'hidden_size': 'Hidden Size', 'num_layers': 'Number of Layers'}
                )
                fig_size.update_layout(height=400)
                st.plotly_chart(fig_size, use_container_width=True)

            with col2:
                # Context length distribution
                context_data = valid_arch_df[valid_arch_df['max_position'] > 0]
                if not context_data.empty:
                    fig_context = px.histogram(
                        context_data,
                        x='max_position',
                        color='organization',
                        title="Context Length Distribution",
                        labels={'max_position': 'Max Position Embeddings', 'count': 'Number of Models'}
                    )
                    fig_context.update_layout(height=400)
                    st.plotly_chart(fig_context, use_container_width=True)

            st.subheader("⚡ Technique Adoption Analysis")

            col1, col2 = st.columns(2)

            with col1:
                # RoPE type distribution
                rope_counts = valid_arch_df['rope_type'].value_counts()
                if len(rope_counts) > 1:
                    fig_rope = px.pie(
                        values=rope_counts.values,
                        names=rope_counts.index,
                        title="Positional Encoding Types"
                    )
                    fig_rope.update_layout(height=300)
                    st.plotly_chart(fig_rope, use_container_width=True)

            with col2:
                # Attention implementation
                attention_counts = valid_arch_df[valid_arch_df['attention_type'] != 'Unknown']['attention_type'].value_counts()
                if len(attention_counts) > 0:
                    fig_attention = px.bar(
                        x=attention_counts.index,
                        y=attention_counts.values,
                        title="Attention Implementation Types",
                        labels={'x': 'Attention Type', 'y': 'Model Count'}
                    )
                    fig_attention.update_layout(height=300)
                    st.plotly_chart(fig_attention, use_container_width=True)

            st.subheader("📊 Organization Innovation Patterns")

            # Organization vs technique matrix
            org_techniques = []
            for _, row in df.iterrows():
                techniques = parse_json_field(row.get('techniques', '{}'))
                org = row.get('organization', 'Unknown')

                # Extract key techniques (with None safety)
                rope_type = techniques.get('rope_type') or techniques.get('positional_encoding') or 'standard'
                sliding_window_size = techniques.get('sliding_window_size', 0)
                has_sliding_window = sliding_window_size is not None and sliding_window_size > 0
                attention_impl = techniques.get('attention_implementation') or 'standard'

                # Safe string operations
                rope_type_str = str(rope_type).lower() if rope_type else 'standard'
                attention_impl_str = str(attention_impl).lower() if attention_impl else 'standard'

                org_techniques.append({
                    'Organization': org,
                    'RoPE_Advanced': 'yes' if 'yarn' in rope_type_str or 'scaled' in rope_type_str else 'no',
                    'Sliding_Window': 'yes' if has_sliding_window else 'no',
                    'Flash_Attention': 'yes' if 'flash' in attention_impl_str else 'no'
                })

            org_tech_df = pd.DataFrame(org_techniques)

            # Create technique adoption heatmap data
            if not org_tech_df.empty:
                heatmap_data = org_tech_df.groupby('Organization').agg({
                    'RoPE_Advanced': lambda x: (x == 'yes').sum(),
                    'Sliding_Window': lambda x: (x == 'yes').sum(),
                    'Flash_Attention': lambda x: (x == 'yes').sum()
                }).reset_index()

                if len(heatmap_data) > 1:
                    fig_heatmap = px.imshow(
                        heatmap_data.set_index('Organization').T,
                        title="Advanced Technique Adoption by Organization",
                        labels={'x': 'Organization', 'y': 'Technique', 'color': 'Models Using Technique'},
                        aspect='auto'
                    )
                    fig_heatmap.update_layout(height=300)
                    st.plotly_chart(fig_heatmap, use_container_width=True)

            st.subheader("🔍 Model Architecture Comparison")

            # Model selection for comparison
            model_options = valid_arch_df['model_id'].tolist()
            if len(model_options) >= 2:
                selected_models = st.multiselect(
                    "Select models to compare (max 4):",
                    model_options,
                    default=model_options[:2],
                    max_selections=4
                )

                if selected_models:
                    comparison_df = valid_arch_df[valid_arch_df['model_id'].isin(selected_models)]

                    # Create comparison table
                    comparison_cols = ['model_id', 'organization', 'hidden_size', 'num_layers',
                                       'max_position', 'vocab_size', 'rope_type', 'attention_type']
                    display_comparison = comparison_df[comparison_cols]
                    st.dataframe(display_comparison, use_container_width=True)

                    # Parameter efficiency chart
                    if len(comparison_df) > 1:
                        # Calculate rough parameter estimate
                        comparison_df['est_params_b'] = (
                            comparison_df['hidden_size'] * comparison_df['num_layers'] *
                            comparison_df['vocab_size'] / 1e9
                        ).round(2)

                        fig_efficiency = px.bar(
                            comparison_df,
                            x='model_id',
                            y='est_params_b',
                            title="Estimated Model Size Comparison (Billions of Parameters)",
                            labels={'est_params_b': 'Estimated Parameters (B)'}
                        )
                        fig_efficiency.update_layout(height=300)
                        st.plotly_chart(fig_efficiency, use_container_width=True)

        else:
            st.warning("Insufficient architecture data for analysis. Models need valid config information.")

    else:
        st.info("**Technical analysis will appear when community data is available!**")

        st.markdown("""
        ### 🔬 What You'll See Here:

        **🏗️ Architecture Analysis**
        - Parameter distribution patterns across organizations
        - Model scaling relationships (size vs capabilities)
        - Context length and vocabulary trends

        **⚡ Innovation Tracking**
        - Technique adoption timelines (RoPE, Flash Attention, etc.)
        - Cross-organizational innovation patterns
        - Emerging architecture components

        **🧬 Model Lineage**
        - Base model relationships and fine-tuning chains
        - Architecture family evolution
        - Research paper connections

        **⚖️ Comparative Analysis**
        - Side-by-side technical specifications
        - Parameter efficiency patterns
        - Architecture similarity clustering
        """)

with tab4:
    st.header("🔐 Access Control & Contributing")
    st.write("ModelAtlas implements **responsible tiered access** for ablation research:")

    # Public Access
    with st.expander("🌍 PUBLIC Access", expanded=True):
        st.write("• ✅ View model architectures and configurations")
        st.write("• ✅ Compare techniques across models")
        st.write("• ✅ Analyze innovation timelines")
        st.write("• ❌ No ablation/intervention access")

    # Contributor Access
    with st.expander("📊 CONTRIBUTOR Access"):
        st.info("**Requirements:** 3+ contributions, 0.8+ quality score, 7+ days active")
        st.write("• ✅ All public features")
        st.write("• ✅ Basic intervention mapping")
        st.write("• ✅ Ablation compatibility analysis")
        st.write("• ✅ Cross-model intervention insights")

    # Heretic Access
    with st.expander("🔥 HERETIC Access"):
        st.error("**Requirements:** 10+ contributions, 0.9+ quality score, manual approval + community vouching")
        st.write("• ✅ All contributor features")
        st.write("• ✅ Advanced ablation strategies")
        st.write("• ✅ Cross-model transfer analysis")
        st.write("• ✅ Strategic research methodologies")
        st.write("• ✅ Heretic community research notes")

    st.subheader("🚀 CLI Commands")

    commands = """
# Setup community access
python atlas.py contribute --setup

# Check your access level
python atlas.py contribute --status

# Submit analyses
python atlas.py contribute --submit

# Request access upgrades
python atlas.py contribute --request-access contributor
python atlas.py contribute --request-access heretic

# Test access control (requires contributor+)
python atlas.py interventions Qwen/Qwen3-8B
"""

    st.code(commands, language="bash")

    st.subheader("🛡️ Why Access Control?")
    st.write("• **Protects Innovation:** Sensitive ablation research within trusted community")
    st.write("• **Rewards Quality:** Contributors earn access through meaningful work")
    st.write("• **Builds Trust:** Community vouching creates research networks")
    st.write("• **Enables Progress:** Heretic community advances boundaries responsibly")

# Footer
st.markdown("---")
st.markdown("""
**Community Links:**
[📊 Dataset](https://huggingface.co/datasets/RadicalNotionAI/modelatlas-community) |
[🚀 Dashboard](https://huggingface.co/spaces/RadicalNotionAI/modelatlas-dashboard) |
[💻 CLI Tool](https://github.com/your-org/ModelAtlas)

*Built with ModelAtlas - Architectural Intelligence for AI Research*
""")