import streamlit as st
import pandas as pd
from huggingface_hub import HfApi
import requests
import re
import concurrent.futures

# Configure the Streamlit page
st.set_page_config(page_title="Heretic Models Explorer", page_icon="🔥", layout="wide")

st.title("🔥 Heretic Models Explorer")
st.markdown(
    "This space lists all models on Hugging Face tagged with "
    "[`heretic`](https://huggingface.co/models?other=heretic). "
    "It automatically fetches their model cards to extract **KL Divergence** and **Refusals**, "
    "allowing you to sort and compare them easily. Click on any column header to sort!"
)

def fetch_model_info(model):
    """Fetches the README.md for a given model and dynamically extracts metrics."""
    model_id = model.id
    kl_div = None
    refusals_str = None
    refusal_rate = None
    initial_refusals_str = None
    
    # Download the README.md via raw URL for high-speed fetching
    url = f"https://huggingface.co/{model_id}/raw/main/README.md"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            readme_text = response.text
            
            # Extract KL divergence using Regex
            # Matches formats like: "KL divergence | 0.0033"
            kl_match = re.search(r"(?i)KL\s*divergence[^a-zA-Z\d\n]*?([\d\.]+)", readme_text)
            if kl_match:
                try:
                    kl_div = float(kl_match.group(1))
                except ValueError:
                    pass
                    
            # Extract Refusals using Regex
            # Matches formats like: "Refusals | 15/100" or "15 / 100"
            for line in readme_text.split('\n'):
                line_lower = line.lower()
                if 'refusals' in line_lower:
                    fractions = re.findall(r"(\d+)\s*/\s*(\d+)", line)
                    if not fractions:
                        continue
                    
                    if 'initial' in line_lower and initial_refusals_str is None:
                        initial_refusals_str = f"{fractions[0][0]}/{fractions[0][1]}"
                    elif 'initial' not in line_lower and refusals_str is None:
                        refusals_str = f"{fractions[0][0]}/{fractions[0][1]}"
                        refusal_rate = (int(fractions[0][0]) / int(fractions[0][1])) * 100 
                        
                        if len(fractions) >= 2 and initial_refusals_str is None:
                            initial_refusals_str = f"{fractions[1][0]}/{fractions[1][1]}"
                            
    except Exception:
        pass

    if kl_div is None or refusals_str is None:
        return None

    return {
        "Model ID": model_id,
        "KL Divergence": kl_div,
        "Initial Refusals": initial_refusals_str,
        "Refusal Rate (%)": refusal_rate,
        "Refusals": refusals_str,
        "Likes": getattr(model, 'likes', 0),
        "Downloads": getattr(model, 'downloads', 0),
        "URL": f"https://huggingface.co/{model_id}"
    }

@st.cache_data(ttl=3600, show_spinner=False) # Cache for 1 hour
def get_heretic_models():
    """Fetches all heretic models and their metrics concurrently."""
    api = HfApi()
    
    # Query all models using the Hugging Face Hub `filter` parameter
    models = list(api.list_models(filter="heretic"))
    
    # Fetch details concurrently for speed (10 workers to avoid API rate limiting issues)
    quant_patterns = re.compile(r'(?i)(gguf|mlx|awq|nvfp4|gptq|exl[23]|-quant|int8|int4|oq[1-8]|mxfp[48])')
    models = [m for m in models if not quant_patterns.search(m.id)]
    
    data = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = executor.map(fetch_model_info, models)
        for res in results:
            if res is not None:
                data.append(res)
            
    return data

# Main execution
with st.spinner("Fetching heretic models and parsing model cards... This might take a moment on the first run."):
    models_data = get_heretic_models()

if not models_data:
    st.warning("No models found with the 'heretic' tag.")
else:
    df = pd.DataFrame(models_data)
    
    # Make Model ID a clickable Markdown link
    df["Model"] = df["URL"]
    
    # Select and order columns for display
    display_df = df[["Model", "KL Divergence", "Initial Refusals", "Refusals", "Refusal Rate (%)", "Likes", "Downloads"]]
    
    st.markdown(f"**Found {len(display_df)} models.**")
    
    # Display as an interactive, sortable dataframe
    st.dataframe(
        display_df,
        column_config={
            "Model": st.column_config.LinkColumn("Model", display_text=r"https://huggingface\.co/(.*)"),
            "KL Divergence": st.column_config.NumberColumn("KL Divergence", format="%.4f"),
            "Initial Refusals": st.column_config.TextColumn("Initial Refusals"),
            "Refusals": st.column_config.TextColumn("Refusals"),
            "Refusal Rate (%)": st.column_config.NumberColumn("Refusal Rate (%)", format="%.2f%%"),
            "Likes": st.column_config.NumberColumn("Likes"),
            "Downloads": st.column_config.NumberColumn("Downloads")
        },
        hide_index=True,
        use_container_width=True
    )