import re
import os
import pickle
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
import requests

GOAL_WORDS = 2_200_000
CATEGORY_GOAL = 1_100_000

OUR_TEAM_PROJECT_IDS = {29, 30, 31, 32, 33, 37}
ANNOTATED_STATES = ["Acceptable", "No Rating"]
GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]

TEAM_COLORS = {
    "A.K. (22)": "#0066cc",
    "J.Š. (23)": "#00cccc",
    "J.Š. (24)": "#00cc00",
    "G.Z. (25)": "#ff9900",
    "L.M. (26)": "#9933ff",
    "M.M. (27)": "#cc0000",
}

# Cache file location (persists between runs)
CACHE_FILE = Path(".cache.pkl")

st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")


def fetch_project_data(proj, url, headers):
    """Fetch data from one project (for parallel execution)."""
    pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
    group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"

    rows = []
    submitted_count = 0  # Track submitted (annotated) tasks
    page = 1
    while True:
        resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params={"page": page, "page_size": 100}, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        tasks = data if isinstance(data, list) else data.get("tasks", [])

        if not tasks:
            break

        for task in tasks:
            task_data = task.get("data", {})
            words = task_data.get("words") or len(task_data.get("text", "").split())
            category = task_data.get("category")

            annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
            if not annots:
                rows.append(
                    {
                        "project_id": pid,
                        "project": name,
                        "project_group": group,
                        "date": None,
                        "state": "Not Annotated",
                        "words": int(words),
                        "category": category,
                    }
                )
                continue

            # Task has annotations - count as submitted
            submitted_count += 1

            ann = annots[0]
            date = ann.get("created_at", "")[:10] or None

            rating = None
            for item in ann.get("result", []):
                if item.get("type") == "choices" and item.get("from_name") == "text_rating":
                    rating = item.get("value", {}).get("choices", [None])[0]
                    break

            has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
            if rating is None:
                state = "No Rating"
            elif rating == "Requires Attention":
                state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
            elif rating == "Unacceptable":
                state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
            else:
                state = "Acceptable"

            rows.append(
                {"project_id": pid, "project": name, "project_group": group, "date": date, "state": state, "words": int(words), "category": category}
            )

        if isinstance(data, list) and len(data) < 100:
            break
        if isinstance(data, dict) and not data.get("next"):
            break
        page += 1

    return pid, task_count, submitted_count, rows


@st.cache_data(ttl=300)
def load_data(projects_hash):
    """Load annotation data from Label Studio with disk cache.

    Args:
        projects_hash: Hash of project states to invalidate Streamlit cache when projects change
    """
    try:
        url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
        key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
    except (KeyError, FileNotFoundError, AttributeError):
        url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
        key = os.getenv("LABEL_STUDIO_API_KEY", "")

    if not url or not key:
        st.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
        st.stop()

    headers = {"Authorization": f"Token {key}"}

    # Fetch all projects
    resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
    resp.raise_for_status()
    projects = resp.json().get("results", [])

    # Load cache
    cache = {}
    if CACHE_FILE.exists():
        try:
            with open(CACHE_FILE, "rb") as f:
                cache = pickle.load(f)
        except Exception:
            cache = {}

    # Check which projects need updating
    projects_to_fetch = []
    all_rows = []

    for proj in projects:
        pid = proj["id"]
        task_count = proj.get("task_number", 0)
        # Get submitted task count from Label Studio API
        api_submitted_count = proj.get("num_tasks_with_annotations", 0)

        cache_key = f"project_{pid}"

        # Invalidate cache if:
        # 1. No cache exists for this project
        # 2. Total task count changed (new tasks added/removed)
        # 3. Submitted task count changed (new annotations/submissions)
        use_cache = False
        if cache_key in cache:
            cached = cache[cache_key]
            # Use cache only if BOTH counts match
            if (cached.get("task_count") == task_count and
                cached.get("submitted_count") == api_submitted_count):
                use_cache = True

        if use_cache:
            all_rows.extend(cache[cache_key]["rows"])
        else:
            projects_to_fetch.append(proj)

    # Fetch updated projects in parallel
    if projects_to_fetch:
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(fetch_project_data, proj, url, headers) for proj in projects_to_fetch]

            progress = st.progress(0, text=f"Loading {len(projects_to_fetch)} projects...")
            for i, future in enumerate(futures):
                pid, task_count, submitted_count, rows = future.result()
                all_rows.extend(rows)
                cache[f"project_{pid}"] = {"task_count": task_count, "submitted_count": submitted_count, "rows": rows}
                progress.progress((i + 1) / len(futures), text=f"Loaded {i + 1}/{len(futures)} projects")
            progress.empty()

        # Save cache
        try:
            with open(CACHE_FILE, "wb") as f:
                pickle.dump(cache, f)
        except Exception:
            pass

    # Create dataframe
    df = pd.DataFrame(all_rows)
    df["words"] = df["words"].astype(int)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
    df["is_goal_state"] = df["state"].isin(GOAL_STATES)

    return df


def get_projects_hash():
    """Fetch projects and return a hash of their states for cache invalidation."""
    import hashlib

    try:
        url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
        key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
    except (KeyError, FileNotFoundError, AttributeError):
        url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
        key = os.getenv("LABEL_STUDIO_API_KEY", "")

    if not url or not key:
        return "no-credentials"

    headers = {"Authorization": f"Token {key}"}
    resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
    resp.raise_for_status()
    projects = resp.json().get("results", [])

    # Create hash from project states (id, task_number, num_tasks_with_annotations)
    state_string = ""
    for proj in projects:
        pid = proj["id"]
        task_count = proj.get("task_number", 0)
        submitted_count = proj.get("num_tasks_with_annotations", 0)
        state_string += f"{pid}:{task_count}:{submitted_count};"

    return hashlib.md5(state_string.encode()).hexdigest()


def anonymize(name):
    """Convert '26 [Name Lastname]' to 'N.L. (26)'"""
    if name == "Others":
        return "Others"
    match = re.match(r"(\d+)\s+\[(.+?)\]", name)
    if match:
        num, full = match.groups()
        parts = full.split()
        if len(parts) >= 2:
            return f"{parts[0][0]}.{parts[-1][0]}. ({num})"
    return name


st.title("📊 Annotation Progress Dashboard")
st.markdown("---")

# Load data
with st.spinner("Loading..."):
    projects_hash = get_projects_hash()
    df = load_data(projects_hash)

# Overview metrics
total = df[df["is_goal_state"]]["words"].sum()
remaining = GOAL_WORDS - total
progress = total / GOAL_WORDS * 100

col1, col2 = st.columns(2)
col1.metric("Progress toward 2.2M", f"{total:,}", f"{progress:.1f}%")
col2.metric("Remaining", f"{remaining:,}", f"{100 - progress:.1f}%")

st.markdown("---")

# Tabs
tab1, tab2 = st.tabs(["📊 Weekly Stats", "⏱️ Pacing"])

# ============== TAB 1: Weekly Stats ==============
with tab1:
    st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")

    cutoff_date = pd.Timestamp("2025-12-22")

    # Filter data - use GOAL_STATES to match progress metrics
    df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
    df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
    df_week["member"] = df_week.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)

    # Weekly pivot (all data)
    weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)

    # Split into before and after cutoff
    weekly_before = weekly_all[weekly_all.index < cutoff_date]
    weekly_after = weekly_all[weekly_all.index >= cutoff_date]

    # Ensure consistent columns
    all_members = set(weekly_all.columns)
    if "Others" not in all_members:
        all_members.add("Others")

    for member in all_members:
        if member not in weekly_after.columns:
            weekly_after[member] = 0
        if member not in weekly_before.columns:
            weekly_before[member] = 0

    # Sort columns by total contribution
    totals = weekly_all.sum().sort_values(ascending=False)
    weekly_after = weekly_after[totals.index]
    weekly_after["Total"] = weekly_after.sum(axis=1)

    # Calculate "Before" summary row
    before_totals = weekly_before[totals.index].sum()
    before_totals["Total"] = before_totals.sum()

    # Format weekly data for display
    display = weekly_after.reset_index()
    display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
    display = display.drop("week_start", axis=1)
    display = display[["Week"] + list(totals.index) + ["Total"]]

    # Add "Before" row at the beginning
    before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
    display = pd.concat([before_row, display], ignore_index=True)

    # Add TOTAL row at the end
    all_totals = weekly_all[totals.index].sum()
    all_totals["Total"] = all_totals.sum()
    total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
    display = pd.concat([display, total_row], ignore_index=True)

    # Format numbers
    for col in display.columns:
        if col != "Week":
            display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")

    # Style and show
    def style_row(row):
        if row["Week"] == "TOTAL":
            return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
        elif row["Week"].startswith("Before"):
            return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
        return [""] * len(row)

    styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
    st.dataframe(styled, hide_index=True, use_container_width=True)

# ============== TAB 2: Pacing ==============
with tab2:
    st.subheader("Category Breakdown")
    st.caption("Requirement: 1.1M words from each category")

    # Split by status: Ready vs Needs Fixing
    df_ready = df[df["is_annotated"]]  # Acceptable + No Rating
    df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
    df_total = df[df["is_goal_state"]]

    # Calculate by category
    mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
    mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
    mok_total = mok_ready + mok_fixing

    zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
    zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
    zin_total = zin_ready + zin_fixing

    total_ready = mok_ready + zin_ready
    total_fixing = mok_fixing + zin_fixing
    total_all = total_ready + total_fixing

    cat_df = pd.DataFrame(
        {
            "Category": ["mokslinis", "ziniasklaida", "TOTAL"],
            "Ready": [f"{mok_ready:,}", f"{zin_ready:,}", f"{total_ready:,}"],
            "Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}", f"{total_fixing:,}"],
            "Total": [f"{mok_total:,}", f"{zin_total:,}", f"{total_all:,}"],
            "Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}", f"{GOAL_WORDS:,}"],
            "Progress": [
                f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
                f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
                f"{total_all / GOAL_WORDS * 100:.1f}%",
            ],
        }
    )
    st.dataframe(cat_df, hide_index=True, use_container_width=True)

    st.markdown("---")
    st.header("Cumulative Progress & Projection")

    # Cumulative data
    df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
    df_cum["member"] = df_cum.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)

    daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
    pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
    cumulative = pivot.sort_index().cumsum()
    cumulative["Total"] = cumulative.sum(axis=1)
    cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]

    # Projection calculation
    last_date = cumulative.index[-1]
    current = cumulative["Total"].iloc[-1]

    # Calculate rate from last 14 days
    lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
    if len(lookback) >= 2:
        days = (last_date - lookback.index[0]).days or 1
        rate = (current - lookback["Total"].iloc[0]) / days
        days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
        completion = last_date + pd.Timedelta(days=days_left)
        weekly_rate = rate * 7
    else:
        rate = completion = weekly_rate = None

    # Chart
    fig = go.Figure()

    # Goal lines
    fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
    fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")

    # Members
    members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
    members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)

    if "Others" in cumulative.columns:
        fig.add_trace(
            go.Scatter(
                x=cumulative.index,
                y=cumulative["Others"],
                name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
                mode="lines",
                line=dict(width=2, color="#7f8c8d"),
            )
        )

    for m in members:
        color = TEAM_COLORS.get(m, "#34495e")
        fig.add_trace(
            go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
        )

    # Total
    fig.add_trace(
        go.Scatter(
            x=cumulative.index,
            y=cumulative["Total"],
            name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
            mode="lines",
            line=dict(width=3, color="#d4af37"),
            fill="tozeroy",
            fillcolor="rgba(212, 175, 55, 0.1)",
        )
    )

    # Projection
    if completion:
        proj_dates = pd.date_range(last_date, completion, freq="D")
        proj_vals = current + rate * (proj_dates - last_date).days
        fig.add_trace(
            go.Scatter(
                x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
            )
        )
        fig.add_trace(
            go.Scatter(
                x=[completion],
                y=[GOAL_WORDS],
                mode="markers+text",
                marker=dict(size=14, color="#d4af37", symbol="diamond"),
                text=[completion.strftime("%b %d")],
                textposition="top center",
                showlegend=False,
            )
        )
        title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
    else:
        title = "Cumulative Progress"

    fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
    fig.update_yaxes(tickformat=".2s")

    st.plotly_chart(fig, use_container_width=True)

    # Metrics
    if completion:
        st.markdown("### Pacing Estimates")
        c1, c2, c3 = st.columns(3)
        c1.metric("Per Week Rate", f"{int(weekly_rate):,} words")
        c2.metric("Weeks Remaining", f"{days_left / 7:.1f} weeks")
        c3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))

# Footer
st.markdown("---")
st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 5 min | Press 'R' to refresh")