import json
import os
import re
from collections import Counter
from pathlib import Path

import gradio as gr
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from wordcloud import WordCloud

MODEL_ID = "seedflora/ev-sentiment"
DATA_PATH = "data.xlsx"
TEXT_COL = "clean_text_formal"
LABEL_COL = "label"
RESULTS_PATH = "results.csv"


def load_label_map(model_dir: Path):
    label_map_path = model_dir / "label_map.json"
    if label_map_path.exists():
        with label_map_path.open("r", encoding="utf-8") as f:
            return json.load(f)
    return None


TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
MODEL.eval()

ID2LABEL = MODEL.config.id2label

STOPWORDS = {
    "yang",
    "dan",
    "di",
    "ke",
    "dari",
    "untuk",
    "pada",
    "ini",
    "itu",
    "atau",
    "juga",
    "dengan",
    "karena",
    "bahwa",
    "sudah",
    "belum",
    "tidak",
    "bukan",
    "jadi",
    "agar",
    "sebagai",
    "lebih",
    "paling",
    "seperti",
    "saja",
    "masih",
    "bisa",
    "dapat",
    "akan",
    "kami",
    "kita",
    "saya",
    "anda",
    "mereka",
    "aku",
    "dia",
    "kamu",
    "nya",
    "the",
    "a",
    "an",
    "is",
    "are",
    "of",
    "to",
    "in",
    "for",
    "on",
    "it",
}


def load_dataset():
    path = Path(DATA_PATH)
    if not path.exists():
        return None, {}
    df = pd.read_excel(path)
    if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
        return None, {}

    df = df[[TEXT_COL, LABEL_COL]].dropna()
    df[TEXT_COL] = df[TEXT_COL].astype(str)
    labels = sorted(df[LABEL_COL].unique().tolist())
    if set(labels) == {0, 2}:
        label_name = {0: "Negatif", 2: "Positif"}
    elif set(labels) == {0, 1}:
        label_name = {0: "Negatif", 1: "Positif"}
    else:
        label_name = {val: f"Label {val}" for val in labels}
    return df, label_name


def load_results():
    path = Path(RESULTS_PATH)
    if not path.exists():
        return None
    try:
        return pd.read_csv(path)
    except Exception:
        return None


DATA_DF, LABEL_NAME = load_dataset()
RESULTS_DF = load_results()


def predict(text):
    if not text or not text.strip():
        return {}

    inputs = TOKENIZER(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = MODEL(**inputs).logits
        probs = torch.softmax(logits, dim=-1).squeeze().tolist()

    scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))}
    return scores


def _tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2]
    return tokens


def _filter_df(label_choice: str):
    if DATA_DF is None:
        return None
    if label_choice == "Semua":
        return DATA_DF
    label_val = None
    for val, name in LABEL_NAME.items():
        if name == label_choice:
            label_val = val
            break
    if label_val is None:
        return DATA_DF
    return DATA_DF[DATA_DF[LABEL_COL] == label_val]


def build_distribution_plot():
    if DATA_DF is None:
        fig = plt.figure()
        plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center")
        return fig
    counts = DATA_DF[LABEL_COL].value_counts().sort_index()
    labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()]
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"])
    ax.set_title("Distribusi Label")
    ax.set_ylabel("Jumlah")
    ax.grid(axis="y", linestyle="--", alpha=0.4)
    return fig


def build_top_words_plot(label_choice: str, top_n: int = 20):
    df = _filter_df(label_choice)
    fig, ax = plt.subplots(figsize=(6, 5))
    if df is None or df.empty:
        ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
        return fig
    tokens = []
    for text in df[TEXT_COL].tolist():
        tokens.extend(_tokenize(text))
    if not tokens:
        ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
        return fig
    common = Counter(tokens).most_common(top_n)
    words = [w for w, _ in common][::-1]
    freqs = [c for _, c in common][::-1]
    ax.barh(words, freqs, color="#3b82f6")
    ax.set_title(f"Top {top_n} Kata - {label_choice}")
    return fig


def build_wordcloud(label_choice: str):
    df = _filter_df(label_choice)
    fig, ax = plt.subplots(figsize=(7, 4.5))
    if df is None or df.empty:
        ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
        ax.axis("off")
        return fig
    tokens = []
    for text in df[TEXT_COL].tolist():
        tokens.extend(_tokenize(text))
    if not tokens:
        ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
        ax.axis("off")
        return fig
    wc = WordCloud(width=900, height=500, background_color="white", collocations=False)
    wc.generate(" ".join(tokens))
    ax.imshow(wc, interpolation="bilinear")
    ax.axis("off")
    ax.set_title(f"Word Cloud - {label_choice}")
    return fig


def build_model_comparison_plot():
    fig, ax = plt.subplots(figsize=(6, 4))
    if RESULTS_DF is None or RESULTS_DF.empty:
        ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center")
        return fig
    data = RESULTS_DF.copy()
    data = data.sort_values("val_f1", ascending=False)
    models = data["model"].tolist()
    val = data["val_f1"].tolist()
    test = data["test_f1"].tolist()
    x = range(len(models))
    ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e")
    ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6")
    ax.set_xticks([i + 0.2 for i in x])
    ax.set_xticklabels(models, rotation=45, ha="right")
    ax.set_ylim(0, 1.0)
    ax.set_title("Perbandingan Model (F1)")
    ax.legend()
    fig.tight_layout()
    return fig


def analytics(label_choice):
    dist_fig = build_distribution_plot()
    top_fig = build_top_words_plot(label_choice)
    wc_fig = build_wordcloud(label_choice)
    model_fig = build_model_comparison_plot()
    if DATA_DF is None:
        summary = pd.DataFrame([{"metric": "rows", "value": 0}])
    else:
        summary = pd.DataFrame(
            [{"metric": "rows", "value": len(DATA_DF)}]
            + [
                {"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v}
                for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items()
            ]
        )
    return dist_fig, top_fig, wc_fig, model_fig, summary


with gr.Blocks(title="Klasifikasi Sentimen EV") as app:
    gr.Markdown("# Klasifikasi Sentimen EV")
    gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).")

    with gr.Tab("Prediksi"):
        inp = gr.Textbox(lines=4, label="Teks")
        out = gr.Label(num_top_classes=2, label="Prediksi")
        btn = gr.Button("Prediksi")
        btn.click(predict, inputs=inp, outputs=out)

    with gr.Tab("Analitik"):
        label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"]
        label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label")
        dist_plot = gr.Plot(label="Distribusi Label")
        top_plot = gr.Plot(label="Top Kata")
        wc_plot = gr.Plot(label="Word Cloud")
        model_plot = gr.Plot(label="Perbandingan Model")
        summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False)
        run_btn = gr.Button("Generate")
        run_btn.click(
            analytics,
            inputs=label_choice,
            outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
        )
        label_choice.change(
            analytics,
            inputs=label_choice,
            outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
        )
    app.load(
        analytics,
        inputs=label_choice,
        outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
    )


if __name__ == "__main__":
    app.launch()