import gradio as gr
import pandas as pd
import numpy as np
import pickle
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings("ignore")

# ── Exact 30 features the model was trained on ────────────────────────────────
FEATURE_COLS = [
    "title_length",
    "title_word_count",
    "description_length",
    "description_word_count",
    "description_density",
    "title_desc_ratio",
    "salary_midpoint",
    "salary_range",
    "has_salary_info",
    "salary_log",
    "desc_salary_interaction",
    "senior_salary",
    "weekend_remote",
    "is_senior_role",
    "is_entry_role",
    "is_software_role",
    "is_data_role",
    "is_manager_role",
    "is_sales_role",
    "is_marketing_role",
    "title_desc_word_interaction",
    "salary_density_interaction",
    "salary_description_interaction",
    "title_density_interaction",
    "cluster_0",
    "cluster_1",
    "cluster_2",
    "cluster_3",
    "cluster_4",
    "cluster_5",
]

CLUSTER_LABELS = [
    "Cluster 0 — General / Mixed roles",
    "Cluster 1 — High-salary specialist roles",
    "Cluster 2 — Tech & software roles",
    "Cluster 3 — Entry-level / high-volume roles",
    "Cluster 4 — Contract & flexible roles",
    "Cluster 5 — Senior leadership roles",
]

# ── Load models ───────────────────────────────────────────────────────────────
try:
    with open("linkedin_regression_model.pkl", "rb") as f:
        reg_model = pickle.load(f)
    with open("linkedin_classification_model.pkl", "rb") as f:
        clf_model = pickle.load(f)
    MODELS_LOADED = True
except Exception as e:
    MODELS_LOADED = False
    MODEL_ERROR = str(e)

# ── Feature builder ───────────────────────────────────────────────────────────
def build_feature_row(
    title_word_count,
    description_word_count,
    has_salary,
    salary_midpoint,
    is_senior,
    is_entry,
    is_software,
    is_data,
    is_manager,
    is_sales,
    is_marketing,
    is_remote,
    posting_dayofweek,
    cluster_choice,
):
    salary_midpoint_val = float(salary_midpoint) if has_salary else 0.0
    salary_log          = np.log1p(salary_midpoint_val)
    salary_range        = salary_midpoint_val * 0.3
    twc                 = int(title_word_count)
    dwc                 = int(description_word_count)
    title_length        = twc * 6
    description_length  = dwc * 5
    description_density = (dwc / max(description_length, 1)) * 100
    title_desc_ratio    = twc / max(dwc, 1)
    posting_weekend     = int(int(posting_dayofweek) >= 5)
    senior_salary       = int(is_senior) * salary_log
    weekend_remote      = posting_weekend * int(is_remote)

    desc_salary_interaction       = dwc * salary_log
    title_desc_word_interaction   = twc * dwc
    salary_density_interaction    = salary_log * description_density
    salary_description_interaction = salary_log * dwc
    title_density_interaction     = twc * description_density

    # Cluster one-hot — exactly one cluster is active
    cluster_idx = int(cluster_choice.split("—")[0].replace("Cluster", "").strip())
    cluster_vals = [1 if i == cluster_idx else 0 for i in range(6)]

    row = {
        "title_length":                    title_length,
        "title_word_count":                twc,
        "description_length":              description_length,
        "description_word_count":          dwc,
        "description_density":             description_density,
        "title_desc_ratio":                title_desc_ratio,
        "salary_midpoint":                 salary_midpoint_val,
        "salary_range":                    salary_range,
        "has_salary_info":                 int(has_salary),
        "salary_log":                      salary_log,
        "desc_salary_interaction":         desc_salary_interaction,
        "senior_salary":                   senior_salary,
        "weekend_remote":                  weekend_remote,
        "is_senior_role":                  int(is_senior),
        "is_entry_role":                   int(is_entry),
        "is_software_role":                int(is_software),
        "is_data_role":                    int(is_data),
        "is_manager_role":                 int(is_manager),
        "is_sales_role":                   int(is_sales),
        "is_marketing_role":               int(is_marketing),
        "title_desc_word_interaction":     title_desc_word_interaction,
        "salary_density_interaction":      salary_density_interaction,
        "salary_description_interaction":  salary_description_interaction,
        "title_density_interaction":       title_density_interaction,
        "cluster_0":                       cluster_vals[0],
        "cluster_1":                       cluster_vals[1],
        "cluster_2":                       cluster_vals[2],
        "cluster_3":                       cluster_vals[3],
        "cluster_4":                       cluster_vals[4],
        "cluster_5":                       cluster_vals[5],
    }

    return pd.DataFrame([row])[FEATURE_COLS]


# ── Prediction ────────────────────────────────────────────────────────────────
def predict_engagement(
    title_word_count, description_word_count,
    has_salary, salary_midpoint,
    is_senior, is_entry, is_software, is_data,
    is_manager, is_sales, is_marketing,
    is_remote, posting_dayofweek,
    cluster_choice,
):
    if not MODELS_LOADED:
        return "⚠️ Models not loaded.", "⚠️ Error", "Upload both .pkl files to the Space.", None

    try:
        X = build_feature_row(
            title_word_count, description_word_count,
            has_salary, salary_midpoint,
            is_senior, is_entry, is_software, is_data,
            is_manager, is_sales, is_marketing,
            is_remote, posting_dayofweek,
            cluster_choice,
        )

        log_views_pred = reg_model.predict(X)[0]
        views_pred     = int(np.expm1(log_views_pred))
        clf_pred       = clf_model.predict(X)[0]

        engagement_label  = "🟢 HIGH ENGAGEMENT"  if clf_pred == 1 else "🔴 NORMAL ENGAGEMENT"
        engagement_detail = (
            "This posting is predicted to land in the top 25% for views on LinkedIn."
            if clf_pred == 1 else
            "This posting is predicted to receive average or below-average views."
        )

        tips = []
        if not has_salary:
            tips.append("💡 Adding salary info is associated with ~90% more views.")
        if int(description_word_count) < 250:
            tips.append("💡 Descriptions of 250–500 words tend to perform best.")
        if int(description_word_count) > 750:
            tips.append("💡 Very long descriptions can reduce engagement — try trimming to under 750 words.")
        if int(posting_dayofweek) >= 5:
            tips.append("💡 Posting mid-week (Tue–Thu) gets more views than weekends.")
        if tips:
            engagement_detail += "\n\n" + "\n".join(tips)

        # Chart
        DARK = "#0f0f0f"
        bar_color = "#3a9e6e" if clf_pred == 1 else "#c0392b"
        fill_pct  = min(views_pred / 1500, 1.0)

        fig, ax = plt.subplots(figsize=(5, 2.5))
        fig.patch.set_facecolor(DARK)
        ax.set_facecolor(DARK)
        ax.barh(0, 1,        color="#1e1e1e", height=0.4, edgecolor="none")
        ax.barh(0, fill_pct, color=bar_color, height=0.4, edgecolor="none")
        ax.set_xlim(0, 1)
        ax.set_ylim(-0.6, 0.6)
        ax.axis("off")
        ax.text(0.5, 0.55, engagement_label,
                ha="center", va="center", color=bar_color,
                fontsize=12, fontweight="bold", transform=ax.transAxes)
        ax.text(0.5, -0.35, f"Predicted: {views_pred:,} views",
                ha="center", va="center", color="#e8e8e8",
                fontsize=13, fontweight="bold", transform=ax.transAxes)
        plt.tight_layout(pad=0.3)

        return f"~{views_pred:,} views", engagement_label, engagement_detail, fig

    except Exception as e:
        import traceback
        return f"Error: {e}", "Error", traceback.format_exc(), None


# ── EDA charts ────────────────────────────────────────────────────────────────
DARK_BG = "#0f0f0f"
CARD_BG = "#141414"
AMBER   = "#f5a623"
GREEN   = "#3a9e6e"
RED     = "#c0392b"
BLUE    = "#3498db"
TEXT    = "#e8e8e8"
MUTED   = "#666"

def style_ax(ax, fig):
    fig.patch.set_facecolor(DARK_BG)
    ax.set_facecolor(CARD_BG)
    ax.tick_params(colors=MUTED, labelsize=10)
    ax.xaxis.label.set_color(MUTED)
    ax.yaxis.label.set_color(MUTED)
    ax.title.set_color(TEXT)
    for spine in ax.spines.values():
        spine.set_edgecolor("#1e1e1e")

def eda_salary_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    bars = ax.barh(["No Salary Info", "Has Salary Info"], [180, 340],
                   color=[RED, GREEN], height=0.45)
    for bar, val in zip(bars, [180, 340]):
        ax.text(val + 5, bar.get_y() + bar.get_height() / 2,
                f"~{val} avg views", va="center", color=TEXT, fontsize=11)
    ax.set_xlabel("Average Views", color=MUTED)
    ax.set_title("Q1 — Salary Transparency vs Engagement", color=TEXT, pad=10)
    ax.set_xlim(0, 440)
    ax.legend(handles=[mpatches.Patch(color=GREEN, label="Salary disclosed"),
                       mpatches.Patch(color=RED,   label="No salary")],
              facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
    plt.tight_layout(); return fig

def eda_description_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    buckets = ["<100", "100–250", "250–500★", "500–750", "750–1000", ">1000"]
    values  = [2.1, 2.8, 3.6, 3.3, 3.0, 2.5]
    colors  = [RED, AMBER, GREEN, BLUE, BLUE, RED]
    bars = ax.bar(buckets, values, color=colors, width=0.55)
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width() / 2, val + 0.05,
                f"{val}", ha="center", va="bottom", color=TEXT, fontsize=10)
    ax.set_ylabel("Mean log(views+1)", color=MUTED)
    ax.set_title("Q2 — Description Length vs Engagement", color=TEXT, pad=10)
    ax.tick_params(axis="x", rotation=15)
    ax.legend(handles=[mpatches.Patch(color=GREEN, label="Sweet spot: 250–500 words")],
              facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
    plt.tight_layout(); return fig

def eda_dayofweek_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    days   = ["Mon", "Tue★", "Wed", "Thu", "Fri", "Sat", "Sun"]
    values = [220, 245, 235, 228, 210, 148, 132]
    colors = [BLUE, GREEN, BLUE, BLUE, BLUE, RED, RED]
    bars = ax.bar(days, values, color=colors, width=0.55)
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width() / 2, val + 3,
                str(val), ha="center", va="bottom", color=TEXT, fontsize=10)
    ax.set_ylabel("Average Views", color=MUTED)
    ax.set_title("Q3 — Day of Week vs Engagement", color=TEXT, pad=10)
    ax.legend(handles=[mpatches.Patch(color=BLUE, label="Weekday"),
                       mpatches.Patch(color=RED,  label="Weekend")],
              facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
    plt.tight_layout(); return fig

def eda_seniority_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    bars = ax.barh(["Entry-level★", "Other / Mid", "Senior-level"],
                   [290, 210, 175], color=[GREEN, AMBER, RED], height=0.45)
    for bar, val in zip(bars, [290, 210, 175]):
        ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
                f"~{val} avg views", va="center", color=TEXT, fontsize=11)
    ax.set_xlabel("Average Views", color=MUTED)
    ax.set_title("Q4 — Seniority Level vs Engagement", color=TEXT, pad=10)
    ax.set_xlim(0, 370)
    plt.tight_layout(); return fig

def eda_worktype_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    bars = ax.barh(["Contract★", "Internship", "Part-time", "Full-time", "Temporary"],
                   [310, 275, 235, 205, 185],
                   color=[GREEN, BLUE, AMBER, MUTED, RED], height=0.45)
    for bar, val in zip(bars, [310, 275, 235, 205, 185]):
        ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
                f"~{val} avg views", va="center", color=TEXT, fontsize=11)
    ax.set_xlabel("Average Views", color=MUTED)
    ax.set_title("Q5 — Work Type vs Engagement", color=TEXT, pad=10)
    ax.set_xlim(0, 390)
    plt.tight_layout(); return fig

def eda_model_chart():
    fig, ax = plt.subplots(figsize=(6, 3.8))
    style_ax(ax, fig)
    models = ["Mean Baseline", "PCA+Linear", "Lasso", "RidgeCV",
              "Linear+Features", "Gradient Boosting", "RF Controlled", "RF Tuned★"]
    rmse   = [0.871, 0.844, 0.843, 0.842, 0.842, 0.837, 0.8349, 0.8347]
    bars = ax.barh(models, rmse, color=[MUTED]*7+[AMBER], height=0.55)
    for bar, val in zip(bars, rmse):
        ax.text(val + 0.0005, bar.get_y() + bar.get_height() / 2,
                f"{val:.4f}", va="center", color=TEXT, fontsize=9)
    ax.set_xlabel("RMSE_log  (lower = better)", color=MUTED)
    ax.set_title("Regression Model Comparison", color=TEXT, pad=10)
    ax.set_xlim(0.820, 0.886)
    ax.legend(handles=[mpatches.Patch(color=AMBER, label="Winner: RF Tuned")],
              facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
    plt.tight_layout(); return fig

def eda_cluster_chart():
    fig, ax = plt.subplots(figsize=(6, 3.2))
    style_ax(ax, fig)
    labels = [f"Cluster {i}" for i in range(6)]
    sizes  = [28, 18, 22, 12, 10, 10]
    colors = [AMBER, BLUE, GREEN, RED, MUTED, "#9b59b6"]
    bars = ax.bar(labels, sizes, color=colors, width=0.55)
    for bar, val in zip(bars, sizes):
        ax.text(bar.get_x() + bar.get_width() / 2, val + 0.4,
                f"{val}%", ha="center", va="bottom", color=TEXT, fontsize=10)
    ax.set_ylabel("% of training postings", color=MUTED)
    ax.set_title("KMeans k=6 — Cluster Size Distribution", color=TEXT, pad=10)
    ax.set_ylim(0, 36)
    plt.tight_layout(); return fig


# ── Gradio UI ─────────────────────────────────────────────────────────────────
css = """
body, .gradio-container { background-color: #0a0a0a !important;
    font-family: 'Segoe UI', system-ui, sans-serif; }
.gr-button-primary { background-color: #f5a623 !important;
    color: #000 !important; font-weight: 700 !important; border: none !important; }
.gr-button-primary:hover { background-color: #d4941f !important; }
label { color: #aaa !important; font-size: 13px !important; }
"""

with gr.Blocks(css=css, title="LinkedIn Engagement Dashboard",
               theme=gr.themes.Base(primary_hue="orange", neutral_hue="gray")) as demo:

    gr.Markdown(
        """
        # 📊 LinkedIn Job Posting Engagement Dashboard
        **Which posting characteristics predict candidate engagement?**
        *Assignment 2 — Regression, Classification & Clustering · LinkedIn Job Postings*
        """
    )

    with gr.Tabs():

        # ── TAB 1: PREDICTOR ─────────────────────────────────────────────────
        with gr.Tab("🎯 Engagement Predictor"):
            gr.Markdown(
                """
                ### Predict engagement for a new job posting
                Fill in the posting details — the model will estimate views and classify
                whether it is likely to reach **high engagement** (top 25%).
                """
            )

            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("#### 📝 Content")
                    title_word_count       = gr.Slider(1, 20, value=5, step=1,
                                                       label="Title word count")
                    description_word_count = gr.Slider(10, 1200, value=350, step=10,
                                                       label="Description word count")
                    has_salary             = gr.Checkbox(value=True,
                                                         label="Salary information disclosed")
                    salary_midpoint        = gr.Slider(0, 300000, value=85000, step=5000,
                                                       label="Salary midpoint ($)")

                with gr.Column(scale=1):
                    gr.Markdown("#### 🏷️ Role Type")
                    is_senior    = gr.Checkbox(value=False, label="Senior role")
                    is_entry     = gr.Checkbox(value=False, label="Entry-level role")
                    is_software  = gr.Checkbox(value=False, label="Software / Engineering role")
                    is_data      = gr.Checkbox(value=False, label="Data / Analytics role")
                    is_manager   = gr.Checkbox(value=False, label="Manager / Director role")
                    is_sales     = gr.Checkbox(value=False, label="Sales role")
                    is_marketing = gr.Checkbox(value=False, label="Marketing role")

                with gr.Column(scale=1):
                    gr.Markdown("#### 📅 Timing, Location & Cluster")
                    is_remote         = gr.Checkbox(value=False, label="Remote role")
                    posting_dayofweek = gr.Slider(0, 6, value=1, step=1,
                                                  label="Day posted (0=Mon … 6=Sun)")
                    gr.Markdown(
                        "<small style='color:#555'>**Cluster** — pick the segment that best "
                        "describes this posting. If unsure, use Cluster 0.</small>"
                    )
                    cluster_choice = gr.Dropdown(
                        choices=CLUSTER_LABELS,
                        value=CLUSTER_LABELS[0],
                        label="Posting cluster (KMeans k=6)",
                    )

            predict_btn = gr.Button("🔍 Predict Engagement", variant="primary", size="lg")

            with gr.Row():
                views_out = gr.Textbox(label="📈 Predicted Views",     interactive=False)
                label_out = gr.Textbox(label="🏷️ Engagement Class",    interactive=False)

            detail_out = gr.Textbox(label="📋 Interpretation & Tips",  interactive=False, lines=5)
            gauge_out  = gr.Plot(label="Result")

            predict_btn.click(
                fn=predict_engagement,
                inputs=[
                    title_word_count, description_word_count,
                    has_salary, salary_midpoint,
                    is_senior, is_entry, is_software, is_data,
                    is_manager, is_sales, is_marketing,
                    is_remote, posting_dayofweek,
                    cluster_choice,
                ],
                outputs=[views_out, label_out, detail_out, gauge_out],
            )

            gr.Markdown(
                "> **Note:** Predictions use posting-level features only. "
                "Platform factors (LinkedIn algorithm, sponsored status, company followers) "
                "are unobservable and account for most real-world variance."
            )

        # ── TAB 2: EDA DASHBOARD ─────────────────────────────────────────────
        with gr.Tab("📊 EDA Dashboard"):
            gr.Markdown("### Key Findings from Exploratory Data Analysis")

            with gr.Row():
                with gr.Column():
                    gr.Markdown("**Q1 — Salary transparency vs views**")
                    salary_plot = gr.Plot()
                with gr.Column():
                    gr.Markdown("**Q2 — Description length vs views**")
                    desc_plot = gr.Plot()

            with gr.Row():
                with gr.Column():
                    gr.Markdown("**Q3 — Day of week vs views**")
                    day_plot = gr.Plot()
                with gr.Column():
                    gr.Markdown("**Q4 — Seniority level vs views**")
                    seniority_plot = gr.Plot()

            with gr.Row():
                with gr.Column():
                    gr.Markdown("**Q5 — Work type vs views**")
                    worktype_plot = gr.Plot()
                with gr.Column():
                    gr.Markdown("**KMeans k=6 cluster distribution**")
                    cluster_plot = gr.Plot()

            with gr.Row():
                with gr.Column():
                    gr.Markdown("**Regression model comparison**")
                    model_plot = gr.Plot()

            gr.Markdown(
                """
                ---
                **Key takeaways:**
                💰 Salary transparency → ~90% more views &nbsp;|&nbsp;
                📝 Sweet spot: 250–500 words &nbsp;|&nbsp;
                📅 Post Tue–Thu &nbsp;|&nbsp;
                👶 Entry-level = larger candidate pool &nbsp;|&nbsp;
                📋 Contract roles outperform full-time
                """
            )

            demo.load(
                fn=lambda: (
                    eda_salary_chart(), eda_description_chart(),
                    eda_dayofweek_chart(), eda_seniority_chart(),
                    eda_worktype_chart(), eda_cluster_chart(),
                    eda_model_chart(),
                ),
                outputs=[salary_plot, desc_plot, day_plot,
                         seniority_plot, worktype_plot, cluster_plot,
                         model_plot],
            )

        # ── TAB 3: ABOUT ─────────────────────────────────────────────────────
        with gr.Tab("ℹ️ About"):
            gr.Markdown(
                """
                ## About This Project

                **Dataset:** LinkedIn Job Postings — arshkon/linkedin-job-postings (Kaggle)
                **Sample:** 30,000 rows from 123,850 · `random_state=42`
                **Target:** `views` — job posting view count

                ---
                ### Models
                | Model | Type | Metric |
                |---|---|---|
                | Random Forest (Tuned) | Regression | RMSE_log 0.8347 · R² 0.081 |
                | Decision Tree | Classification | Highest F1 + Recall for Class 1 |

                ### All 30 features
                | Group | Features |
                |---|---|
                | Text length | title_length, title_word_count, description_length, description_word_count |
                | Text structure | description_density, title_desc_ratio |
                | Salary | salary_midpoint, salary_range, has_salary_info, salary_log |
                | Role keywords | is_senior_role, is_entry_role, is_software_role, is_data_role, is_manager_role, is_sales_role, is_marketing_role |
                | Interactions | desc_salary_interaction, senior_salary, weekend_remote, title_desc_word_interaction, salary_density_interaction, salary_description_interaction, title_density_interaction |
                | Clustering | cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5 |

                ---
                ### Clustering
                KMeans k=6 · Silhouette score 0.289 · Fit on training data only
                k=7/8 rejected — produced near-singleton clusters

                ---
                ### Limitations
                - R²≈0.08 reflects unobservable platform factors
                - Associations shown, not causal relationships
                - High-engagement threshold is a business rule (top 25%), not a natural label
                - Cluster labels in the predictor are interpretive approximations
                """
            )

demo.launch()