import gradio as gr import pandas as pd import numpy as np import pickle import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.patches as mpatches import warnings warnings.filterwarnings("ignore") # ── Exact 30 features the model was trained on ──────────────────────────────── FEATURE_COLS = [ "title_length", "title_word_count", "description_length", "description_word_count", "description_density", "title_desc_ratio", "salary_midpoint", "salary_range", "has_salary_info", "salary_log", "desc_salary_interaction", "senior_salary", "weekend_remote", "is_senior_role", "is_entry_role", "is_software_role", "is_data_role", "is_manager_role", "is_sales_role", "is_marketing_role", "title_desc_word_interaction", "salary_density_interaction", "salary_description_interaction", "title_density_interaction", "cluster_0", "cluster_1", "cluster_2", "cluster_3", "cluster_4", "cluster_5", ] CLUSTER_LABELS = [ "Cluster 0 — General / Mixed roles", "Cluster 1 — High-salary specialist roles", "Cluster 2 — Tech & software roles", "Cluster 3 — Entry-level / high-volume roles", "Cluster 4 — Contract & flexible roles", "Cluster 5 — Senior leadership roles", ] # ── Load models ─────────────────────────────────────────────────────────────── try: with open("linkedin_regression_model.pkl", "rb") as f: reg_model = pickle.load(f) with open("linkedin_classification_model.pkl", "rb") as f: clf_model = pickle.load(f) MODELS_LOADED = True except Exception as e: MODELS_LOADED = False MODEL_ERROR = str(e) # ── Feature builder ─────────────────────────────────────────────────────────── def build_feature_row( title_word_count, description_word_count, has_salary, salary_midpoint, is_senior, is_entry, is_software, is_data, is_manager, is_sales, is_marketing, is_remote, posting_dayofweek, cluster_choice, ): salary_midpoint_val = float(salary_midpoint) if has_salary else 0.0 salary_log = np.log1p(salary_midpoint_val) salary_range = salary_midpoint_val * 0.3 twc = int(title_word_count) dwc = int(description_word_count) title_length = twc * 6 description_length = dwc * 5 description_density = (dwc / max(description_length, 1)) * 100 title_desc_ratio = twc / max(dwc, 1) posting_weekend = int(int(posting_dayofweek) >= 5) senior_salary = int(is_senior) * salary_log weekend_remote = posting_weekend * int(is_remote) desc_salary_interaction = dwc * salary_log title_desc_word_interaction = twc * dwc salary_density_interaction = salary_log * description_density salary_description_interaction = salary_log * dwc title_density_interaction = twc * description_density # Cluster one-hot — exactly one cluster is active cluster_idx = int(cluster_choice.split("—")[0].replace("Cluster", "").strip()) cluster_vals = [1 if i == cluster_idx else 0 for i in range(6)] row = { "title_length": title_length, "title_word_count": twc, "description_length": description_length, "description_word_count": dwc, "description_density": description_density, "title_desc_ratio": title_desc_ratio, "salary_midpoint": salary_midpoint_val, "salary_range": salary_range, "has_salary_info": int(has_salary), "salary_log": salary_log, "desc_salary_interaction": desc_salary_interaction, "senior_salary": senior_salary, "weekend_remote": weekend_remote, "is_senior_role": int(is_senior), "is_entry_role": int(is_entry), "is_software_role": int(is_software), "is_data_role": int(is_data), "is_manager_role": int(is_manager), "is_sales_role": int(is_sales), "is_marketing_role": int(is_marketing), "title_desc_word_interaction": title_desc_word_interaction, "salary_density_interaction": salary_density_interaction, "salary_description_interaction": salary_description_interaction, "title_density_interaction": title_density_interaction, "cluster_0": cluster_vals[0], "cluster_1": cluster_vals[1], "cluster_2": cluster_vals[2], "cluster_3": cluster_vals[3], "cluster_4": cluster_vals[4], "cluster_5": cluster_vals[5], } return pd.DataFrame([row])[FEATURE_COLS] # ── Prediction ──────────────────────────────────────────────────────────────── def predict_engagement( title_word_count, description_word_count, has_salary, salary_midpoint, is_senior, is_entry, is_software, is_data, is_manager, is_sales, is_marketing, is_remote, posting_dayofweek, cluster_choice, ): if not MODELS_LOADED: return "⚠️ Models not loaded.", "⚠️ Error", "Upload both .pkl files to the Space.", None try: X = build_feature_row( title_word_count, description_word_count, has_salary, salary_midpoint, is_senior, is_entry, is_software, is_data, is_manager, is_sales, is_marketing, is_remote, posting_dayofweek, cluster_choice, ) log_views_pred = reg_model.predict(X)[0] views_pred = int(np.expm1(log_views_pred)) clf_pred = clf_model.predict(X)[0] engagement_label = "🟢 HIGH ENGAGEMENT" if clf_pred == 1 else "🔴 NORMAL ENGAGEMENT" engagement_detail = ( "This posting is predicted to land in the top 25% for views on LinkedIn." if clf_pred == 1 else "This posting is predicted to receive average or below-average views." ) tips = [] if not has_salary: tips.append("💡 Adding salary info is associated with ~90% more views.") if int(description_word_count) < 250: tips.append("💡 Descriptions of 250–500 words tend to perform best.") if int(description_word_count) > 750: tips.append("💡 Very long descriptions can reduce engagement — try trimming to under 750 words.") if int(posting_dayofweek) >= 5: tips.append("💡 Posting mid-week (Tue–Thu) gets more views than weekends.") if tips: engagement_detail += "\n\n" + "\n".join(tips) # Chart DARK = "#0f0f0f" bar_color = "#3a9e6e" if clf_pred == 1 else "#c0392b" fill_pct = min(views_pred / 1500, 1.0) fig, ax = plt.subplots(figsize=(5, 2.5)) fig.patch.set_facecolor(DARK) ax.set_facecolor(DARK) ax.barh(0, 1, color="#1e1e1e", height=0.4, edgecolor="none") ax.barh(0, fill_pct, color=bar_color, height=0.4, edgecolor="none") ax.set_xlim(0, 1) ax.set_ylim(-0.6, 0.6) ax.axis("off") ax.text(0.5, 0.55, engagement_label, ha="center", va="center", color=bar_color, fontsize=12, fontweight="bold", transform=ax.transAxes) ax.text(0.5, -0.35, f"Predicted: {views_pred:,} views", ha="center", va="center", color="#e8e8e8", fontsize=13, fontweight="bold", transform=ax.transAxes) plt.tight_layout(pad=0.3) return f"~{views_pred:,} views", engagement_label, engagement_detail, fig except Exception as e: import traceback return f"Error: {e}", "Error", traceback.format_exc(), None # ── EDA charts ──────────────────────────────────────────────────────────────── DARK_BG = "#0f0f0f" CARD_BG = "#141414" AMBER = "#f5a623" GREEN = "#3a9e6e" RED = "#c0392b" BLUE = "#3498db" TEXT = "#e8e8e8" MUTED = "#666" def style_ax(ax, fig): fig.patch.set_facecolor(DARK_BG) ax.set_facecolor(CARD_BG) ax.tick_params(colors=MUTED, labelsize=10) ax.xaxis.label.set_color(MUTED) ax.yaxis.label.set_color(MUTED) ax.title.set_color(TEXT) for spine in ax.spines.values(): spine.set_edgecolor("#1e1e1e") def eda_salary_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) bars = ax.barh(["No Salary Info", "Has Salary Info"], [180, 340], color=[RED, GREEN], height=0.45) for bar, val in zip(bars, [180, 340]): ax.text(val + 5, bar.get_y() + bar.get_height() / 2, f"~{val} avg views", va="center", color=TEXT, fontsize=11) ax.set_xlabel("Average Views", color=MUTED) ax.set_title("Q1 — Salary Transparency vs Engagement", color=TEXT, pad=10) ax.set_xlim(0, 440) ax.legend(handles=[mpatches.Patch(color=GREEN, label="Salary disclosed"), mpatches.Patch(color=RED, label="No salary")], facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) plt.tight_layout(); return fig def eda_description_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) buckets = ["<100", "100–250", "250–500★", "500–750", "750–1000", ">1000"] values = [2.1, 2.8, 3.6, 3.3, 3.0, 2.5] colors = [RED, AMBER, GREEN, BLUE, BLUE, RED] bars = ax.bar(buckets, values, color=colors, width=0.55) for bar, val in zip(bars, values): ax.text(bar.get_x() + bar.get_width() / 2, val + 0.05, f"{val}", ha="center", va="bottom", color=TEXT, fontsize=10) ax.set_ylabel("Mean log(views+1)", color=MUTED) ax.set_title("Q2 — Description Length vs Engagement", color=TEXT, pad=10) ax.tick_params(axis="x", rotation=15) ax.legend(handles=[mpatches.Patch(color=GREEN, label="Sweet spot: 250–500 words")], facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) plt.tight_layout(); return fig def eda_dayofweek_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) days = ["Mon", "Tue★", "Wed", "Thu", "Fri", "Sat", "Sun"] values = [220, 245, 235, 228, 210, 148, 132] colors = [BLUE, GREEN, BLUE, BLUE, BLUE, RED, RED] bars = ax.bar(days, values, color=colors, width=0.55) for bar, val in zip(bars, values): ax.text(bar.get_x() + bar.get_width() / 2, val + 3, str(val), ha="center", va="bottom", color=TEXT, fontsize=10) ax.set_ylabel("Average Views", color=MUTED) ax.set_title("Q3 — Day of Week vs Engagement", color=TEXT, pad=10) ax.legend(handles=[mpatches.Patch(color=BLUE, label="Weekday"), mpatches.Patch(color=RED, label="Weekend")], facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) plt.tight_layout(); return fig def eda_seniority_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) bars = ax.barh(["Entry-level★", "Other / Mid", "Senior-level"], [290, 210, 175], color=[GREEN, AMBER, RED], height=0.45) for bar, val in zip(bars, [290, 210, 175]): ax.text(val + 4, bar.get_y() + bar.get_height() / 2, f"~{val} avg views", va="center", color=TEXT, fontsize=11) ax.set_xlabel("Average Views", color=MUTED) ax.set_title("Q4 — Seniority Level vs Engagement", color=TEXT, pad=10) ax.set_xlim(0, 370) plt.tight_layout(); return fig def eda_worktype_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) bars = ax.barh(["Contract★", "Internship", "Part-time", "Full-time", "Temporary"], [310, 275, 235, 205, 185], color=[GREEN, BLUE, AMBER, MUTED, RED], height=0.45) for bar, val in zip(bars, [310, 275, 235, 205, 185]): ax.text(val + 4, bar.get_y() + bar.get_height() / 2, f"~{val} avg views", va="center", color=TEXT, fontsize=11) ax.set_xlabel("Average Views", color=MUTED) ax.set_title("Q5 — Work Type vs Engagement", color=TEXT, pad=10) ax.set_xlim(0, 390) plt.tight_layout(); return fig def eda_model_chart(): fig, ax = plt.subplots(figsize=(6, 3.8)) style_ax(ax, fig) models = ["Mean Baseline", "PCA+Linear", "Lasso", "RidgeCV", "Linear+Features", "Gradient Boosting", "RF Controlled", "RF Tuned★"] rmse = [0.871, 0.844, 0.843, 0.842, 0.842, 0.837, 0.8349, 0.8347] bars = ax.barh(models, rmse, color=[MUTED]*7+[AMBER], height=0.55) for bar, val in zip(bars, rmse): ax.text(val + 0.0005, bar.get_y() + bar.get_height() / 2, f"{val:.4f}", va="center", color=TEXT, fontsize=9) ax.set_xlabel("RMSE_log (lower = better)", color=MUTED) ax.set_title("Regression Model Comparison", color=TEXT, pad=10) ax.set_xlim(0.820, 0.886) ax.legend(handles=[mpatches.Patch(color=AMBER, label="Winner: RF Tuned")], facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) plt.tight_layout(); return fig def eda_cluster_chart(): fig, ax = plt.subplots(figsize=(6, 3.2)) style_ax(ax, fig) labels = [f"Cluster {i}" for i in range(6)] sizes = [28, 18, 22, 12, 10, 10] colors = [AMBER, BLUE, GREEN, RED, MUTED, "#9b59b6"] bars = ax.bar(labels, sizes, color=colors, width=0.55) for bar, val in zip(bars, sizes): ax.text(bar.get_x() + bar.get_width() / 2, val + 0.4, f"{val}%", ha="center", va="bottom", color=TEXT, fontsize=10) ax.set_ylabel("% of training postings", color=MUTED) ax.set_title("KMeans k=6 — Cluster Size Distribution", color=TEXT, pad=10) ax.set_ylim(0, 36) plt.tight_layout(); return fig # ── Gradio UI ───────────────────────────────────────────────────────────────── css = """ body, .gradio-container { background-color: #0a0a0a !important; font-family: 'Segoe UI', system-ui, sans-serif; } .gr-button-primary { background-color: #f5a623 !important; color: #000 !important; font-weight: 700 !important; border: none !important; } .gr-button-primary:hover { background-color: #d4941f !important; } label { color: #aaa !important; font-size: 13px !important; } """ with gr.Blocks(css=css, title="LinkedIn Engagement Dashboard", theme=gr.themes.Base(primary_hue="orange", neutral_hue="gray")) as demo: gr.Markdown( """ # 📊 LinkedIn Job Posting Engagement Dashboard **Which posting characteristics predict candidate engagement?** *Assignment 2 — Regression, Classification & Clustering · LinkedIn Job Postings* """ ) with gr.Tabs(): # ── TAB 1: PREDICTOR ───────────────────────────────────────────────── with gr.Tab("🎯 Engagement Predictor"): gr.Markdown( """ ### Predict engagement for a new job posting Fill in the posting details — the model will estimate views and classify whether it is likely to reach **high engagement** (top 25%). """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 📝 Content") title_word_count = gr.Slider(1, 20, value=5, step=1, label="Title word count") description_word_count = gr.Slider(10, 1200, value=350, step=10, label="Description word count") has_salary = gr.Checkbox(value=True, label="Salary information disclosed") salary_midpoint = gr.Slider(0, 300000, value=85000, step=5000, label="Salary midpoint ($)") with gr.Column(scale=1): gr.Markdown("#### 🏷️ Role Type") is_senior = gr.Checkbox(value=False, label="Senior role") is_entry = gr.Checkbox(value=False, label="Entry-level role") is_software = gr.Checkbox(value=False, label="Software / Engineering role") is_data = gr.Checkbox(value=False, label="Data / Analytics role") is_manager = gr.Checkbox(value=False, label="Manager / Director role") is_sales = gr.Checkbox(value=False, label="Sales role") is_marketing = gr.Checkbox(value=False, label="Marketing role") with gr.Column(scale=1): gr.Markdown("#### 📅 Timing, Location & Cluster") is_remote = gr.Checkbox(value=False, label="Remote role") posting_dayofweek = gr.Slider(0, 6, value=1, step=1, label="Day posted (0=Mon … 6=Sun)") gr.Markdown( "**Cluster** — pick the segment that best " "describes this posting. If unsure, use Cluster 0." ) cluster_choice = gr.Dropdown( choices=CLUSTER_LABELS, value=CLUSTER_LABELS[0], label="Posting cluster (KMeans k=6)", ) predict_btn = gr.Button("🔍 Predict Engagement", variant="primary", size="lg") with gr.Row(): views_out = gr.Textbox(label="📈 Predicted Views", interactive=False) label_out = gr.Textbox(label="🏷️ Engagement Class", interactive=False) detail_out = gr.Textbox(label="📋 Interpretation & Tips", interactive=False, lines=5) gauge_out = gr.Plot(label="Result") predict_btn.click( fn=predict_engagement, inputs=[ title_word_count, description_word_count, has_salary, salary_midpoint, is_senior, is_entry, is_software, is_data, is_manager, is_sales, is_marketing, is_remote, posting_dayofweek, cluster_choice, ], outputs=[views_out, label_out, detail_out, gauge_out], ) gr.Markdown( "> **Note:** Predictions use posting-level features only. " "Platform factors (LinkedIn algorithm, sponsored status, company followers) " "are unobservable and account for most real-world variance." ) # ── TAB 2: EDA DASHBOARD ───────────────────────────────────────────── with gr.Tab("📊 EDA Dashboard"): gr.Markdown("### Key Findings from Exploratory Data Analysis") with gr.Row(): with gr.Column(): gr.Markdown("**Q1 — Salary transparency vs views**") salary_plot = gr.Plot() with gr.Column(): gr.Markdown("**Q2 — Description length vs views**") desc_plot = gr.Plot() with gr.Row(): with gr.Column(): gr.Markdown("**Q3 — Day of week vs views**") day_plot = gr.Plot() with gr.Column(): gr.Markdown("**Q4 — Seniority level vs views**") seniority_plot = gr.Plot() with gr.Row(): with gr.Column(): gr.Markdown("**Q5 — Work type vs views**") worktype_plot = gr.Plot() with gr.Column(): gr.Markdown("**KMeans k=6 cluster distribution**") cluster_plot = gr.Plot() with gr.Row(): with gr.Column(): gr.Markdown("**Regression model comparison**") model_plot = gr.Plot() gr.Markdown( """ --- **Key takeaways:** 💰 Salary transparency → ~90% more views  |  📝 Sweet spot: 250–500 words  |  📅 Post Tue–Thu  |  👶 Entry-level = larger candidate pool  |  📋 Contract roles outperform full-time """ ) demo.load( fn=lambda: ( eda_salary_chart(), eda_description_chart(), eda_dayofweek_chart(), eda_seniority_chart(), eda_worktype_chart(), eda_cluster_chart(), eda_model_chart(), ), outputs=[salary_plot, desc_plot, day_plot, seniority_plot, worktype_plot, cluster_plot, model_plot], ) # ── TAB 3: ABOUT ───────────────────────────────────────────────────── with gr.Tab("ℹ️ About"): gr.Markdown( """ ## About This Project **Dataset:** LinkedIn Job Postings — arshkon/linkedin-job-postings (Kaggle) **Sample:** 30,000 rows from 123,850 · `random_state=42` **Target:** `views` — job posting view count --- ### Models | Model | Type | Metric | |---|---|---| | Random Forest (Tuned) | Regression | RMSE_log 0.8347 · R² 0.081 | | Decision Tree | Classification | Highest F1 + Recall for Class 1 | ### All 30 features | Group | Features | |---|---| | Text length | title_length, title_word_count, description_length, description_word_count | | Text structure | description_density, title_desc_ratio | | Salary | salary_midpoint, salary_range, has_salary_info, salary_log | | Role keywords | is_senior_role, is_entry_role, is_software_role, is_data_role, is_manager_role, is_sales_role, is_marketing_role | | Interactions | desc_salary_interaction, senior_salary, weekend_remote, title_desc_word_interaction, salary_density_interaction, salary_description_interaction, title_density_interaction | | Clustering | cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5 | --- ### Clustering KMeans k=6 · Silhouette score 0.289 · Fit on training data only k=7/8 rejected — produced near-singleton clusters --- ### Limitations - R²≈0.08 reflects unobservable platform factors - Associations shown, not causal relationships - High-engagement threshold is a business rule (top 25%), not a natural label - Cluster labels in the predictor are interpretive approximations """ ) demo.launch()