MichaelYitzchak's picture
Upload app.py
8c0e1ad verified
Raw
History Blame Contribute Delete
24.5 kB
import gradio as gr
import pandas as pd
import numpy as np
import pickle
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings("ignore")
# ── Exact 30 features the model was trained on ────────────────────────────────
FEATURE_COLS = [
"title_length",
"title_word_count",
"description_length",
"description_word_count",
"description_density",
"title_desc_ratio",
"salary_midpoint",
"salary_range",
"has_salary_info",
"salary_log",
"desc_salary_interaction",
"senior_salary",
"weekend_remote",
"is_senior_role",
"is_entry_role",
"is_software_role",
"is_data_role",
"is_manager_role",
"is_sales_role",
"is_marketing_role",
"title_desc_word_interaction",
"salary_density_interaction",
"salary_description_interaction",
"title_density_interaction",
"cluster_0",
"cluster_1",
"cluster_2",
"cluster_3",
"cluster_4",
"cluster_5",
]
CLUSTER_LABELS = [
"Cluster 0 β€” General / Mixed roles",
"Cluster 1 β€” High-salary specialist roles",
"Cluster 2 β€” Tech & software roles",
"Cluster 3 β€” Entry-level / high-volume roles",
"Cluster 4 β€” Contract & flexible roles",
"Cluster 5 β€” Senior leadership roles",
]
# ── Load models ───────────────────────────────────────────────────────────────
try:
with open("linkedin_regression_model.pkl", "rb") as f:
reg_model = pickle.load(f)
with open("linkedin_classification_model.pkl", "rb") as f:
clf_model = pickle.load(f)
MODELS_LOADED = True
except Exception as e:
MODELS_LOADED = False
MODEL_ERROR = str(e)
# ── Feature builder ───────────────────────────────────────────────────────────
def build_feature_row(
title_word_count,
description_word_count,
has_salary,
salary_midpoint,
is_senior,
is_entry,
is_software,
is_data,
is_manager,
is_sales,
is_marketing,
is_remote,
posting_dayofweek,
cluster_choice,
):
salary_midpoint_val = float(salary_midpoint) if has_salary else 0.0
salary_log = np.log1p(salary_midpoint_val)
salary_range = salary_midpoint_val * 0.3
twc = int(title_word_count)
dwc = int(description_word_count)
title_length = twc * 6
description_length = dwc * 5
description_density = (dwc / max(description_length, 1)) * 100
title_desc_ratio = twc / max(dwc, 1)
posting_weekend = int(int(posting_dayofweek) >= 5)
senior_salary = int(is_senior) * salary_log
weekend_remote = posting_weekend * int(is_remote)
desc_salary_interaction = dwc * salary_log
title_desc_word_interaction = twc * dwc
salary_density_interaction = salary_log * description_density
salary_description_interaction = salary_log * dwc
title_density_interaction = twc * description_density
# Cluster one-hot β€” exactly one cluster is active
cluster_idx = int(cluster_choice.split("β€”")[0].replace("Cluster", "").strip())
cluster_vals = [1 if i == cluster_idx else 0 for i in range(6)]
row = {
"title_length": title_length,
"title_word_count": twc,
"description_length": description_length,
"description_word_count": dwc,
"description_density": description_density,
"title_desc_ratio": title_desc_ratio,
"salary_midpoint": salary_midpoint_val,
"salary_range": salary_range,
"has_salary_info": int(has_salary),
"salary_log": salary_log,
"desc_salary_interaction": desc_salary_interaction,
"senior_salary": senior_salary,
"weekend_remote": weekend_remote,
"is_senior_role": int(is_senior),
"is_entry_role": int(is_entry),
"is_software_role": int(is_software),
"is_data_role": int(is_data),
"is_manager_role": int(is_manager),
"is_sales_role": int(is_sales),
"is_marketing_role": int(is_marketing),
"title_desc_word_interaction": title_desc_word_interaction,
"salary_density_interaction": salary_density_interaction,
"salary_description_interaction": salary_description_interaction,
"title_density_interaction": title_density_interaction,
"cluster_0": cluster_vals[0],
"cluster_1": cluster_vals[1],
"cluster_2": cluster_vals[2],
"cluster_3": cluster_vals[3],
"cluster_4": cluster_vals[4],
"cluster_5": cluster_vals[5],
}
return pd.DataFrame([row])[FEATURE_COLS]
# ── Prediction ────────────────────────────────────────────────────────────────
def predict_engagement(
title_word_count, description_word_count,
has_salary, salary_midpoint,
is_senior, is_entry, is_software, is_data,
is_manager, is_sales, is_marketing,
is_remote, posting_dayofweek,
cluster_choice,
):
if not MODELS_LOADED:
return "⚠️ Models not loaded.", "⚠️ Error", "Upload both .pkl files to the Space.", None
try:
X = build_feature_row(
title_word_count, description_word_count,
has_salary, salary_midpoint,
is_senior, is_entry, is_software, is_data,
is_manager, is_sales, is_marketing,
is_remote, posting_dayofweek,
cluster_choice,
)
log_views_pred = reg_model.predict(X)[0]
views_pred = int(np.expm1(log_views_pred))
clf_pred = clf_model.predict(X)[0]
engagement_label = "🟒 HIGH ENGAGEMENT" if clf_pred == 1 else "πŸ”΄ NORMAL ENGAGEMENT"
engagement_detail = (
"This posting is predicted to land in the top 25% for views on LinkedIn."
if clf_pred == 1 else
"This posting is predicted to receive average or below-average views."
)
tips = []
if not has_salary:
tips.append("πŸ’‘ Adding salary info is associated with ~90% more views.")
if int(description_word_count) < 250:
tips.append("πŸ’‘ Descriptions of 250–500 words tend to perform best.")
if int(description_word_count) > 750:
tips.append("πŸ’‘ Very long descriptions can reduce engagement β€” try trimming to under 750 words.")
if int(posting_dayofweek) >= 5:
tips.append("πŸ’‘ Posting mid-week (Tue–Thu) gets more views than weekends.")
if tips:
engagement_detail += "\n\n" + "\n".join(tips)
# Chart
DARK = "#0f0f0f"
bar_color = "#3a9e6e" if clf_pred == 1 else "#c0392b"
fill_pct = min(views_pred / 1500, 1.0)
fig, ax = plt.subplots(figsize=(5, 2.5))
fig.patch.set_facecolor(DARK)
ax.set_facecolor(DARK)
ax.barh(0, 1, color="#1e1e1e", height=0.4, edgecolor="none")
ax.barh(0, fill_pct, color=bar_color, height=0.4, edgecolor="none")
ax.set_xlim(0, 1)
ax.set_ylim(-0.6, 0.6)
ax.axis("off")
ax.text(0.5, 0.55, engagement_label,
ha="center", va="center", color=bar_color,
fontsize=12, fontweight="bold", transform=ax.transAxes)
ax.text(0.5, -0.35, f"Predicted: {views_pred:,} views",
ha="center", va="center", color="#e8e8e8",
fontsize=13, fontweight="bold", transform=ax.transAxes)
plt.tight_layout(pad=0.3)
return f"~{views_pred:,} views", engagement_label, engagement_detail, fig
except Exception as e:
import traceback
return f"Error: {e}", "Error", traceback.format_exc(), None
# ── EDA charts ────────────────────────────────────────────────────────────────
DARK_BG = "#0f0f0f"
CARD_BG = "#141414"
AMBER = "#f5a623"
GREEN = "#3a9e6e"
RED = "#c0392b"
BLUE = "#3498db"
TEXT = "#e8e8e8"
MUTED = "#666"
def style_ax(ax, fig):
fig.patch.set_facecolor(DARK_BG)
ax.set_facecolor(CARD_BG)
ax.tick_params(colors=MUTED, labelsize=10)
ax.xaxis.label.set_color(MUTED)
ax.yaxis.label.set_color(MUTED)
ax.title.set_color(TEXT)
for spine in ax.spines.values():
spine.set_edgecolor("#1e1e1e")
def eda_salary_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
bars = ax.barh(["No Salary Info", "Has Salary Info"], [180, 340],
color=[RED, GREEN], height=0.45)
for bar, val in zip(bars, [180, 340]):
ax.text(val + 5, bar.get_y() + bar.get_height() / 2,
f"~{val} avg views", va="center", color=TEXT, fontsize=11)
ax.set_xlabel("Average Views", color=MUTED)
ax.set_title("Q1 β€” Salary Transparency vs Engagement", color=TEXT, pad=10)
ax.set_xlim(0, 440)
ax.legend(handles=[mpatches.Patch(color=GREEN, label="Salary disclosed"),
mpatches.Patch(color=RED, label="No salary")],
facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
plt.tight_layout(); return fig
def eda_description_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
buckets = ["<100", "100–250", "250–500β˜…", "500–750", "750–1000", ">1000"]
values = [2.1, 2.8, 3.6, 3.3, 3.0, 2.5]
colors = [RED, AMBER, GREEN, BLUE, BLUE, RED]
bars = ax.bar(buckets, values, color=colors, width=0.55)
for bar, val in zip(bars, values):
ax.text(bar.get_x() + bar.get_width() / 2, val + 0.05,
f"{val}", ha="center", va="bottom", color=TEXT, fontsize=10)
ax.set_ylabel("Mean log(views+1)", color=MUTED)
ax.set_title("Q2 β€” Description Length vs Engagement", color=TEXT, pad=10)
ax.tick_params(axis="x", rotation=15)
ax.legend(handles=[mpatches.Patch(color=GREEN, label="Sweet spot: 250–500 words")],
facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
plt.tight_layout(); return fig
def eda_dayofweek_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
days = ["Mon", "Tueβ˜…", "Wed", "Thu", "Fri", "Sat", "Sun"]
values = [220, 245, 235, 228, 210, 148, 132]
colors = [BLUE, GREEN, BLUE, BLUE, BLUE, RED, RED]
bars = ax.bar(days, values, color=colors, width=0.55)
for bar, val in zip(bars, values):
ax.text(bar.get_x() + bar.get_width() / 2, val + 3,
str(val), ha="center", va="bottom", color=TEXT, fontsize=10)
ax.set_ylabel("Average Views", color=MUTED)
ax.set_title("Q3 β€” Day of Week vs Engagement", color=TEXT, pad=10)
ax.legend(handles=[mpatches.Patch(color=BLUE, label="Weekday"),
mpatches.Patch(color=RED, label="Weekend")],
facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
plt.tight_layout(); return fig
def eda_seniority_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
bars = ax.barh(["Entry-levelβ˜…", "Other / Mid", "Senior-level"],
[290, 210, 175], color=[GREEN, AMBER, RED], height=0.45)
for bar, val in zip(bars, [290, 210, 175]):
ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
f"~{val} avg views", va="center", color=TEXT, fontsize=11)
ax.set_xlabel("Average Views", color=MUTED)
ax.set_title("Q4 β€” Seniority Level vs Engagement", color=TEXT, pad=10)
ax.set_xlim(0, 370)
plt.tight_layout(); return fig
def eda_worktype_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
bars = ax.barh(["Contractβ˜…", "Internship", "Part-time", "Full-time", "Temporary"],
[310, 275, 235, 205, 185],
color=[GREEN, BLUE, AMBER, MUTED, RED], height=0.45)
for bar, val in zip(bars, [310, 275, 235, 205, 185]):
ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
f"~{val} avg views", va="center", color=TEXT, fontsize=11)
ax.set_xlabel("Average Views", color=MUTED)
ax.set_title("Q5 β€” Work Type vs Engagement", color=TEXT, pad=10)
ax.set_xlim(0, 390)
plt.tight_layout(); return fig
def eda_model_chart():
fig, ax = plt.subplots(figsize=(6, 3.8))
style_ax(ax, fig)
models = ["Mean Baseline", "PCA+Linear", "Lasso", "RidgeCV",
"Linear+Features", "Gradient Boosting", "RF Controlled", "RF Tunedβ˜…"]
rmse = [0.871, 0.844, 0.843, 0.842, 0.842, 0.837, 0.8349, 0.8347]
bars = ax.barh(models, rmse, color=[MUTED]*7+[AMBER], height=0.55)
for bar, val in zip(bars, rmse):
ax.text(val + 0.0005, bar.get_y() + bar.get_height() / 2,
f"{val:.4f}", va="center", color=TEXT, fontsize=9)
ax.set_xlabel("RMSE_log (lower = better)", color=MUTED)
ax.set_title("Regression Model Comparison", color=TEXT, pad=10)
ax.set_xlim(0.820, 0.886)
ax.legend(handles=[mpatches.Patch(color=AMBER, label="Winner: RF Tuned")],
facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
plt.tight_layout(); return fig
def eda_cluster_chart():
fig, ax = plt.subplots(figsize=(6, 3.2))
style_ax(ax, fig)
labels = [f"Cluster {i}" for i in range(6)]
sizes = [28, 18, 22, 12, 10, 10]
colors = [AMBER, BLUE, GREEN, RED, MUTED, "#9b59b6"]
bars = ax.bar(labels, sizes, color=colors, width=0.55)
for bar, val in zip(bars, sizes):
ax.text(bar.get_x() + bar.get_width() / 2, val + 0.4,
f"{val}%", ha="center", va="bottom", color=TEXT, fontsize=10)
ax.set_ylabel("% of training postings", color=MUTED)
ax.set_title("KMeans k=6 β€” Cluster Size Distribution", color=TEXT, pad=10)
ax.set_ylim(0, 36)
plt.tight_layout(); return fig
# ── Gradio UI ─────────────────────────────────────────────────────────────────
css = """
body, .gradio-container { background-color: #0a0a0a !important;
font-family: 'Segoe UI', system-ui, sans-serif; }
.gr-button-primary { background-color: #f5a623 !important;
color: #000 !important; font-weight: 700 !important; border: none !important; }
.gr-button-primary:hover { background-color: #d4941f !important; }
label { color: #aaa !important; font-size: 13px !important; }
"""
with gr.Blocks(css=css, title="LinkedIn Engagement Dashboard",
theme=gr.themes.Base(primary_hue="orange", neutral_hue="gray")) as demo:
gr.Markdown(
"""
# πŸ“Š LinkedIn Job Posting Engagement Dashboard
**Which posting characteristics predict candidate engagement?**
*Assignment 2 β€” Regression, Classification & Clustering Β· LinkedIn Job Postings*
"""
)
with gr.Tabs():
# ── TAB 1: PREDICTOR ─────────────────────────────────────────────────
with gr.Tab("🎯 Engagement Predictor"):
gr.Markdown(
"""
### Predict engagement for a new job posting
Fill in the posting details β€” the model will estimate views and classify
whether it is likely to reach **high engagement** (top 25%).
"""
)
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### πŸ“ Content")
title_word_count = gr.Slider(1, 20, value=5, step=1,
label="Title word count")
description_word_count = gr.Slider(10, 1200, value=350, step=10,
label="Description word count")
has_salary = gr.Checkbox(value=True,
label="Salary information disclosed")
salary_midpoint = gr.Slider(0, 300000, value=85000, step=5000,
label="Salary midpoint ($)")
with gr.Column(scale=1):
gr.Markdown("#### 🏷️ Role Type")
is_senior = gr.Checkbox(value=False, label="Senior role")
is_entry = gr.Checkbox(value=False, label="Entry-level role")
is_software = gr.Checkbox(value=False, label="Software / Engineering role")
is_data = gr.Checkbox(value=False, label="Data / Analytics role")
is_manager = gr.Checkbox(value=False, label="Manager / Director role")
is_sales = gr.Checkbox(value=False, label="Sales role")
is_marketing = gr.Checkbox(value=False, label="Marketing role")
with gr.Column(scale=1):
gr.Markdown("#### πŸ“… Timing, Location & Cluster")
is_remote = gr.Checkbox(value=False, label="Remote role")
posting_dayofweek = gr.Slider(0, 6, value=1, step=1,
label="Day posted (0=Mon … 6=Sun)")
gr.Markdown(
"<small style='color:#555'>**Cluster** β€” pick the segment that best "
"describes this posting. If unsure, use Cluster 0.</small>"
)
cluster_choice = gr.Dropdown(
choices=CLUSTER_LABELS,
value=CLUSTER_LABELS[0],
label="Posting cluster (KMeans k=6)",
)
predict_btn = gr.Button("πŸ” Predict Engagement", variant="primary", size="lg")
with gr.Row():
views_out = gr.Textbox(label="πŸ“ˆ Predicted Views", interactive=False)
label_out = gr.Textbox(label="🏷️ Engagement Class", interactive=False)
detail_out = gr.Textbox(label="πŸ“‹ Interpretation & Tips", interactive=False, lines=5)
gauge_out = gr.Plot(label="Result")
predict_btn.click(
fn=predict_engagement,
inputs=[
title_word_count, description_word_count,
has_salary, salary_midpoint,
is_senior, is_entry, is_software, is_data,
is_manager, is_sales, is_marketing,
is_remote, posting_dayofweek,
cluster_choice,
],
outputs=[views_out, label_out, detail_out, gauge_out],
)
gr.Markdown(
"> **Note:** Predictions use posting-level features only. "
"Platform factors (LinkedIn algorithm, sponsored status, company followers) "
"are unobservable and account for most real-world variance."
)
# ── TAB 2: EDA DASHBOARD ─────────────────────────────────────────────
with gr.Tab("πŸ“Š EDA Dashboard"):
gr.Markdown("### Key Findings from Exploratory Data Analysis")
with gr.Row():
with gr.Column():
gr.Markdown("**Q1 β€” Salary transparency vs views**")
salary_plot = gr.Plot()
with gr.Column():
gr.Markdown("**Q2 β€” Description length vs views**")
desc_plot = gr.Plot()
with gr.Row():
with gr.Column():
gr.Markdown("**Q3 β€” Day of week vs views**")
day_plot = gr.Plot()
with gr.Column():
gr.Markdown("**Q4 β€” Seniority level vs views**")
seniority_plot = gr.Plot()
with gr.Row():
with gr.Column():
gr.Markdown("**Q5 β€” Work type vs views**")
worktype_plot = gr.Plot()
with gr.Column():
gr.Markdown("**KMeans k=6 cluster distribution**")
cluster_plot = gr.Plot()
with gr.Row():
with gr.Column():
gr.Markdown("**Regression model comparison**")
model_plot = gr.Plot()
gr.Markdown(
"""
---
**Key takeaways:**
πŸ’° Salary transparency β†’ ~90% more views &nbsp;|&nbsp;
πŸ“ Sweet spot: 250–500 words &nbsp;|&nbsp;
πŸ“… Post Tue–Thu &nbsp;|&nbsp;
πŸ‘Ά Entry-level = larger candidate pool &nbsp;|&nbsp;
πŸ“‹ Contract roles outperform full-time
"""
)
demo.load(
fn=lambda: (
eda_salary_chart(), eda_description_chart(),
eda_dayofweek_chart(), eda_seniority_chart(),
eda_worktype_chart(), eda_cluster_chart(),
eda_model_chart(),
),
outputs=[salary_plot, desc_plot, day_plot,
seniority_plot, worktype_plot, cluster_plot,
model_plot],
)
# ── TAB 3: ABOUT ─────────────────────────────────────────────────────
with gr.Tab("ℹ️ About"):
gr.Markdown(
"""
## About This Project
**Dataset:** LinkedIn Job Postings β€” arshkon/linkedin-job-postings (Kaggle)
**Sample:** 30,000 rows from 123,850 Β· `random_state=42`
**Target:** `views` β€” job posting view count
---
### Models
| Model | Type | Metric |
|---|---|---|
| Random Forest (Tuned) | Regression | RMSE_log 0.8347 Β· RΒ² 0.081 |
| Decision Tree | Classification | Highest F1 + Recall for Class 1 |
### All 30 features
| Group | Features |
|---|---|
| Text length | title_length, title_word_count, description_length, description_word_count |
| Text structure | description_density, title_desc_ratio |
| Salary | salary_midpoint, salary_range, has_salary_info, salary_log |
| Role keywords | is_senior_role, is_entry_role, is_software_role, is_data_role, is_manager_role, is_sales_role, is_marketing_role |
| Interactions | desc_salary_interaction, senior_salary, weekend_remote, title_desc_word_interaction, salary_density_interaction, salary_description_interaction, title_density_interaction |
| Clustering | cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5 |
---
### Clustering
KMeans k=6 Β· Silhouette score 0.289 Β· Fit on training data only
k=7/8 rejected β€” produced near-singleton clusters
---
### Limitations
- RΒ²β‰ˆ0.08 reflects unobservable platform factors
- Associations shown, not causal relationships
- High-engagement threshold is a business rule (top 25%), not a natural label
- Cluster labels in the predictor are interpretive approximations
"""
)
demo.launch()