Spaces:

MichaelYitzchak
/

linkedin_Job_Engagement

Sleeping

App Files Files Community

linkedin_Job_Engagement / app.py

MichaelYitzchak

Upload app.py

8c0e1ad verified about 1 month ago

Raw

History Blame Contribute Delete

24.5 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import pickle
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	import warnings
	warnings.filterwarnings("ignore")

	# ── Exact 30 features the model was trained on ────────────────────────────────
	FEATURE_COLS = [
	"title_length",
	"title_word_count",
	"description_length",
	"description_word_count",
	"description_density",
	"title_desc_ratio",
	"salary_midpoint",
	"salary_range",
	"has_salary_info",
	"salary_log",
	"desc_salary_interaction",
	"senior_salary",
	"weekend_remote",
	"is_senior_role",
	"is_entry_role",
	"is_software_role",
	"is_data_role",
	"is_manager_role",
	"is_sales_role",
	"is_marketing_role",
	"title_desc_word_interaction",
	"salary_density_interaction",
	"salary_description_interaction",
	"title_density_interaction",
	"cluster_0",
	"cluster_1",
	"cluster_2",
	"cluster_3",
	"cluster_4",
	"cluster_5",
	]

	CLUSTER_LABELS = [
	"Cluster 0 — General / Mixed roles",
	"Cluster 1 — High-salary specialist roles",
	"Cluster 2 — Tech & software roles",
	"Cluster 3 — Entry-level / high-volume roles",
	"Cluster 4 — Contract & flexible roles",
	"Cluster 5 — Senior leadership roles",
	]

	# ── Load models ───────────────────────────────────────────────────────────────
	try:
	with open("linkedin_regression_model.pkl", "rb") as f:
	reg_model = pickle.load(f)
	with open("linkedin_classification_model.pkl", "rb") as f:
	clf_model = pickle.load(f)
	MODELS_LOADED = True
	except Exception as e:
	MODELS_LOADED = False
	MODEL_ERROR = str(e)

	# ── Feature builder ───────────────────────────────────────────────────────────
	def build_feature_row(
	title_word_count,
	description_word_count,
	has_salary,
	salary_midpoint,
	is_senior,
	is_entry,
	is_software,
	is_data,
	is_manager,
	is_sales,
	is_marketing,
	is_remote,
	posting_dayofweek,
	cluster_choice,
	):
	salary_midpoint_val = float(salary_midpoint) if has_salary else 0.0
	salary_log = np.log1p(salary_midpoint_val)
	salary_range = salary_midpoint_val * 0.3
	twc = int(title_word_count)
	dwc = int(description_word_count)
	title_length = twc * 6
	description_length = dwc * 5
	description_density = (dwc / max(description_length, 1)) * 100
	title_desc_ratio = twc / max(dwc, 1)
	posting_weekend = int(int(posting_dayofweek) >= 5)
	senior_salary = int(is_senior) * salary_log
	weekend_remote = posting_weekend * int(is_remote)

	desc_salary_interaction = dwc * salary_log
	title_desc_word_interaction = twc * dwc
	salary_density_interaction = salary_log * description_density
	salary_description_interaction = salary_log * dwc
	title_density_interaction = twc * description_density

	# Cluster one-hot — exactly one cluster is active
	cluster_idx = int(cluster_choice.split("—")[0].replace("Cluster", "").strip())
	cluster_vals = [1 if i == cluster_idx else 0 for i in range(6)]

	row = {
	"title_length": title_length,
	"title_word_count": twc,
	"description_length": description_length,
	"description_word_count": dwc,
	"description_density": description_density,
	"title_desc_ratio": title_desc_ratio,
	"salary_midpoint": salary_midpoint_val,
	"salary_range": salary_range,
	"has_salary_info": int(has_salary),
	"salary_log": salary_log,
	"desc_salary_interaction": desc_salary_interaction,
	"senior_salary": senior_salary,
	"weekend_remote": weekend_remote,
	"is_senior_role": int(is_senior),
	"is_entry_role": int(is_entry),
	"is_software_role": int(is_software),
	"is_data_role": int(is_data),
	"is_manager_role": int(is_manager),
	"is_sales_role": int(is_sales),
	"is_marketing_role": int(is_marketing),
	"title_desc_word_interaction": title_desc_word_interaction,
	"salary_density_interaction": salary_density_interaction,
	"salary_description_interaction": salary_description_interaction,
	"title_density_interaction": title_density_interaction,
	"cluster_0": cluster_vals[0],
	"cluster_1": cluster_vals[1],
	"cluster_2": cluster_vals[2],
	"cluster_3": cluster_vals[3],
	"cluster_4": cluster_vals[4],
	"cluster_5": cluster_vals[5],
	}

	return pd.DataFrame([row])[FEATURE_COLS]


	# ── Prediction ────────────────────────────────────────────────────────────────
	def predict_engagement(
	title_word_count, description_word_count,
	has_salary, salary_midpoint,
	is_senior, is_entry, is_software, is_data,
	is_manager, is_sales, is_marketing,
	is_remote, posting_dayofweek,
	cluster_choice,
	):
	if not MODELS_LOADED:
	return "⚠️ Models not loaded.", "⚠️ Error", "Upload both .pkl files to the Space.", None

	try:
	X = build_feature_row(
	title_word_count, description_word_count,
	has_salary, salary_midpoint,
	is_senior, is_entry, is_software, is_data,
	is_manager, is_sales, is_marketing,
	is_remote, posting_dayofweek,
	cluster_choice,
	)

	log_views_pred = reg_model.predict(X)[0]
	views_pred = int(np.expm1(log_views_pred))
	clf_pred = clf_model.predict(X)[0]

	engagement_label = "🟢 HIGH ENGAGEMENT" if clf_pred == 1 else "🔴 NORMAL ENGAGEMENT"
	engagement_detail = (
	"This posting is predicted to land in the top 25% for views on LinkedIn."
	if clf_pred == 1 else
	"This posting is predicted to receive average or below-average views."
	)

	tips = []
	if not has_salary:
	tips.append("💡 Adding salary info is associated with ~90% more views.")
	if int(description_word_count) < 250:
	tips.append("💡 Descriptions of 250–500 words tend to perform best.")
	if int(description_word_count) > 750:
	tips.append("💡 Very long descriptions can reduce engagement — try trimming to under 750 words.")
	if int(posting_dayofweek) >= 5:
	tips.append("💡 Posting mid-week (Tue–Thu) gets more views than weekends.")
	if tips:
	engagement_detail += "\n\n" + "\n".join(tips)

	# Chart
	DARK = "#0f0f0f"
	bar_color = "#3a9e6e" if clf_pred == 1 else "#c0392b"
	fill_pct = min(views_pred / 1500, 1.0)

	fig, ax = plt.subplots(figsize=(5, 2.5))
	fig.patch.set_facecolor(DARK)
	ax.set_facecolor(DARK)
	ax.barh(0, 1, color="#1e1e1e", height=0.4, edgecolor="none")
	ax.barh(0, fill_pct, color=bar_color, height=0.4, edgecolor="none")
	ax.set_xlim(0, 1)
	ax.set_ylim(-0.6, 0.6)
	ax.axis("off")
	ax.text(0.5, 0.55, engagement_label,
	ha="center", va="center", color=bar_color,
	fontsize=12, fontweight="bold", transform=ax.transAxes)
	ax.text(0.5, -0.35, f"Predicted: {views_pred:,} views",
	ha="center", va="center", color="#e8e8e8",
	fontsize=13, fontweight="bold", transform=ax.transAxes)
	plt.tight_layout(pad=0.3)

	return f"~{views_pred:,} views", engagement_label, engagement_detail, fig

	except Exception as e:
	import traceback
	return f"Error: {e}", "Error", traceback.format_exc(), None


	# ── EDA charts ────────────────────────────────────────────────────────────────
	DARK_BG = "#0f0f0f"
	CARD_BG = "#141414"
	AMBER = "#f5a623"
	GREEN = "#3a9e6e"
	RED = "#c0392b"
	BLUE = "#3498db"
	TEXT = "#e8e8e8"
	MUTED = "#666"

	def style_ax(ax, fig):
	fig.patch.set_facecolor(DARK_BG)
	ax.set_facecolor(CARD_BG)
	ax.tick_params(colors=MUTED, labelsize=10)
	ax.xaxis.label.set_color(MUTED)
	ax.yaxis.label.set_color(MUTED)
	ax.title.set_color(TEXT)
	for spine in ax.spines.values():
	spine.set_edgecolor("#1e1e1e")

	def eda_salary_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	bars = ax.barh(["No Salary Info", "Has Salary Info"], [180, 340],
	color=[RED, GREEN], height=0.45)
	for bar, val in zip(bars, [180, 340]):
	ax.text(val + 5, bar.get_y() + bar.get_height() / 2,
	f"~{val} avg views", va="center", color=TEXT, fontsize=11)
	ax.set_xlabel("Average Views", color=MUTED)
	ax.set_title("Q1 — Salary Transparency vs Engagement", color=TEXT, pad=10)
	ax.set_xlim(0, 440)
	ax.legend(handles=[mpatches.Patch(color=GREEN, label="Salary disclosed"),
	mpatches.Patch(color=RED, label="No salary")],
	facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
	plt.tight_layout(); return fig

	def eda_description_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	buckets = ["<100", "100–250", "250–500★", "500–750", "750–1000", ">1000"]
	values = [2.1, 2.8, 3.6, 3.3, 3.0, 2.5]
	colors = [RED, AMBER, GREEN, BLUE, BLUE, RED]
	bars = ax.bar(buckets, values, color=colors, width=0.55)
	for bar, val in zip(bars, values):
	ax.text(bar.get_x() + bar.get_width() / 2, val + 0.05,
	f"{val}", ha="center", va="bottom", color=TEXT, fontsize=10)
	ax.set_ylabel("Mean log(views+1)", color=MUTED)
	ax.set_title("Q2 — Description Length vs Engagement", color=TEXT, pad=10)
	ax.tick_params(axis="x", rotation=15)
	ax.legend(handles=[mpatches.Patch(color=GREEN, label="Sweet spot: 250–500 words")],
	facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
	plt.tight_layout(); return fig

	def eda_dayofweek_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	days = ["Mon", "Tue★", "Wed", "Thu", "Fri", "Sat", "Sun"]
	values = [220, 245, 235, 228, 210, 148, 132]
	colors = [BLUE, GREEN, BLUE, BLUE, BLUE, RED, RED]
	bars = ax.bar(days, values, color=colors, width=0.55)
	for bar, val in zip(bars, values):
	ax.text(bar.get_x() + bar.get_width() / 2, val + 3,
	str(val), ha="center", va="bottom", color=TEXT, fontsize=10)
	ax.set_ylabel("Average Views", color=MUTED)
	ax.set_title("Q3 — Day of Week vs Engagement", color=TEXT, pad=10)
	ax.legend(handles=[mpatches.Patch(color=BLUE, label="Weekday"),
	mpatches.Patch(color=RED, label="Weekend")],
	facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
	plt.tight_layout(); return fig

	def eda_seniority_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	bars = ax.barh(["Entry-level★", "Other / Mid", "Senior-level"],
	[290, 210, 175], color=[GREEN, AMBER, RED], height=0.45)
	for bar, val in zip(bars, [290, 210, 175]):
	ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
	f"~{val} avg views", va="center", color=TEXT, fontsize=11)
	ax.set_xlabel("Average Views", color=MUTED)
	ax.set_title("Q4 — Seniority Level vs Engagement", color=TEXT, pad=10)
	ax.set_xlim(0, 370)
	plt.tight_layout(); return fig

	def eda_worktype_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	bars = ax.barh(["Contract★", "Internship", "Part-time", "Full-time", "Temporary"],
	[310, 275, 235, 205, 185],
	color=[GREEN, BLUE, AMBER, MUTED, RED], height=0.45)
	for bar, val in zip(bars, [310, 275, 235, 205, 185]):
	ax.text(val + 4, bar.get_y() + bar.get_height() / 2,
	f"~{val} avg views", va="center", color=TEXT, fontsize=11)
	ax.set_xlabel("Average Views", color=MUTED)
	ax.set_title("Q5 — Work Type vs Engagement", color=TEXT, pad=10)
	ax.set_xlim(0, 390)
	plt.tight_layout(); return fig

	def eda_model_chart():
	fig, ax = plt.subplots(figsize=(6, 3.8))
	style_ax(ax, fig)
	models = ["Mean Baseline", "PCA+Linear", "Lasso", "RidgeCV",
	"Linear+Features", "Gradient Boosting", "RF Controlled", "RF Tuned★"]
	rmse = [0.871, 0.844, 0.843, 0.842, 0.842, 0.837, 0.8349, 0.8347]
	bars = ax.barh(models, rmse, color=[MUTED]*7+[AMBER], height=0.55)
	for bar, val in zip(bars, rmse):
	ax.text(val + 0.0005, bar.get_y() + bar.get_height() / 2,
	f"{val:.4f}", va="center", color=TEXT, fontsize=9)
	ax.set_xlabel("RMSE_log (lower = better)", color=MUTED)
	ax.set_title("Regression Model Comparison", color=TEXT, pad=10)
	ax.set_xlim(0.820, 0.886)
	ax.legend(handles=[mpatches.Patch(color=AMBER, label="Winner: RF Tuned")],
	facecolor=CARD_BG, labelcolor=TEXT, fontsize=9)
	plt.tight_layout(); return fig

	def eda_cluster_chart():
	fig, ax = plt.subplots(figsize=(6, 3.2))
	style_ax(ax, fig)
	labels = [f"Cluster {i}" for i in range(6)]
	sizes = [28, 18, 22, 12, 10, 10]
	colors = [AMBER, BLUE, GREEN, RED, MUTED, "#9b59b6"]
	bars = ax.bar(labels, sizes, color=colors, width=0.55)
	for bar, val in zip(bars, sizes):
	ax.text(bar.get_x() + bar.get_width() / 2, val + 0.4,
	f"{val}%", ha="center", va="bottom", color=TEXT, fontsize=10)
	ax.set_ylabel("% of training postings", color=MUTED)
	ax.set_title("KMeans k=6 — Cluster Size Distribution", color=TEXT, pad=10)
	ax.set_ylim(0, 36)
	plt.tight_layout(); return fig


	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	css = """
	body, .gradio-container { background-color: #0a0a0a !important;
	font-family: 'Segoe UI', system-ui, sans-serif; }
	.gr-button-primary { background-color: #f5a623 !important;
	color: #000 !important; font-weight: 700 !important; border: none !important; }
	.gr-button-primary:hover { background-color: #d4941f !important; }
	label { color: #aaa !important; font-size: 13px !important; }
	"""

	with gr.Blocks(css=css, title="LinkedIn Engagement Dashboard",
	theme=gr.themes.Base(primary_hue="orange", neutral_hue="gray")) as demo:

	gr.Markdown(
	"""
	# 📊 LinkedIn Job Posting Engagement Dashboard
	Which posting characteristics predict candidate engagement?
	Assignment 2 — Regression, Classification & Clustering · LinkedIn Job Postings
	"""
	)

	with gr.Tabs():

	# ── TAB 1: PREDICTOR ─────────────────────────────────────────────────
	with gr.Tab("🎯 Engagement Predictor"):
	gr.Markdown(
	"""
	### Predict engagement for a new job posting
	Fill in the posting details — the model will estimate views and classify
	whether it is likely to reach high engagement (top 25%).
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("#### 📝 Content")
	title_word_count = gr.Slider(1, 20, value=5, step=1,
	label="Title word count")
	description_word_count = gr.Slider(10, 1200, value=350, step=10,
	label="Description word count")
	has_salary = gr.Checkbox(value=True,
	label="Salary information disclosed")
	salary_midpoint = gr.Slider(0, 300000, value=85000, step=5000,
	label="Salary midpoint ($)")

	with gr.Column(scale=1):
	gr.Markdown("#### 🏷️ Role Type")
	is_senior = gr.Checkbox(value=False, label="Senior role")
	is_entry = gr.Checkbox(value=False, label="Entry-level role")
	is_software = gr.Checkbox(value=False, label="Software / Engineering role")
	is_data = gr.Checkbox(value=False, label="Data / Analytics role")
	is_manager = gr.Checkbox(value=False, label="Manager / Director role")
	is_sales = gr.Checkbox(value=False, label="Sales role")
	is_marketing = gr.Checkbox(value=False, label="Marketing role")

	with gr.Column(scale=1):
	gr.Markdown("#### 📅 Timing, Location & Cluster")
	is_remote = gr.Checkbox(value=False, label="Remote role")
	posting_dayofweek = gr.Slider(0, 6, value=1, step=1,
	label="Day posted (0=Mon … 6=Sun)")
	gr.Markdown(
	"<small style='color:#555'>Cluster — pick the segment that best "
	"describes this posting. If unsure, use Cluster 0.</small>"
	)
	cluster_choice = gr.Dropdown(
	choices=CLUSTER_LABELS,
	value=CLUSTER_LABELS[0],
	label="Posting cluster (KMeans k=6)",
	)

	predict_btn = gr.Button("🔍 Predict Engagement", variant="primary", size="lg")

	with gr.Row():
	views_out = gr.Textbox(label="📈 Predicted Views", interactive=False)
	label_out = gr.Textbox(label="🏷️ Engagement Class", interactive=False)

	detail_out = gr.Textbox(label="📋 Interpretation & Tips", interactive=False, lines=5)
	gauge_out = gr.Plot(label="Result")

	predict_btn.click(
	fn=predict_engagement,
	inputs=[
	title_word_count, description_word_count,
	has_salary, salary_midpoint,
	is_senior, is_entry, is_software, is_data,
	is_manager, is_sales, is_marketing,
	is_remote, posting_dayofweek,
	cluster_choice,
	],
	outputs=[views_out, label_out, detail_out, gauge_out],
	)

	gr.Markdown(
	"> Note: Predictions use posting-level features only. "
	"Platform factors (LinkedIn algorithm, sponsored status, company followers) "
	"are unobservable and account for most real-world variance."
	)

	# ── TAB 2: EDA DASHBOARD ─────────────────────────────────────────────
	with gr.Tab("📊 EDA Dashboard"):
	gr.Markdown("### Key Findings from Exploratory Data Analysis")

	with gr.Row():
	with gr.Column():
	gr.Markdown("Q1 — Salary transparency vs views")
	salary_plot = gr.Plot()
	with gr.Column():
	gr.Markdown("Q2 — Description length vs views")
	desc_plot = gr.Plot()

	with gr.Row():
	with gr.Column():
	gr.Markdown("Q3 — Day of week vs views")
	day_plot = gr.Plot()
	with gr.Column():
	gr.Markdown("Q4 — Seniority level vs views")
	seniority_plot = gr.Plot()

	with gr.Row():
	with gr.Column():
	gr.Markdown("Q5 — Work type vs views")
	worktype_plot = gr.Plot()
	with gr.Column():
	gr.Markdown("KMeans k=6 cluster distribution")
	cluster_plot = gr.Plot()

	with gr.Row():
	with gr.Column():
	gr.Markdown("Regression model comparison")
	model_plot = gr.Plot()

	gr.Markdown(
	"""
	---
	Key takeaways:
	💰 Salary transparency → ~90% more views  \|
	📝 Sweet spot: 250–500 words  \|
	📅 Post Tue–Thu  \|
	👶 Entry-level = larger candidate pool  \|
	📋 Contract roles outperform full-time
	"""
	)

	demo.load(
	fn=lambda: (
	eda_salary_chart(), eda_description_chart(),
	eda_dayofweek_chart(), eda_seniority_chart(),
	eda_worktype_chart(), eda_cluster_chart(),
	eda_model_chart(),
	),
	outputs=[salary_plot, desc_plot, day_plot,
	seniority_plot, worktype_plot, cluster_plot,
	model_plot],
	)

	# ── TAB 3: ABOUT ─────────────────────────────────────────────────────
	with gr.Tab("ℹ️ About"):
	gr.Markdown(
	"""
	## About This Project

	Dataset: LinkedIn Job Postings — arshkon/linkedin-job-postings (Kaggle)
	Sample: 30,000 rows from 123,850 · `random_state=42`
	Target: `views` — job posting view count

	---
	### Models
	\| Model \| Type \| Metric \|
	\|---\|---\|---\|
	\| Random Forest (Tuned) \| Regression \| RMSE_log 0.8347 · R² 0.081 \|
	\| Decision Tree \| Classification \| Highest F1 + Recall for Class 1 \|

	### All 30 features
	\| Group \| Features \|
	\|---\|---\|
	\| Text length \| title_length, title_word_count, description_length, description_word_count \|
	\| Text structure \| description_density, title_desc_ratio \|
	\| Salary \| salary_midpoint, salary_range, has_salary_info, salary_log \|
	\| Role keywords \| is_senior_role, is_entry_role, is_software_role, is_data_role, is_manager_role, is_sales_role, is_marketing_role \|
	\| Interactions \| desc_salary_interaction, senior_salary, weekend_remote, title_desc_word_interaction, salary_density_interaction, salary_description_interaction, title_density_interaction \|
	\| Clustering \| cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5 \|

	---
	### Clustering
	KMeans k=6 · Silhouette score 0.289 · Fit on training data only
	k=7/8 rejected — produced near-singleton clusters

	---
	### Limitations
	- R²≈0.08 reflects unobservable platform factors
	- Associations shown, not causal relationships
	- High-engagement threshold is a business rule (top 25%), not a natural label
	- Cluster labels in the predictor are interpretive approximations
	"""
	)

	demo.launch()