| import gradio as gr |
| import pandas as pd |
| import numpy as np |
| import pickle |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import matplotlib.patches as mpatches |
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| |
| FEATURE_COLS = [ |
| "title_length", |
| "title_word_count", |
| "description_length", |
| "description_word_count", |
| "description_density", |
| "title_desc_ratio", |
| "salary_midpoint", |
| "salary_range", |
| "has_salary_info", |
| "salary_log", |
| "desc_salary_interaction", |
| "senior_salary", |
| "weekend_remote", |
| "is_senior_role", |
| "is_entry_role", |
| "is_software_role", |
| "is_data_role", |
| "is_manager_role", |
| "is_sales_role", |
| "is_marketing_role", |
| "title_desc_word_interaction", |
| "salary_density_interaction", |
| "salary_description_interaction", |
| "title_density_interaction", |
| "cluster_0", |
| "cluster_1", |
| "cluster_2", |
| "cluster_3", |
| "cluster_4", |
| "cluster_5", |
| ] |
|
|
| CLUSTER_LABELS = [ |
| "Cluster 0 β General / Mixed roles", |
| "Cluster 1 β High-salary specialist roles", |
| "Cluster 2 β Tech & software roles", |
| "Cluster 3 β Entry-level / high-volume roles", |
| "Cluster 4 β Contract & flexible roles", |
| "Cluster 5 β Senior leadership roles", |
| ] |
|
|
| |
| try: |
| with open("linkedin_regression_model.pkl", "rb") as f: |
| reg_model = pickle.load(f) |
| with open("linkedin_classification_model.pkl", "rb") as f: |
| clf_model = pickle.load(f) |
| MODELS_LOADED = True |
| except Exception as e: |
| MODELS_LOADED = False |
| MODEL_ERROR = str(e) |
|
|
| |
| def build_feature_row( |
| title_word_count, |
| description_word_count, |
| has_salary, |
| salary_midpoint, |
| is_senior, |
| is_entry, |
| is_software, |
| is_data, |
| is_manager, |
| is_sales, |
| is_marketing, |
| is_remote, |
| posting_dayofweek, |
| cluster_choice, |
| ): |
| salary_midpoint_val = float(salary_midpoint) if has_salary else 0.0 |
| salary_log = np.log1p(salary_midpoint_val) |
| salary_range = salary_midpoint_val * 0.3 |
| twc = int(title_word_count) |
| dwc = int(description_word_count) |
| title_length = twc * 6 |
| description_length = dwc * 5 |
| description_density = (dwc / max(description_length, 1)) * 100 |
| title_desc_ratio = twc / max(dwc, 1) |
| posting_weekend = int(int(posting_dayofweek) >= 5) |
| senior_salary = int(is_senior) * salary_log |
| weekend_remote = posting_weekend * int(is_remote) |
|
|
| desc_salary_interaction = dwc * salary_log |
| title_desc_word_interaction = twc * dwc |
| salary_density_interaction = salary_log * description_density |
| salary_description_interaction = salary_log * dwc |
| title_density_interaction = twc * description_density |
|
|
| |
| cluster_idx = int(cluster_choice.split("β")[0].replace("Cluster", "").strip()) |
| cluster_vals = [1 if i == cluster_idx else 0 for i in range(6)] |
|
|
| row = { |
| "title_length": title_length, |
| "title_word_count": twc, |
| "description_length": description_length, |
| "description_word_count": dwc, |
| "description_density": description_density, |
| "title_desc_ratio": title_desc_ratio, |
| "salary_midpoint": salary_midpoint_val, |
| "salary_range": salary_range, |
| "has_salary_info": int(has_salary), |
| "salary_log": salary_log, |
| "desc_salary_interaction": desc_salary_interaction, |
| "senior_salary": senior_salary, |
| "weekend_remote": weekend_remote, |
| "is_senior_role": int(is_senior), |
| "is_entry_role": int(is_entry), |
| "is_software_role": int(is_software), |
| "is_data_role": int(is_data), |
| "is_manager_role": int(is_manager), |
| "is_sales_role": int(is_sales), |
| "is_marketing_role": int(is_marketing), |
| "title_desc_word_interaction": title_desc_word_interaction, |
| "salary_density_interaction": salary_density_interaction, |
| "salary_description_interaction": salary_description_interaction, |
| "title_density_interaction": title_density_interaction, |
| "cluster_0": cluster_vals[0], |
| "cluster_1": cluster_vals[1], |
| "cluster_2": cluster_vals[2], |
| "cluster_3": cluster_vals[3], |
| "cluster_4": cluster_vals[4], |
| "cluster_5": cluster_vals[5], |
| } |
|
|
| return pd.DataFrame([row])[FEATURE_COLS] |
|
|
|
|
| |
| def predict_engagement( |
| title_word_count, description_word_count, |
| has_salary, salary_midpoint, |
| is_senior, is_entry, is_software, is_data, |
| is_manager, is_sales, is_marketing, |
| is_remote, posting_dayofweek, |
| cluster_choice, |
| ): |
| if not MODELS_LOADED: |
| return "β οΈ Models not loaded.", "β οΈ Error", "Upload both .pkl files to the Space.", None |
|
|
| try: |
| X = build_feature_row( |
| title_word_count, description_word_count, |
| has_salary, salary_midpoint, |
| is_senior, is_entry, is_software, is_data, |
| is_manager, is_sales, is_marketing, |
| is_remote, posting_dayofweek, |
| cluster_choice, |
| ) |
|
|
| log_views_pred = reg_model.predict(X)[0] |
| views_pred = int(np.expm1(log_views_pred)) |
| clf_pred = clf_model.predict(X)[0] |
|
|
| engagement_label = "π’ HIGH ENGAGEMENT" if clf_pred == 1 else "π΄ NORMAL ENGAGEMENT" |
| engagement_detail = ( |
| "This posting is predicted to land in the top 25% for views on LinkedIn." |
| if clf_pred == 1 else |
| "This posting is predicted to receive average or below-average views." |
| ) |
|
|
| tips = [] |
| if not has_salary: |
| tips.append("π‘ Adding salary info is associated with ~90% more views.") |
| if int(description_word_count) < 250: |
| tips.append("π‘ Descriptions of 250β500 words tend to perform best.") |
| if int(description_word_count) > 750: |
| tips.append("π‘ Very long descriptions can reduce engagement β try trimming to under 750 words.") |
| if int(posting_dayofweek) >= 5: |
| tips.append("π‘ Posting mid-week (TueβThu) gets more views than weekends.") |
| if tips: |
| engagement_detail += "\n\n" + "\n".join(tips) |
|
|
| |
| DARK = "#0f0f0f" |
| bar_color = "#3a9e6e" if clf_pred == 1 else "#c0392b" |
| fill_pct = min(views_pred / 1500, 1.0) |
|
|
| fig, ax = plt.subplots(figsize=(5, 2.5)) |
| fig.patch.set_facecolor(DARK) |
| ax.set_facecolor(DARK) |
| ax.barh(0, 1, color="#1e1e1e", height=0.4, edgecolor="none") |
| ax.barh(0, fill_pct, color=bar_color, height=0.4, edgecolor="none") |
| ax.set_xlim(0, 1) |
| ax.set_ylim(-0.6, 0.6) |
| ax.axis("off") |
| ax.text(0.5, 0.55, engagement_label, |
| ha="center", va="center", color=bar_color, |
| fontsize=12, fontweight="bold", transform=ax.transAxes) |
| ax.text(0.5, -0.35, f"Predicted: {views_pred:,} views", |
| ha="center", va="center", color="#e8e8e8", |
| fontsize=13, fontweight="bold", transform=ax.transAxes) |
| plt.tight_layout(pad=0.3) |
|
|
| return f"~{views_pred:,} views", engagement_label, engagement_detail, fig |
|
|
| except Exception as e: |
| import traceback |
| return f"Error: {e}", "Error", traceback.format_exc(), None |
|
|
|
|
| |
| DARK_BG = "#0f0f0f" |
| CARD_BG = "#141414" |
| AMBER = "#f5a623" |
| GREEN = "#3a9e6e" |
| RED = "#c0392b" |
| BLUE = "#3498db" |
| TEXT = "#e8e8e8" |
| MUTED = "#666" |
|
|
| def style_ax(ax, fig): |
| fig.patch.set_facecolor(DARK_BG) |
| ax.set_facecolor(CARD_BG) |
| ax.tick_params(colors=MUTED, labelsize=10) |
| ax.xaxis.label.set_color(MUTED) |
| ax.yaxis.label.set_color(MUTED) |
| ax.title.set_color(TEXT) |
| for spine in ax.spines.values(): |
| spine.set_edgecolor("#1e1e1e") |
|
|
| def eda_salary_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| bars = ax.barh(["No Salary Info", "Has Salary Info"], [180, 340], |
| color=[RED, GREEN], height=0.45) |
| for bar, val in zip(bars, [180, 340]): |
| ax.text(val + 5, bar.get_y() + bar.get_height() / 2, |
| f"~{val} avg views", va="center", color=TEXT, fontsize=11) |
| ax.set_xlabel("Average Views", color=MUTED) |
| ax.set_title("Q1 β Salary Transparency vs Engagement", color=TEXT, pad=10) |
| ax.set_xlim(0, 440) |
| ax.legend(handles=[mpatches.Patch(color=GREEN, label="Salary disclosed"), |
| mpatches.Patch(color=RED, label="No salary")], |
| facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) |
| plt.tight_layout(); return fig |
|
|
| def eda_description_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| buckets = ["<100", "100β250", "250β500β
", "500β750", "750β1000", ">1000"] |
| values = [2.1, 2.8, 3.6, 3.3, 3.0, 2.5] |
| colors = [RED, AMBER, GREEN, BLUE, BLUE, RED] |
| bars = ax.bar(buckets, values, color=colors, width=0.55) |
| for bar, val in zip(bars, values): |
| ax.text(bar.get_x() + bar.get_width() / 2, val + 0.05, |
| f"{val}", ha="center", va="bottom", color=TEXT, fontsize=10) |
| ax.set_ylabel("Mean log(views+1)", color=MUTED) |
| ax.set_title("Q2 β Description Length vs Engagement", color=TEXT, pad=10) |
| ax.tick_params(axis="x", rotation=15) |
| ax.legend(handles=[mpatches.Patch(color=GREEN, label="Sweet spot: 250β500 words")], |
| facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) |
| plt.tight_layout(); return fig |
|
|
| def eda_dayofweek_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| days = ["Mon", "Tueβ
", "Wed", "Thu", "Fri", "Sat", "Sun"] |
| values = [220, 245, 235, 228, 210, 148, 132] |
| colors = [BLUE, GREEN, BLUE, BLUE, BLUE, RED, RED] |
| bars = ax.bar(days, values, color=colors, width=0.55) |
| for bar, val in zip(bars, values): |
| ax.text(bar.get_x() + bar.get_width() / 2, val + 3, |
| str(val), ha="center", va="bottom", color=TEXT, fontsize=10) |
| ax.set_ylabel("Average Views", color=MUTED) |
| ax.set_title("Q3 β Day of Week vs Engagement", color=TEXT, pad=10) |
| ax.legend(handles=[mpatches.Patch(color=BLUE, label="Weekday"), |
| mpatches.Patch(color=RED, label="Weekend")], |
| facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) |
| plt.tight_layout(); return fig |
|
|
| def eda_seniority_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| bars = ax.barh(["Entry-levelβ
", "Other / Mid", "Senior-level"], |
| [290, 210, 175], color=[GREEN, AMBER, RED], height=0.45) |
| for bar, val in zip(bars, [290, 210, 175]): |
| ax.text(val + 4, bar.get_y() + bar.get_height() / 2, |
| f"~{val} avg views", va="center", color=TEXT, fontsize=11) |
| ax.set_xlabel("Average Views", color=MUTED) |
| ax.set_title("Q4 β Seniority Level vs Engagement", color=TEXT, pad=10) |
| ax.set_xlim(0, 370) |
| plt.tight_layout(); return fig |
|
|
| def eda_worktype_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| bars = ax.barh(["Contractβ
", "Internship", "Part-time", "Full-time", "Temporary"], |
| [310, 275, 235, 205, 185], |
| color=[GREEN, BLUE, AMBER, MUTED, RED], height=0.45) |
| for bar, val in zip(bars, [310, 275, 235, 205, 185]): |
| ax.text(val + 4, bar.get_y() + bar.get_height() / 2, |
| f"~{val} avg views", va="center", color=TEXT, fontsize=11) |
| ax.set_xlabel("Average Views", color=MUTED) |
| ax.set_title("Q5 β Work Type vs Engagement", color=TEXT, pad=10) |
| ax.set_xlim(0, 390) |
| plt.tight_layout(); return fig |
|
|
| def eda_model_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.8)) |
| style_ax(ax, fig) |
| models = ["Mean Baseline", "PCA+Linear", "Lasso", "RidgeCV", |
| "Linear+Features", "Gradient Boosting", "RF Controlled", "RF Tunedβ
"] |
| rmse = [0.871, 0.844, 0.843, 0.842, 0.842, 0.837, 0.8349, 0.8347] |
| bars = ax.barh(models, rmse, color=[MUTED]*7+[AMBER], height=0.55) |
| for bar, val in zip(bars, rmse): |
| ax.text(val + 0.0005, bar.get_y() + bar.get_height() / 2, |
| f"{val:.4f}", va="center", color=TEXT, fontsize=9) |
| ax.set_xlabel("RMSE_log (lower = better)", color=MUTED) |
| ax.set_title("Regression Model Comparison", color=TEXT, pad=10) |
| ax.set_xlim(0.820, 0.886) |
| ax.legend(handles=[mpatches.Patch(color=AMBER, label="Winner: RF Tuned")], |
| facecolor=CARD_BG, labelcolor=TEXT, fontsize=9) |
| plt.tight_layout(); return fig |
|
|
| def eda_cluster_chart(): |
| fig, ax = plt.subplots(figsize=(6, 3.2)) |
| style_ax(ax, fig) |
| labels = [f"Cluster {i}" for i in range(6)] |
| sizes = [28, 18, 22, 12, 10, 10] |
| colors = [AMBER, BLUE, GREEN, RED, MUTED, "#9b59b6"] |
| bars = ax.bar(labels, sizes, color=colors, width=0.55) |
| for bar, val in zip(bars, sizes): |
| ax.text(bar.get_x() + bar.get_width() / 2, val + 0.4, |
| f"{val}%", ha="center", va="bottom", color=TEXT, fontsize=10) |
| ax.set_ylabel("% of training postings", color=MUTED) |
| ax.set_title("KMeans k=6 β Cluster Size Distribution", color=TEXT, pad=10) |
| ax.set_ylim(0, 36) |
| plt.tight_layout(); return fig |
|
|
|
|
| |
| css = """ |
| body, .gradio-container { background-color: #0a0a0a !important; |
| font-family: 'Segoe UI', system-ui, sans-serif; } |
| .gr-button-primary { background-color: #f5a623 !important; |
| color: #000 !important; font-weight: 700 !important; border: none !important; } |
| .gr-button-primary:hover { background-color: #d4941f !important; } |
| label { color: #aaa !important; font-size: 13px !important; } |
| """ |
|
|
| with gr.Blocks(css=css, title="LinkedIn Engagement Dashboard", |
| theme=gr.themes.Base(primary_hue="orange", neutral_hue="gray")) as demo: |
|
|
| gr.Markdown( |
| """ |
| # π LinkedIn Job Posting Engagement Dashboard |
| **Which posting characteristics predict candidate engagement?** |
| *Assignment 2 β Regression, Classification & Clustering Β· LinkedIn Job Postings* |
| """ |
| ) |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("π― Engagement Predictor"): |
| gr.Markdown( |
| """ |
| ### Predict engagement for a new job posting |
| Fill in the posting details β the model will estimate views and classify |
| whether it is likely to reach **high engagement** (top 25%). |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("#### π Content") |
| title_word_count = gr.Slider(1, 20, value=5, step=1, |
| label="Title word count") |
| description_word_count = gr.Slider(10, 1200, value=350, step=10, |
| label="Description word count") |
| has_salary = gr.Checkbox(value=True, |
| label="Salary information disclosed") |
| salary_midpoint = gr.Slider(0, 300000, value=85000, step=5000, |
| label="Salary midpoint ($)") |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("#### π·οΈ Role Type") |
| is_senior = gr.Checkbox(value=False, label="Senior role") |
| is_entry = gr.Checkbox(value=False, label="Entry-level role") |
| is_software = gr.Checkbox(value=False, label="Software / Engineering role") |
| is_data = gr.Checkbox(value=False, label="Data / Analytics role") |
| is_manager = gr.Checkbox(value=False, label="Manager / Director role") |
| is_sales = gr.Checkbox(value=False, label="Sales role") |
| is_marketing = gr.Checkbox(value=False, label="Marketing role") |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("#### π
Timing, Location & Cluster") |
| is_remote = gr.Checkbox(value=False, label="Remote role") |
| posting_dayofweek = gr.Slider(0, 6, value=1, step=1, |
| label="Day posted (0=Mon β¦ 6=Sun)") |
| gr.Markdown( |
| "<small style='color:#555'>**Cluster** β pick the segment that best " |
| "describes this posting. If unsure, use Cluster 0.</small>" |
| ) |
| cluster_choice = gr.Dropdown( |
| choices=CLUSTER_LABELS, |
| value=CLUSTER_LABELS[0], |
| label="Posting cluster (KMeans k=6)", |
| ) |
|
|
| predict_btn = gr.Button("π Predict Engagement", variant="primary", size="lg") |
|
|
| with gr.Row(): |
| views_out = gr.Textbox(label="π Predicted Views", interactive=False) |
| label_out = gr.Textbox(label="π·οΈ Engagement Class", interactive=False) |
|
|
| detail_out = gr.Textbox(label="π Interpretation & Tips", interactive=False, lines=5) |
| gauge_out = gr.Plot(label="Result") |
|
|
| predict_btn.click( |
| fn=predict_engagement, |
| inputs=[ |
| title_word_count, description_word_count, |
| has_salary, salary_midpoint, |
| is_senior, is_entry, is_software, is_data, |
| is_manager, is_sales, is_marketing, |
| is_remote, posting_dayofweek, |
| cluster_choice, |
| ], |
| outputs=[views_out, label_out, detail_out, gauge_out], |
| ) |
|
|
| gr.Markdown( |
| "> **Note:** Predictions use posting-level features only. " |
| "Platform factors (LinkedIn algorithm, sponsored status, company followers) " |
| "are unobservable and account for most real-world variance." |
| ) |
|
|
| |
| with gr.Tab("π EDA Dashboard"): |
| gr.Markdown("### Key Findings from Exploratory Data Analysis") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("**Q1 β Salary transparency vs views**") |
| salary_plot = gr.Plot() |
| with gr.Column(): |
| gr.Markdown("**Q2 β Description length vs views**") |
| desc_plot = gr.Plot() |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("**Q3 β Day of week vs views**") |
| day_plot = gr.Plot() |
| with gr.Column(): |
| gr.Markdown("**Q4 β Seniority level vs views**") |
| seniority_plot = gr.Plot() |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("**Q5 β Work type vs views**") |
| worktype_plot = gr.Plot() |
| with gr.Column(): |
| gr.Markdown("**KMeans k=6 cluster distribution**") |
| cluster_plot = gr.Plot() |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("**Regression model comparison**") |
| model_plot = gr.Plot() |
|
|
| gr.Markdown( |
| """ |
| --- |
| **Key takeaways:** |
| π° Salary transparency β ~90% more views | |
| π Sweet spot: 250β500 words | |
| π
Post TueβThu | |
| πΆ Entry-level = larger candidate pool | |
| π Contract roles outperform full-time |
| """ |
| ) |
|
|
| demo.load( |
| fn=lambda: ( |
| eda_salary_chart(), eda_description_chart(), |
| eda_dayofweek_chart(), eda_seniority_chart(), |
| eda_worktype_chart(), eda_cluster_chart(), |
| eda_model_chart(), |
| ), |
| outputs=[salary_plot, desc_plot, day_plot, |
| seniority_plot, worktype_plot, cluster_plot, |
| model_plot], |
| ) |
|
|
| |
| with gr.Tab("βΉοΈ About"): |
| gr.Markdown( |
| """ |
| ## About This Project |
| |
| **Dataset:** LinkedIn Job Postings β arshkon/linkedin-job-postings (Kaggle) |
| **Sample:** 30,000 rows from 123,850 Β· `random_state=42` |
| **Target:** `views` β job posting view count |
| |
| --- |
| ### Models |
| | Model | Type | Metric | |
| |---|---|---| |
| | Random Forest (Tuned) | Regression | RMSE_log 0.8347 Β· RΒ² 0.081 | |
| | Decision Tree | Classification | Highest F1 + Recall for Class 1 | |
| |
| ### All 30 features |
| | Group | Features | |
| |---|---| |
| | Text length | title_length, title_word_count, description_length, description_word_count | |
| | Text structure | description_density, title_desc_ratio | |
| | Salary | salary_midpoint, salary_range, has_salary_info, salary_log | |
| | Role keywords | is_senior_role, is_entry_role, is_software_role, is_data_role, is_manager_role, is_sales_role, is_marketing_role | |
| | Interactions | desc_salary_interaction, senior_salary, weekend_remote, title_desc_word_interaction, salary_density_interaction, salary_description_interaction, title_density_interaction | |
| | Clustering | cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5 | |
| |
| --- |
| ### Clustering |
| KMeans k=6 Β· Silhouette score 0.289 Β· Fit on training data only |
| k=7/8 rejected β produced near-singleton clusters |
| |
| --- |
| ### Limitations |
| - RΒ²β0.08 reflects unobservable platform factors |
| - Associations shown, not causal relationships |
| - High-engagement threshold is a business rule (top 25%), not a natural label |
| - Cluster labels in the predictor are interpretive approximations |
| """ |
| ) |
|
|
| demo.launch() |
|
|