import os, re, json, time, pickle, random, traceback
import numpy as np
import pandas as pd
import gradio as gr
import plotly.graph_objects as go
import requests as req
from pathlib import Path
from typing import Tuple
try:
from huggingface_hub import InferenceClient
except Exception:
InferenceClient = None
# =========================================================
# CONFIG
# =========================================================
BASE_DIR = Path(__file__).resolve().parent
HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
N8N_SALARY_PREDICTION_URL = os.environ.get("N8N_SALARY_PREDICTION_URL", "").strip()
N8N_HR_FEEDBACK_URL = os.environ.get("N8N_HR_FEEDBACK_URL", "").strip()
N8N_ANOMALY_DETECTION_URL = os.environ.get("N8N_ANOMALY_DETECTION_URL", "").strip()
LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
llm_client = (InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY) if LLM_ENABLED else None)
P = ["#45FFCA","#D09CFA","#FF9B9B","#F875AA","#3EDBF0","#F2C637","#7c5cbf","#2ec4a0"]
GOLD = "#F2C637"
PURPLE = "#28096d"
# =========================================================
# DATA
# =========================================================
_CACHE = {}
def _load_csv(f):
p = BASE_DIR / f
if p.exists():
try: return pd.read_csv(p)
except: pass
return pd.DataFrame()
def _gen_base(n=500):
rng = np.random.default_rng(42)
edu_levels = ["Bachelor's","Master's","PhD"]
job_pool = {
"Bachelor's":["Data Analyst","Junior Engineer","Business Analyst","HR Specialist","Marketing Manager"],
"Master's":["Data Scientist","Senior Engineer","Product Manager","Financial Analyst","ML Engineer"],
"PhD":["Director of Analytics","Research Scientist","VP of Engineering","Chief Data Officer","AI Researcher"],
}
rows = []
for _ in range(n):
edu = rng.choice(edu_levels, p=[0.55,0.35,0.10])
exp = int(rng.integers(0,35))
age = int(np.clip(22+exp+rng.integers(-2,5),22,65))
gender = rng.choice(["Male","Female"], p=[0.52,0.48])
job = rng.choice(job_pool[edu])
base = {"Bachelor's":52000,"Master's":68000,"PhD":88000}[edu]
sal = base + exp*3100 + (age-22)*180 + float(rng.normal(0,6000))
if gender=="Male": sal *= float(rng.uniform(1.0,1.06))
sal = max(25000,int(sal))
rows.append({"Age":age,"Gender":gender,"Education Level":edu,"Job Title":job,"Years of Experience":exp,"Salary":sal})
df = pd.DataFrame(rows)
def tier(r):
s={"Bachelor's":1,"Master's":2,"PhD":3}.get(r["Education Level"],1)
if r["Years of Experience"]>=15 and s>=2: return "senior"
elif r["Years of Experience"]>=7 or s==3: return "mid"
return "junior"
df["career_tier"] = df.apply(tier,axis=1)
df["salary_growth"] = df.apply(lambda r: int(r["Salary"]*(0.15 if r["career_tier"]=="senior" else 0.30 if r["career_tier"]=="mid" else 0.50)+float(np.random.normal(0,2000))),axis=1)
return df
def _gen_progression(base_df):
rows=[]
for _,row in base_df.iterrows():
t=row["career_tier"]; base=row["Salary"]
growth=np.linspace(0.85 if t=="senior" else 0.70 if t=="mid" else 0.50,1.0,5)
for i,yr in enumerate(range(2020,2025)):
sal=int(np.clip(base*(growth[i]+np.random.normal(0,0.03)),20000,None))
rows.append({"career_tier":t,"year":yr,"salary_that_year":sal})
return pd.DataFrame(rows)
def _gen_feedback(base_df):
pools={"senior":["Consistently exceeds expectations.","Key leader and mentor.","Delivers high-impact results.","Trusted advisor to management."],"mid":["Solid contributor, meets targets.","Shows initiative and grows.","Dependable team member.","Takes ownership of projects."],"junior":["Eager learner, developing skills.","Shows promise under coaching.","Making good progress.","Improving steadily with support."]}
rows=[]
for _,row in base_df.iterrows():
for comment in random.sample(pools[row["career_tier"]],2):
rows.append({"career_tier":row["career_tier"],"feedback_comment":comment,"Salary":row["Salary"]})
return pd.DataFrame(rows)
def get_base_df():
if "base" not in _CACHE:
df=_load_csv("employee_analysis_ready.csv")
if df.empty: df=_load_csv("Salary_Data.csv")
if df.empty: df=_gen_base()
if "career_tier" not in df.columns:
def tier(r):
s={"Bachelor's":1,"Master's":2,"PhD":3}.get(r.get("Education Level",""),1)
e=r.get("Years of Experience",0)
if e>=15 and s>=2: return "senior"
elif e>=7 or s==3: return "mid"
return "junior"
df["career_tier"]=df.apply(tier,axis=1)
_CACHE["base"]=df
return _CACHE["base"]
def get_prog_df():
if "prog" not in _CACHE:
df=_load_csv("synthetic_salary_progression.csv")
if df.empty: df=_gen_progression(get_base_df())
_CACHE["prog"]=df
return _CACHE["prog"]
def get_feed_df():
if "feed" not in _CACHE:
df=_load_csv("synthetic_employee_feedback.csv")
if df.empty: df=_gen_feedback(get_base_df())
_CACHE["feed"]=df
return _CACHE["feed"]
# =========================================================
# HELPERS
# =========================================================
def get_tier(age,exp,edu):
s={"Bachelor's":1,"Master's":2,"PhD":3}.get(edu,1)
if exp>=15 and s>=2: return "senior"
elif exp>=7 or s==3: return "mid"
return "junior"
def get_exp_group(exp):
if exp<=5: return "0-5 years"
elif exp<=10: return "6-10 years"
elif exp<=15: return "11-15 years"
elif exp<=20: return "16-20 years"
return "20+"
def _layout(**kw):
d=dict(template="plotly_white",paper_bgcolor="rgba(255,255,255,0.97)",plot_bgcolor="rgba(255,255,255,0.99)",font=dict(family="system-ui,sans-serif",color="#1a0a3d",size=12),margin=dict(l=50,r=20,t=55,b=45),legend=dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1,bgcolor="rgba(255,255,255,0.9)",bordercolor="rgba(40,9,109,0.15)",borderwidth=1),title=dict(font=dict(size=14,color="#28096d")))
d.update(kw)
return d
def _n8n(url,payload):
if not url: return None
try:
r=req.post(url,json=payload,timeout=15)
return r.json()
except Exception as e:
return {"error":str(e)}
# =========================================================
# PREDICTION MODEL
# =========================================================
def _load_model():
for mp in [BASE_DIR/"rf_model.pkl",BASE_DIR/"model"/"rf_model.pkl"]:
cp=mp.parent/"train_columns.pkl"
if mp.exists() and cp.exists():
try:
with open(mp,"rb") as f: m=pickle.load(f)
with open(cp,"rb") as f: c=pickle.load(f)
return m,c
except: pass
return None,None
RF_MODEL,TRAIN_COLS=_load_model()
def predict_local(age,exp,edu,gender):
tier=get_tier(age,exp,edu)
exp_group=get_exp_group(exp)
if RF_MODEL and TRAIN_COLS:
row={"Age":age,"Years of Experience":exp,"Education Level_Master's":1 if edu=="Master's" else 0,"Education Level_PhD":1 if edu=="PhD" else 0,"salary_growth":exp*3500,"vader_score":0.3 if tier=="senior" else 0.15 if tier=="mid" else 0.05}
inp=pd.DataFrame([row])
tmp=pd.DataFrame([{"Experience Group":exp_group,"career_tier":tier}])
inp=pd.concat([inp,tmp],axis=1)
inp=pd.get_dummies(inp,columns=["Experience Group","career_tier"],drop_first=True)*1
for col in TRAIN_COLS:
if col not in inp.columns: inp[col]=0
inp=inp[TRAIN_COLS]
predicted=float(RF_MODEL.predict(inp)[0])
source="Random Forest Model"
else:
base={"Bachelor's":55000,"Master's":72000,"PhD":92000}.get(edu,55000)
predicted=base+(exp*3200)+((age-22)*250)
predicted=max(28000,predicted)
source="Formula Estimate"
predicted+=predicted*random.uniform(-0.01,0.01)
return {"predicted":round(predicted),"low":round(predicted*0.92),"high":round(predicted*1.08),"tier":tier,"exp_group":exp_group,"source":source}
def project_salary(base_salary, exp, edu, tier, years=5):
"""Project salary for next N years with promotion probability"""
projections = []
current = base_salary
current_exp = exp
for yr in range(1, years+1):
current_exp += 1
growth_rate = {"senior": 0.04, "mid": 0.06, "junior": 0.09}.get(tier, 0.05)
# Promotion bump every 3 years
promotion_bump = 0
if yr % 3 == 0:
promotion_bump = {"junior": 0.12, "mid": 0.10, "senior": 0.07}.get(tier, 0.08)
if yr % 3 == 0 and tier == "junior" and current_exp >= 7:
tier = "mid"
elif tier == "mid" and current_exp >= 15:
tier = "senior"
current = current * (1 + growth_rate + promotion_bump)
projections.append({
"year": 2025 + yr,
"salary": round(current),
"tier": tier,
"promotion": promotion_bump > 0
})
return projections
# =========================================================
# TAB 1: SALARY PREDICTOR
# =========================================================
def run_predictor(age, exp, edu, gender, job_title, name, feedback):
res = predict_local(int(age), int(exp), edu, gender)
predicted = res["predicted"]
low, high = res["low"], res["high"]
tier, exp_group, source = res["tier"], res["exp_group"], res["source"]
# Call Automation 1 (n8n salary prediction)
auto1_result = _n8n(N8N_SALARY_PREDICTION_URL, {
"age":int(age),"years_of_experience":int(exp),
"education_level":edu,"gender":gender,"job_title":job_title
})
if auto1_result and "predicted_salary" in auto1_result:
predicted = auto1_result["predicted_salary"]
low = round(predicted*0.92)
high = round(predicted*1.08)
# Call Automation 2 (HR feedback)
auto2_result = _n8n(N8N_HR_FEEDBACK_URL, {
"name": name or "Employee","career_tier":tier,"salary":predicted,
"years_of_experience":int(exp),"education_level":edu,
"feedback": feedback or "No feedback provided"
})
sentiment_adjustment = 0
sentiment_label = "Neutral"
sentiment_score = 0.5
if auto2_result and "error" not in auto2_result:
sentiment_adjustment = auto2_result.get("salary_adjustment", 0)
sentiment_score = auto2_result.get("sentiment_score", 0.5)
alert = auto2_result.get("alert","")
if "POSITIVE" in alert.upper(): sentiment_label = "Positive 😊"
elif "NEGATIVE" in alert.upper(): sentiment_label = "Negative 😟"
else: sentiment_label = "Neutral 😐"
if sentiment_adjustment:
predicted = round(predicted * (1 + sentiment_adjustment/100))
low = round(predicted*0.92)
high = round(predicted*1.08)
# Salary projection
projections = project_salary(predicted, int(exp), edu, tier)
# Gauge chart
fig_gauge = go.Figure(go.Indicator(
mode="gauge+number",value=predicted,
number={"prefix":"$","valueformat":",.0f","font":{"size":32,"color":PURPLE}},
gauge={"axis":{"range":[25000,250000],"tickprefix":"$","tickformat":",.0f","tickfont":{"size":9}},"bar":{"color":GOLD,"thickness":0.3},"bgcolor":"white","borderwidth":0,"steps":[{"range":[25000,80000],"color":"#e8f5f0"},{"range":[80000,140000],"color":"#d4ecff"},{"range":[140000,250000],"color":"#ede8ff"}],"threshold":{"line":{"color":"#FF9B9B","width":3},"thickness":0.8,"value":high}},
title={"text":"Predicted Annual Salary","font":{"size":13,"color":PURPLE}},
))
fig_gauge.update_layout(height=280,paper_bgcolor="rgba(255,255,255,0.97)",margin=dict(l=30,r=30,t=60,b=10))
# Projection chart
years = [p["year"] for p in projections]
salaries = [p["salary"] for p in projections]
promotions = [p["promotion"] for p in projections]
proj_colors = ["#F2C637" if p else "#45FFCA" for p in promotions]
fig_proj = go.Figure()
fig_proj.add_trace(go.Scatter(
x=[2025]+years, y=[predicted]+salaries,
mode="lines", name="Salary Trajectory",
line=dict(color="#45FFCA",width=3),
fill="tozeroy", fillcolor="rgba(69,255,202,0.1)"
))
for i,(yr,sal,promo) in enumerate(zip(years,salaries,promotions)):
if promo:
fig_proj.add_trace(go.Scatter(
x=[yr],y=[sal],mode="markers+text",
marker=dict(color="#F2C637",size=14,symbol="star"),
text=["Promotion!"],textposition="top center",
textfont=dict(size=10,color="#F2C637"),
showlegend=False,
hovertemplate=f"Year: {yr}
Salary: ${sal:,.0f}
⭐ Promotion"
))
fig_proj.add_annotation(x=2025,y=predicted,text=f"Now: ${predicted:,.0f}",
showarrow=True,arrowhead=2,font=dict(size=10,color=PURPLE))
fig_proj.update_layout(**_layout(height=300,showlegend=False,
title=dict(text="📈 5-Year Salary Projection"),
xaxis=dict(tickformat="d"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
# Build automation status HTML
tier_emoji = {"junior":"🌱","mid":"📈","senior":"⭐"}.get(tier,"")
sent_color = "#45FFCA" if "Positive" in sentiment_label else "#FF9B9B" if "Negative" in sentiment_label else "#D3D1C7"
adj_text = f"{sentiment_adjustment:+.1f}%" if sentiment_adjustment else "0%"
auto1_status = "✅ Connected" if auto1_result and "error" not in auto1_result else ("⚠️ "+auto1_result.get("error","Error")[:30] if auto1_result else "⚙️ Not configured")
auto2_status = "✅ "+sentiment_label if auto2_result and "error" not in auto2_result else ("⚠️ Error" if auto2_result else "⚙️ Not configured")
result_html = f"""
Career Tier
{tier_emoji} {tier.title()}
Range
${low:,.0f} – ${high:,.0f}
💬 Feedback Sentiment Impact
{sentiment_label}
Salary adj: {adj_text}
Score: {sentiment_score:.2f}
🔗 n8n Automation Status
AUTOMATION 1 — Salary Prediction
{auto1_status}
AUTOMATION 2 — HR Feedback Alert
{auto2_status}
"""
# 5-year table
proj_rows = "".join([
f''
f'| {p["year"]} | '
f'${p["salary"]:,.0f} | '
f'{p["tier"].title()} | '
f'{"⭐ Promotion!" if p["promotion"] else "—"} | '
f'
'
for p in projections
])
proj_table = f"""
📅 5-Year Salary Forecast
| Year |
Projected Salary |
Tier |
Event |
{proj_rows}
"""
return fig_gauge, fig_proj, result_html, proj_table
# =========================================================
# TAB 2: SALARY ANALYZER
# =========================================================
def run_analyzer(salary, age, exp, edu, gender, job_title):
salary = float(salary)
tier = get_tier(int(age), int(exp), edu)
exp_group = get_exp_group(int(exp))
# Call Automation 3
auto3_result = _n8n(N8N_ANOMALY_DETECTION_URL, {
"salary":salary,"age":int(age),
"years_of_experience":int(exp),"education_level":edu
})
# Local anomaly calculation as fallback
base_expected = {"Bachelor's":55000,"Master's":72000,"PhD":92000}.get(edu,55000)
expected = base_expected + (int(exp)*3200) + ((int(age)-22)*250)
deviation = ((salary - expected) / expected) * 100
if auto3_result and "error" not in auto3_result:
is_anomaly = auto3_result.get("anomaly", False)
flag = auto3_result.get("flag", "NORMAL RANGE")
severity = auto3_result.get("severity", "normal")
dev = auto3_result.get("deviation_pct", round(deviation,1))
expected_sal = auto3_result.get("expected_salary", round(expected))
auto3_status = "✅ Connected"
else:
dev = round(deviation,1)
expected_sal = round(expected)
is_anomaly = abs(dev) > 20
if dev > 40: flag,severity = "OVERPAID","high"
elif dev > 20: flag,severity = "SLIGHTLY OVERPAID","medium"
elif dev < -40: flag,severity = "UNDERPAID","high"
elif dev < -20: flag,severity = "SLIGHTLY UNDERPAID","medium"
else: flag,severity = "NORMAL RANGE","normal"
auto3_status = "⚙️ Using local calculation" if not auto3_result else "⚠️ Error"
# Color theme based on result
sev_color = {"high":"#FF9B9B","medium":"#F2C637","normal":"#45FFCA"}.get(severity,"#45FFCA")
sev_icon = {"high":"🚨","medium":"⚠️","normal":"✅"}.get(severity,"✅")
# Comparison gauge
fig_compare = go.Figure()
fig_compare.add_trace(go.Bar(
x=["Your Salary","Market Expected"],
y=[salary, expected_sal],
marker_color=[sev_color, "#45FFCA"],
text=[f"${salary:,.0f}", f"${expected_sal:,.0f}"],
textposition="outside",
hovertemplate="%{x}: $%{y:,.0f}"
))
fig_compare.update_layout(**_layout(height=320,showlegend=False,
title=dict(text="Your Salary vs Market Expected"),
yaxis=dict(tickprefix="$",tickformat=",.0f")))
# Deviation gauge
fig_dev = go.Figure(go.Indicator(
mode="gauge+number+delta",value=dev,
number={"suffix":"%","valueformat":".1f","font":{"size":28,"color":PURPLE}},
delta={"reference":0,"valueformat":".1f","suffix":"%"},
gauge={"axis":{"range":[-60,60],"ticksuffix":"%"},"bar":{"color":sev_color,"thickness":0.3},"bgcolor":"white","borderwidth":0,"steps":[{"range":[-60,-20],"color":"rgba(255,155,155,0.3)"},{"range":[-20,20],"color":"rgba(69,255,202,0.2)"},{"range":[20,60],"color":"rgba(208,156,250,0.3)"}],"threshold":{"line":{"color":"red","width":3},"thickness":0.8,"value":40 if dev>0 else -40}},
title={"text":"Salary Deviation from Market","font":{"size":13,"color":PURPLE}},
))
fig_dev.update_layout(height=280,paper_bgcolor="rgba(255,255,255,0.97)",margin=dict(l=30,r=30,t=60,b=10))
# Percentile estimation
df = get_base_df()
if "Salary" in df.columns:
percentile = round((df["Salary"] < salary).mean() * 100)
else:
percentile = 50
# Build result HTML
result_html = f"""
{sev_icon}
{flag}
Salary anomaly detection result
Your Salary
${salary:,.0f}
Market Expected
${expected_sal:,.0f}
Percentile
Top {100-percentile}%
💡 Recommendation
{'Your salary is significantly above market. Ensure your performance justifies this compensation.' if dev > 30 else
'Your salary is slightly above market — you are in a strong position.' if dev > 10 else
'Your salary is within market norms. You are fairly compensated.' if abs(dev) <= 10 else
'Your salary is slightly below market. Consider negotiating a raise.' if dev > -30 else
'Your salary is significantly below market. You may be underpaid — seek negotiation or new opportunities.'}
🔗 AUTOMATION 3 — Anomaly Detection
{auto3_status}
"""
return fig_dev, fig_compare, result_html
# =========================================================
# TAB 3: INSIGHTS (Dashboard + AI)
# =========================================================
def build_edu_chart():
df=get_base_df()
if "Education Level" not in df.columns: return go.Figure()
grp=df.groupby("Education Level")["Salary"].mean().reset_index()
order=["Bachelor's","Master's","PhD"]
grp["Education Level"]=pd.Categorical(grp["Education Level"],categories=order,ordered=True)
grp=grp.sort_values("Education Level")
fig=go.Figure(go.Bar(x=grp["Education Level"],y=grp["Salary"],marker_color=P[:3],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}"))
fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Education"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
return fig
def build_exp_chart():
df=get_base_df().copy()
if "Years of Experience" not in df.columns: return go.Figure()
df["eg"]=df["Years of Experience"].apply(get_exp_group)
grp=df.groupby("eg")["Salary"].mean().reset_index()
order=["0-5 years","6-10 years","11-15 years","16-20 years","20+"]
grp["eg"]=pd.Categorical(grp["eg"],categories=order,ordered=True)
grp=grp.sort_values("eg")
fig=go.Figure(go.Bar(x=grp["eg"],y=grp["Salary"],marker_color=P[:5],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}"))
fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Experience"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
return fig
def build_gender_chart():
df=get_base_df()
if "Gender" not in df.columns: return go.Figure()
grp=df.groupby("Gender")["Salary"].mean().reset_index()
fig=go.Figure(go.Bar(x=grp["Gender"],y=grp["Salary"],marker_color=[P[0],P[2]],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}"))
fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Gender"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
return fig
def build_tier_chart():
df=get_base_df()
if "career_tier" not in df.columns: return go.Figure()
grp=df.groupby("career_tier")["Salary"].mean().reset_index()
order=["junior","mid","senior"]
grp["career_tier"]=pd.Categorical(grp["career_tier"],categories=order,ordered=True)
grp=grp.sort_values("career_tier")
fig=go.Figure(go.Bar(x=grp["career_tier"].str.title(),y=grp["Salary"],marker_color=[P[0],P[1],P[2]],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}"))
fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Career Tier"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
return fig
def build_prog_chart():
df=get_prog_df()
if df.empty or "year" not in df.columns: return go.Figure()
avg=df.groupby(["year","career_tier"])["salary_that_year"].mean().reset_index()
fig=go.Figure()
for i,t in enumerate(["junior","mid","senior"]):
sub=avg[avg["career_tier"]==t]
fig.add_trace(go.Scatter(x=sub["year"],y=sub["salary_that_year"],name=t.title(),mode="lines+markers",line=dict(color=P[i],width=2.5),marker=dict(size=7),hovertemplate=f"{t.title()}
%{{x}}: $%{{y:,.0f}}"))
fig.update_layout(**_layout(height=340,title=dict(text="Salary Progression by Career Tier (2020–2024)"),yaxis=dict(tickprefix="$",tickformat=",.0f"),xaxis=dict(tickformat="d")))
return fig
def build_sentiment_chart():
df=get_feed_df().copy()
if df.empty: return go.Figure()
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
ana=SentimentIntensityAnalyzer()
df["score"]=df["feedback_comment"].apply(lambda x: ana.polarity_scores(str(x))["compound"])
df["sentiment"]=df["score"].apply(lambda s: "positive" if s>=0.05 else ("negative" if s<=-0.05 else "neutral"))
except:
df["sentiment"]=df["career_tier"].map({"senior":"positive","mid":"positive","junior":"neutral"}).fillna("neutral")
counts=df.groupby(["career_tier","sentiment"]).size().unstack(fill_value=0).reset_index()
for c in ["negative","neutral","positive"]:
if c not in counts.columns: counts[c]=0
tot=counts[["negative","neutral","positive"]].sum(axis=1)
for c in ["negative","neutral","positive"]: counts[c]=(counts[c]/tot*100).round(1)
order=["junior","mid","senior"]
counts["career_tier"]=pd.Categorical(counts["career_tier"],categories=order,ordered=True)
counts=counts.sort_values("career_tier")
colors={"negative":"#FF9B9B","neutral":"#D3D1C7","positive":"#45FFCA"}
fig=go.Figure()
for s in ["negative","neutral","positive"]:
fig.add_trace(go.Bar(x=counts["career_tier"].str.title(),y=counts[s],name=s.title(),marker_color=colors[s],text=counts[s].apply(lambda v: f"{v:.0f}%"),textposition="inside",hovertemplate=f"{s.title()}: %{{y:.1f}}%"))
fig.update_layout(**_layout(height=320,barmode="stack",title=dict(text="Feedback Sentiment by Career Tier (%)"),yaxis=dict(ticksuffix="%",range=[0,100])))
return fig
def build_scatter_chart():
df=get_base_df()
if "Years of Experience" not in df.columns: return go.Figure()
tc={"junior":P[0],"mid":P[1],"senior":P[2]}
fig=go.Figure()
for t in ["junior","mid","senior"]:
sub=df[df["career_tier"]==t]
fig.add_trace(go.Scatter(x=sub["Years of Experience"],y=sub["Salary"],mode="markers",name=t.title(),marker=dict(color=tc[t],size=5,opacity=0.65),hovertemplate="Exp: %{x}y
$%{y:,.0f}"))
fig.update_layout(**_layout(height=320,title=dict(text="Experience vs Salary by Career Tier"),xaxis=dict(title="Years of Experience"),yaxis=dict(tickprefix="$",tickformat=",.0f")))
return fig
CHART_MAP={"salary_education":build_edu_chart,"salary_experience":build_exp_chart,"gender_salary":build_gender_chart,"career_tier":build_tier_chart,"salary_progression":build_prog_chart,"sentiment":build_sentiment_chart,"scatter":build_scatter_chart}
def render_kpis():
df=get_base_df()
n=len(df); avg=df["Salary"].mean() if "Salary" in df.columns else 0
avg_exp=df["Years of Experience"].mean() if "Years of Experience" in df.columns else 0
def card(icon,label,value,color):
return (f''
f'
{icon}
'
f'
{label}
'
f'
{value}
')
html=(''
+card("👥","Employees",f"{n:,}",GOLD)+card("💰","Avg Salary",f"${avg:,.0f}","#45FFCA")
+card("📅","Avg Experience",f"{avg_exp:.1f} yrs","#D09CFA")+card("🎯","Career Tiers","3","#FF9B9B")
+"
")
return html
AI_SYSTEM = """You are an AI HR analytics assistant for the F2 Salary Prediction project at ESCP Business School.
Dataset: Age, Gender, Education Level (Bachelor's/Master's/PhD), Job Title, Years of Experience, Salary, career_tier (junior/mid/senior), synthetic salary progression 2020-2024, VADER sentiment.
Key findings: PhD earns ~35% more than Bachelor's. Senior tier = highest stable salaries. Junior grows fastest. Gender gap ~4%. RF model ~92-95% accuracy.
Answer in 2-4 sentences with specific numbers. End with: ```json {"chart": ""}```
Chart options: salary_education | salary_experience | gender_salary | career_tier | salary_progression | sentiment | scatter | none"""
def _keyword_reply(msg):
m=msg.lower()
if any(w in m for w in ["education","degree","bachelor","master","phd"]): return ("PhD holders earn ~35% more than Bachelor's graduates. Education is one of the top salary predictors.","salary_education")
if any(w in m for w in ["gender","male","female","gap"]): return ("There is a ~4% gender pay gap, with male employees earning slightly more on average.","gender_salary")
if any(w in m for w in ["experience","years","exp"]) and "prog" not in m: return ("Employees with 20+ years earn nearly 3× those with 0-5 years. The steepest growth is in the first 10 years.","salary_experience")
if any(w in m for w in ["tier","career","junior","mid","senior"]): return ("Senior employees earn the most and have stable salaries. Junior employees show the fastest growth rate.","career_tier")
if any(w in m for w in ["progress","growth","time","2020","trend"]): return ("Junior employees grow from 50% to 100% of base salary over 5 years. Seniors maintain high stable salaries.","salary_progression")
if any(w in m for w in ["sentiment","feedback","comment","review"]): return ("Senior employees receive 90%+ positive feedback. There is a moderate positive correlation between sentiment and salary.","sentiment")
if any(w in m for w in ["scatter","correlation","relationship"]): return ("Experience correlates strongly with salary across all tiers. The scatter shows clear tier separation.","scatter")
return ("Ask me about: education vs salary, experience trends, gender pay gap, career tier distributions, salary progression, or feedback sentiment.","none")
def ai_chat(user_msg, history):
if not user_msg or not user_msg.strip(): return history or [], "", None
safe_history=[item for item in (history or []) if isinstance(item,dict) and "role" in item]
chart_name="none"
if LLM_ENABLED:
try:
msgs=[{"role":"system","content":AI_SYSTEM}]+safe_history[-10:]+[{"role":"user","content":user_msg}]
r=llm_client.chat_completion(model=MODEL_NAME,messages=msgs,temperature=0.3,max_tokens=400,stream=False)
raw=r["choices"][0]["message"]["content"] if isinstance(r,dict) else r.choices[0].message.content
hit=re.search(r'```json\s*(\{.*?\})\s*```',raw,re.DOTALL)
if hit: chart_name=json.loads(hit.group(1)).get("chart","none")
reply=re.sub(r'```json.*?```','',raw,flags=re.DOTALL).strip()
except: reply,chart_name=_keyword_reply(user_msg)
else: reply,chart_name=_keyword_reply(user_msg)
chart_fn=CHART_MAP.get(chart_name)
chart_out=chart_fn() if chart_fn else None
return safe_history+[{"role":"user","content":user_msg},{"role":"assistant","content":reply}],"",chart_out
def refresh_insights():
return (render_kpis(),build_edu_chart(),build_gender_chart(),
build_exp_chart(),build_tier_chart(),build_prog_chart(),
build_sentiment_chart(),build_scatter_chart())
# =========================================================
# PIPELINE RUNNER
# =========================================================
def run_notebook_safe(env_key,default):
nb_name=os.environ.get(env_key,default).strip()
nb_in=BASE_DIR/nb_name
if not nb_in.exists():
return f"❌ {nb_name} not found.\nUpload the notebook to your HuggingFace Space Files tab."
try:
import papermill as pm
out=BASE_DIR/"runs"/f"run_{time.strftime('%Y%m%d-%H%M%S')}_{nb_name}"
out.parent.mkdir(exist_ok=True)
pm.execute_notebook(str(nb_in),str(out),cwd=str(BASE_DIR),log_output=True,progress_bar=False,execution_timeout=1800)
_CACHE.clear()
return f"✅ {nb_name} completed.\nCSVs: {[p.name for p in BASE_DIR.glob('*.csv')]}"
except Exception as e:
return f"❌ FAILED: {e}\n\n{traceback.format_exc()[-1500:]}"
def run_nb1(): return run_notebook_safe("NB1","F2_Data_Extraction_and_synthetic_enrichment.ipynb")
def run_nb2(): return run_notebook_safe("NB2","F2_quantitative_and_qualitative_analysis___prediction.ipynb")
def run_both():
return f"{'='*44}\nNOTEBOOK 1\n{'='*44}\n{run_nb1()}\n\n{'='*44}\nNOTEBOOK 2\n{'='*44}\n{run_nb2()}"
def load_css():
p=BASE_DIR/"style.css"
extra="""
.gr-button-primary{background:rgb(40,9,109)!important;border:none!important;}
.tab-nav button{font-weight:700!important;font-size:14px!important;}
"""
return (p.read_text(encoding="utf-8") if p.exists() else "")+extra
# =========================================================
# UI
# =========================================================
with gr.Blocks(title="F2 Salary Intelligence — ESCP") as demo:
gr.Markdown("# F2 Salary Intelligence Platform\n*AI-powered salary prediction, analysis & insights — ESCP Big Data Project*", elem_id="escp_title")
# ── TAB 1: SALARY PREDICTOR ─────────────────────────────
with gr.Tab("🎯 Salary Predictor"):
gr.Markdown("### Predict your salary and future trajectory\n*Enter your profile — our AI predicts your salary, adjusts for feedback sentiment via n8n, and forecasts your next 5 years.*")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### 👤 Your Profile")
p_name = gr.Textbox(label="Name (optional)", placeholder="e.g. Jane Doe")
p_age = gr.Slider(18,70,value=30,step=1,label="Age")
p_exp = gr.Slider(0,40,value=5,step=1,label="Years of Experience")
p_edu = gr.Dropdown(["Bachelor's","Master's","PhD"],value="Bachelor's",label="Education Level",interactive=True)
p_job = gr.Textbox(label="Job Title",value="Data Analyst")
p_gender = gr.Dropdown(["Male","Female"],value="Male",label="Gender",interactive=True)
gr.Markdown("#### 💬 Feedback & Sentiment")
gr.Markdown("*Your feedback influences salary via **Automation 2 (HR Alert)***")
p_feedback = gr.Textbox(label="Performance Feedback",placeholder='e.g. "Consistently exceeds expectations and shows strong leadership..."',lines=3,interactive=True)
btn_pred = gr.Button("🚀 Predict My Salary", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("#### 📊 Prediction Results")
out_gauge = gr.Plot(label="Salary Estimate")
out_result = gr.HTML()
gr.Markdown("#### 📈 5-Year Salary Trajectory")
with gr.Row():
out_proj_chart = gr.Plot(label="Salary Projection")
out_proj_table = gr.HTML()
gr.Markdown("#### 🗂️ Example Profiles")
gr.Examples(
examples=[
["",28,3,"Bachelor's","Junior Data Analyst","Female","Shows promise and responds well to coaching."],
["",35,10,"Master's","Senior Data Scientist","Male","Consistently exceeds expectations with strong leadership."],
["",45,20,"PhD","Director of Analytics","Female","Outstanding mentor who drives cross-team innovation."],
["",22,1,"Bachelor's","Intern","Male","Still ramping up but shows enthusiasm."],
["",50,25,"Master's","VP of Engineering","Male","Completely fails deadlines and shows poor leadership."],
],
inputs=[p_name,p_age,p_exp,p_edu,p_job,p_gender,p_feedback],
)
btn_pred.click(run_predictor,inputs=[p_age,p_exp,p_edu,p_gender,p_job,p_name,p_feedback],outputs=[out_gauge,out_proj_chart,out_result,out_proj_table])
# ── TAB 2: SALARY ANALYZER ──────────────────────────────
with gr.Tab("🔍 Salary Analyzer"):
gr.Markdown("### Is your salary fair?\n*Enter your current salary and profile — **Automation 3** detects anomalies and compares you to market benchmarks.*")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### 💰 Your Salary & Profile")
a_salary = gr.Number(label="Your Current Annual Salary ($)",value=75000)
a_age = gr.Slider(18,70,value=30,step=1,label="Age")
a_exp = gr.Slider(0,40,value=5,step=1,label="Years of Experience")
a_edu = gr.Dropdown(["Bachelor's","Master's","PhD"],value="Bachelor's",label="Education Level",interactive=True)
a_gender = gr.Dropdown(["Male","Female"],value="Male",label="Gender",interactive=True)
a_job = gr.Textbox(label="Job Title",value="Data Analyst")
btn_ana = gr.Button("🔍 Analyze My Salary", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("#### 📊 Analysis Results")
out_dev = gr.Plot(label="Deviation Gauge")
out_ana_res = gr.HTML()
gr.Markdown("#### 📊 Market Comparison")
out_compare = gr.Plot(label="Salary Comparison")
btn_ana.click(run_analyzer,inputs=[a_salary,a_age,a_exp,a_edu,a_gender,a_job],outputs=[out_dev,out_compare,out_ana_res])
# ── TAB 3: INSIGHTS ─────────────────────────────────────
with gr.Tab("📊 Insights"):
gr.Markdown("### Data Insights & AI Analysis\n*Explore our findings — interactive charts + AI assistant*")
kpi_html = gr.HTML(value=render_kpis)
ref_btn = gr.Button("🔄 Refresh Data", variant="secondary", size="sm")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("#### 💬 Ask our AI")
chatbot = gr.Chatbot(label="AI Analytics Assistant",height=380)
user_input = gr.Textbox(label="Ask about the data",placeholder='e.g. "How does education affect salary?"',lines=1)
gr.Markdown("**Quick questions:**")
with gr.Row():
ex1=gr.Button("📚 Education",size="sm"); ex2=gr.Button("📈 Career tiers",size="sm"); ex3=gr.Button("⚧ Gender gap",size="sm")
with gr.Row():
ex4=gr.Button("📅 Progression",size="sm"); ex5=gr.Button("💬 Sentiment",size="sm"); ex6=gr.Button("🔗 Scatter",size="sm")
with gr.Column(scale=1):
ai_chart = gr.Plot(label="Interactive Chart")
gr.Markdown("#### 📊 Key Charts")
with gr.Row():
c_edu=gr.Plot(label="Education"); c_gender=gr.Plot(label="Gender")
with gr.Row():
c_exp=gr.Plot(label="Experience"); c_tier=gr.Plot(label="Career Tier")
with gr.Row():
c_prog=gr.Plot(label="Progression"); c_sent=gr.Plot(label="Sentiment")
c_scatter=gr.Plot(label="Scatter")
ex1.click(lambda: "How does education level affect salary?",outputs=[user_input])
ex2.click(lambda: "Show career tier salary differences",outputs=[user_input])
ex3.click(lambda: "Is there a gender pay gap?",outputs=[user_input])
ex4.click(lambda: "Show salary progression over time",outputs=[user_input])
ex5.click(lambda: "Do senior employees get more positive feedback?",outputs=[user_input])
ex6.click(lambda: "Show experience vs salary scatter",outputs=[user_input])
user_input.submit(ai_chat,inputs=[user_input,chatbot],outputs=[chatbot,user_input,ai_chart])
def on_refresh():
kpi,edu,gender,exp,tier,prog,sent,scatter=refresh_insights()
return kpi,edu,gender,exp,tier,prog,sent,scatter
ref_btn.click(on_refresh,outputs=[kpi_html,c_edu,c_gender,c_exp,c_tier,c_prog,c_sent,c_scatter])
# ── TAB 4: PIPELINE (minimal) ───────────────────────────
with gr.Tab("⚙️ Pipeline"):
gr.Markdown("### Data Pipeline\n*Regenerate synthetic data and retrain the model.*")
with gr.Row():
btn_nb1=gr.Button("▶ Step 1: Data Creation",variant="secondary")
btn_nb2=gr.Button("▶ Step 2: Analysis & Training",variant="secondary")
btn_all=gr.Button("⚡ Run Full Pipeline",variant="primary")
pipe_log=gr.Textbox(label="Log",lines=15,interactive=False)
btn_nb1.click(run_nb1,outputs=[pipe_log])
btn_nb2.click(run_nb2,outputs=[pipe_log])
btn_all.click(run_both,outputs=[pipe_log])
demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])