import os, re, json, time, pickle, random, traceback import numpy as np import pandas as pd import gradio as gr import plotly.graph_objects as go import requests as req from pathlib import Path from typing import Tuple try: from huggingface_hub import InferenceClient except Exception: InferenceClient = None # ========================================================= # CONFIG # ========================================================= BASE_DIR = Path(__file__).resolve().parent HF_API_KEY = os.environ.get("HF_API_KEY", "").strip() MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip() HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip() N8N_SALARY_PREDICTION_URL = os.environ.get("N8N_SALARY_PREDICTION_URL", "").strip() N8N_HR_FEEDBACK_URL = os.environ.get("N8N_HR_FEEDBACK_URL", "").strip() N8N_ANOMALY_DETECTION_URL = os.environ.get("N8N_ANOMALY_DETECTION_URL", "").strip() LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None llm_client = (InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY) if LLM_ENABLED else None) P = ["#45FFCA","#D09CFA","#FF9B9B","#F875AA","#3EDBF0","#F2C637","#7c5cbf","#2ec4a0"] GOLD = "#F2C637" PURPLE = "#28096d" # ========================================================= # DATA # ========================================================= _CACHE = {} def _load_csv(f): p = BASE_DIR / f if p.exists(): try: return pd.read_csv(p) except: pass return pd.DataFrame() def _gen_base(n=500): rng = np.random.default_rng(42) edu_levels = ["Bachelor's","Master's","PhD"] job_pool = { "Bachelor's":["Data Analyst","Junior Engineer","Business Analyst","HR Specialist","Marketing Manager"], "Master's":["Data Scientist","Senior Engineer","Product Manager","Financial Analyst","ML Engineer"], "PhD":["Director of Analytics","Research Scientist","VP of Engineering","Chief Data Officer","AI Researcher"], } rows = [] for _ in range(n): edu = rng.choice(edu_levels, p=[0.55,0.35,0.10]) exp = int(rng.integers(0,35)) age = int(np.clip(22+exp+rng.integers(-2,5),22,65)) gender = rng.choice(["Male","Female"], p=[0.52,0.48]) job = rng.choice(job_pool[edu]) base = {"Bachelor's":52000,"Master's":68000,"PhD":88000}[edu] sal = base + exp*3100 + (age-22)*180 + float(rng.normal(0,6000)) if gender=="Male": sal *= float(rng.uniform(1.0,1.06)) sal = max(25000,int(sal)) rows.append({"Age":age,"Gender":gender,"Education Level":edu,"Job Title":job,"Years of Experience":exp,"Salary":sal}) df = pd.DataFrame(rows) def tier(r): s={"Bachelor's":1,"Master's":2,"PhD":3}.get(r["Education Level"],1) if r["Years of Experience"]>=15 and s>=2: return "senior" elif r["Years of Experience"]>=7 or s==3: return "mid" return "junior" df["career_tier"] = df.apply(tier,axis=1) df["salary_growth"] = df.apply(lambda r: int(r["Salary"]*(0.15 if r["career_tier"]=="senior" else 0.30 if r["career_tier"]=="mid" else 0.50)+float(np.random.normal(0,2000))),axis=1) return df def _gen_progression(base_df): rows=[] for _,row in base_df.iterrows(): t=row["career_tier"]; base=row["Salary"] growth=np.linspace(0.85 if t=="senior" else 0.70 if t=="mid" else 0.50,1.0,5) for i,yr in enumerate(range(2020,2025)): sal=int(np.clip(base*(growth[i]+np.random.normal(0,0.03)),20000,None)) rows.append({"career_tier":t,"year":yr,"salary_that_year":sal}) return pd.DataFrame(rows) def _gen_feedback(base_df): pools={"senior":["Consistently exceeds expectations.","Key leader and mentor.","Delivers high-impact results.","Trusted advisor to management."],"mid":["Solid contributor, meets targets.","Shows initiative and grows.","Dependable team member.","Takes ownership of projects."],"junior":["Eager learner, developing skills.","Shows promise under coaching.","Making good progress.","Improving steadily with support."]} rows=[] for _,row in base_df.iterrows(): for comment in random.sample(pools[row["career_tier"]],2): rows.append({"career_tier":row["career_tier"],"feedback_comment":comment,"Salary":row["Salary"]}) return pd.DataFrame(rows) def get_base_df(): if "base" not in _CACHE: df=_load_csv("employee_analysis_ready.csv") if df.empty: df=_load_csv("Salary_Data.csv") if df.empty: df=_gen_base() if "career_tier" not in df.columns: def tier(r): s={"Bachelor's":1,"Master's":2,"PhD":3}.get(r.get("Education Level",""),1) e=r.get("Years of Experience",0) if e>=15 and s>=2: return "senior" elif e>=7 or s==3: return "mid" return "junior" df["career_tier"]=df.apply(tier,axis=1) _CACHE["base"]=df return _CACHE["base"] def get_prog_df(): if "prog" not in _CACHE: df=_load_csv("synthetic_salary_progression.csv") if df.empty: df=_gen_progression(get_base_df()) _CACHE["prog"]=df return _CACHE["prog"] def get_feed_df(): if "feed" not in _CACHE: df=_load_csv("synthetic_employee_feedback.csv") if df.empty: df=_gen_feedback(get_base_df()) _CACHE["feed"]=df return _CACHE["feed"] # ========================================================= # HELPERS # ========================================================= def get_tier(age,exp,edu): s={"Bachelor's":1,"Master's":2,"PhD":3}.get(edu,1) if exp>=15 and s>=2: return "senior" elif exp>=7 or s==3: return "mid" return "junior" def get_exp_group(exp): if exp<=5: return "0-5 years" elif exp<=10: return "6-10 years" elif exp<=15: return "11-15 years" elif exp<=20: return "16-20 years" return "20+" def _layout(**kw): d=dict(template="plotly_white",paper_bgcolor="rgba(255,255,255,0.97)",plot_bgcolor="rgba(255,255,255,0.99)",font=dict(family="system-ui,sans-serif",color="#1a0a3d",size=12),margin=dict(l=50,r=20,t=55,b=45),legend=dict(orientation="h",yanchor="bottom",y=1.02,xanchor="right",x=1,bgcolor="rgba(255,255,255,0.9)",bordercolor="rgba(40,9,109,0.15)",borderwidth=1),title=dict(font=dict(size=14,color="#28096d"))) d.update(kw) return d def _n8n(url,payload): if not url: return None try: r=req.post(url,json=payload,timeout=15) return r.json() except Exception as e: return {"error":str(e)} # ========================================================= # PREDICTION MODEL # ========================================================= def _load_model(): for mp in [BASE_DIR/"rf_model.pkl",BASE_DIR/"model"/"rf_model.pkl"]: cp=mp.parent/"train_columns.pkl" if mp.exists() and cp.exists(): try: with open(mp,"rb") as f: m=pickle.load(f) with open(cp,"rb") as f: c=pickle.load(f) return m,c except: pass return None,None RF_MODEL,TRAIN_COLS=_load_model() def predict_local(age,exp,edu,gender): tier=get_tier(age,exp,edu) exp_group=get_exp_group(exp) if RF_MODEL and TRAIN_COLS: row={"Age":age,"Years of Experience":exp,"Education Level_Master's":1 if edu=="Master's" else 0,"Education Level_PhD":1 if edu=="PhD" else 0,"salary_growth":exp*3500,"vader_score":0.3 if tier=="senior" else 0.15 if tier=="mid" else 0.05} inp=pd.DataFrame([row]) tmp=pd.DataFrame([{"Experience Group":exp_group,"career_tier":tier}]) inp=pd.concat([inp,tmp],axis=1) inp=pd.get_dummies(inp,columns=["Experience Group","career_tier"],drop_first=True)*1 for col in TRAIN_COLS: if col not in inp.columns: inp[col]=0 inp=inp[TRAIN_COLS] predicted=float(RF_MODEL.predict(inp)[0]) source="Random Forest Model" else: base={"Bachelor's":55000,"Master's":72000,"PhD":92000}.get(edu,55000) predicted=base+(exp*3200)+((age-22)*250) predicted=max(28000,predicted) source="Formula Estimate" predicted+=predicted*random.uniform(-0.01,0.01) return {"predicted":round(predicted),"low":round(predicted*0.92),"high":round(predicted*1.08),"tier":tier,"exp_group":exp_group,"source":source} def project_salary(base_salary, exp, edu, tier, years=5): """Project salary for next N years with promotion probability""" projections = [] current = base_salary current_exp = exp for yr in range(1, years+1): current_exp += 1 growth_rate = {"senior": 0.04, "mid": 0.06, "junior": 0.09}.get(tier, 0.05) # Promotion bump every 3 years promotion_bump = 0 if yr % 3 == 0: promotion_bump = {"junior": 0.12, "mid": 0.10, "senior": 0.07}.get(tier, 0.08) if yr % 3 == 0 and tier == "junior" and current_exp >= 7: tier = "mid" elif tier == "mid" and current_exp >= 15: tier = "senior" current = current * (1 + growth_rate + promotion_bump) projections.append({ "year": 2025 + yr, "salary": round(current), "tier": tier, "promotion": promotion_bump > 0 }) return projections # ========================================================= # TAB 1: SALARY PREDICTOR # ========================================================= def run_predictor(age, exp, edu, gender, job_title, name, feedback): res = predict_local(int(age), int(exp), edu, gender) predicted = res["predicted"] low, high = res["low"], res["high"] tier, exp_group, source = res["tier"], res["exp_group"], res["source"] # Call Automation 1 (n8n salary prediction) auto1_result = _n8n(N8N_SALARY_PREDICTION_URL, { "age":int(age),"years_of_experience":int(exp), "education_level":edu,"gender":gender,"job_title":job_title }) if auto1_result and "predicted_salary" in auto1_result: predicted = auto1_result["predicted_salary"] low = round(predicted*0.92) high = round(predicted*1.08) # Call Automation 2 (HR feedback) auto2_result = _n8n(N8N_HR_FEEDBACK_URL, { "name": name or "Employee","career_tier":tier,"salary":predicted, "years_of_experience":int(exp),"education_level":edu, "feedback": feedback or "No feedback provided" }) sentiment_adjustment = 0 sentiment_label = "Neutral" sentiment_score = 0.5 if auto2_result and "error" not in auto2_result: sentiment_adjustment = auto2_result.get("salary_adjustment", 0) sentiment_score = auto2_result.get("sentiment_score", 0.5) alert = auto2_result.get("alert","") if "POSITIVE" in alert.upper(): sentiment_label = "Positive 😊" elif "NEGATIVE" in alert.upper(): sentiment_label = "Negative 😟" else: sentiment_label = "Neutral 😐" if sentiment_adjustment: predicted = round(predicted * (1 + sentiment_adjustment/100)) low = round(predicted*0.92) high = round(predicted*1.08) # Salary projection projections = project_salary(predicted, int(exp), edu, tier) # Gauge chart fig_gauge = go.Figure(go.Indicator( mode="gauge+number",value=predicted, number={"prefix":"$","valueformat":",.0f","font":{"size":32,"color":PURPLE}}, gauge={"axis":{"range":[25000,250000],"tickprefix":"$","tickformat":",.0f","tickfont":{"size":9}},"bar":{"color":GOLD,"thickness":0.3},"bgcolor":"white","borderwidth":0,"steps":[{"range":[25000,80000],"color":"#e8f5f0"},{"range":[80000,140000],"color":"#d4ecff"},{"range":[140000,250000],"color":"#ede8ff"}],"threshold":{"line":{"color":"#FF9B9B","width":3},"thickness":0.8,"value":high}}, title={"text":"Predicted Annual Salary","font":{"size":13,"color":PURPLE}}, )) fig_gauge.update_layout(height=280,paper_bgcolor="rgba(255,255,255,0.97)",margin=dict(l=30,r=30,t=60,b=10)) # Projection chart years = [p["year"] for p in projections] salaries = [p["salary"] for p in projections] promotions = [p["promotion"] for p in projections] proj_colors = ["#F2C637" if p else "#45FFCA" for p in promotions] fig_proj = go.Figure() fig_proj.add_trace(go.Scatter( x=[2025]+years, y=[predicted]+salaries, mode="lines", name="Salary Trajectory", line=dict(color="#45FFCA",width=3), fill="tozeroy", fillcolor="rgba(69,255,202,0.1)" )) for i,(yr,sal,promo) in enumerate(zip(years,salaries,promotions)): if promo: fig_proj.add_trace(go.Scatter( x=[yr],y=[sal],mode="markers+text", marker=dict(color="#F2C637",size=14,symbol="star"), text=["Promotion!"],textposition="top center", textfont=dict(size=10,color="#F2C637"), showlegend=False, hovertemplate=f"Year: {yr}
Salary: ${sal:,.0f}
⭐ Promotion" )) fig_proj.add_annotation(x=2025,y=predicted,text=f"Now: ${predicted:,.0f}", showarrow=True,arrowhead=2,font=dict(size=10,color=PURPLE)) fig_proj.update_layout(**_layout(height=300,showlegend=False, title=dict(text="📈 5-Year Salary Projection"), xaxis=dict(tickformat="d"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) # Build automation status HTML tier_emoji = {"junior":"🌱","mid":"📈","senior":"⭐"}.get(tier,"") sent_color = "#45FFCA" if "Positive" in sentiment_label else "#FF9B9B" if "Negative" in sentiment_label else "#D3D1C7" adj_text = f"{sentiment_adjustment:+.1f}%" if sentiment_adjustment else "0%" auto1_status = "✅ Connected" if auto1_result and "error" not in auto1_result else ("⚠️ "+auto1_result.get("error","Error")[:30] if auto1_result else "⚙️ Not configured") auto2_status = "✅ "+sentiment_label if auto2_result and "error" not in auto2_result else ("⚠️ Error" if auto2_result else "⚙️ Not configured") result_html = f"""
Career Tier
{tier_emoji} {tier.title()}
Experience
{exp_group}
Range
${low:,.0f} – ${high:,.0f}
Model
{source}
💬 Feedback Sentiment Impact
{sentiment_label} Salary adj: {adj_text} Score: {sentiment_score:.2f}
🔗 n8n Automation Status
AUTOMATION 1 — Salary Prediction
{auto1_status}
AUTOMATION 2 — HR Feedback Alert
{auto2_status}
""" # 5-year table proj_rows = "".join([ f'' f'{p["year"]}' f'${p["salary"]:,.0f}' f'{p["tier"].title()}' f'{"⭐ Promotion!" if p["promotion"] else "—"}' f'' for p in projections ]) proj_table = f"""
📅 5-Year Salary Forecast
{proj_rows}
Year Projected Salary Tier Event
""" return fig_gauge, fig_proj, result_html, proj_table # ========================================================= # TAB 2: SALARY ANALYZER # ========================================================= def run_analyzer(salary, age, exp, edu, gender, job_title): salary = float(salary) tier = get_tier(int(age), int(exp), edu) exp_group = get_exp_group(int(exp)) # Call Automation 3 auto3_result = _n8n(N8N_ANOMALY_DETECTION_URL, { "salary":salary,"age":int(age), "years_of_experience":int(exp),"education_level":edu }) # Local anomaly calculation as fallback base_expected = {"Bachelor's":55000,"Master's":72000,"PhD":92000}.get(edu,55000) expected = base_expected + (int(exp)*3200) + ((int(age)-22)*250) deviation = ((salary - expected) / expected) * 100 if auto3_result and "error" not in auto3_result: is_anomaly = auto3_result.get("anomaly", False) flag = auto3_result.get("flag", "NORMAL RANGE") severity = auto3_result.get("severity", "normal") dev = auto3_result.get("deviation_pct", round(deviation,1)) expected_sal = auto3_result.get("expected_salary", round(expected)) auto3_status = "✅ Connected" else: dev = round(deviation,1) expected_sal = round(expected) is_anomaly = abs(dev) > 20 if dev > 40: flag,severity = "OVERPAID","high" elif dev > 20: flag,severity = "SLIGHTLY OVERPAID","medium" elif dev < -40: flag,severity = "UNDERPAID","high" elif dev < -20: flag,severity = "SLIGHTLY UNDERPAID","medium" else: flag,severity = "NORMAL RANGE","normal" auto3_status = "⚙️ Using local calculation" if not auto3_result else "⚠️ Error" # Color theme based on result sev_color = {"high":"#FF9B9B","medium":"#F2C637","normal":"#45FFCA"}.get(severity,"#45FFCA") sev_icon = {"high":"🚨","medium":"⚠️","normal":"✅"}.get(severity,"✅") # Comparison gauge fig_compare = go.Figure() fig_compare.add_trace(go.Bar( x=["Your Salary","Market Expected"], y=[salary, expected_sal], marker_color=[sev_color, "#45FFCA"], text=[f"${salary:,.0f}", f"${expected_sal:,.0f}"], textposition="outside", hovertemplate="%{x}: $%{y:,.0f}" )) fig_compare.update_layout(**_layout(height=320,showlegend=False, title=dict(text="Your Salary vs Market Expected"), yaxis=dict(tickprefix="$",tickformat=",.0f"))) # Deviation gauge fig_dev = go.Figure(go.Indicator( mode="gauge+number+delta",value=dev, number={"suffix":"%","valueformat":".1f","font":{"size":28,"color":PURPLE}}, delta={"reference":0,"valueformat":".1f","suffix":"%"}, gauge={"axis":{"range":[-60,60],"ticksuffix":"%"},"bar":{"color":sev_color,"thickness":0.3},"bgcolor":"white","borderwidth":0,"steps":[{"range":[-60,-20],"color":"rgba(255,155,155,0.3)"},{"range":[-20,20],"color":"rgba(69,255,202,0.2)"},{"range":[20,60],"color":"rgba(208,156,250,0.3)"}],"threshold":{"line":{"color":"red","width":3},"thickness":0.8,"value":40 if dev>0 else -40}}, title={"text":"Salary Deviation from Market","font":{"size":13,"color":PURPLE}}, )) fig_dev.update_layout(height=280,paper_bgcolor="rgba(255,255,255,0.97)",margin=dict(l=30,r=30,t=60,b=10)) # Percentile estimation df = get_base_df() if "Salary" in df.columns: percentile = round((df["Salary"] < salary).mean() * 100) else: percentile = 50 # Build result HTML result_html = f"""
{sev_icon}
{flag}
Salary anomaly detection result
Your Salary
${salary:,.0f}
Market Expected
${expected_sal:,.0f}
Deviation
{dev:+.1f}%
Percentile
Top {100-percentile}%
💡 Recommendation
{'Your salary is significantly above market. Ensure your performance justifies this compensation.' if dev > 30 else 'Your salary is slightly above market — you are in a strong position.' if dev > 10 else 'Your salary is within market norms. You are fairly compensated.' if abs(dev) <= 10 else 'Your salary is slightly below market. Consider negotiating a raise.' if dev > -30 else 'Your salary is significantly below market. You may be underpaid — seek negotiation or new opportunities.'}
🔗 AUTOMATION 3 — Anomaly Detection
{auto3_status}
""" return fig_dev, fig_compare, result_html # ========================================================= # TAB 3: INSIGHTS (Dashboard + AI) # ========================================================= def build_edu_chart(): df=get_base_df() if "Education Level" not in df.columns: return go.Figure() grp=df.groupby("Education Level")["Salary"].mean().reset_index() order=["Bachelor's","Master's","PhD"] grp["Education Level"]=pd.Categorical(grp["Education Level"],categories=order,ordered=True) grp=grp.sort_values("Education Level") fig=go.Figure(go.Bar(x=grp["Education Level"],y=grp["Salary"],marker_color=P[:3],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}")) fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Education"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) return fig def build_exp_chart(): df=get_base_df().copy() if "Years of Experience" not in df.columns: return go.Figure() df["eg"]=df["Years of Experience"].apply(get_exp_group) grp=df.groupby("eg")["Salary"].mean().reset_index() order=["0-5 years","6-10 years","11-15 years","16-20 years","20+"] grp["eg"]=pd.Categorical(grp["eg"],categories=order,ordered=True) grp=grp.sort_values("eg") fig=go.Figure(go.Bar(x=grp["eg"],y=grp["Salary"],marker_color=P[:5],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}")) fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Experience"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) return fig def build_gender_chart(): df=get_base_df() if "Gender" not in df.columns: return go.Figure() grp=df.groupby("Gender")["Salary"].mean().reset_index() fig=go.Figure(go.Bar(x=grp["Gender"],y=grp["Salary"],marker_color=[P[0],P[2]],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}")) fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Gender"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) return fig def build_tier_chart(): df=get_base_df() if "career_tier" not in df.columns: return go.Figure() grp=df.groupby("career_tier")["Salary"].mean().reset_index() order=["junior","mid","senior"] grp["career_tier"]=pd.Categorical(grp["career_tier"],categories=order,ordered=True) grp=grp.sort_values("career_tier") fig=go.Figure(go.Bar(x=grp["career_tier"].str.title(),y=grp["Salary"],marker_color=[P[0],P[1],P[2]],text=[f"${v:,.0f}" for v in grp["Salary"]],textposition="outside",hovertemplate="%{x}
$%{y:,.0f}")) fig.update_layout(**_layout(height=320,showlegend=False,title=dict(text="Avg Salary by Career Tier"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) return fig def build_prog_chart(): df=get_prog_df() if df.empty or "year" not in df.columns: return go.Figure() avg=df.groupby(["year","career_tier"])["salary_that_year"].mean().reset_index() fig=go.Figure() for i,t in enumerate(["junior","mid","senior"]): sub=avg[avg["career_tier"]==t] fig.add_trace(go.Scatter(x=sub["year"],y=sub["salary_that_year"],name=t.title(),mode="lines+markers",line=dict(color=P[i],width=2.5),marker=dict(size=7),hovertemplate=f"{t.title()}
%{{x}}: $%{{y:,.0f}}")) fig.update_layout(**_layout(height=340,title=dict(text="Salary Progression by Career Tier (2020–2024)"),yaxis=dict(tickprefix="$",tickformat=",.0f"),xaxis=dict(tickformat="d"))) return fig def build_sentiment_chart(): df=get_feed_df().copy() if df.empty: return go.Figure() try: from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer ana=SentimentIntensityAnalyzer() df["score"]=df["feedback_comment"].apply(lambda x: ana.polarity_scores(str(x))["compound"]) df["sentiment"]=df["score"].apply(lambda s: "positive" if s>=0.05 else ("negative" if s<=-0.05 else "neutral")) except: df["sentiment"]=df["career_tier"].map({"senior":"positive","mid":"positive","junior":"neutral"}).fillna("neutral") counts=df.groupby(["career_tier","sentiment"]).size().unstack(fill_value=0).reset_index() for c in ["negative","neutral","positive"]: if c not in counts.columns: counts[c]=0 tot=counts[["negative","neutral","positive"]].sum(axis=1) for c in ["negative","neutral","positive"]: counts[c]=(counts[c]/tot*100).round(1) order=["junior","mid","senior"] counts["career_tier"]=pd.Categorical(counts["career_tier"],categories=order,ordered=True) counts=counts.sort_values("career_tier") colors={"negative":"#FF9B9B","neutral":"#D3D1C7","positive":"#45FFCA"} fig=go.Figure() for s in ["negative","neutral","positive"]: fig.add_trace(go.Bar(x=counts["career_tier"].str.title(),y=counts[s],name=s.title(),marker_color=colors[s],text=counts[s].apply(lambda v: f"{v:.0f}%"),textposition="inside",hovertemplate=f"{s.title()}: %{{y:.1f}}%")) fig.update_layout(**_layout(height=320,barmode="stack",title=dict(text="Feedback Sentiment by Career Tier (%)"),yaxis=dict(ticksuffix="%",range=[0,100]))) return fig def build_scatter_chart(): df=get_base_df() if "Years of Experience" not in df.columns: return go.Figure() tc={"junior":P[0],"mid":P[1],"senior":P[2]} fig=go.Figure() for t in ["junior","mid","senior"]: sub=df[df["career_tier"]==t] fig.add_trace(go.Scatter(x=sub["Years of Experience"],y=sub["Salary"],mode="markers",name=t.title(),marker=dict(color=tc[t],size=5,opacity=0.65),hovertemplate="Exp: %{x}y
$%{y:,.0f}")) fig.update_layout(**_layout(height=320,title=dict(text="Experience vs Salary by Career Tier"),xaxis=dict(title="Years of Experience"),yaxis=dict(tickprefix="$",tickformat=",.0f"))) return fig CHART_MAP={"salary_education":build_edu_chart,"salary_experience":build_exp_chart,"gender_salary":build_gender_chart,"career_tier":build_tier_chart,"salary_progression":build_prog_chart,"sentiment":build_sentiment_chart,"scatter":build_scatter_chart} def render_kpis(): df=get_base_df() n=len(df); avg=df["Salary"].mean() if "Salary" in df.columns else 0 avg_exp=df["Years of Experience"].mean() if "Years of Experience" in df.columns else 0 def card(icon,label,value,color): return (f'
' f'
{icon}
' f'
{label}
' f'
{value}
') html=('
' +card("👥","Employees",f"{n:,}",GOLD)+card("💰","Avg Salary",f"${avg:,.0f}","#45FFCA") +card("📅","Avg Experience",f"{avg_exp:.1f} yrs","#D09CFA")+card("🎯","Career Tiers","3","#FF9B9B") +"
") return html AI_SYSTEM = """You are an AI HR analytics assistant for the F2 Salary Prediction project at ESCP Business School. Dataset: Age, Gender, Education Level (Bachelor's/Master's/PhD), Job Title, Years of Experience, Salary, career_tier (junior/mid/senior), synthetic salary progression 2020-2024, VADER sentiment. Key findings: PhD earns ~35% more than Bachelor's. Senior tier = highest stable salaries. Junior grows fastest. Gender gap ~4%. RF model ~92-95% accuracy. Answer in 2-4 sentences with specific numbers. End with: ```json {"chart": ""}``` Chart options: salary_education | salary_experience | gender_salary | career_tier | salary_progression | sentiment | scatter | none""" def _keyword_reply(msg): m=msg.lower() if any(w in m for w in ["education","degree","bachelor","master","phd"]): return ("PhD holders earn ~35% more than Bachelor's graduates. Education is one of the top salary predictors.","salary_education") if any(w in m for w in ["gender","male","female","gap"]): return ("There is a ~4% gender pay gap, with male employees earning slightly more on average.","gender_salary") if any(w in m for w in ["experience","years","exp"]) and "prog" not in m: return ("Employees with 20+ years earn nearly 3× those with 0-5 years. The steepest growth is in the first 10 years.","salary_experience") if any(w in m for w in ["tier","career","junior","mid","senior"]): return ("Senior employees earn the most and have stable salaries. Junior employees show the fastest growth rate.","career_tier") if any(w in m for w in ["progress","growth","time","2020","trend"]): return ("Junior employees grow from 50% to 100% of base salary over 5 years. Seniors maintain high stable salaries.","salary_progression") if any(w in m for w in ["sentiment","feedback","comment","review"]): return ("Senior employees receive 90%+ positive feedback. There is a moderate positive correlation between sentiment and salary.","sentiment") if any(w in m for w in ["scatter","correlation","relationship"]): return ("Experience correlates strongly with salary across all tiers. The scatter shows clear tier separation.","scatter") return ("Ask me about: education vs salary, experience trends, gender pay gap, career tier distributions, salary progression, or feedback sentiment.","none") def ai_chat(user_msg, history): if not user_msg or not user_msg.strip(): return history or [], "", None safe_history=[item for item in (history or []) if isinstance(item,dict) and "role" in item] chart_name="none" if LLM_ENABLED: try: msgs=[{"role":"system","content":AI_SYSTEM}]+safe_history[-10:]+[{"role":"user","content":user_msg}] r=llm_client.chat_completion(model=MODEL_NAME,messages=msgs,temperature=0.3,max_tokens=400,stream=False) raw=r["choices"][0]["message"]["content"] if isinstance(r,dict) else r.choices[0].message.content hit=re.search(r'```json\s*(\{.*?\})\s*```',raw,re.DOTALL) if hit: chart_name=json.loads(hit.group(1)).get("chart","none") reply=re.sub(r'```json.*?```','',raw,flags=re.DOTALL).strip() except: reply,chart_name=_keyword_reply(user_msg) else: reply,chart_name=_keyword_reply(user_msg) chart_fn=CHART_MAP.get(chart_name) chart_out=chart_fn() if chart_fn else None return safe_history+[{"role":"user","content":user_msg},{"role":"assistant","content":reply}],"",chart_out def refresh_insights(): return (render_kpis(),build_edu_chart(),build_gender_chart(), build_exp_chart(),build_tier_chart(),build_prog_chart(), build_sentiment_chart(),build_scatter_chart()) # ========================================================= # PIPELINE RUNNER # ========================================================= def run_notebook_safe(env_key,default): nb_name=os.environ.get(env_key,default).strip() nb_in=BASE_DIR/nb_name if not nb_in.exists(): return f"❌ {nb_name} not found.\nUpload the notebook to your HuggingFace Space Files tab." try: import papermill as pm out=BASE_DIR/"runs"/f"run_{time.strftime('%Y%m%d-%H%M%S')}_{nb_name}" out.parent.mkdir(exist_ok=True) pm.execute_notebook(str(nb_in),str(out),cwd=str(BASE_DIR),log_output=True,progress_bar=False,execution_timeout=1800) _CACHE.clear() return f"✅ {nb_name} completed.\nCSVs: {[p.name for p in BASE_DIR.glob('*.csv')]}" except Exception as e: return f"❌ FAILED: {e}\n\n{traceback.format_exc()[-1500:]}" def run_nb1(): return run_notebook_safe("NB1","F2_Data_Extraction_and_synthetic_enrichment.ipynb") def run_nb2(): return run_notebook_safe("NB2","F2_quantitative_and_qualitative_analysis___prediction.ipynb") def run_both(): return f"{'='*44}\nNOTEBOOK 1\n{'='*44}\n{run_nb1()}\n\n{'='*44}\nNOTEBOOK 2\n{'='*44}\n{run_nb2()}" def load_css(): p=BASE_DIR/"style.css" extra=""" .gr-button-primary{background:rgb(40,9,109)!important;border:none!important;} .tab-nav button{font-weight:700!important;font-size:14px!important;} """ return (p.read_text(encoding="utf-8") if p.exists() else "")+extra # ========================================================= # UI # ========================================================= with gr.Blocks(title="F2 Salary Intelligence — ESCP") as demo: gr.Markdown("# F2 Salary Intelligence Platform\n*AI-powered salary prediction, analysis & insights — ESCP Big Data Project*", elem_id="escp_title") # ── TAB 1: SALARY PREDICTOR ───────────────────────────── with gr.Tab("🎯 Salary Predictor"): gr.Markdown("### Predict your salary and future trajectory\n*Enter your profile — our AI predicts your salary, adjusts for feedback sentiment via n8n, and forecasts your next 5 years.*") with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 👤 Your Profile") p_name = gr.Textbox(label="Name (optional)", placeholder="e.g. Jane Doe") p_age = gr.Slider(18,70,value=30,step=1,label="Age") p_exp = gr.Slider(0,40,value=5,step=1,label="Years of Experience") p_edu = gr.Dropdown(["Bachelor's","Master's","PhD"],value="Bachelor's",label="Education Level",interactive=True) p_job = gr.Textbox(label="Job Title",value="Data Analyst") p_gender = gr.Dropdown(["Male","Female"],value="Male",label="Gender",interactive=True) gr.Markdown("#### 💬 Feedback & Sentiment") gr.Markdown("*Your feedback influences salary via **Automation 2 (HR Alert)***") p_feedback = gr.Textbox(label="Performance Feedback",placeholder='e.g. "Consistently exceeds expectations and shows strong leadership..."',lines=3,interactive=True) btn_pred = gr.Button("🚀 Predict My Salary", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("#### 📊 Prediction Results") out_gauge = gr.Plot(label="Salary Estimate") out_result = gr.HTML() gr.Markdown("#### 📈 5-Year Salary Trajectory") with gr.Row(): out_proj_chart = gr.Plot(label="Salary Projection") out_proj_table = gr.HTML() gr.Markdown("#### 🗂️ Example Profiles") gr.Examples( examples=[ ["",28,3,"Bachelor's","Junior Data Analyst","Female","Shows promise and responds well to coaching."], ["",35,10,"Master's","Senior Data Scientist","Male","Consistently exceeds expectations with strong leadership."], ["",45,20,"PhD","Director of Analytics","Female","Outstanding mentor who drives cross-team innovation."], ["",22,1,"Bachelor's","Intern","Male","Still ramping up but shows enthusiasm."], ["",50,25,"Master's","VP of Engineering","Male","Completely fails deadlines and shows poor leadership."], ], inputs=[p_name,p_age,p_exp,p_edu,p_job,p_gender,p_feedback], ) btn_pred.click(run_predictor,inputs=[p_age,p_exp,p_edu,p_gender,p_job,p_name,p_feedback],outputs=[out_gauge,out_proj_chart,out_result,out_proj_table]) # ── TAB 2: SALARY ANALYZER ────────────────────────────── with gr.Tab("🔍 Salary Analyzer"): gr.Markdown("### Is your salary fair?\n*Enter your current salary and profile — **Automation 3** detects anomalies and compares you to market benchmarks.*") with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 💰 Your Salary & Profile") a_salary = gr.Number(label="Your Current Annual Salary ($)",value=75000) a_age = gr.Slider(18,70,value=30,step=1,label="Age") a_exp = gr.Slider(0,40,value=5,step=1,label="Years of Experience") a_edu = gr.Dropdown(["Bachelor's","Master's","PhD"],value="Bachelor's",label="Education Level",interactive=True) a_gender = gr.Dropdown(["Male","Female"],value="Male",label="Gender",interactive=True) a_job = gr.Textbox(label="Job Title",value="Data Analyst") btn_ana = gr.Button("🔍 Analyze My Salary", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("#### 📊 Analysis Results") out_dev = gr.Plot(label="Deviation Gauge") out_ana_res = gr.HTML() gr.Markdown("#### 📊 Market Comparison") out_compare = gr.Plot(label="Salary Comparison") btn_ana.click(run_analyzer,inputs=[a_salary,a_age,a_exp,a_edu,a_gender,a_job],outputs=[out_dev,out_compare,out_ana_res]) # ── TAB 3: INSIGHTS ───────────────────────────────────── with gr.Tab("📊 Insights"): gr.Markdown("### Data Insights & AI Analysis\n*Explore our findings — interactive charts + AI assistant*") kpi_html = gr.HTML(value=render_kpis) ref_btn = gr.Button("🔄 Refresh Data", variant="secondary", size="sm") with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 💬 Ask our AI") chatbot = gr.Chatbot(label="AI Analytics Assistant",height=380) user_input = gr.Textbox(label="Ask about the data",placeholder='e.g. "How does education affect salary?"',lines=1) gr.Markdown("**Quick questions:**") with gr.Row(): ex1=gr.Button("📚 Education",size="sm"); ex2=gr.Button("📈 Career tiers",size="sm"); ex3=gr.Button("⚧ Gender gap",size="sm") with gr.Row(): ex4=gr.Button("📅 Progression",size="sm"); ex5=gr.Button("💬 Sentiment",size="sm"); ex6=gr.Button("🔗 Scatter",size="sm") with gr.Column(scale=1): ai_chart = gr.Plot(label="Interactive Chart") gr.Markdown("#### 📊 Key Charts") with gr.Row(): c_edu=gr.Plot(label="Education"); c_gender=gr.Plot(label="Gender") with gr.Row(): c_exp=gr.Plot(label="Experience"); c_tier=gr.Plot(label="Career Tier") with gr.Row(): c_prog=gr.Plot(label="Progression"); c_sent=gr.Plot(label="Sentiment") c_scatter=gr.Plot(label="Scatter") ex1.click(lambda: "How does education level affect salary?",outputs=[user_input]) ex2.click(lambda: "Show career tier salary differences",outputs=[user_input]) ex3.click(lambda: "Is there a gender pay gap?",outputs=[user_input]) ex4.click(lambda: "Show salary progression over time",outputs=[user_input]) ex5.click(lambda: "Do senior employees get more positive feedback?",outputs=[user_input]) ex6.click(lambda: "Show experience vs salary scatter",outputs=[user_input]) user_input.submit(ai_chat,inputs=[user_input,chatbot],outputs=[chatbot,user_input,ai_chart]) def on_refresh(): kpi,edu,gender,exp,tier,prog,sent,scatter=refresh_insights() return kpi,edu,gender,exp,tier,prog,sent,scatter ref_btn.click(on_refresh,outputs=[kpi_html,c_edu,c_gender,c_exp,c_tier,c_prog,c_sent,c_scatter]) # ── TAB 4: PIPELINE (minimal) ─────────────────────────── with gr.Tab("⚙️ Pipeline"): gr.Markdown("### Data Pipeline\n*Regenerate synthetic data and retrain the model.*") with gr.Row(): btn_nb1=gr.Button("▶ Step 1: Data Creation",variant="secondary") btn_nb2=gr.Button("▶ Step 2: Analysis & Training",variant="secondary") btn_all=gr.Button("⚡ Run Full Pipeline",variant="primary") pipe_log=gr.Textbox(label="Log",lines=15,interactive=False) btn_nb1.click(run_nb1,outputs=[pipe_log]) btn_nb2.click(run_nb2,outputs=[pipe_log]) btn_all.click(run_both,outputs=[pipe_log]) demo.launch(css=load_css(), allowed_paths=[str(BASE_DIR)])