""" CS1 Group 14 — Job Description Risk Analyzer Built for Gradio 4.44 / Hugging Face Spaces """ import os import re import json from pathlib import Path from typing import Dict, List, Tuple import pandas as pd import gradio as gr import plotly.graph_objects as go import plotly.express as px # ========================================================= # CONFIG # ========================================================= BASE_DIR = Path(__file__).resolve().parent DATA_FILE = BASE_DIR / "job_description_data.xlsx" N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip() # ========================================================= # RED FLAG TAXONOMY # ========================================================= RED_FLAGS = [ ("high responsibility early", +10, ["full ownership", "lead the", "responsible for", "drive the", "own the", "manage the team", "take charge"]), ("high autonomy / ownership", +10, ["autonomous", "self-starter", "work independently", "minimal supervision", "own initiative"]), ("adaptability / flexibility demand", +8, ["flexible", "adaptable", "fast-paced", "changing priorities", "wear many hats"]), ("cross-functional / many stakeholders",+8, ["cross-functional", "multiple stakeholders", "various teams", "coordinate with", "liaise"]), ("customer-facing emotional labor", +6, ["customer-facing", "client-facing", "handle complaints", "difficult customers"]), ("technical complexity", +6, ["python", "sql", "machine learning", "api", "data pipeline", "advanced", "complex systems"]), ("on-site only / no remote", +5, ["on-site only", "no remote", "in-office", "fully on-site", "presence required"]), ("travel / mobility", +5, ["travel required", "mobility", "frequent travel", "willing to travel"]), ("pressure / deadlines", +5, ["tight deadlines", "high pressure", "fast deadlines", "demanding schedule"]), ("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]), ("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]), ("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]), ("salary clearly specified", -6, ["salary:", "compensation:", "annual salary", "monthly salary"]), ("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]), ("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]), ] # ========================================================= # DATA LOADING # ========================================================= def load_dataset(): if not DATA_FILE.exists(): return pd.DataFrame() try: return pd.read_excel(DATA_FILE) except Exception: return pd.DataFrame() DF = load_dataset() def extract_flag_labels(red_flags_cell): if not isinstance(red_flags_cell, str): return [] out = [] for part in re.split(r",\s*(?=[a-zA-Z])", red_flags_cell): m = re.match(r"(.+?)\s*\(([+-]\d+)\)", part.strip()) if m: out.append((m.group(1).strip(), int(m.group(2)))) return out # ========================================================= # CORE: ANALYZE # ========================================================= def classify_risk(score): if score < 12: return "Low", "đŸŸĸ" if score < 25: return "Medium", "🟡" return "High", "🔴" def analyze_job(text): if not text or len(text.strip()) < 30: return "âš ī¸ Please paste a real job description (at least 30 characters).", 0, "—", _empty_chart("Paste a job description above") lower = text.lower() detected = [] score = 0 for label, weight, patterns in RED_FLAGS: if any(p in lower for p in patterns): detected.append((label, weight)) score += weight risk, emoji = classify_risk(score) md = "## " + emoji + " Risk: **" + risk + "** | Score: **" + str(score) + "**\n\n" if not detected: md += "_No clear red or positive signals detected._" else: bad = [(l, w) for l, w in detected if w > 0] good = [(l, w) for l, w in detected if w < 0] if bad: md += "### 🚩 Red flags detected\n" for l, w in bad: md += "- **" + l + "** `(+" + str(w) + ")`\n" if good: md += "\n### ✅ Positive signals detected\n" for l, w in good: md += "- **" + l + "** `(" + str(w) + ")`\n" if detected: cdf = pd.DataFrame(detected, columns=["Signal", "Weight"]) cdf["Type"] = cdf["Weight"].apply(lambda w: "Red flag" if w > 0 else "Positive") fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h", color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"}, title="Signal breakdown") fig.update_layout(**_styled_layout(height=420)) else: fig = _empty_chart("No signals to chart") return md, score, risk, fig # ========================================================= # CHARTS # ========================================================= def _styled_layout(**kwargs): defaults = dict( template="plotly_white", paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3", font=dict(family="system-ui, sans-serif", color="#1a2238", size=12), margin=dict(l=60, r=20, t=70, b=70), ) defaults.update(kwargs) return defaults def _empty_chart(title): fig = go.Figure() fig.update_layout( title=title, height=420, template="plotly_white", paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3", annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False, font=dict(size=14, color="#8a9099"))], ) return fig def build_flag_frequency_chart(): if DF.empty or "Red Flags" not in DF.columns: return _empty_chart("Dataset not loaded") all_flags = [] for cell in DF["Red Flags"].dropna(): all_flags.extend(label for label, _ in extract_flag_labels(str(cell))) counts = pd.Series(all_flags).value_counts().head(12) fig = go.Figure(go.Bar( y=counts.index[::-1], x=counts.values[::-1], orientation="h", marker=dict(color="#e85a4f"), )) fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs")) return fig def build_risk_distribution_chart(): if DF.empty or "Risk Level" not in DF.columns: return _empty_chart("Dataset not loaded") counts = DF["Risk Level"].value_counts() colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"} fig = go.Figure(go.Pie( labels=counts.index, values=counts.values, marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]), hole=0.4, )) fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution")) return fig def build_score_distribution_chart(): if DF.empty or "Score" not in DF.columns: return _empty_chart("Dataset not loaded") scores = DF["Score"].dropna() fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f")) fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution")) return fig # ========================================================= # KPI CARDS # ========================================================= def render_kpi_cards(): if DF.empty: return '
No dataset loaded.
' total_jobs = len(DF) avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0 risk_counts = DF["Risk Level"].value_counts() if "Risk Level" in DF.columns else pd.Series() high_pct = (risk_counts.get("High", 0) / total_jobs * 100) if total_jobs else 0 all_flags = [] if "Red Flags" in DF.columns: for cell in DF["Red Flags"].dropna(): all_flags.extend(label for label, _ in extract_flag_labels(str(cell))) top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "—" def card(label, value, sub, color): return ( '
' '
' + label + '
' '
' + str(value) + '
' '
' + sub + '
' '
' ) cards = [ card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"), card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"), card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"), card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "—" else "—", top_flag if top_flag != "—" else "no data", "#7d4e8a"), ] return ('
' + "".join(cards) + '
') # ========================================================= # CHAT (n8n -> keyword fallback) # ========================================================= def keyword_fallback(msg): m = msg.lower() if any(w in m for w in ["common", "frequent", "most", "top"]): return ("The most common signals in our dataset are 'high responsibility early', " "'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency" if any(w in m for w in ["risk", "distribution", "level"]): return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine " "multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution" if any(w in m for w in ["score", "histogram", "spread"]): return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals " "a problematic posting."), "score_distribution" if any(w in m for w in ["how", "work", "explain", "method"]): return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, " "positive signals subtract. The total maps to Low/Medium/High risk."), "none" return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none" def call_n8n(msg): import requests try: r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15) data = r.json() return data.get("answer", "n8n returned no answer."), data.get("chart", "none") except Exception as e: fb_text, fb_chart = keyword_fallback(msg) return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart def ai_chat(user_msg, history): if not user_msg or not user_msg.strip(): return history or [], "", None if N8N_WEBHOOK_URL: reply, chart_key = call_n8n(user_msg) else: reply, chart_key = keyword_fallback(user_msg) builders = { "flag_frequency": build_flag_frequency_chart, "risk_distribution": build_risk_distribution_chart, "score_distribution": build_score_distribution_chart, } chart_out = builders[chart_key]() if chart_key in builders else None new_history = (history or []) + [(user_msg, reply)] return new_history, "", chart_out # ========================================================= # CSS LOADER # ========================================================= def load_css(): css_path = BASE_DIR / "style.css" if css_path.exists(): return css_path.read_text(encoding="utf-8") return "" # ========================================================= # UI # ========================================================= CSS = load_css() with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo: gr.Markdown( "# Job Risk Analyzer\n" "Detect hidden risk patterns in job postings using a weighted signal taxonomy " "calibrated on 47 real labeled descriptions.", elem_id="escp_title", ) with gr.Tab("🔍 Analyze a Job"): gr.Markdown("Paste any job description below to detect red flags and estimate risk.") with gr.Row(): with gr.Column(): inp = gr.Textbox(label="Job description", lines=15, placeholder="Paste the full job posting here...") btn = gr.Button("Analyze", variant="primary") with gr.Column(): out_md = gr.Markdown() with gr.Row(): out_score = gr.Number(label="Score", precision=0) out_risk = gr.Textbox(label="Risk Level") out_chart = gr.Plot(label="Signal breakdown") btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart]) with gr.Tab("📊 Dataset Dashboard"): gr.HTML(value=render_kpi_cards()) gr.Markdown("### Insights from labeled job postings") gr.Plot(value=build_flag_frequency_chart(), label="Most common signals") with gr.Row(): gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution") gr.Plot(value=build_score_distribution_chart(), label="Score distribution") if not DF.empty: display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns] if display_cols: gr.Markdown("### Raw labeled dataset") gr.Dataframe(DF[display_cols], wrap=True, interactive=False) with gr.Tab('"AI" Dashboard'): status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL else "Using **keyword matching** (set `N8N_WEBHOOK_URL` to upgrade).") gr.Markdown("### Ask questions, get visualizations\n\n" + status) with gr.Row(): with gr.Column(): chatbot = gr.Chatbot(label="Conversation", height=380) user_input = gr.Textbox(label="Ask about the dataset", placeholder="e.g. What are the most common red flags?") gr.Examples( examples=[ "What are the most common red flags?", "Show me the risk level distribution", "How is the score spread across jobs?", "How does the analyzer work?", ], inputs=user_input, ) with gr.Column(): ai_chart = gr.Plot(label="Visualization") user_input.submit(ai_chat, inputs=[user_input, chatbot], outputs=[chatbot, user_input, ai_chart]) with gr.Tab("â„šī¸ About"): gr.Markdown(""" ### How it works This app uses a **weighted red-flag taxonomy** built from 47 real labeled job postings. Each detected signal contributes to a total score that maps to Low / Medium / High risk. - đŸŸĸ **Low** (< 12): Healthy posting with clear structure and benefits - 🟡 **Medium** (12–24): Some warning signs worth investigating - 🔴 **High** (â‰Ĩ 25): Multiple concerning patterns ### Team — CS1 Group 14 - **Gaspard** — UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing) - **Person 3** — Data Analyst (extraction, analysis, charts) - **Person 4** — Project Manager (final report, coordination) ### Iterations - **v1** — Keyword matching with hard-coded weights from labeled dataset - **v2** — Refined keyword patterns after user testing - **v3** — Integrated n8n workflow for smarter conversational responses """) demo.launch(server_name="0.0.0.0", server_port=7860)