Spaces:
Paused
Paused
| """ | |
| CS1 Group 14 β Job Description Risk Analyzer | |
| Built for Gradio 4.44 / Hugging Face Spaces | |
| """ | |
| import os | |
| import re | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import pandas as pd | |
| import gradio as gr | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| # ========================================================= | |
| # CONFIG | |
| # ========================================================= | |
| BASE_DIR = Path(__file__).resolve().parent | |
| DATA_FILE = BASE_DIR / "job_description_data.xlsx" | |
| N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip() | |
| # ========================================================= | |
| # RED FLAG TAXONOMY | |
| # ========================================================= | |
| RED_FLAGS = [ | |
| ("high responsibility early", +10, ["full ownership", "lead the", "responsible for", "drive the", "own the", "manage the team", "take charge"]), | |
| ("high autonomy / ownership", +10, ["autonomous", "self-starter", "work independently", "minimal supervision", "own initiative"]), | |
| ("adaptability / flexibility demand", +8, ["flexible", "adaptable", "fast-paced", "changing priorities", "wear many hats"]), | |
| ("cross-functional / many stakeholders",+8, ["cross-functional", "multiple stakeholders", "various teams", "coordinate with", "liaise"]), | |
| ("customer-facing emotional labor", +6, ["customer-facing", "client-facing", "handle complaints", "difficult customers"]), | |
| ("technical complexity", +6, ["python", "sql", "machine learning", "api", "data pipeline", "advanced", "complex systems"]), | |
| ("on-site only / no remote", +5, ["on-site only", "no remote", "in-office", "fully on-site", "presence required"]), | |
| ("travel / mobility", +5, ["travel required", "mobility", "frequent travel", "willing to travel"]), | |
| ("pressure / deadlines", +5, ["tight deadlines", "high pressure", "fast deadlines", "demanding schedule"]), | |
| ("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]), | |
| ("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]), | |
| ("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]), | |
| ("salary clearly specified", -6, ["salary:", "compensation:", "annual salary", "monthly salary"]), | |
| ("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]), | |
| ("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]), | |
| ] | |
| # ========================================================= | |
| # DATA LOADING | |
| # ========================================================= | |
| def load_dataset(): | |
| if not DATA_FILE.exists(): | |
| return pd.DataFrame() | |
| try: | |
| return pd.read_excel(DATA_FILE) | |
| except Exception: | |
| return pd.DataFrame() | |
| DF = load_dataset() | |
| def extract_flag_labels(red_flags_cell): | |
| if not isinstance(red_flags_cell, str): | |
| return [] | |
| out = [] | |
| for part in re.split(r",\s*(?=[a-zA-Z])", red_flags_cell): | |
| m = re.match(r"(.+?)\s*\(([+-]\d+)\)", part.strip()) | |
| if m: | |
| out.append((m.group(1).strip(), int(m.group(2)))) | |
| return out | |
| # ========================================================= | |
| # CORE: ANALYZE | |
| # ========================================================= | |
| def classify_risk(score): | |
| if score < 12: | |
| return "Low", "π’" | |
| if score < 25: | |
| return "Medium", "π‘" | |
| return "High", "π΄" | |
| def analyze_job(text): | |
| if not text or len(text.strip()) < 30: | |
| return "β οΈ Please paste a real job description (at least 30 characters).", 0, "β", _empty_chart("Paste a job description above") | |
| lower = text.lower() | |
| detected = [] | |
| score = 0 | |
| for label, weight, patterns in RED_FLAGS: | |
| if any(p in lower for p in patterns): | |
| detected.append((label, weight)) | |
| score += weight | |
| risk, emoji = classify_risk(score) | |
| md = "## " + emoji + " Risk: **" + risk + "** | Score: **" + str(score) + "**\n\n" | |
| if not detected: | |
| md += "_No clear red or positive signals detected._" | |
| else: | |
| bad = [(l, w) for l, w in detected if w > 0] | |
| good = [(l, w) for l, w in detected if w < 0] | |
| if bad: | |
| md += "### π© Red flags detected\n" | |
| for l, w in bad: | |
| md += "- **" + l + "** `(+" + str(w) + ")`\n" | |
| if good: | |
| md += "\n### β Positive signals detected\n" | |
| for l, w in good: | |
| md += "- **" + l + "** `(" + str(w) + ")`\n" | |
| if detected: | |
| cdf = pd.DataFrame(detected, columns=["Signal", "Weight"]) | |
| cdf["Type"] = cdf["Weight"].apply(lambda w: "Red flag" if w > 0 else "Positive") | |
| fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h", | |
| color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"}, | |
| title="Signal breakdown") | |
| fig.update_layout(**_styled_layout(height=420)) | |
| else: | |
| fig = _empty_chart("No signals to chart") | |
| return md, score, risk, fig | |
| # ========================================================= | |
| # CHARTS | |
| # ========================================================= | |
| def _styled_layout(**kwargs): | |
| defaults = dict( | |
| template="plotly_white", | |
| paper_bgcolor="#fdfaf3", | |
| plot_bgcolor="#fdfaf3", | |
| font=dict(family="system-ui, sans-serif", color="#1a2238", size=12), | |
| margin=dict(l=60, r=20, t=70, b=70), | |
| ) | |
| defaults.update(kwargs) | |
| return defaults | |
| def _empty_chart(title): | |
| fig = go.Figure() | |
| fig.update_layout( | |
| title=title, height=420, template="plotly_white", | |
| paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3", | |
| annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper", | |
| showarrow=False, font=dict(size=14, color="#8a9099"))], | |
| ) | |
| return fig | |
| def build_flag_frequency_chart(): | |
| if DF.empty or "Red Flags" not in DF.columns: | |
| return _empty_chart("Dataset not loaded") | |
| all_flags = [] | |
| for cell in DF["Red Flags"].dropna(): | |
| all_flags.extend(label for label, _ in extract_flag_labels(str(cell))) | |
| counts = pd.Series(all_flags).value_counts().head(12) | |
| fig = go.Figure(go.Bar( | |
| y=counts.index[::-1], x=counts.values[::-1], orientation="h", | |
| marker=dict(color="#e85a4f"), | |
| )) | |
| fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs")) | |
| return fig | |
| def build_risk_distribution_chart(): | |
| if DF.empty or "Risk Level" not in DF.columns: | |
| return _empty_chart("Dataset not loaded") | |
| counts = DF["Risk Level"].value_counts() | |
| colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"} | |
| fig = go.Figure(go.Pie( | |
| labels=counts.index, values=counts.values, | |
| marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]), | |
| hole=0.4, | |
| )) | |
| fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution")) | |
| return fig | |
| def build_score_distribution_chart(): | |
| if DF.empty or "Score" not in DF.columns: | |
| return _empty_chart("Dataset not loaded") | |
| scores = DF["Score"].dropna() | |
| fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f")) | |
| fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution")) | |
| return fig | |
| # ========================================================= | |
| # KPI CARDS | |
| # ========================================================= | |
| def render_kpi_cards(): | |
| if DF.empty: | |
| return '<div style="background:#fdfaf3;padding:32px;text-align:center;border-radius:12px;border:1px solid #d9cfb9;color:#4a5475;">No dataset loaded.</div>' | |
| total_jobs = len(DF) | |
| avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0 | |
| risk_counts = DF["Risk Level"].value_counts() if "Risk Level" in DF.columns else pd.Series() | |
| high_pct = (risk_counts.get("High", 0) / total_jobs * 100) if total_jobs else 0 | |
| all_flags = [] | |
| if "Red Flags" in DF.columns: | |
| for cell in DF["Red Flags"].dropna(): | |
| all_flags.extend(label for label, _ in extract_flag_labels(str(cell))) | |
| top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "β" | |
| def card(label, value, sub, color): | |
| return ( | |
| '<div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;' | |
| 'padding:20px 22px;box-shadow:0 2px 8px rgba(26,34,56,0.04);">' | |
| '<div style="font-family:monospace;color:' + color + ';font-size:11px;font-weight:600;' | |
| 'text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">' + label + '</div>' | |
| '<div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;' | |
| 'letter-spacing:-0.03em;margin-bottom:10px;">' + str(value) + '</div>' | |
| '<div style="font-family:monospace;font-size:11px;color:#4a5475;">' + sub + '</div>' | |
| '</div>' | |
| ) | |
| cards = [ | |
| card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"), | |
| card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"), | |
| card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"), | |
| card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "β" else "β", | |
| top_flag if top_flag != "β" else "no data", "#7d4e8a"), | |
| ] | |
| return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));' | |
| 'gap:12px;margin-bottom:24px;">' + "".join(cards) + '</div>') | |
| # ========================================================= | |
| # CHAT (n8n -> keyword fallback) | |
| # ========================================================= | |
| def keyword_fallback(msg): | |
| m = msg.lower() | |
| if any(w in m for w in ["common", "frequent", "most", "top"]): | |
| return ("The most common signals in our dataset are 'high responsibility early', " | |
| "'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency" | |
| if any(w in m for w in ["risk", "distribution", "level"]): | |
| return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine " | |
| "multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution" | |
| if any(w in m for w in ["score", "histogram", "spread"]): | |
| return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals " | |
| "a problematic posting."), "score_distribution" | |
| if any(w in m for w in ["how", "work", "explain", "method"]): | |
| return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, " | |
| "positive signals subtract. The total maps to Low/Medium/High risk."), "none" | |
| return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none" | |
| def call_n8n(msg): | |
| import requests | |
| try: | |
| r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15) | |
| data = r.json() | |
| return data.get("answer", "n8n returned no answer."), data.get("chart", "none") | |
| except Exception as e: | |
| fb_text, fb_chart = keyword_fallback(msg) | |
| return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart | |
| def ai_chat(user_msg, history): | |
| if not user_msg or not user_msg.strip(): | |
| return history or [], "", None | |
| if N8N_WEBHOOK_URL: | |
| reply, chart_key = call_n8n(user_msg) | |
| else: | |
| reply, chart_key = keyword_fallback(user_msg) | |
| builders = { | |
| "flag_frequency": build_flag_frequency_chart, | |
| "risk_distribution": build_risk_distribution_chart, | |
| "score_distribution": build_score_distribution_chart, | |
| } | |
| chart_out = builders[chart_key]() if chart_key in builders else None | |
| new_history = (history or []) + [(user_msg, reply)] | |
| return new_history, "", chart_out | |
| # ========================================================= | |
| # CSS LOADER | |
| # ========================================================= | |
| def load_css(): | |
| css_path = BASE_DIR / "style.css" | |
| if css_path.exists(): | |
| return css_path.read_text(encoding="utf-8") | |
| return "" | |
| # ========================================================= | |
| # UI | |
| # ========================================================= | |
| CSS = load_css() | |
| with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo: | |
| gr.Markdown( | |
| "# Job Risk Analyzer\n" | |
| "Detect hidden risk patterns in job postings using a weighted signal taxonomy " | |
| "calibrated on 47 real labeled descriptions.", | |
| elem_id="escp_title", | |
| ) | |
| with gr.Tab("π Analyze a Job"): | |
| gr.Markdown("Paste any job description below to detect red flags and estimate risk.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp = gr.Textbox(label="Job description", lines=15, | |
| placeholder="Paste the full job posting here...") | |
| btn = gr.Button("Analyze", variant="primary") | |
| with gr.Column(): | |
| out_md = gr.Markdown() | |
| with gr.Row(): | |
| out_score = gr.Number(label="Score", precision=0) | |
| out_risk = gr.Textbox(label="Risk Level") | |
| out_chart = gr.Plot(label="Signal breakdown") | |
| btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart]) | |
| with gr.Tab("π Dataset Dashboard"): | |
| gr.HTML(value=render_kpi_cards()) | |
| gr.Markdown("### Insights from labeled job postings") | |
| gr.Plot(value=build_flag_frequency_chart(), label="Most common signals") | |
| with gr.Row(): | |
| gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution") | |
| gr.Plot(value=build_score_distribution_chart(), label="Score distribution") | |
| if not DF.empty: | |
| display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns] | |
| if display_cols: | |
| gr.Markdown("### Raw labeled dataset") | |
| gr.Dataframe(DF[display_cols], wrap=True, interactive=False) | |
| with gr.Tab('"AI" Dashboard'): | |
| status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL | |
| else "Using **keyword matching** (set `N8N_WEBHOOK_URL` to upgrade).") | |
| gr.Markdown("### Ask questions, get visualizations\n\n" + status) | |
| with gr.Row(): | |
| with gr.Column(): | |
| chatbot = gr.Chatbot(label="Conversation", height=380) | |
| user_input = gr.Textbox(label="Ask about the dataset", | |
| placeholder="e.g. What are the most common red flags?") | |
| gr.Examples( | |
| examples=[ | |
| "What are the most common red flags?", | |
| "Show me the risk level distribution", | |
| "How is the score spread across jobs?", | |
| "How does the analyzer work?", | |
| ], | |
| inputs=user_input, | |
| ) | |
| with gr.Column(): | |
| ai_chart = gr.Plot(label="Visualization") | |
| user_input.submit(ai_chat, inputs=[user_input, chatbot], | |
| outputs=[chatbot, user_input, ai_chart]) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ### How it works | |
| This app uses a **weighted red-flag taxonomy** built from 47 real labeled job postings. | |
| Each detected signal contributes to a total score that maps to Low / Medium / High risk. | |
| - π’ **Low** (< 12): Healthy posting with clear structure and benefits | |
| - π‘ **Medium** (12β24): Some warning signs worth investigating | |
| - π΄ **High** (β₯ 25): Multiple concerning patterns | |
| ### Team β CS1 Group 14 | |
| - **Gaspard** β UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing) | |
| - **Person 3** β Data Analyst (extraction, analysis, charts) | |
| - **Person 4** β Project Manager (final report, coordination) | |
| ### Iterations | |
| - **v1** β Keyword matching with hard-coded weights from labeled dataset | |
| - **v2** β Refined keyword patterns after user testing | |
| - **v3** β Integrated n8n workflow for smarter conversational responses | |
| """) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |