Spaces:

ESCP
/

CS1_Group_14

Paused

File size: 16,664 Bytes

"""
CS1 Group 14 — Job Description Risk Analyzer
Built for Gradio 4.44 / Hugging Face Spaces
"""
import os
import re
import json
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
import gradio as gr
import plotly.graph_objects as go
import plotly.express as px

# =========================================================
# CONFIG
# =========================================================

BASE_DIR = Path(__file__).resolve().parent
DATA_FILE = BASE_DIR / "job_description_data.xlsx"
N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()

# =========================================================
# RED FLAG TAXONOMY
# =========================================================

RED_FLAGS = [
    ("high responsibility early",          +10, ["full ownership", "lead the", "responsible for", "drive the", "own the", "manage the team", "take charge"]),
    ("high autonomy / ownership",          +10, ["autonomous", "self-starter", "work independently", "minimal supervision", "own initiative"]),
    ("adaptability / flexibility demand",   +8, ["flexible", "adaptable", "fast-paced", "changing priorities", "wear many hats"]),
    ("cross-functional / many stakeholders",+8, ["cross-functional", "multiple stakeholders", "various teams", "coordinate with", "liaise"]),
    ("customer-facing emotional labor",     +6, ["customer-facing", "client-facing", "handle complaints", "difficult customers"]),
    ("technical complexity",                +6, ["python", "sql", "machine learning", "api", "data pipeline", "advanced", "complex systems"]),
    ("on-site only / no remote",            +5, ["on-site only", "no remote", "in-office", "fully on-site", "presence required"]),
    ("travel / mobility",                   +5, ["travel required", "mobility", "frequent travel", "willing to travel"]),
    ("pressure / deadlines",                +5, ["tight deadlines", "high pressure", "fast deadlines", "demanding schedule"]),
    ("broad / unclear scope",               +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]),
    ("multitasking / many hats",            +5, ["multitask", "juggle", "multiple roles"]),
    ("training / support provided",         -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]),
    ("salary clearly specified",            -6, ["salary:", "compensation:", "annual salary", "monthly salary"]),
    ("clear role structure",                -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]),
    ("benefits clearly mentioned",          -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]),
]

# =========================================================
# DATA LOADING
# =========================================================

def load_dataset():
    if not DATA_FILE.exists():
        return pd.DataFrame()
    try:
        return pd.read_excel(DATA_FILE)
    except Exception:
        return pd.DataFrame()

DF = load_dataset()


def extract_flag_labels(red_flags_cell):
    if not isinstance(red_flags_cell, str):
        return []
    out = []
    for part in re.split(r",\s*(?=[a-zA-Z])", red_flags_cell):
        m = re.match(r"(.+?)\s*\(([+-]\d+)\)", part.strip())
        if m:
            out.append((m.group(1).strip(), int(m.group(2))))
    return out


# =========================================================
# CORE: ANALYZE
# =========================================================

def classify_risk(score):
    if score < 12:
        return "Low", "🟢"
    if score < 25:
        return "Medium", "🟡"
    return "High", "🔴"


def analyze_job(text):
    if not text or len(text.strip()) < 30:
        return "⚠️ Please paste a real job description (at least 30 characters).", 0, "—", _empty_chart("Paste a job description above")

    lower = text.lower()
    detected = []
    score = 0
    for label, weight, patterns in RED_FLAGS:
        if any(p in lower for p in patterns):
            detected.append((label, weight))
            score += weight

    risk, emoji = classify_risk(score)
    md = "## " + emoji + " Risk: **" + risk + "** | Score: **" + str(score) + "**\n\n"
    if not detected:
        md += "_No clear red or positive signals detected._"
    else:
        bad = [(l, w) for l, w in detected if w > 0]
        good = [(l, w) for l, w in detected if w < 0]
        if bad:
            md += "### 🚩 Red flags detected\n"
            for l, w in bad:
                md += "- **" + l + "** `(+" + str(w) + ")`\n"
        if good:
            md += "\n### ✅ Positive signals detected\n"
            for l, w in good:
                md += "- **" + l + "** `(" + str(w) + ")`\n"

    if detected:
        cdf = pd.DataFrame(detected, columns=["Signal", "Weight"])
        cdf["Type"] = cdf["Weight"].apply(lambda w: "Red flag" if w > 0 else "Positive")
        fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h",
                     color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"},
                     title="Signal breakdown")
        fig.update_layout(**_styled_layout(height=420))
    else:
        fig = _empty_chart("No signals to chart")

    return md, score, risk, fig


# =========================================================
# CHARTS
# =========================================================

def _styled_layout(**kwargs):
    defaults = dict(
        template="plotly_white",
        paper_bgcolor="#fdfaf3",
        plot_bgcolor="#fdfaf3",
        font=dict(family="system-ui, sans-serif", color="#1a2238", size=12),
        margin=dict(l=60, r=20, t=70, b=70),
    )
    defaults.update(kwargs)
    return defaults


def _empty_chart(title):
    fig = go.Figure()
    fig.update_layout(
        title=title, height=420, template="plotly_white",
        paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3",
        annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper",
                          showarrow=False, font=dict(size=14, color="#8a9099"))],
    )
    return fig


def build_flag_frequency_chart():
    if DF.empty or "Red Flags" not in DF.columns:
        return _empty_chart("Dataset not loaded")
    all_flags = []
    for cell in DF["Red Flags"].dropna():
        all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
    counts = pd.Series(all_flags).value_counts().head(12)
    fig = go.Figure(go.Bar(
        y=counts.index[::-1], x=counts.values[::-1], orientation="h",
        marker=dict(color="#e85a4f"),
    ))
    fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs"))
    return fig


def build_risk_distribution_chart():
    if DF.empty or "Risk Level" not in DF.columns:
        return _empty_chart("Dataset not loaded")
    counts = DF["Risk Level"].value_counts()
    colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"}
    fig = go.Figure(go.Pie(
        labels=counts.index, values=counts.values,
        marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]),
        hole=0.4,
    ))
    fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution"))
    return fig


def build_score_distribution_chart():
    if DF.empty or "Score" not in DF.columns:
        return _empty_chart("Dataset not loaded")
    scores = DF["Score"].dropna()
    fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f"))
    fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution"))
    return fig


# =========================================================
# KPI CARDS
# =========================================================

def render_kpi_cards():
    if DF.empty:
        return '<div style="background:#fdfaf3;padding:32px;text-align:center;border-radius:12px;border:1px solid #d9cfb9;color:#4a5475;">No dataset loaded.</div>'

    total_jobs = len(DF)
    avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0
    risk_counts = DF["Risk Level"].value_counts() if "Risk Level" in DF.columns else pd.Series()
    high_pct = (risk_counts.get("High", 0) / total_jobs * 100) if total_jobs else 0

    all_flags = []
    if "Red Flags" in DF.columns:
        for cell in DF["Red Flags"].dropna():
            all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
    top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "—"

    def card(label, value, sub, color):
        return (
            '<div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;'
            'padding:20px 22px;box-shadow:0 2px 8px rgba(26,34,56,0.04);">'
            '<div style="font-family:monospace;color:' + color + ';font-size:11px;font-weight:600;'
            'text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">' + label + '</div>'
            '<div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;'
            'letter-spacing:-0.03em;margin-bottom:10px;">' + str(value) + '</div>'
            '<div style="font-family:monospace;font-size:11px;color:#4a5475;">' + sub + '</div>'
            '</div>'
        )

    cards = [
        card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"),
        card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"),
        card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"),
        card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "—" else "—",
             top_flag if top_flag != "—" else "no data", "#7d4e8a"),
    ]
    return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));'
            'gap:12px;margin-bottom:24px;">' + "".join(cards) + '</div>')


# =========================================================
# CHAT (n8n -> keyword fallback)
# =========================================================

def keyword_fallback(msg):
    m = msg.lower()
    if any(w in m for w in ["common", "frequent", "most", "top"]):
        return ("The most common signals in our dataset are 'high responsibility early', "
                "'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency"
    if any(w in m for w in ["risk", "distribution", "level"]):
        return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine "
                "multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution"
    if any(w in m for w in ["score", "histogram", "spread"]):
        return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals "
                "a problematic posting."), "score_distribution"
    if any(w in m for w in ["how", "work", "explain", "method"]):
        return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, "
                "positive signals subtract. The total maps to Low/Medium/High risk."), "none"
    return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none"


def call_n8n(msg):
    import requests
    try:
        r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15)
        data = r.json()
        return data.get("answer", "n8n returned no answer."), data.get("chart", "none")
    except Exception as e:
        fb_text, fb_chart = keyword_fallback(msg)
        return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart


def ai_chat(user_msg, history):
    if not user_msg or not user_msg.strip():
        return history or [], "", None

    if N8N_WEBHOOK_URL:
        reply, chart_key = call_n8n(user_msg)
    else:
        reply, chart_key = keyword_fallback(user_msg)

    builders = {
        "flag_frequency": build_flag_frequency_chart,
        "risk_distribution": build_risk_distribution_chart,
        "score_distribution": build_score_distribution_chart,
    }
    chart_out = builders[chart_key]() if chart_key in builders else None

    new_history = (history or []) + [(user_msg, reply)]
    return new_history, "", chart_out


# =========================================================
# CSS LOADER
# =========================================================

def load_css():
    css_path = BASE_DIR / "style.css"
    if css_path.exists():
        return css_path.read_text(encoding="utf-8")
    return ""


# =========================================================
# UI
# =========================================================

CSS = load_css()

with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo:

    gr.Markdown(
        "# Job Risk Analyzer\n"
        "Detect hidden risk patterns in job postings using a weighted signal taxonomy "
        "calibrated on 47 real labeled descriptions.",
        elem_id="escp_title",
    )

    with gr.Tab("🔍 Analyze a Job"):
        gr.Markdown("Paste any job description below to detect red flags and estimate risk.")
        with gr.Row():
            with gr.Column():
                inp = gr.Textbox(label="Job description", lines=15,
                                 placeholder="Paste the full job posting here...")
                btn = gr.Button("Analyze", variant="primary")
            with gr.Column():
                out_md = gr.Markdown()
                with gr.Row():
                    out_score = gr.Number(label="Score", precision=0)
                    out_risk = gr.Textbox(label="Risk Level")
                out_chart = gr.Plot(label="Signal breakdown")
        btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart])

    with gr.Tab("📊 Dataset Dashboard"):
        gr.HTML(value=render_kpi_cards())
        gr.Markdown("### Insights from labeled job postings")
        gr.Plot(value=build_flag_frequency_chart(), label="Most common signals")
        with gr.Row():
            gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution")
            gr.Plot(value=build_score_distribution_chart(), label="Score distribution")
        if not DF.empty:
            display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns]
            if display_cols:
                gr.Markdown("### Raw labeled dataset")
                gr.Dataframe(DF[display_cols], wrap=True, interactive=False)

    with gr.Tab('"AI" Dashboard'):
        status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL
                  else "Using **keyword matching** (set `N8N_WEBHOOK_URL` to upgrade).")
        gr.Markdown("### Ask questions, get visualizations\n\n" + status)

        with gr.Row():
            with gr.Column():
                chatbot = gr.Chatbot(label="Conversation", height=380)
                user_input = gr.Textbox(label="Ask about the dataset",
                                        placeholder="e.g. What are the most common red flags?")
                gr.Examples(
                    examples=[
                        "What are the most common red flags?",
                        "Show me the risk level distribution",
                        "How is the score spread across jobs?",
                        "How does the analyzer work?",
                    ],
                    inputs=user_input,
                )
            with gr.Column():
                ai_chart = gr.Plot(label="Visualization")

        user_input.submit(ai_chat, inputs=[user_input, chatbot],
                          outputs=[chatbot, user_input, ai_chart])

    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
### How it works

This app uses a **weighted red-flag taxonomy** built from 47 real labeled job postings.
Each detected signal contributes to a total score that maps to Low / Medium / High risk.

- 🟢 **Low** (< 12): Healthy posting with clear structure and benefits
- 🟡 **Medium** (12–24): Some warning signs worth investigating
- 🔴 **High** (≥ 25): Multiple concerning patterns

### Team — CS1 Group 14

- **Gaspard** — UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing)
- **Person 3** — Data Analyst (extraction, analysis, charts)
- **Person 4** — Project Manager (final report, coordination)

### Iterations

- **v1** — Keyword matching with hard-coded weights from labeled dataset
- **v2** — Refined keyword patterns after user testing
- **v3** — Integrated n8n workflow for smarter conversational responses
        """)

demo.launch(server_name="0.0.0.0", server_port=7860)