CS1_Group_14 / app.py
grasepard2's picture
Update app.py
b466904 verified
"""
CS1 Group 14 β€” Job Description Risk Analyzer
Built for Gradio 4.44 / Hugging Face Spaces
"""
import os
import re
import json
from pathlib import Path
from typing import Dict, List, Tuple
import pandas as pd
import gradio as gr
import plotly.graph_objects as go
import plotly.express as px
# =========================================================
# CONFIG
# =========================================================
BASE_DIR = Path(__file__).resolve().parent
DATA_FILE = BASE_DIR / "job_description_data.xlsx"
N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
# =========================================================
# RED FLAG TAXONOMY
# =========================================================
RED_FLAGS = [
("high responsibility early", +10, ["full ownership", "lead the", "responsible for", "drive the", "own the", "manage the team", "take charge"]),
("high autonomy / ownership", +10, ["autonomous", "self-starter", "work independently", "minimal supervision", "own initiative"]),
("adaptability / flexibility demand", +8, ["flexible", "adaptable", "fast-paced", "changing priorities", "wear many hats"]),
("cross-functional / many stakeholders",+8, ["cross-functional", "multiple stakeholders", "various teams", "coordinate with", "liaise"]),
("customer-facing emotional labor", +6, ["customer-facing", "client-facing", "handle complaints", "difficult customers"]),
("technical complexity", +6, ["python", "sql", "machine learning", "api", "data pipeline", "advanced", "complex systems"]),
("on-site only / no remote", +5, ["on-site only", "no remote", "in-office", "fully on-site", "presence required"]),
("travel / mobility", +5, ["travel required", "mobility", "frequent travel", "willing to travel"]),
("pressure / deadlines", +5, ["tight deadlines", "high pressure", "fast deadlines", "demanding schedule"]),
("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]),
("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]),
("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]),
("salary clearly specified", -6, ["salary:", "compensation:", "annual salary", "monthly salary"]),
("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]),
("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]),
]
# =========================================================
# DATA LOADING
# =========================================================
def load_dataset():
if not DATA_FILE.exists():
return pd.DataFrame()
try:
return pd.read_excel(DATA_FILE)
except Exception:
return pd.DataFrame()
DF = load_dataset()
def extract_flag_labels(red_flags_cell):
if not isinstance(red_flags_cell, str):
return []
out = []
for part in re.split(r",\s*(?=[a-zA-Z])", red_flags_cell):
m = re.match(r"(.+?)\s*\(([+-]\d+)\)", part.strip())
if m:
out.append((m.group(1).strip(), int(m.group(2))))
return out
# =========================================================
# CORE: ANALYZE
# =========================================================
def classify_risk(score):
if score < 12:
return "Low", "🟒"
if score < 25:
return "Medium", "🟑"
return "High", "πŸ”΄"
def analyze_job(text):
if not text or len(text.strip()) < 30:
return "⚠️ Please paste a real job description (at least 30 characters).", 0, "β€”", _empty_chart("Paste a job description above")
lower = text.lower()
detected = []
score = 0
for label, weight, patterns in RED_FLAGS:
if any(p in lower for p in patterns):
detected.append((label, weight))
score += weight
risk, emoji = classify_risk(score)
md = "## " + emoji + " Risk: **" + risk + "** | Score: **" + str(score) + "**\n\n"
if not detected:
md += "_No clear red or positive signals detected._"
else:
bad = [(l, w) for l, w in detected if w > 0]
good = [(l, w) for l, w in detected if w < 0]
if bad:
md += "### 🚩 Red flags detected\n"
for l, w in bad:
md += "- **" + l + "** `(+" + str(w) + ")`\n"
if good:
md += "\n### βœ… Positive signals detected\n"
for l, w in good:
md += "- **" + l + "** `(" + str(w) + ")`\n"
if detected:
cdf = pd.DataFrame(detected, columns=["Signal", "Weight"])
cdf["Type"] = cdf["Weight"].apply(lambda w: "Red flag" if w > 0 else "Positive")
fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h",
color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"},
title="Signal breakdown")
fig.update_layout(**_styled_layout(height=420))
else:
fig = _empty_chart("No signals to chart")
return md, score, risk, fig
# =========================================================
# CHARTS
# =========================================================
def _styled_layout(**kwargs):
defaults = dict(
template="plotly_white",
paper_bgcolor="#fdfaf3",
plot_bgcolor="#fdfaf3",
font=dict(family="system-ui, sans-serif", color="#1a2238", size=12),
margin=dict(l=60, r=20, t=70, b=70),
)
defaults.update(kwargs)
return defaults
def _empty_chart(title):
fig = go.Figure()
fig.update_layout(
title=title, height=420, template="plotly_white",
paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3",
annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper",
showarrow=False, font=dict(size=14, color="#8a9099"))],
)
return fig
def build_flag_frequency_chart():
if DF.empty or "Red Flags" not in DF.columns:
return _empty_chart("Dataset not loaded")
all_flags = []
for cell in DF["Red Flags"].dropna():
all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
counts = pd.Series(all_flags).value_counts().head(12)
fig = go.Figure(go.Bar(
y=counts.index[::-1], x=counts.values[::-1], orientation="h",
marker=dict(color="#e85a4f"),
))
fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs"))
return fig
def build_risk_distribution_chart():
if DF.empty or "Risk Level" not in DF.columns:
return _empty_chart("Dataset not loaded")
counts = DF["Risk Level"].value_counts()
colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"}
fig = go.Figure(go.Pie(
labels=counts.index, values=counts.values,
marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]),
hole=0.4,
))
fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution"))
return fig
def build_score_distribution_chart():
if DF.empty or "Score" not in DF.columns:
return _empty_chart("Dataset not loaded")
scores = DF["Score"].dropna()
fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f"))
fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution"))
return fig
# =========================================================
# KPI CARDS
# =========================================================
def render_kpi_cards():
if DF.empty:
return '<div style="background:#fdfaf3;padding:32px;text-align:center;border-radius:12px;border:1px solid #d9cfb9;color:#4a5475;">No dataset loaded.</div>'
total_jobs = len(DF)
avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0
risk_counts = DF["Risk Level"].value_counts() if "Risk Level" in DF.columns else pd.Series()
high_pct = (risk_counts.get("High", 0) / total_jobs * 100) if total_jobs else 0
all_flags = []
if "Red Flags" in DF.columns:
for cell in DF["Red Flags"].dropna():
all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "β€”"
def card(label, value, sub, color):
return (
'<div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;'
'padding:20px 22px;box-shadow:0 2px 8px rgba(26,34,56,0.04);">'
'<div style="font-family:monospace;color:' + color + ';font-size:11px;font-weight:600;'
'text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">' + label + '</div>'
'<div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;'
'letter-spacing:-0.03em;margin-bottom:10px;">' + str(value) + '</div>'
'<div style="font-family:monospace;font-size:11px;color:#4a5475;">' + sub + '</div>'
'</div>'
)
cards = [
card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"),
card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"),
card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"),
card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "β€”" else "β€”",
top_flag if top_flag != "β€”" else "no data", "#7d4e8a"),
]
return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));'
'gap:12px;margin-bottom:24px;">' + "".join(cards) + '</div>')
# =========================================================
# CHAT (n8n -> keyword fallback)
# =========================================================
def keyword_fallback(msg):
m = msg.lower()
if any(w in m for w in ["common", "frequent", "most", "top"]):
return ("The most common signals in our dataset are 'high responsibility early', "
"'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency"
if any(w in m for w in ["risk", "distribution", "level"]):
return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine "
"multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution"
if any(w in m for w in ["score", "histogram", "spread"]):
return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals "
"a problematic posting."), "score_distribution"
if any(w in m for w in ["how", "work", "explain", "method"]):
return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, "
"positive signals subtract. The total maps to Low/Medium/High risk."), "none"
return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none"
def call_n8n(msg):
import requests
try:
r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15)
data = r.json()
return data.get("answer", "n8n returned no answer."), data.get("chart", "none")
except Exception as e:
fb_text, fb_chart = keyword_fallback(msg)
return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart
def ai_chat(user_msg, history):
if not user_msg or not user_msg.strip():
return history or [], "", None
if N8N_WEBHOOK_URL:
reply, chart_key = call_n8n(user_msg)
else:
reply, chart_key = keyword_fallback(user_msg)
builders = {
"flag_frequency": build_flag_frequency_chart,
"risk_distribution": build_risk_distribution_chart,
"score_distribution": build_score_distribution_chart,
}
chart_out = builders[chart_key]() if chart_key in builders else None
new_history = (history or []) + [(user_msg, reply)]
return new_history, "", chart_out
# =========================================================
# CSS LOADER
# =========================================================
def load_css():
css_path = BASE_DIR / "style.css"
if css_path.exists():
return css_path.read_text(encoding="utf-8")
return ""
# =========================================================
# UI
# =========================================================
CSS = load_css()
with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo:
gr.Markdown(
"# Job Risk Analyzer\n"
"Detect hidden risk patterns in job postings using a weighted signal taxonomy "
"calibrated on 47 real labeled descriptions.",
elem_id="escp_title",
)
with gr.Tab("πŸ” Analyze a Job"):
gr.Markdown("Paste any job description below to detect red flags and estimate risk.")
with gr.Row():
with gr.Column():
inp = gr.Textbox(label="Job description", lines=15,
placeholder="Paste the full job posting here...")
btn = gr.Button("Analyze", variant="primary")
with gr.Column():
out_md = gr.Markdown()
with gr.Row():
out_score = gr.Number(label="Score", precision=0)
out_risk = gr.Textbox(label="Risk Level")
out_chart = gr.Plot(label="Signal breakdown")
btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart])
with gr.Tab("πŸ“Š Dataset Dashboard"):
gr.HTML(value=render_kpi_cards())
gr.Markdown("### Insights from labeled job postings")
gr.Plot(value=build_flag_frequency_chart(), label="Most common signals")
with gr.Row():
gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution")
gr.Plot(value=build_score_distribution_chart(), label="Score distribution")
if not DF.empty:
display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns]
if display_cols:
gr.Markdown("### Raw labeled dataset")
gr.Dataframe(DF[display_cols], wrap=True, interactive=False)
with gr.Tab('"AI" Dashboard'):
status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL
else "Using **keyword matching** (set `N8N_WEBHOOK_URL` to upgrade).")
gr.Markdown("### Ask questions, get visualizations\n\n" + status)
with gr.Row():
with gr.Column():
chatbot = gr.Chatbot(label="Conversation", height=380)
user_input = gr.Textbox(label="Ask about the dataset",
placeholder="e.g. What are the most common red flags?")
gr.Examples(
examples=[
"What are the most common red flags?",
"Show me the risk level distribution",
"How is the score spread across jobs?",
"How does the analyzer work?",
],
inputs=user_input,
)
with gr.Column():
ai_chart = gr.Plot(label="Visualization")
user_input.submit(ai_chat, inputs=[user_input, chatbot],
outputs=[chatbot, user_input, ai_chart])
with gr.Tab("ℹ️ About"):
gr.Markdown("""
### How it works
This app uses a **weighted red-flag taxonomy** built from 47 real labeled job postings.
Each detected signal contributes to a total score that maps to Low / Medium / High risk.
- 🟒 **Low** (< 12): Healthy posting with clear structure and benefits
- 🟑 **Medium** (12–24): Some warning signs worth investigating
- πŸ”΄ **High** (β‰₯ 25): Multiple concerning patterns
### Team β€” CS1 Group 14
- **Gaspard** β€” UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing)
- **Person 3** β€” Data Analyst (extraction, analysis, charts)
- **Person 4** β€” Project Manager (final report, coordination)
### Iterations
- **v1** β€” Keyword matching with hard-coded weights from labeled dataset
- **v2** β€” Refined keyword patterns after user testing
- **v3** β€” Integrated n8n workflow for smarter conversational responses
""")
demo.launch(server_name="0.0.0.0", server_port=7860)