Spaces:

ESCP
/

CS1_Group_14

Paused

App Files Files Community

CS1_Group_14 / app.py

grasepard2

Update app.py

b466904 verified about 1 month ago

raw

history blame contribute delete

16.7 kB

	"""
	CS1 Group 14 — Job Description Risk Analyzer
	Built for Gradio 4.44 / Hugging Face Spaces
	"""
	import os
	import re
	import json
	from pathlib import Path
	from typing import Dict, List, Tuple

	import pandas as pd
	import gradio as gr
	import plotly.graph_objects as go
	import plotly.express as px

	# =========================================================
	# CONFIG
	# =========================================================

	BASE_DIR = Path(__file__).resolve().parent
	DATA_FILE = BASE_DIR / "job_description_data.xlsx"
	N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()

	# =========================================================
	# RED FLAG TAXONOMY
	# =========================================================

	RED_FLAGS = [
	("high responsibility early", +10, ["full ownership", "lead the", "responsible for", "drive the", "own the", "manage the team", "take charge"]),
	("high autonomy / ownership", +10, ["autonomous", "self-starter", "work independently", "minimal supervision", "own initiative"]),
	("adaptability / flexibility demand", +8, ["flexible", "adaptable", "fast-paced", "changing priorities", "wear many hats"]),
	("cross-functional / many stakeholders",+8, ["cross-functional", "multiple stakeholders", "various teams", "coordinate with", "liaise"]),
	("customer-facing emotional labor", +6, ["customer-facing", "client-facing", "handle complaints", "difficult customers"]),
	("technical complexity", +6, ["python", "sql", "machine learning", "api", "data pipeline", "advanced", "complex systems"]),
	("on-site only / no remote", +5, ["on-site only", "no remote", "in-office", "fully on-site", "presence required"]),
	("travel / mobility", +5, ["travel required", "mobility", "frequent travel", "willing to travel"]),
	("pressure / deadlines", +5, ["tight deadlines", "high pressure", "fast deadlines", "demanding schedule"]),
	("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]),
	("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]),
	("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]),
	("salary clearly specified", -6, ["salary:", "compensation:", "annual salary", "monthly salary"]),
	("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]),
	("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]),
	]

	# =========================================================
	# DATA LOADING
	# =========================================================

	def load_dataset():
	if not DATA_FILE.exists():
	return pd.DataFrame()
	try:
	return pd.read_excel(DATA_FILE)
	except Exception:
	return pd.DataFrame()

	DF = load_dataset()


	def extract_flag_labels(red_flags_cell):
	if not isinstance(red_flags_cell, str):
	return []
	out = []
	for part in re.split(r",\s*(?=[a-zA-Z])", red_flags_cell):
	m = re.match(r"(.+?)\s*\(([+-]\d+)\)", part.strip())
	if m:
	out.append((m.group(1).strip(), int(m.group(2))))
	return out


	# =========================================================
	# CORE: ANALYZE
	# =========================================================

	def classify_risk(score):
	if score < 12:
	return "Low", "🟢"
	if score < 25:
	return "Medium", "🟡"
	return "High", "🔴"


	def analyze_job(text):
	if not text or len(text.strip()) < 30:
	return "⚠️ Please paste a real job description (at least 30 characters).", 0, "—", _empty_chart("Paste a job description above")

	lower = text.lower()
	detected = []
	score = 0
	for label, weight, patterns in RED_FLAGS:
	if any(p in lower for p in patterns):
	detected.append((label, weight))
	score += weight

	risk, emoji = classify_risk(score)
	md = "## " + emoji + " Risk: " + risk + " \| Score: " + str(score) + "\n\n"
	if not detected:
	md += "_No clear red or positive signals detected._"
	else:
	bad = [(l, w) for l, w in detected if w > 0]
	good = [(l, w) for l, w in detected if w < 0]
	if bad:
	md += "### 🚩 Red flags detected\n"
	for l, w in bad:
	md += "- " + l + " `(+" + str(w) + ")`\n"
	if good:
	md += "\n### ✅ Positive signals detected\n"
	for l, w in good:
	md += "- " + l + " `(" + str(w) + ")`\n"

	if detected:
	cdf = pd.DataFrame(detected, columns=["Signal", "Weight"])
	cdf["Type"] = cdf["Weight"].apply(lambda w: "Red flag" if w > 0 else "Positive")
	fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h",
	color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"},
	title="Signal breakdown")
	fig.update_layout(**_styled_layout(height=420))
	else:
	fig = _empty_chart("No signals to chart")

	return md, score, risk, fig


	# =========================================================
	# CHARTS
	# =========================================================

	def _styled_layout(**kwargs):
	defaults = dict(
	template="plotly_white",
	paper_bgcolor="#fdfaf3",
	plot_bgcolor="#fdfaf3",
	font=dict(family="system-ui, sans-serif", color="#1a2238", size=12),
	margin=dict(l=60, r=20, t=70, b=70),
	)
	defaults.update(kwargs)
	return defaults


	def _empty_chart(title):
	fig = go.Figure()
	fig.update_layout(
	title=title, height=420, template="plotly_white",
	paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3",
	annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper",
	showarrow=False, font=dict(size=14, color="#8a9099"))],
	)
	return fig


	def build_flag_frequency_chart():
	if DF.empty or "Red Flags" not in DF.columns:
	return _empty_chart("Dataset not loaded")
	all_flags = []
	for cell in DF["Red Flags"].dropna():
	all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
	counts = pd.Series(all_flags).value_counts().head(12)
	fig = go.Figure(go.Bar(
	y=counts.index[::-1], x=counts.values[::-1], orientation="h",
	marker=dict(color="#e85a4f"),
	))
	fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs"))
	return fig


	def build_risk_distribution_chart():
	if DF.empty or "Risk Level" not in DF.columns:
	return _empty_chart("Dataset not loaded")
	counts = DF["Risk Level"].value_counts()
	colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"}
	fig = go.Figure(go.Pie(
	labels=counts.index, values=counts.values,
	marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]),
	hole=0.4,
	))
	fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution"))
	return fig


	def build_score_distribution_chart():
	if DF.empty or "Score" not in DF.columns:
	return _empty_chart("Dataset not loaded")
	scores = DF["Score"].dropna()
	fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f"))
	fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution"))
	return fig


	# =========================================================
	# KPI CARDS
	# =========================================================

	def render_kpi_cards():
	if DF.empty:
	return '<div style="background:#fdfaf3;padding:32px;text-align:center;border-radius:12px;border:1px solid #d9cfb9;color:#4a5475;">No dataset loaded.</div>'

	total_jobs = len(DF)
	avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0
	risk_counts = DF["Risk Level"].value_counts() if "Risk Level" in DF.columns else pd.Series()
	high_pct = (risk_counts.get("High", 0) / total_jobs * 100) if total_jobs else 0

	all_flags = []
	if "Red Flags" in DF.columns:
	for cell in DF["Red Flags"].dropna():
	all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
	top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "—"

	def card(label, value, sub, color):
	return (
	'<div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;'
	'padding:20px 22px;box-shadow:0 2px 8px rgba(26,34,56,0.04);">'
	'<div style="font-family:monospace;color:' + color + ';font-size:11px;font-weight:600;'
	'text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">' + label + '</div>'
	'<div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;'
	'letter-spacing:-0.03em;margin-bottom:10px;">' + str(value) + '</div>'
	'<div style="font-family:monospace;font-size:11px;color:#4a5475;">' + sub + '</div>'
	'</div>'
	)

	cards = [
	card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"),
	card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"),
	card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"),
	card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "—" else "—",
	top_flag if top_flag != "—" else "no data", "#7d4e8a"),
	]
	return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));'
	'gap:12px;margin-bottom:24px;">' + "".join(cards) + '</div>')


	# =========================================================
	# CHAT (n8n -> keyword fallback)
	# =========================================================

	def keyword_fallback(msg):
	m = msg.lower()
	if any(w in m for w in ["common", "frequent", "most", "top"]):
	return ("The most common signals in our dataset are 'high responsibility early', "
	"'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency"
	if any(w in m for w in ["risk", "distribution", "level"]):
	return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine "
	"multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution"
	if any(w in m for w in ["score", "histogram", "spread"]):
	return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals "
	"a problematic posting."), "score_distribution"
	if any(w in m for w in ["how", "work", "explain", "method"]):
	return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, "
	"positive signals subtract. The total maps to Low/Medium/High risk."), "none"
	return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none"


	def call_n8n(msg):
	import requests
	try:
	r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15)
	data = r.json()
	return data.get("answer", "n8n returned no answer."), data.get("chart", "none")
	except Exception as e:
	fb_text, fb_chart = keyword_fallback(msg)
	return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart


	def ai_chat(user_msg, history):
	if not user_msg or not user_msg.strip():
	return history or [], "", None

	if N8N_WEBHOOK_URL:
	reply, chart_key = call_n8n(user_msg)
	else:
	reply, chart_key = keyword_fallback(user_msg)

	builders = {
	"flag_frequency": build_flag_frequency_chart,
	"risk_distribution": build_risk_distribution_chart,
	"score_distribution": build_score_distribution_chart,
	}
	chart_out = builders[chart_key]() if chart_key in builders else None

	new_history = (history or []) + [(user_msg, reply)]
	return new_history, "", chart_out


	# =========================================================
	# CSS LOADER
	# =========================================================

	def load_css():
	css_path = BASE_DIR / "style.css"
	if css_path.exists():
	return css_path.read_text(encoding="utf-8")
	return ""


	# =========================================================
	# UI
	# =========================================================

	CSS = load_css()

	with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo:

	gr.Markdown(
	"# Job Risk Analyzer\n"
	"Detect hidden risk patterns in job postings using a weighted signal taxonomy "
	"calibrated on 47 real labeled descriptions.",
	elem_id="escp_title",
	)

	with gr.Tab("🔍 Analyze a Job"):
	gr.Markdown("Paste any job description below to detect red flags and estimate risk.")
	with gr.Row():
	with gr.Column():
	inp = gr.Textbox(label="Job description", lines=15,
	placeholder="Paste the full job posting here...")
	btn = gr.Button("Analyze", variant="primary")
	with gr.Column():
	out_md = gr.Markdown()
	with gr.Row():
	out_score = gr.Number(label="Score", precision=0)
	out_risk = gr.Textbox(label="Risk Level")
	out_chart = gr.Plot(label="Signal breakdown")
	btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart])

	with gr.Tab("📊 Dataset Dashboard"):
	gr.HTML(value=render_kpi_cards())
	gr.Markdown("### Insights from labeled job postings")
	gr.Plot(value=build_flag_frequency_chart(), label="Most common signals")
	with gr.Row():
	gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution")
	gr.Plot(value=build_score_distribution_chart(), label="Score distribution")
	if not DF.empty:
	display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns]
	if display_cols:
	gr.Markdown("### Raw labeled dataset")
	gr.Dataframe(DF[display_cols], wrap=True, interactive=False)

	with gr.Tab('"AI" Dashboard'):
	status = ("Connected to n8n workflow." if N8N_WEBHOOK_URL
	else "Using keyword matching (set `N8N_WEBHOOK_URL` to upgrade).")
	gr.Markdown("### Ask questions, get visualizations\n\n" + status)

	with gr.Row():
	with gr.Column():
	chatbot = gr.Chatbot(label="Conversation", height=380)
	user_input = gr.Textbox(label="Ask about the dataset",
	placeholder="e.g. What are the most common red flags?")
	gr.Examples(
	examples=[
	"What are the most common red flags?",
	"Show me the risk level distribution",
	"How is the score spread across jobs?",
	"How does the analyzer work?",
	],
	inputs=user_input,
	)
	with gr.Column():
	ai_chart = gr.Plot(label="Visualization")

	user_input.submit(ai_chat, inputs=[user_input, chatbot],
	outputs=[chatbot, user_input, ai_chart])

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	### How it works

	This app uses a weighted red-flag taxonomy built from 47 real labeled job postings.
	Each detected signal contributes to a total score that maps to Low / Medium / High risk.

	- 🟢 Low (< 12): Healthy posting with clear structure and benefits
	- 🟡 Medium (12–24): Some warning signs worth investigating
	- 🔴 High (≥ 25): Multiple concerning patterns

	### Team — CS1 Group 14

	- Gaspard — UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing)
	- Person 3 — Data Analyst (extraction, analysis, charts)
	- Person 4 — Project Manager (final report, coordination)

	### Iterations

	- v1 — Keyword matching with hard-coded weights from labeled dataset
	- v2 — Refined keyword patterns after user testing
	- v3 — Integrated n8n workflow for smarter conversational responses
	""")

	demo.launch(server_name="0.0.0.0", server_port=7860)