grasepard2 commited on
Commit
30f1124
Β·
verified Β·
1 Parent(s): 272e88d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -299
app.py CHANGED
@@ -1,43 +1,28 @@
 
 
 
 
1
  import os
2
  import re
3
  import json
4
- import traceback
5
  from pathlib import Path
6
- from typing import Dict, Any, List, Tuple
7
 
8
  import pandas as pd
9
  import gradio as gr
10
  import plotly.graph_objects as go
11
  import plotly.express as px
12
 
13
- # Optional LLM (HuggingFace Inference API)
14
- try:
15
- from huggingface_hub import InferenceClient
16
- except Exception:
17
- InferenceClient = None
18
-
19
  # =========================================================
20
  # CONFIG
21
  # =========================================================
22
 
23
  BASE_DIR = Path(__file__).resolve().parent
24
  DATA_FILE = BASE_DIR / "job_description_data.xlsx"
25
-
26
- HF_API_KEY = os.environ.get("HF_API_KEY", "").strip()
27
- MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-R1").strip()
28
- HF_PROVIDER = os.environ.get("HF_PROVIDER", "novita").strip()
29
  N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
30
 
31
- LLM_ENABLED = bool(HF_API_KEY) and InferenceClient is not None
32
- llm_client = (
33
- InferenceClient(provider=HF_PROVIDER, api_key=HF_API_KEY)
34
- if LLM_ENABLED
35
- else None
36
- )
37
-
38
  # =========================================================
39
- # RED FLAG TAXONOMY (extracted from labeled dataset)
40
- # Positive weights = red flags; negative weights = positive signals
41
  # =========================================================
42
 
43
  RED_FLAGS = [
@@ -53,19 +38,16 @@ RED_FLAGS = [
53
  ("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]),
54
  ("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]),
55
  ("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]),
56
- ("salary clearly specified", -6, ["salary:", "€", "$", "compensation:", "annual salary", "monthly salary"]),
57
  ("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]),
58
  ("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]),
59
  ]
60
 
61
- CHART_PALETTE = ["#34d399", "#60a5fa", "#f472b6", "#fbbf24", "#a78bfa",
62
- "#22d3ee", "#fb7185", "#84cc16", "#f97316", "#e879f9"]
63
-
64
  # =========================================================
65
  # DATA LOADING
66
  # =========================================================
67
 
68
- def load_dataset() -> pd.DataFrame:
69
  if not DATA_FILE.exists():
70
  return pd.DataFrame()
71
  try:
@@ -73,12 +55,10 @@ def load_dataset() -> pd.DataFrame:
73
  except Exception:
74
  return pd.DataFrame()
75
 
76
-
77
  DF = load_dataset()
78
 
79
 
80
- def extract_flag_labels(red_flags_cell: str) -> List[Tuple[str, int]]:
81
- """Parse 'label (+10), label2 (-5)' into [(label, weight)]."""
82
  if not isinstance(red_flags_cell, str):
83
  return []
84
  out = []
@@ -90,10 +70,10 @@ def extract_flag_labels(red_flags_cell: str) -> List[Tuple[str, int]]:
90
 
91
 
92
  # =========================================================
93
- # CORE: ANALYZE A JOB DESCRIPTION
94
  # =========================================================
95
 
96
- def classify_risk(score: float) -> Tuple[str, str]:
97
  if score < 12:
98
  return "Low", "🟒"
99
  if score < 25:
@@ -101,10 +81,9 @@ def classify_risk(score: float) -> Tuple[str, str]:
101
  return "High", "πŸ”΄"
102
 
103
 
104
- def analyze_job(text: str) -> Tuple[str, int, str, go.Figure]:
105
  if not text or len(text.strip()) < 30:
106
- return ("⚠️ Please paste a real job description (at least 30 characters).",
107
- 0, "β€”", _empty_chart("Paste a job description above"))
108
 
109
  lower = text.lower()
110
  detected = []
@@ -115,21 +94,20 @@ def analyze_job(text: str) -> Tuple[str, int, str, go.Figure]:
115
  score += weight
116
 
117
  risk, emoji = classify_risk(score)
118
-
119
- md = f"## {emoji} Risk: **{risk}** &nbsp;|&nbsp; Score: **{score}**\n\n"
120
  if not detected:
121
- md += "_No clear red or positive signals detected. Description may be too short or vague._"
122
  else:
123
  bad = [(l, w) for l, w in detected if w > 0]
124
  good = [(l, w) for l, w in detected if w < 0]
125
  if bad:
126
  md += "### 🚩 Red flags detected\n"
127
  for l, w in bad:
128
- md += f"- **{l}** `(+{w})`\n"
129
  if good:
130
  md += "\n### βœ… Positive signals detected\n"
131
  for l, w in good:
132
- md += f"- **{l}** `({w})`\n"
133
 
134
  if detected:
135
  cdf = pd.DataFrame(detected, columns=["Signal", "Weight"])
@@ -137,7 +115,7 @@ def analyze_job(text: str) -> Tuple[str, int, str, go.Figure]:
137
  fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h",
138
  color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"},
139
  title="Signal breakdown")
140
- fig.update_layout(**_styled_layout(height=420, showlegend=True))
141
  else:
142
  fig = _empty_chart("No signals to chart")
143
 
@@ -145,50 +123,33 @@ def analyze_job(text: str) -> Tuple[str, int, str, go.Figure]:
145
 
146
 
147
  # =========================================================
148
- # CHART STYLE HELPERS (preserved from template)
149
  # =========================================================
150
 
151
- def _styled_layout(**kwargs) -> dict:
152
  defaults = dict(
153
  template="plotly_white",
154
  paper_bgcolor="#fdfaf3",
155
  plot_bgcolor="#fdfaf3",
156
- font=dict(family="Geist, system-ui, -apple-system, sans-serif", color="#1a2238", size=12),
157
  margin=dict(l=60, r=20, t=70, b=70),
158
- legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1,
159
- bgcolor="rgba(253,250,243,0.9)",
160
- bordercolor="#d9cfb9", borderwidth=1,
161
- font=dict(color="#4a5475", size=11)),
162
- title=dict(font=dict(size=14, color="#1a2238", family="Geist, system-ui, sans-serif")),
163
- xaxis=dict(gridcolor="#e6dcc7", zerolinecolor="#d9cfb9",
164
- tickfont=dict(color="#4a5475", size=11),
165
- title=dict(font=dict(color="#4a5475", size=12))),
166
- yaxis=dict(gridcolor="#e6dcc7", zerolinecolor="#d9cfb9",
167
- tickfont=dict(color="#4a5475", size=11),
168
- title=dict(font=dict(color="#4a5475", size=12))),
169
  )
170
  defaults.update(kwargs)
171
  return defaults
172
 
173
 
174
- def _empty_chart(title: str) -> go.Figure:
175
  fig = go.Figure()
176
  fig.update_layout(
177
  title=title, height=420, template="plotly_white",
178
- paper_bgcolor="#fdfaf3",
179
- plot_bgcolor="#fdfaf3",
180
- font=dict(color="#1a2238", family="Geist, system-ui, sans-serif"),
181
- annotations=[dict(text="(no data available)", x=0.5, y=0.5, xref="paper", yref="paper",
182
- showarrow=False, font=dict(size=13, color="#8a9099"))],
183
  )
184
  return fig
185
 
186
 
187
- # =========================================================
188
- # DATASET INSIGHTS (charts from labeled XLSX)
189
- # =========================================================
190
-
191
- def build_flag_frequency_chart() -> go.Figure:
192
  if DF.empty or "Red Flags" not in DF.columns:
193
  return _empty_chart("Dataset not loaded")
194
  all_flags = []
@@ -197,46 +158,32 @@ def build_flag_frequency_chart() -> go.Figure:
197
  counts = pd.Series(all_flags).value_counts().head(12)
198
  fig = go.Figure(go.Bar(
199
  y=counts.index[::-1], x=counts.values[::-1], orientation="h",
200
- marker=dict(color=counts.values[::-1],
201
- colorscale=[[0, "#f4b8b1"], [1, "#e85a4f"]]),
202
- hovertemplate="<b>%{y}</b><br>Detected in %{x} jobs<extra></extra>",
203
  ))
204
- fig.update_layout(**_styled_layout(
205
- height=460, title=dict(text="Most Common Signals Across 47 Analyzed Jobs"),
206
- showlegend=False))
207
- fig.update_xaxes(title="Number of postings")
208
  return fig
209
 
210
 
211
- def build_risk_distribution_chart() -> go.Figure:
212
  if DF.empty or "Risk Level" not in DF.columns:
213
  return _empty_chart("Dataset not loaded")
214
  counts = DF["Risk Level"].value_counts()
215
- colors = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"}
216
  fig = go.Figure(go.Pie(
217
  labels=counts.index, values=counts.values,
218
- marker=dict(colors=[colors.get(l, "#888") for l in counts.index]),
219
- hole=0.4, textinfo="label+percent",
220
  ))
221
- fig.update_layout(**_styled_layout(
222
- height=400, title=dict(text="Risk Level Distribution in Dataset")))
223
  return fig
224
 
225
 
226
- def build_score_distribution_chart() -> go.Figure:
227
  if DF.empty or "Score" not in DF.columns:
228
  return _empty_chart("Dataset not loaded")
229
  scores = DF["Score"].dropna()
230
- fig = go.Figure(go.Histogram(
231
- x=scores, nbinsx=15, marker_color="#e85a4f",
232
- marker_line_color="#c53030", marker_line_width=1,
233
- hovertemplate="Score range: %{x}<br>Jobs: %{y}<extra></extra>",
234
- ))
235
- fig.update_layout(**_styled_layout(
236
- height=380, title=dict(text="Risk Score Distribution"),
237
- bargap=0.05))
238
- fig.update_xaxes(title="Risk score")
239
- fig.update_yaxes(title="Number of jobs")
240
  return fig
241
 
242
 
@@ -244,17 +191,9 @@ def build_score_distribution_chart() -> go.Figure:
244
  # KPI CARDS
245
  # =========================================================
246
 
247
- def render_kpi_cards() -> str:
248
  if DF.empty:
249
- return ('<div style="background:#fdfaf3;padding:32px;text-align:center;'
250
- 'border-radius:12px;border:1px solid #d9cfb9;">'
251
- '<div style="font-family:\'Geist Mono\',monospace;font-size:11px;'
252
- 'color:#e85a4f;letter-spacing:0.08em;text-transform:uppercase;margin-bottom:12px;font-weight:600;">No Data</div>'
253
- '<div style="color:#4a5475;font-size:14px;">'
254
- 'Upload <code style="background:#f1ebe0;color:#7d4e8a;padding:2px 6px;border-radius:4px;'
255
- 'font-family:\'Geist Mono\',monospace;font-size:0.85em;border:1px solid #e6dcc7;">'
256
- 'job_description_data.xlsx</code> to populate metrics.'
257
- '</div></div>')
258
 
259
  total_jobs = len(DF)
260
  avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0
@@ -267,167 +206,99 @@ def render_kpi_cards() -> str:
267
  all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
268
  top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "β€”"
269
 
270
- def card(label, value, delta_text, accent_color="#e85a4f"):
271
- return f"""
272
- <div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;
273
- padding:20px 22px;position:relative;overflow:hidden;
274
- box-shadow:0 1px 0 rgba(255,255,255,0.7) inset, 0 2px 8px rgba(26, 34, 56, 0.04);
275
- transition:border-color 0.15s, transform 0.15s;">
276
- <div style="font-family:'Geist Mono','SF Mono',monospace;
277
- color:{accent_color};font-size:11px;font-weight:600;
278
- text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">
279
- {label}
280
- </div>
281
- <div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;
282
- letter-spacing:-0.03em;margin-bottom:10px;
283
- font-family:'Geist',-apple-system,system-ui,sans-serif;">
284
- {value}
285
- </div>
286
- <div style="display:flex;align-items:center;gap:6px;
287
- font-family:'Geist Mono',monospace;font-size:11px;color:#4a5475;">
288
- <span style="display:inline-block;width:6px;height:6px;border-radius:50%;
289
- background:{accent_color};box-shadow:0 0 8px {accent_color}80;"></span>
290
- <span>{delta_text}</span>
291
- </div>
292
- </div>"""
293
 
294
  cards = [
295
- card("Total.Jobs", f"{total_jobs}", "real labeled postings", "#e85a4f"),
296
- card("Avg.Score", f"{avg_score:.1f}", "weighted across dataset", "#2a9d8f"),
297
- card("High.Risk %", f"{high_pct:.0f}%", f"{risk_counts.get('High', 0)} postings flagged", "#c53030"),
298
- card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "β€”" else "β€”",
299
  top_flag if top_flag != "β€”" else "no data", "#7d4e8a"),
300
  ]
301
  return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));'
302
- 'gap:12px;margin-bottom:32px;">' + "".join(cards) + "</div>")
303
 
304
 
305
  # =========================================================
306
- # AI CHAT (n8n > LLM > keyword fallback)
307
  # =========================================================
308
 
309
- DASHBOARD_SYSTEM = """You are an AI assistant for a job description risk analyzer app.
310
- You help users understand patterns in job postings β€” red flags, risk levels, common warning signs.
311
-
312
- DATASET CONTEXT:
313
- - 47 real job postings labeled by humans
314
- - Each scored on 15 weighted signals (positive = red flag, negative = good signal)
315
- - Risk levels: Low (<12), Medium (12-24), High (>=25)
316
- - Top categories: high responsibility early, technical complexity, autonomy demands
317
-
318
- YOUR JOB:
319
- Answer the user's question conversationally in 2-4 sentences. At the END, output a JSON block:
320
- ```json
321
- {"show": "flag_frequency"|"risk_distribution"|"score_distribution"|"none"}
322
- ```
323
- Pick the chart most relevant to their question, or "none" if no chart fits.
324
- """
325
-
326
- JSON_BLOCK_RE = re.compile(r"```json\s*(\{.*?\})\s*```", re.DOTALL)
327
-
328
-
329
- def _parse_directive(text: str) -> Dict[str, str]:
330
- m = JSON_BLOCK_RE.search(text)
331
- if m:
332
- try:
333
- return json.loads(m.group(1))
334
- except json.JSONDecodeError:
335
- pass
336
- return {"show": "none"}
337
-
338
-
339
- def _clean_response(text: str) -> str:
340
- return JSON_BLOCK_RE.sub("", text).strip()
341
-
342
-
343
- def _n8n_call(msg: str) -> Tuple[str, Dict]:
344
- import requests as req
345
  try:
346
- resp = req.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=20)
347
- data = resp.json()
348
- answer = data.get("answer", "No response from n8n workflow.")
349
- chart = data.get("chart", "none")
350
- return answer, {"show": chart}
351
  except Exception as e:
352
- return f"n8n error: {e}", None
 
353
 
354
 
355
- def _keyword_fallback(msg: str) -> Tuple[str, Dict]:
356
- m = msg.lower()
357
- if any(w in m for w in ["common", "frequent", "most", "top flag", "patterns"]):
358
- return ("The most common signals across our 47 analyzed postings are below. "
359
- "Notice how 'high responsibility early', 'technical complexity', and "
360
- "'clear role structure' dominate β€” they appear in nearly every posting.",
361
- {"show": "flag_frequency"})
362
- if any(w in m for w in ["risk", "distribution", "level", "low", "medium", "high"]):
363
- return ("Here is the risk-level breakdown across our dataset. "
364
- "Most jobs land in the Medium tier; a smaller share are flagged High.",
365
- {"show": "risk_distribution"})
366
- if any(w in m for w in ["score", "histogram", "spread", "average"]):
367
- return ("Risk scores cluster mostly between 10 and 30. "
368
- "Anything above 25 is classified as High-risk.",
369
- {"show": "score_distribution"})
370
- if any(w in m for w in ["how", "work", "method", "explain"]):
371
- return ("The app detects 15 weighted signals in any pasted job description. "
372
- "Red flags add to the score (e.g. +10 for 'high responsibility early'), "
373
- "positive signals subtract (e.g. -8 for 'training provided'). "
374
- "The total maps to Low / Medium / High risk.",
375
- {"show": "none"})
376
- return ("Try asking about: **most common red flags**, **risk distribution**, "
377
- "**score spread**, or **how the analyzer works**.",
378
- {"show": "none"})
379
-
380
-
381
- def ai_chat(user_msg: str, history: list):
382
  if not user_msg or not user_msg.strip():
383
- return history, "", None
384
 
385
  if N8N_WEBHOOK_URL:
386
- reply, directive = _n8n_call(user_msg)
387
- if directive is None:
388
- reply_fb, directive = _keyword_fallback(user_msg)
389
- reply += "\n\n" + reply_fb
390
- elif LLM_ENABLED:
391
- msgs = [{"role": "system", "content": DASHBOARD_SYSTEM}]
392
- for user_turn, bot_turn in (history or [])[-3:]:
393
- msgs.append({"role": "user", "content": user_turn})
394
- msgs.append({"role": "assistant", "content": bot_turn})
395
- msgs.append({"role": "user", "content": user_msg})
396
- try:
397
- r = llm_client.chat_completion(model=MODEL_NAME, messages=msgs,
398
- temperature=0.3, max_tokens=500, stream=False)
399
- raw = (r["choices"][0]["message"]["content"]
400
- if isinstance(r, dict) else r.choices[0].message.content)
401
- directive = _parse_directive(raw)
402
- reply = _clean_response(raw)
403
- except Exception as e:
404
- reply = f"LLM error: {e}"
405
- reply_fb, directive = _keyword_fallback(user_msg)
406
- reply += "\n\n" + reply_fb
407
  else:
408
- reply, directive = _keyword_fallback(user_msg)
409
 
410
- chart_builders = {
411
  "flag_frequency": build_flag_frequency_chart,
412
  "risk_distribution": build_risk_distribution_chart,
413
  "score_distribution": build_score_distribution_chart,
414
  }
415
- chart_out = chart_builders[directive["show"]]() if directive.get("show") in chart_builders else None
416
 
417
  new_history = (history or []) + [(user_msg, reply)]
418
  return new_history, "", chart_out
419
 
420
 
421
  # =========================================================
422
- # UI
423
  # =========================================================
424
 
425
- def load_css() -> str:
426
  css_path = BASE_DIR / "style.css"
427
- return css_path.read_text(encoding="utf-8") if css_path.exists() else ""
 
 
428
 
429
 
430
- with gr.Blocks(title="Job Risk Analyzer β€” CS1 Group 14", css=load_css()) as demo:
 
 
 
 
 
 
431
 
432
  gr.Markdown(
433
  "# Job Risk Analyzer\n"
@@ -436,17 +307,14 @@ with gr.Blocks(title="Job Risk Analyzer β€” CS1 Group 14", css=load_css()) as de
436
  elem_id="escp_title",
437
  )
438
 
439
- # ===========================================================
440
- # TAB 1 -- Live analyzer (the main feature)
441
- # ===========================================================
442
  with gr.Tab("πŸ” Analyze a Job"):
443
- gr.Markdown("Paste any job description below to detect red flags and estimate hidden risk.")
444
  with gr.Row():
445
- with gr.Column(scale=1):
446
- inp = gr.Textbox(label="Job description", lines=18,
447
  placeholder="Paste the full job posting here...")
448
- btn = gr.Button("Analyze", variant="primary", size="lg")
449
- with gr.Column(scale=1):
450
  out_md = gr.Markdown()
451
  with gr.Row():
452
  out_score = gr.Number(label="Score", precision=0)
@@ -454,46 +322,29 @@ with gr.Blocks(title="Job Risk Analyzer β€” CS1 Group 14", css=load_css()) as de
454
  out_chart = gr.Plot(label="Signal breakdown")
455
  btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart])
456
 
457
- # ===========================================================
458
- # TAB 2 -- Dataset Dashboard
459
- # ===========================================================
460
  with gr.Tab("πŸ“Š Dataset Dashboard"):
461
- kpi_html = gr.HTML(value=render_kpi_cards)
462
- refresh_btn = gr.Button("Refresh Dashboard", variant="primary")
463
-
464
- gr.Markdown("#### Insights from 47 labeled job postings")
465
- chart_freq = gr.Plot(label="Most common signals", value=build_flag_frequency_chart)
466
  with gr.Row():
467
- chart_risk = gr.Plot(label="Risk distribution", value=build_risk_distribution_chart)
468
- chart_score = gr.Plot(label="Score distribution", value=build_score_distribution_chart)
469
-
470
- gr.Markdown("#### Raw labeled dataset")
471
  if not DF.empty:
472
  display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns]
473
- gr.Dataframe(DF[display_cols], wrap=True, interactive=False)
474
-
475
- def _on_refresh():
476
- return (render_kpi_cards(), build_flag_frequency_chart(),
477
- build_risk_distribution_chart(), build_score_distribution_chart())
478
 
479
- refresh_btn.click(_on_refresh,
480
- outputs=[kpi_html, chart_freq, chart_risk, chart_score])
481
-
482
- # ===========================================================
483
- # TAB 3 -- AI Dashboard
484
- # ===========================================================
485
  with gr.Tab('"AI" Dashboard'):
486
- _status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL
487
- else "**LLM active.**" if LLM_ENABLED
488
- else "Using **keyword matching**. Set `N8N_WEBHOOK_URL` or `HF_API_KEY` in Space settings to upgrade.")
489
- gr.Markdown(f"### Ask questions, get visualizations\n\n{_status}")
490
-
491
- with gr.Row(equal_height=True):
492
- with gr.Column(scale=1):
493
- chatbot = gr.Chatbot(label="Conversation", height=380, type="messages")
494
  user_input = gr.Textbox(label="Ask about the dataset",
495
- placeholder="e.g. What are the most common red flags?",
496
- lines=1)
497
  gr.Examples(
498
  examples=[
499
  "What are the most common red flags?",
@@ -503,44 +354,35 @@ with gr.Blocks(title="Job Risk Analyzer β€” CS1 Group 14", css=load_css()) as de
503
  ],
504
  inputs=user_input,
505
  )
506
- with gr.Column(scale=1):
507
  ai_chart = gr.Plot(label="Visualization")
508
 
509
  user_input.submit(ai_chat, inputs=[user_input, chatbot],
510
  outputs=[chatbot, user_input, ai_chart])
511
 
512
- # ===========================================================
513
- # TAB 4 -- About / Iterations
514
- # ===========================================================
515
  with gr.Tab("ℹ️ About"):
516
  gr.Markdown("""
517
- ### How it works
518
- This app uses a **weighted red-flag taxonomy** built from analyzing 47 real job postings.
519
- Each detected signal contributes to a total score; the score determines risk level.
520
-
521
- - 🟒 **Low** (< 12): Healthy posting with clear structure and benefits
522
- - 🟑 **Medium** (12–24): Some warning signs worth investigating
523
- - πŸ”΄ **High** (β‰₯ 25): Multiple concerning patterns β€” proceed with caution
524
-
525
- ### Team β€” CS1 Group 14
526
- - **Gaspard** β€” Technical Lead (Hugging Face Space + Gradio app)
527
- - **Person 3** β€” Data Analysis & Insights
528
- - **Person 4** β€” Testing & Iterations
529
- - **Person 5** β€” Report & Coordination
530
-
531
- ### Iterations
532
- - **v1** β€” Keyword matching with hard-coded weights from labeled dataset
533
- - **v2** β€” _(to be filled by Person 4 after testing)_
534
- - **v3** β€” _(future: integrate LLM for semantic detection beyond keywords)_
535
-
536
- ### Data source
537
- 47 real job postings (mostly French market) manually labeled by the team
538
- with 15 weighted signal categories.
539
- """)
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
- demo.queue().launch(
543
- server_name="0.0.0.0",
544
- server_port=7860,
545
- allowed_paths=[str(BASE_DIR)]
546
- )
 
1
+ """
2
+ CS1 Group 14 β€” Job Description Risk Analyzer
3
+ Built for Gradio 4.44 / Hugging Face Spaces
4
+ """
5
  import os
6
  import re
7
  import json
 
8
  from pathlib import Path
9
+ from typing import Dict, List, Tuple
10
 
11
  import pandas as pd
12
  import gradio as gr
13
  import plotly.graph_objects as go
14
  import plotly.express as px
15
 
 
 
 
 
 
 
16
  # =========================================================
17
  # CONFIG
18
  # =========================================================
19
 
20
  BASE_DIR = Path(__file__).resolve().parent
21
  DATA_FILE = BASE_DIR / "job_description_data.xlsx"
 
 
 
 
22
  N8N_WEBHOOK_URL = os.environ.get("N8N_WEBHOOK_URL", "").strip()
23
 
 
 
 
 
 
 
 
24
  # =========================================================
25
+ # RED FLAG TAXONOMY
 
26
  # =========================================================
27
 
28
  RED_FLAGS = [
 
38
  ("broad / unclear scope", +5, ["other duties", "as needed", "various tasks", "wide range of responsibilities"]),
39
  ("multitasking / many hats", +5, ["multitask", "juggle", "multiple roles"]),
40
  ("training / support provided", -8, ["training provided", "mentorship", "onboarding", "support and training", "we will train"]),
41
+ ("salary clearly specified", -6, ["salary:", "compensation:", "annual salary", "monthly salary"]),
42
  ("clear role structure", -5, ["responsibilities include", "your missions", "main tasks", "key responsibilities"]),
43
  ("benefits clearly mentioned", -4, ["health insurance", "paid leave", "meal vouchers", "transport", "benefits include", "profit-sharing"]),
44
  ]
45
 
 
 
 
46
  # =========================================================
47
  # DATA LOADING
48
  # =========================================================
49
 
50
+ def load_dataset():
51
  if not DATA_FILE.exists():
52
  return pd.DataFrame()
53
  try:
 
55
  except Exception:
56
  return pd.DataFrame()
57
 
 
58
  DF = load_dataset()
59
 
60
 
61
+ def extract_flag_labels(red_flags_cell):
 
62
  if not isinstance(red_flags_cell, str):
63
  return []
64
  out = []
 
70
 
71
 
72
  # =========================================================
73
+ # CORE: ANALYZE
74
  # =========================================================
75
 
76
+ def classify_risk(score):
77
  if score < 12:
78
  return "Low", "🟒"
79
  if score < 25:
 
81
  return "High", "πŸ”΄"
82
 
83
 
84
+ def analyze_job(text):
85
  if not text or len(text.strip()) < 30:
86
+ return "⚠️ Please paste a real job description (at least 30 characters).", 0, "β€”", _empty_chart("Paste a job description above")
 
87
 
88
  lower = text.lower()
89
  detected = []
 
94
  score += weight
95
 
96
  risk, emoji = classify_risk(score)
97
+ md = "## " + emoji + " Risk: **" + risk + "** | Score: **" + str(score) + "**\n\n"
 
98
  if not detected:
99
+ md += "_No clear red or positive signals detected._"
100
  else:
101
  bad = [(l, w) for l, w in detected if w > 0]
102
  good = [(l, w) for l, w in detected if w < 0]
103
  if bad:
104
  md += "### 🚩 Red flags detected\n"
105
  for l, w in bad:
106
+ md += "- **" + l + "** `(+" + str(w) + ")`\n"
107
  if good:
108
  md += "\n### βœ… Positive signals detected\n"
109
  for l, w in good:
110
+ md += "- **" + l + "** `(" + str(w) + ")`\n"
111
 
112
  if detected:
113
  cdf = pd.DataFrame(detected, columns=["Signal", "Weight"])
 
115
  fig = px.bar(cdf, x="Weight", y="Signal", color="Type", orientation="h",
116
  color_discrete_map={"Red flag": "#c53030", "Positive": "#2f855a"},
117
  title="Signal breakdown")
118
+ fig.update_layout(**_styled_layout(height=420))
119
  else:
120
  fig = _empty_chart("No signals to chart")
121
 
 
123
 
124
 
125
  # =========================================================
126
+ # CHARTS
127
  # =========================================================
128
 
129
+ def _styled_layout(**kwargs):
130
  defaults = dict(
131
  template="plotly_white",
132
  paper_bgcolor="#fdfaf3",
133
  plot_bgcolor="#fdfaf3",
134
+ font=dict(family="system-ui, sans-serif", color="#1a2238", size=12),
135
  margin=dict(l=60, r=20, t=70, b=70),
 
 
 
 
 
 
 
 
 
 
 
136
  )
137
  defaults.update(kwargs)
138
  return defaults
139
 
140
 
141
+ def _empty_chart(title):
142
  fig = go.Figure()
143
  fig.update_layout(
144
  title=title, height=420, template="plotly_white",
145
+ paper_bgcolor="#fdfaf3", plot_bgcolor="#fdfaf3",
146
+ annotations=[dict(text="(no data)", x=0.5, y=0.5, xref="paper", yref="paper",
147
+ showarrow=False, font=dict(size=14, color="#8a9099"))],
 
 
148
  )
149
  return fig
150
 
151
 
152
+ def build_flag_frequency_chart():
 
 
 
 
153
  if DF.empty or "Red Flags" not in DF.columns:
154
  return _empty_chart("Dataset not loaded")
155
  all_flags = []
 
158
  counts = pd.Series(all_flags).value_counts().head(12)
159
  fig = go.Figure(go.Bar(
160
  y=counts.index[::-1], x=counts.values[::-1], orientation="h",
161
+ marker=dict(color="#e85a4f"),
 
 
162
  ))
163
+ fig.update_layout(**_styled_layout(height=460, title="Most Common Signals Across Analyzed Jobs"))
 
 
 
164
  return fig
165
 
166
 
167
+ def build_risk_distribution_chart():
168
  if DF.empty or "Risk Level" not in DF.columns:
169
  return _empty_chart("Dataset not loaded")
170
  counts = DF["Risk Level"].value_counts()
171
+ colors_map = {"Low": "#2a9d8f", "Medium": "#e9a23b", "High": "#c53030"}
172
  fig = go.Figure(go.Pie(
173
  labels=counts.index, values=counts.values,
174
+ marker=dict(colors=[colors_map.get(l, "#888") for l in counts.index]),
175
+ hole=0.4,
176
  ))
177
+ fig.update_layout(**_styled_layout(height=400, title="Risk Level Distribution"))
 
178
  return fig
179
 
180
 
181
+ def build_score_distribution_chart():
182
  if DF.empty or "Score" not in DF.columns:
183
  return _empty_chart("Dataset not loaded")
184
  scores = DF["Score"].dropna()
185
+ fig = go.Figure(go.Histogram(x=scores, nbinsx=15, marker_color="#e85a4f"))
186
+ fig.update_layout(**_styled_layout(height=380, title="Risk Score Distribution"))
 
 
 
 
 
 
 
 
187
  return fig
188
 
189
 
 
191
  # KPI CARDS
192
  # =========================================================
193
 
194
+ def render_kpi_cards():
195
  if DF.empty:
196
+ return '<div style="background:#fdfaf3;padding:32px;text-align:center;border-radius:12px;border:1px solid #d9cfb9;color:#4a5475;">No dataset loaded.</div>'
 
 
 
 
 
 
 
 
197
 
198
  total_jobs = len(DF)
199
  avg_score = DF["Score"].dropna().mean() if "Score" in DF.columns else 0
 
206
  all_flags.extend(label for label, _ in extract_flag_labels(str(cell)))
207
  top_flag = pd.Series(all_flags).value_counts().index[0] if all_flags else "β€”"
208
 
209
+ def card(label, value, sub, color):
210
+ return (
211
+ '<div style="background:#fdfaf3;border:1px solid #d9cfb9;border-radius:12px;'
212
+ 'padding:20px 22px;box-shadow:0 2px 8px rgba(26,34,56,0.04);">'
213
+ '<div style="font-family:monospace;color:' + color + ';font-size:11px;font-weight:600;'
214
+ 'text-transform:uppercase;letter-spacing:0.08em;margin-bottom:14px;">' + label + '</div>'
215
+ '<div style="color:#1a2238;font-size:34px;font-weight:700;line-height:1;'
216
+ 'letter-spacing:-0.03em;margin-bottom:10px;">' + str(value) + '</div>'
217
+ '<div style="font-family:monospace;font-size:11px;color:#4a5475;">' + sub + '</div>'
218
+ '</div>'
219
+ )
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
  cards = [
222
+ card("Total.Jobs", total_jobs, "real labeled postings", "#e85a4f"),
223
+ card("Avg.Score", str(round(avg_score, 1)), "weighted across dataset", "#2a9d8f"),
224
+ card("High.Risk %", str(round(high_pct)) + "%", str(risk_counts.get("High", 0)) + " postings flagged", "#c53030"),
225
+ card("Top.Signal", top_flag.split(' ')[0].title() if top_flag != "β€”" else "β€”",
226
  top_flag if top_flag != "β€”" else "no data", "#7d4e8a"),
227
  ]
228
  return ('<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));'
229
+ 'gap:12px;margin-bottom:24px;">' + "".join(cards) + '</div>')
230
 
231
 
232
  # =========================================================
233
+ # CHAT (n8n -> keyword fallback)
234
  # =========================================================
235
 
236
+ def keyword_fallback(msg):
237
+ m = msg.lower()
238
+ if any(w in m for w in ["common", "frequent", "most", "top"]):
239
+ return ("The most common signals in our dataset are 'high responsibility early', "
240
+ "'technical complexity', and 'clear role structure'. These appear in over 60% of postings."), "flag_frequency"
241
+ if any(w in m for w in ["risk", "distribution", "level"]):
242
+ return ("Most jobs land in the Medium risk tier (scores 12-24). High-risk postings combine "
243
+ "multiple red flags like vague scope, on-site-only, and missing salary information."), "risk_distribution"
244
+ if any(w in m for w in ["score", "histogram", "spread"]):
245
+ return ("Risk scores cluster between 10-25 in our dataset. Anything above 25 signals "
246
+ "a problematic posting."), "score_distribution"
247
+ if any(w in m for w in ["how", "work", "explain", "method"]):
248
+ return ("The analyzer scans for 15 weighted signal categories. Red flags add to the score, "
249
+ "positive signals subtract. The total maps to Low/Medium/High risk."), "none"
250
+ return ("Try asking: most common red flags, risk distribution, score spread, or how it works."), "none"
251
+
252
+
253
+ def call_n8n(msg):
254
+ import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  try:
256
+ r = requests.post(N8N_WEBHOOK_URL, json={"question": msg}, timeout=15)
257
+ data = r.json()
258
+ return data.get("answer", "n8n returned no answer."), data.get("chart", "none")
 
 
259
  except Exception as e:
260
+ fb_text, fb_chart = keyword_fallback(msg)
261
+ return "(n8n unavailable, using local logic)\n\n" + fb_text, fb_chart
262
 
263
 
264
+ def ai_chat(user_msg, history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  if not user_msg or not user_msg.strip():
266
+ return history or [], "", None
267
 
268
  if N8N_WEBHOOK_URL:
269
+ reply, chart_key = call_n8n(user_msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  else:
271
+ reply, chart_key = keyword_fallback(user_msg)
272
 
273
+ builders = {
274
  "flag_frequency": build_flag_frequency_chart,
275
  "risk_distribution": build_risk_distribution_chart,
276
  "score_distribution": build_score_distribution_chart,
277
  }
278
+ chart_out = builders[chart_key]() if chart_key in builders else None
279
 
280
  new_history = (history or []) + [(user_msg, reply)]
281
  return new_history, "", chart_out
282
 
283
 
284
  # =========================================================
285
+ # CSS LOADER
286
  # =========================================================
287
 
288
+ def load_css():
289
  css_path = BASE_DIR / "style.css"
290
+ if css_path.exists():
291
+ return css_path.read_text(encoding="utf-8")
292
+ return ""
293
 
294
 
295
+ # =========================================================
296
+ # UI
297
+ # =========================================================
298
+
299
+ CSS = load_css()
300
+
301
+ with gr.Blocks(title="Job Risk Analyzer", css=CSS) as demo:
302
 
303
  gr.Markdown(
304
  "# Job Risk Analyzer\n"
 
307
  elem_id="escp_title",
308
  )
309
 
 
 
 
310
  with gr.Tab("πŸ” Analyze a Job"):
311
+ gr.Markdown("Paste any job description below to detect red flags and estimate risk.")
312
  with gr.Row():
313
+ with gr.Column():
314
+ inp = gr.Textbox(label="Job description", lines=15,
315
  placeholder="Paste the full job posting here...")
316
+ btn = gr.Button("Analyze", variant="primary")
317
+ with gr.Column():
318
  out_md = gr.Markdown()
319
  with gr.Row():
320
  out_score = gr.Number(label="Score", precision=0)
 
322
  out_chart = gr.Plot(label="Signal breakdown")
323
  btn.click(analyze_job, inputs=[inp], outputs=[out_md, out_score, out_risk, out_chart])
324
 
 
 
 
325
  with gr.Tab("πŸ“Š Dataset Dashboard"):
326
+ gr.HTML(value=render_kpi_cards())
327
+ gr.Markdown("### Insights from labeled job postings")
328
+ gr.Plot(value=build_flag_frequency_chart(), label="Most common signals")
 
 
329
  with gr.Row():
330
+ gr.Plot(value=build_risk_distribution_chart(), label="Risk distribution")
331
+ gr.Plot(value=build_score_distribution_chart(), label="Score distribution")
 
 
332
  if not DF.empty:
333
  display_cols = [c for c in ["Job title", "company", "Score", "Risk Level"] if c in DF.columns]
334
+ if display_cols:
335
+ gr.Markdown("### Raw labeled dataset")
336
+ gr.Dataframe(DF[display_cols], wrap=True, interactive=False)
 
 
337
 
 
 
 
 
 
 
338
  with gr.Tab('"AI" Dashboard'):
339
+ status = ("Connected to **n8n workflow**." if N8N_WEBHOOK_URL
340
+ else "Using **keyword matching** (set `N8N_WEBHOOK_URL` to upgrade).")
341
+ gr.Markdown("### Ask questions, get visualizations\n\n" + status)
342
+
343
+ with gr.Row():
344
+ with gr.Column():
345
+ chatbot = gr.Chatbot(label="Conversation", height=380)
 
346
  user_input = gr.Textbox(label="Ask about the dataset",
347
+ placeholder="e.g. What are the most common red flags?")
 
348
  gr.Examples(
349
  examples=[
350
  "What are the most common red flags?",
 
354
  ],
355
  inputs=user_input,
356
  )
357
+ with gr.Column():
358
  ai_chart = gr.Plot(label="Visualization")
359
 
360
  user_input.submit(ai_chat, inputs=[user_input, chatbot],
361
  outputs=[chatbot, user_input, ai_chart])
362
 
 
 
 
363
  with gr.Tab("ℹ️ About"):
364
  gr.Markdown("""
365
+ ### How it works
366
+
367
+ This app uses a **weighted red-flag taxonomy** built from 47 real labeled job postings.
368
+ Each detected signal contributes to a total score that maps to Low / Medium / High risk.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ - 🟒 **Low** (< 12): Healthy posting with clear structure and benefits
371
+ - 🟑 **Medium** (12–24): Some warning signs worth investigating
372
+ - πŸ”΄ **High** (β‰₯ 25): Multiple concerning patterns
373
+
374
+ ### Team β€” CS1 Group 14
375
+
376
+ - **Gaspard** β€” UX Designer + Content Specialist (HF Space, Gradio app, n8n workflow, testing)
377
+ - **Person 3** β€” Data Analyst (extraction, analysis, charts)
378
+ - **Person 4** β€” Project Manager (final report, coordination)
379
+
380
+ ### Iterations
381
+
382
+ - **v1** β€” Keyword matching with hard-coded weights from labeled dataset
383
+ - **v2** β€” Refined keyword patterns after user testing
384
+ - **v3** β€” Integrated n8n workflow for smarter conversational responses
385
+ """)
386
 
387
+ if __name__ == "__main__":
388
+ demo.launch(server_name="0.0.0.0", server_port=7860)