urchade commited on
Commit
abf90c4
·
verified ·
1 Parent(s): 86f265c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -74
app.py CHANGED
@@ -1,7 +1,8 @@
1
- """GLiGuard prompt moderation demo built with Gradio."""
2
 
3
  import html
4
  import os
 
5
 
6
  import gradio as gr
7
  from gliner2 import GLiNER2
@@ -10,6 +11,7 @@ from huggingface_hub import login
10
  MODEL_ID = "fastino/gliguard-LLMGuardrails-300M"
11
  MODEL_NAME = "GLiGuard LLM Guardrails 300M"
12
  DEFAULT_THRESHOLD = 0.5
 
13
 
14
  SAFETY_LABELS = ["safe", "unsafe"]
15
  REFUSAL_LABELS = ["refusal", "compliance"]
@@ -50,18 +52,18 @@ TASKS = {
50
  "prompt_toxicity": {
51
  "labels": TOXICITY_LABELS,
52
  "multi_label": True,
53
- "cls_threshold": 0.4,
54
  },
55
  "jailbreak_detection": {
56
  "labels": JAILBREAK_LABELS,
57
  "multi_label": True,
58
- "cls_threshold": 0.4,
59
  },
60
  "response_safety": SAFETY_LABELS,
61
  "response_toxicity": {
62
  "labels": TOXICITY_LABELS,
63
  "multi_label": True,
64
- "cls_threshold": 0.4,
65
  },
66
  "response_refusal": REFUSAL_LABELS,
67
  }
@@ -75,6 +77,9 @@ TASK_OPTIONS = [
75
  ("Response Refusal", "response_refusal"),
76
  ]
77
 
 
 
 
78
  DISPLAY_NAMES = {
79
  "safe": "Safe",
80
  "unsafe": "Unsafe",
@@ -124,19 +129,57 @@ EXAMPLES = [
124
  ]
125
 
126
  HF_TOKEN = os.environ.get("HF_TOKEN")
127
- if HF_TOKEN:
128
- login(token=HF_TOKEN)
129
- print("Logged in to Hugging Face Hub")
130
 
131
- print(f"Loading model: {MODEL_ID}")
132
- model = GLiNER2.from_pretrained(MODEL_ID)
133
- print("Model loaded")
 
 
 
134
 
135
 
136
  def _format_label(label: str) -> str:
137
  return DISPLAY_NAMES.get(label, label.replace("_", " ").title())
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  def _extract_single_label(value):
141
  if isinstance(value, dict):
142
  return value.get("label", "unknown"), float(value.get("confidence", 0.0))
@@ -183,18 +226,36 @@ def _render_group(title: str, subtitle: str, items: list[tuple[str, float]], acc
183
  )
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
186
  def _empty_state_html() -> str:
187
  return """
188
  <div class="empty-state">
189
  <div class="empty-icon">🛡️</div>
190
- <div class="empty-title">Run GLiGuard on prompts or responses</div>
191
  <div class="empty-copy">
192
- Select prompt-side and/or response-side tasks, then analyze the text with the GLiGuard checkpoint.
193
  </div>
194
  </div>
195
  """
196
 
197
 
 
 
 
 
 
 
 
198
  def _build_overview_card(title: str, value: str, subtitle: str) -> str:
199
  return (
200
  "<div class='stat-card'>"
@@ -205,7 +266,22 @@ def _build_overview_card(title: str, value: str, subtitle: str) -> str:
205
  )
206
 
207
 
208
- def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  selected_task_set = set(selected_tasks)
210
  has_safety = "prompt_safety" in selected_task_set and "prompt_safety" in result
211
  has_toxicity = "prompt_toxicity" in selected_task_set and "prompt_toxicity" in result
@@ -244,26 +320,42 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
244
  if has_response_refusal:
245
  response_refusal_label, response_refusal_conf = _extract_single_label(result.get("response_refusal"))
246
 
247
- is_unsafe = (
248
- (has_safety and safety_label == "unsafe")
249
- or bool(toxicity_hits)
250
- or bool(jailbreak_hits)
251
- or (has_response_safety and response_safety_label == "unsafe" and response_refusal_label != "refusal")
252
- or bool(response_toxicity_hits)
253
- )
254
  status_key = "unsafe" if is_unsafe else "safe"
255
  status = STATUS_STYLES[status_key]
256
 
257
- summary = "GLiGuard found one or more harmful signals in the selected prompt or response tasks."
258
- if status_key == "safe":
259
- summary = "No selected task produced a harmful signal above the chosen threshold."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  prompt_task_count = sum([has_safety, has_toxicity, has_jailbreak])
262
  response_task_count = sum([has_response_safety, has_response_toxicity, has_response_refusal])
263
  if prompt_task_count and not response_task_count:
264
- summary = "This run evaluates prompt-side safety only."
265
  elif response_task_count and not prompt_task_count:
266
- summary = "This run evaluates response-side safety only."
267
 
268
  top_risk = "None"
269
  if toxicity_hits:
@@ -277,11 +369,15 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
277
 
278
  stats_cards = [
279
  _build_overview_card("Tasks Run", str(len(selected_tasks)), "Selected at inference time"),
280
- _build_overview_card("Prompt Tasks", str(prompt_task_count), "Prompt-side analyses executed"),
281
- _build_overview_card("Response Tasks", str(response_task_count), "Response-side analyses executed"),
282
  ]
283
 
284
  prompt_cards = []
 
 
 
 
285
  if has_safety:
286
  prompt_cards.append(
287
  _build_overview_card("Prompt Safety", _format_label(safety_label), f"Confidence {safety_confidence:.1%}")
@@ -296,6 +392,13 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
296
  )
297
 
298
  response_cards = []
 
 
 
 
 
 
 
299
  if has_response_safety:
300
  response_cards.append(
301
  _build_overview_card("Response Safety", _format_label(response_safety_label), f"Confidence {response_safety_conf:.1%}")
@@ -309,6 +412,27 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
309
  _build_overview_card("Response Refusal", _format_label(response_refusal_label), f"Confidence {response_refusal_conf:.1%}")
310
  )
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  result_sections = []
313
  if prompt_cards:
314
  result_sections.append(
@@ -329,6 +453,9 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
329
  result_sections.append(
330
  _render_group("Response Toxicity", "Multi-label response harm classification", response_toxicity_hits, "#2563eb")
331
  )
 
 
 
332
 
333
  return f"""
334
  <div class="results-shell">
@@ -349,69 +476,144 @@ def _build_result_html(result: dict, selected_tasks: list[str]) -> str:
349
  """
350
 
351
 
352
- def classify_prompt(prompt_text: str, response_text: str, threshold: float, selected_tasks: list[str]) -> str:
 
 
 
 
 
 
353
  prompt_text = (prompt_text or "").strip()
354
  response_text = (response_text or "").strip()
355
 
356
  if not prompt_text and not response_text:
357
- return _empty_state_html()
358
  if not selected_tasks:
359
- return """
360
- <div class="empty-state">
361
- <div class="empty-icon">🧭</div>
362
- <div class="empty-title">Select at least one task</div>
363
- <div class="empty-copy">
364
- Choose one or more GLiGuard tasks before running inference.
 
 
365
  </div>
366
- </div>
367
- """
 
 
 
 
 
368
 
369
  tasks = {task_name: TASKS[task_name] for task_name in selected_tasks if task_name in TASKS}
370
  has_prompt_task = any(task.startswith("prompt_") or task == "jailbreak_detection" for task in selected_tasks)
371
  has_response_task = any(task.startswith("response_") for task in selected_tasks)
372
 
373
  if has_prompt_task and not prompt_text:
374
- return """
375
- <div class="empty-state">
376
- <div class="empty-icon">✍️</div>
377
- <div class="empty-title">Add a prompt</div>
378
- <div class="empty-copy">
379
- Prompt-side tasks require a prompt in the first text box.
 
 
380
  </div>
381
- </div>
382
- """
 
 
 
 
 
383
 
384
  if has_response_task and not response_text:
385
- return """
386
- <div class="empty-state">
387
- <div class="empty-icon">💬</div>
388
- <div class="empty-title">Add a response</div>
389
- <div class="empty-copy">
390
- Response-side tasks require a model response in the response text box.
 
 
391
  </div>
392
- </div>
393
- """
 
 
 
 
 
394
 
395
- inference_parts = []
396
- if has_prompt_task and prompt_text:
397
- inference_parts.append(f"Prompt: {prompt_text}")
398
- if has_response_task and response_text:
399
- if prompt_text:
400
- inference_parts.append(f"Response: {response_text}")
401
- else:
402
- inference_parts.append(f"Response: {response_text}")
403
 
404
- inference_text = "\n".join(inference_parts)
405
- if has_prompt_task and not has_response_task:
406
- inference_text = prompt_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
- result = model.classify_text(
409
- text=inference_text,
410
- tasks=tasks,
411
- threshold=threshold,
412
- include_confidence=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  )
414
- return _build_result_html(result, selected_tasks)
415
 
416
 
417
  DESCRIPTION = f"""
@@ -610,9 +812,9 @@ with gr.Blocks(title="GLiGuard Demo") as demo:
610
  )
611
  task_selector = gr.CheckboxGroup(
612
  choices=TASK_OPTIONS,
613
- value=[task_value for _, task_value in TASK_OPTIONS],
614
  label="Tasks to run",
615
- info="Select any mix of prompt-side and response-side GLiGuard tasks.",
616
  )
617
  with gr.Row():
618
  classify_btn = gr.Button("Analyze Content", variant="primary", size="lg")
@@ -628,6 +830,12 @@ with gr.Blocks(title="GLiGuard Demo") as demo:
628
  examples_per_page=8,
629
  )
630
 
 
 
 
 
 
 
631
  classify_btn.click(
632
  fn=classify_prompt,
633
  inputs=[prompt_input, response_input, threshold_slider, task_selector],
@@ -644,9 +852,9 @@ with gr.Blocks(title="GLiGuard Demo") as demo:
644
  outputs=[result_html],
645
  )
646
  clear_btn.click(
647
- fn=lambda: ("", "", [task_value for _, task_value in TASK_OPTIONS], _empty_state_html()),
648
  outputs=[prompt_input, response_input, task_selector, result_html],
649
  )
650
 
651
  if __name__ == "__main__":
652
- demo.launch(theme=THEME, css=CUSTOM_CSS)
 
1
+ """GLiGuard demo built with Gradio."""
2
 
3
  import html
4
  import os
5
+ from functools import lru_cache
6
 
7
  import gradio as gr
8
  from gliner2 import GLiNER2
 
11
  MODEL_ID = "fastino/gliguard-LLMGuardrails-300M"
12
  MODEL_NAME = "GLiGuard LLM Guardrails 300M"
13
  DEFAULT_THRESHOLD = 0.5
14
+ MULTI_LABEL_THRESHOLD = 0.4
15
 
16
  SAFETY_LABELS = ["safe", "unsafe"]
17
  REFUSAL_LABELS = ["refusal", "compliance"]
 
52
  "prompt_toxicity": {
53
  "labels": TOXICITY_LABELS,
54
  "multi_label": True,
55
+ "cls_threshold": MULTI_LABEL_THRESHOLD,
56
  },
57
  "jailbreak_detection": {
58
  "labels": JAILBREAK_LABELS,
59
  "multi_label": True,
60
+ "cls_threshold": MULTI_LABEL_THRESHOLD,
61
  },
62
  "response_safety": SAFETY_LABELS,
63
  "response_toxicity": {
64
  "labels": TOXICITY_LABELS,
65
  "multi_label": True,
66
+ "cls_threshold": MULTI_LABEL_THRESHOLD,
67
  },
68
  "response_refusal": REFUSAL_LABELS,
69
  }
 
77
  ("Response Refusal", "response_refusal"),
78
  ]
79
 
80
+ PROMPT_TASK_VALUES = ["prompt_safety", "prompt_toxicity", "jailbreak_detection"]
81
+ ALL_TASK_VALUES = [task_value for _, task_value in TASK_OPTIONS]
82
+
83
  DISPLAY_NAMES = {
84
  "safe": "Safe",
85
  "unsafe": "Unsafe",
 
129
  ]
130
 
131
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
132
 
133
+
134
+ @lru_cache(maxsize=1)
135
+ def _load_model() -> GLiNER2:
136
+ if HF_TOKEN:
137
+ login(token=HF_TOKEN)
138
+ return GLiNER2.from_pretrained(MODEL_ID)
139
 
140
 
141
  def _format_label(label: str) -> str:
142
  return DISPLAY_NAMES.get(label, label.replace("_", " ").title())
143
 
144
 
145
+ def _runtime_status_html(title: str, copy: str, tone: str = "info", details: str | None = None) -> str:
146
+ tones = {
147
+ "info": {"accent": "#2563eb", "bg": "#eff6ff", "badge": "Info"},
148
+ "ready": {"accent": "#16a34a", "bg": "#f0fdf4", "badge": "Ready"},
149
+ "warning": {"accent": "#d97706", "bg": "#fffbeb", "badge": "Check"},
150
+ "error": {"accent": "#dc2626", "bg": "#fef2f2", "badge": "Error"},
151
+ }
152
+ style = tones.get(tone, tones["info"])
153
+ detail_html = ""
154
+ if details:
155
+ detail_html = f"<div class='runtime-detail'>{html.escape(details)}</div>"
156
+
157
+ return (
158
+ "<div class='runtime-status' "
159
+ f"style='border-color:{style['accent']}33;background:{style['bg']};'>"
160
+ f"<div class='runtime-badge' style='background:{style['accent']};'>{style['badge']}</div>"
161
+ "<div class='runtime-copy'>"
162
+ f"<div class='runtime-title' style='color:{style['accent']};'>{html.escape(title)}</div>"
163
+ f"<div class='runtime-subtitle'>{html.escape(copy)}</div>"
164
+ f"{detail_html}"
165
+ "</div>"
166
+ "</div>"
167
+ )
168
+
169
+
170
+ def _idle_status_html() -> str:
171
+ return _runtime_status_html(
172
+ "Model loads on first analysis",
173
+ "The first run may take longer while the GLiGuard checkpoint is initialized through the GLiNER2 interface.",
174
+ tone="info",
175
+ )
176
+
177
+
178
+ def _format_exception(exc: Exception) -> str:
179
+ detail = str(exc).strip() or exc.__class__.__name__
180
+ return detail.splitlines()[0][:280]
181
+
182
+
183
  def _extract_single_label(value):
184
  if isinstance(value, dict):
185
  return value.get("label", "unknown"), float(value.get("confidence", 0.0))
 
226
  )
227
 
228
 
229
+ def _render_notes(title: str, subtitle: str, items: list[str]) -> str:
230
+ body = "".join(f"<li>{html.escape(item)}</li>" for item in items)
231
+ return (
232
+ "<div class='result-card'>"
233
+ f"<div class='eyebrow'>{html.escape(title)}</div>"
234
+ f"<div class='subtle'>{html.escape(subtitle)}</div>"
235
+ f"<ul class='note-list'>{body}</ul>"
236
+ "</div>"
237
+ )
238
+
239
+
240
  def _empty_state_html() -> str:
241
  return """
242
  <div class="empty-state">
243
  <div class="empty-icon">🛡️</div>
244
+ <div class="empty-title">Run schema-driven GLiGuard moderation</div>
245
  <div class="empty-copy">
246
+ Choose any mix of prompt-side and response-side tasks, then run the GLiGuard checkpoint in one composed moderation pass.
247
  </div>
248
  </div>
249
  """
250
 
251
 
252
+ def _auto_select_tasks(response_text: str):
253
+ response_text = (response_text or "").strip()
254
+ if response_text:
255
+ return gr.update(value=ALL_TASK_VALUES)
256
+ return gr.update(value=PROMPT_TASK_VALUES)
257
+
258
+
259
  def _build_overview_card(title: str, value: str, subtitle: str) -> str:
260
  return (
261
  "<div class='stat-card'>"
 
266
  )
267
 
268
 
269
+ def _build_inference_text(
270
+ prompt_text: str,
271
+ response_text: str,
272
+ has_prompt_task: bool,
273
+ has_response_task: bool,
274
+ ) -> tuple[str, str]:
275
+ if has_prompt_task and not has_response_task:
276
+ return prompt_text, "Raw prompt"
277
+ if has_response_task and not has_prompt_task:
278
+ if prompt_text:
279
+ return f"Prompt: {prompt_text}\nResponse: {response_text}", "Prompt + Response pair"
280
+ return f"Response: {response_text}", "Response only"
281
+ return f"Prompt: {prompt_text}\nResponse: {response_text}", "Prompt + Response pair"
282
+
283
+
284
+ def _build_result_html(result: dict, selected_tasks: list[str], threshold: float, input_format: str) -> str:
285
  selected_task_set = set(selected_tasks)
286
  has_safety = "prompt_safety" in selected_task_set and "prompt_safety" in result
287
  has_toxicity = "prompt_toxicity" in selected_task_set and "prompt_toxicity" in result
 
320
  if has_response_refusal:
321
  response_refusal_label, response_refusal_conf = _extract_single_label(result.get("response_refusal"))
322
 
323
+ prompt_flagged = (has_safety and safety_label == "unsafe") or bool(toxicity_hits) or bool(jailbreak_hits)
324
+ response_unsafe_signal = has_response_safety and response_safety_label == "unsafe"
325
+ refusal_override = response_unsafe_signal and response_refusal_label == "refusal"
326
+ response_flagged = (response_unsafe_signal and not refusal_override) or bool(response_toxicity_hits)
327
+ is_unsafe = prompt_flagged or response_flagged
 
 
328
  status_key = "unsafe" if is_unsafe else "safe"
329
  status = STATUS_STYLES[status_key]
330
 
331
+ signal_phrases = []
332
+ if has_safety and safety_label == "unsafe":
333
+ signal_phrases.append("prompt safety predicted unsafe")
334
+ if toxicity_hits:
335
+ signal_phrases.append(f"{len(toxicity_hits)} prompt toxicity signal(s)")
336
+ if jailbreak_hits:
337
+ signal_phrases.append(f"{len(jailbreak_hits)} jailbreak signal(s)")
338
+ if response_unsafe_signal and not refusal_override:
339
+ signal_phrases.append("response safety predicted unsafe without a refusal")
340
+ if response_toxicity_hits:
341
+ signal_phrases.append(f"{len(response_toxicity_hits)} response toxicity signal(s)")
342
+
343
+ if signal_phrases:
344
+ summary = "GLiGuard flagged this run because " + ", ".join(signal_phrases) + "."
345
+ elif refusal_override:
346
+ summary = (
347
+ "Response safety predicted unsafe, but response refusal predicted a refusal, "
348
+ "which overrides unsafe in the benchmark-style response verdict."
349
+ )
350
+ else:
351
+ summary = "No selected task produced a harmful signal above the configured cutoffs."
352
 
353
  prompt_task_count = sum([has_safety, has_toxicity, has_jailbreak])
354
  response_task_count = sum([has_response_safety, has_response_toxicity, has_response_refusal])
355
  if prompt_task_count and not response_task_count:
356
+ summary = summary + " This run only used prompt-side moderation tasks."
357
  elif response_task_count and not prompt_task_count:
358
+ summary = summary + " This run only used response-side moderation tasks."
359
 
360
  top_risk = "None"
361
  if toxicity_hits:
 
369
 
370
  stats_cards = [
371
  _build_overview_card("Tasks Run", str(len(selected_tasks)), "Selected at inference time"),
372
+ _build_overview_card("Input Format", input_format, "Formatting passed into GLiGuard"),
373
+ _build_overview_card("Global Threshold", f"{threshold:.2f}", "Forwarded to classify_text"),
374
  ]
375
 
376
  prompt_cards = []
377
+ if prompt_task_count:
378
+ prompt_cards.append(
379
+ _build_overview_card("Prompt Verdict", "Flagged" if prompt_flagged else "Clear", "Unsafe if any prompt-side harmful signal fires")
380
+ )
381
  if has_safety:
382
  prompt_cards.append(
383
  _build_overview_card("Prompt Safety", _format_label(safety_label), f"Confidence {safety_confidence:.1%}")
 
392
  )
393
 
394
  response_cards = []
395
+ if response_task_count:
396
+ verdict_subtitle = "Benchmark-style response verdict"
397
+ if refusal_override:
398
+ verdict_subtitle = "Refusal overrides the unsafe response-safety signal"
399
+ response_cards.append(
400
+ _build_overview_card("Response Verdict", "Flagged" if response_flagged else "Clear", verdict_subtitle)
401
+ )
402
  if has_response_safety:
403
  response_cards.append(
404
  _build_overview_card("Response Safety", _format_label(response_safety_label), f"Confidence {response_safety_conf:.1%}")
 
412
  _build_overview_card("Response Refusal", _format_label(response_refusal_label), f"Confidence {response_refusal_conf:.1%}")
413
  )
414
 
415
+ decision_notes = [
416
+ f"Prompt-only runs use the raw prompt, while response-side runs use {input_format.lower()} formatting.",
417
+ (
418
+ f"Multi-label tasks keep the README default cls_threshold={MULTI_LABEL_THRESHOLD:.1f}, "
419
+ f"and the global threshold for this run was {threshold:.2f}."
420
+ ),
421
+ ]
422
+ if prompt_task_count:
423
+ decision_notes.append(
424
+ "Prompt verdict becomes unsafe when prompt safety predicts unsafe or any non-benign prompt toxicity or jailbreak label appears."
425
+ )
426
+ if response_task_count:
427
+ if refusal_override:
428
+ decision_notes.append(
429
+ "Response safety fired, but refusal overrode that signal, so the response verdict stayed clear unless response toxicity also fired."
430
+ )
431
+ else:
432
+ decision_notes.append(
433
+ "Response verdict becomes unsafe when response safety predicts unsafe without a refusal, or when response toxicity returns non-benign labels."
434
+ )
435
+
436
  result_sections = []
437
  if prompt_cards:
438
  result_sections.append(
 
453
  result_sections.append(
454
  _render_group("Response Toxicity", "Multi-label response harm classification", response_toxicity_hits, "#2563eb")
455
  )
456
+ result_sections.append(
457
+ _render_notes("Decision Logic", "How the demo aggregated the selected GLiGuard tasks", decision_notes)
458
+ )
459
 
460
  return f"""
461
  <div class="results-shell">
 
476
  """
477
 
478
 
479
+ def classify_prompt(
480
+ prompt_text: str,
481
+ response_text: str,
482
+ threshold: float,
483
+ selected_tasks: list[str],
484
+ progress=gr.Progress(track_tqdm=False),
485
+ ) -> tuple[str, str]:
486
  prompt_text = (prompt_text or "").strip()
487
  response_text = (response_text or "").strip()
488
 
489
  if not prompt_text and not response_text:
490
+ return _empty_state_html(), _idle_status_html()
491
  if not selected_tasks:
492
+ return (
493
+ """
494
+ <div class="empty-state">
495
+ <div class="empty-icon">🧭</div>
496
+ <div class="empty-title">Select at least one task</div>
497
+ <div class="empty-copy">
498
+ Choose one or more GLiGuard tasks before running inference.
499
+ </div>
500
  </div>
501
+ """,
502
+ _runtime_status_html(
503
+ "Task selection needed",
504
+ "Pick at least one prompt-side or response-side GLiGuard task before analyzing text.",
505
+ tone="warning",
506
+ ),
507
+ )
508
 
509
  tasks = {task_name: TASKS[task_name] for task_name in selected_tasks if task_name in TASKS}
510
  has_prompt_task = any(task.startswith("prompt_") or task == "jailbreak_detection" for task in selected_tasks)
511
  has_response_task = any(task.startswith("response_") for task in selected_tasks)
512
 
513
  if has_prompt_task and not prompt_text:
514
+ return (
515
+ """
516
+ <div class="empty-state">
517
+ <div class="empty-icon">✍️</div>
518
+ <div class="empty-title">Add a prompt</div>
519
+ <div class="empty-copy">
520
+ Prompt-side tasks require a prompt in the first text box.
521
+ </div>
522
  </div>
523
+ """,
524
+ _runtime_status_html(
525
+ "Prompt required",
526
+ "Prompt safety, prompt toxicity, and jailbreak detection all require prompt text.",
527
+ tone="warning",
528
+ ),
529
+ )
530
 
531
  if has_response_task and not response_text:
532
+ return (
533
+ """
534
+ <div class="empty-state">
535
+ <div class="empty-icon">💬</div>
536
+ <div class="empty-title">Add a response</div>
537
+ <div class="empty-copy">
538
+ Response-side tasks require a model response in the response text box.
539
+ </div>
540
  </div>
541
+ """,
542
+ _runtime_status_html(
543
+ "Response required",
544
+ "Response safety, response toxicity, and response refusal need assistant output in the response box.",
545
+ tone="warning",
546
+ ),
547
+ )
548
 
549
+ inference_text, input_format = _build_inference_text(
550
+ prompt_text=prompt_text,
551
+ response_text=response_text,
552
+ has_prompt_task=has_prompt_task,
553
+ has_response_task=has_response_task,
554
+ )
 
 
555
 
556
+ progress(0.15, desc="Preparing GLiGuard schema")
557
+ try:
558
+ progress(0.4, desc="Loading GLiGuard model")
559
+ model = _load_model()
560
+ except Exception as exc:
561
+ error_detail = _format_exception(exc)
562
+ return (
563
+ """
564
+ <div class="empty-state">
565
+ <div class="empty-icon">⚠️</div>
566
+ <div class="empty-title">GLiGuard could not load</div>
567
+ <div class="empty-copy">
568
+ The demo could not initialize the checkpoint. Check your Hugging Face access and local model setup, then try again.
569
+ </div>
570
+ </div>
571
+ """,
572
+ _runtime_status_html(
573
+ "Model load failed",
574
+ "The checkpoint did not initialize successfully.",
575
+ tone="error",
576
+ details=error_detail,
577
+ ),
578
+ )
579
 
580
+ try:
581
+ progress(0.8, desc="Running moderation")
582
+ result = model.classify_text(
583
+ text=inference_text,
584
+ tasks=tasks,
585
+ threshold=threshold,
586
+ include_confidence=True,
587
+ )
588
+ except Exception as exc:
589
+ error_detail = _format_exception(exc)
590
+ return (
591
+ """
592
+ <div class="empty-state">
593
+ <div class="empty-icon">⚠️</div>
594
+ <div class="empty-title">Inference did not complete</div>
595
+ <div class="empty-copy">
596
+ GLiGuard loaded, but this moderation request failed before results could be rendered.
597
+ </div>
598
+ </div>
599
+ """,
600
+ _runtime_status_html(
601
+ "Inference failed",
602
+ "The model was available, but this specific request raised an error.",
603
+ tone="error",
604
+ details=error_detail,
605
+ ),
606
+ )
607
+
608
+ progress(1.0, desc="Rendering results")
609
+ return (
610
+ _build_result_html(result, selected_tasks, threshold, input_format),
611
+ _runtime_status_html(
612
+ "Model ready",
613
+ f"Ran {len(selected_tasks)} task(s) using {input_format.lower()} formatting.",
614
+ tone="ready",
615
+ ),
616
  )
 
617
 
618
 
619
  DESCRIPTION = f"""
 
812
  )
813
  task_selector = gr.CheckboxGroup(
814
  choices=TASK_OPTIONS,
815
+ value=PROMPT_TASK_VALUES,
816
  label="Tasks to run",
817
+ info="Tasks auto-switch based on whether a response is present. You can still adjust them manually.",
818
  )
819
  with gr.Row():
820
  classify_btn = gr.Button("Analyze Content", variant="primary", size="lg")
 
830
  examples_per_page=8,
831
  )
832
 
833
+ response_input.change(
834
+ fn=_auto_select_tasks,
835
+ inputs=[response_input],
836
+ outputs=[task_selector],
837
+ )
838
+
839
  classify_btn.click(
840
  fn=classify_prompt,
841
  inputs=[prompt_input, response_input, threshold_slider, task_selector],
 
852
  outputs=[result_html],
853
  )
854
  clear_btn.click(
855
+ fn=lambda: ("", "", PROMPT_TASK_VALUES, _empty_state_html()),
856
  outputs=[prompt_input, response_input, task_selector, result_html],
857
  )
858
 
859
  if __name__ == "__main__":
860
+ demo.launch(theme=THEME, css=CUSTOM_CSS)