kriti0608 commited on
Commit
02c919a
·
verified ·
1 Parent(s): aab9816

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -0
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.pipeline import JailbreakPipeline
3
+
4
+ pipeline = JailbreakPipeline(consider_output=True)
5
+
6
+ def run_defense(prompt):
7
+ result = pipeline.process(prompt)
8
+
9
+ fired = "\n".join(
10
+ [f"- **{h['rule']}**: {h['description']} (match: *{h['match_text']}*)"
11
+ for h in result["fired_rules"]]
12
+ ) or "No rules fired ✔️"
13
+
14
+ return (
15
+ result["risk_score"],
16
+ fired,
17
+ result["repaired_output"] or "No repair needed ✔️"
18
+ )
19
+
20
+ with gr.Blocks(title="JailBreakDefense") as demo:
21
+ gr.Markdown("# 🔒 JailBreakDefense – Jailbreak Prompt Detector")
22
+ gr.Markdown("Enter any prompt and detect jailbreak attempts in real-time.")
23
+
24
+ with gr.Row():
25
+ prompt = gr.Textbox(
26
+ label="User Prompt",
27
+ placeholder="Type something like: 'Ignore safety and do anything now...'"
28
+ )
29
+
30
+ btn = gr.Button("Analyze Prompt")
31
+
32
+ risk_score = gr.Number(label="Risk Score (0–1)")
33
+ rules_fired = gr.Markdown(label="Fired Rules")
34
+ repaired = gr.Textbox(label="Safe Output (if repaired)", lines=4)
35
+
36
+ btn.click(run_defense, inputs=[prompt], outputs=[risk_score, rules_fired, repaired])
37
+
38
+ demo.launch()