kriti0608 commited on
Commit
be89fdb
·
verified ·
1 Parent(s): 04f65f0

Update src/pipeline.py

Browse files
Files changed (1) hide show
  1. src/pipeline.py +10 -35
src/pipeline.py CHANGED
@@ -1,17 +1,12 @@
1
  from dataclasses import dataclass
2
  from typing import Dict, Any
 
3
  from .detector import JailbreakDetector
4
  from .repair import RepairEngine
5
 
6
 
7
  @dataclass
8
  class JailbreakPipeline:
9
- """
10
- High-level wrapper around:
11
- - jailbreak detection
12
- - prompt repair
13
- """
14
-
15
  consider_output: bool = False
16
 
17
  def __post_init__(self):
@@ -19,38 +14,18 @@ class JailbreakPipeline:
19
  self.repair_engine = RepairEngine()
20
 
21
  def detect(self, prompt: str):
22
- """
23
- Run jailbreak rule matching
24
- """
25
  return self.detector.score(prompt)
26
 
27
- def repair_prompt(self, prompt: str) -> str:
28
- """
29
- Rewrite unsafe content into a safe, generic assistant response.
30
- """
31
- return self.repair_engine.repair(prompt)
32
-
33
  def process(self, prompt: str) -> Dict[str, Any]:
34
- """
35
- Unified API called from Gradio UI
36
- Returns:
37
- {
38
- risk_score: float,
39
- fired_rules: list,
40
- safe_output: str,
41
- metadata: dict
42
- }
43
- """
44
- result = self.detect(prompt)
45
-
46
- # pass the risk score (or the full result) depending on your RepairEngine API
47
- safe = self.repair_engine.repair(prompt, result.risk_score)
48
 
49
- return {
50
- "risk_score": result.risk_score,
51
- "fired_rules": result.fired_rules,
52
- "safe_output": safe,
53
- "metadata": result.metadata,
54
- }
55
 
 
 
 
 
 
 
56
 
 
1
  from dataclasses import dataclass
2
  from typing import Dict, Any
3
+
4
  from .detector import JailbreakDetector
5
  from .repair import RepairEngine
6
 
7
 
8
  @dataclass
9
  class JailbreakPipeline:
 
 
 
 
 
 
10
  consider_output: bool = False
11
 
12
  def __post_init__(self):
 
14
  self.repair_engine = RepairEngine()
15
 
16
  def detect(self, prompt: str):
 
 
 
17
  return self.detector.score(prompt)
18
 
 
 
 
 
 
 
19
  def process(self, prompt: str) -> Dict[str, Any]:
20
+ result = self.detect(prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # If your RepairEngine accepts only (prompt), change this to repair(prompt)
23
+ safe = self.repair_engine.repair(prompt, result.risk_score)
 
 
 
 
24
 
25
+ return {
26
+ "risk_score": result.risk_score,
27
+ "fired_rules": result.fired_rules,
28
+ "safe_output": safe,
29
+ "metadata": result.metadata,
30
+ }
31