Spaces:

rithwik-ravikumar
/

OpenEnv-Dynamic-Guardrails

Sleeping

App Files Files Community

Rithwik Ravi commited on Apr 25

Commit

005d862

1 Parent(s): f421da5

fix: anchor env/ in gitignore to prevent excluding src/env package

Browse files

Files changed (5) hide show

.gitignore +1 -1
src/env/__init__.py +0 -0
src/env/guardrail.py +99 -0
src/env/models.py +46 -0
src/env/reward.py +14 -0

.gitignore CHANGED Viewed

@@ -1,7 +1,7 @@
 # Environments
 .venv/
 venv/
-env/
 # Python Cache
 __pycache__/

 # Environments
 .venv/
 venv/
+/env/
 # Python Cache
 __pycache__/

src/env/__init__.py ADDED Viewed

File without changes

src/env/guardrail.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import json
+import re
+import math
+from typing import List, Any
+from collections import Counter
+from pydantic import ValidationError
+from src.env.models import GuardrailGraph, Action, Observation, extract_and_clean_json
+def calculate_entropy(text: str) -> float:
+    if not text:
+        return 0.0
+    probabilities = [n_x/len(text) for x, n_x in Counter(text).items()]
+    return -sum([p * math.log2(p) for p in probabilities])
+def evaluate_node(node: dict, text: str) -> bool:
+    # Check if it's a SemanticFilter
+    if "filter_type" in node:
+        filter_type = node["filter_type"]
+        val = node.get("value")
+        if filter_type == "substring":
+            return str(val).lower() in text.lower()
+        elif filter_type == "regex_pattern":
+            try:
+                return bool(re.search(str(val), text))
+            except re.error:
+                return False
+        elif filter_type == "length_limit":
+            return len(text) > (val if isinstance(val, (int, float)) else float('inf'))
+        elif filter_type == "entropy_threshold":
+            return calculate_entropy(text) > (val if isinstance(val, (int, float)) else float('inf'))
+        elif filter_type == "keyword_match":
+            return str(val).lower() in text.lower().split()
+        return False
+    # Else it's a LogicNode
+    operator = node.get("operator")
+    children = node.get("children", [])
+    if operator == "AND":
+        if not children: return False
+        return all(evaluate_node(child, text) for child in children)
+    elif operator == "OR":
+        if not children: return False
+        return any(evaluate_node(child, text) for child in children)
+    elif operator == "NOT":
+        if not children: return False
+        return not evaluate_node(children[0], text)
+    return False
+try:
+    from openenv import BaseEnvironment
+except ImportError:
+    class BaseEnvironment:
+        pass
+class GuardrailEnvironment(BaseEnvironment):
+    def __init__(self):
+        super().__init__()
+        self.state = None
+    def reset(self, adversarial_samples: List[str], benign_samples: List[str]) -> Observation:
+        self.state = Observation(
+            adversarial_samples=adversarial_samples,
+            benign_samples=benign_samples
+        )
+        return self.state
+    def step(self, action: Action) -> tuple[float, float, bool]:
+        """
+        Returns (recall, fpr, syntax_error)
+        """
+        try:
+            clean_json = extract_and_clean_json(action.ast_json)
+            parsed_ast = json.loads(clean_json)
+            ast_wrapper = GuardrailGraph.model_validate(parsed_ast)
+        except (json.JSONDecodeError, ValidationError):
+            return 0.0, 0.0, True
+        true_positives = 0
+        false_positives = 0
+        # We evaluate against raw dict to avoid recursive pydantic object overhead
+        root_node = ast_wrapper.model_dump().get("root", {})
+        # Evaluate Recall (TP rate on adversarial)
+        adv_total = len(self.state.adversarial_samples)
+        for text in self.state.adversarial_samples:
+            if evaluate_node(root_node, text):
+                true_positives += 1
+        # Evaluate FPR (FP rate on benign)
+        ben_total = len(self.state.benign_samples)
+        for text in self.state.benign_samples:
+            if evaluate_node(root_node, text):
+                false_positives += 1
+        recall = true_positives / adv_total if adv_total > 0 else 0.0
+        fpr = false_positives / ben_total if ben_total > 0 else 0.0
+        return recall, fpr, False

src/env/models.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import List, Union, Literal, Optional
+from pydantic import BaseModel, ConfigDict
+import json
+import re
+class SemanticFilter(BaseModel):
+    filter_type: Literal["substring", "regex_pattern", "length_limit", "entropy_threshold", "keyword_match"]
+    value: Union[str, int, float]
+class LogicNode(BaseModel):
+    operator: Literal["AND", "OR", "NOT"]
+    children: List[Union["LogicNode", SemanticFilter]]
+class GuardrailGraph(BaseModel):
+    graph_id: str
+    description: str
+    root: LogicNode
+LogicNode.model_rebuild()
+class Observation(BaseModel):
+    adversarial_samples: List[str]
+    benign_samples: List[str]
+class Action(BaseModel):
+    ast_json: str  # The model outputs a JSON string representing the GuardrailGraph
+    baseline_ast_json: Optional[str] = None
+class StepResult(BaseModel):
+    observation: Observation
+    reward: float
+    done: bool
+    info: dict
+def extract_and_clean_json(text: str) -> str:
+    # Extract JSON blocks from markdown explicitly and strip trailing commas
+    text = text.strip()
+    match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
+    if match:
+        text = match.group(1)
+    # Replace trailing commas before closing braces/brackets
+    text = re.sub(r',\s*}', '}', text)
+    text = re.sub(r',\s*]', ']', text)
+    return text.strip()

src/env/reward.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import math
+class LogBarrierReward:
+    def calculate(self, recall: float, fpr: float, syntax_error: bool = False) -> float:
+        """
+        Recall = True Positive Rate on adversarial
+        FPR = False Positive Rate on benign
+        Reward = (1.0 * Recall) - (2.0 * math.log1p(FPR))
+        """
+        if syntax_error:
+            return -10.0
+        reward = (1.0 * recall) - (2.0 * math.log1p(fpr))
+        return float(reward)