phi4-guardrail / configuration_sentinel.py
shri-ads's picture
Upload folder using huggingface_hub
3a0dc65 verified
from transformers import PretrainedConfig
class SentinelConfig(PretrainedConfig):
"""
Configuration for SentinelGuardedPhi.
Holds source IDs for both sub-models and guardrail hyperparameters.
Neither model's weights are stored in this repo — they are pulled from
HuggingFace Hub at load time using these IDs.
"""
model_type = "sentinel_guarded_phi"
def __init__(
self,
phi_model_id: str = "microsoft/Phi-4-mini-instruct",
guard_model_id: str = "meta-llama/Llama-Prompt-Guard-2-86M",
guard_threshold: float = 0.5,
blocked_response: str = "I'm not able to assist with that.",
**kwargs,
):
"""
Args:
phi_model_id: HF repo ID for the base Phi-4-mini-instruct model.
guard_model_id: HF repo ID for the Llama-Prompt-Guard-2-86M classifier.
guard_threshold: Probability threshold above which a prompt is blocked.
A float in [0, 1]. Lower = stricter.
blocked_response: Static string returned when the guardrail fires.
"""
self.phi_model_id = phi_model_id
self.guard_model_id = guard_model_id
self.guard_threshold = guard_threshold
self.blocked_response = blocked_response
super().__init__(**kwargs)