| from transformers import PretrainedConfig | |
| class SentinelConfig(PretrainedConfig): | |
| """ | |
| Configuration for SentinelGuardedPhi. | |
| Holds source IDs for both sub-models and guardrail hyperparameters. | |
| Neither model's weights are stored in this repo — they are pulled from | |
| HuggingFace Hub at load time using these IDs. | |
| """ | |
| model_type = "sentinel_guarded_phi" | |
| def __init__( | |
| self, | |
| phi_model_id: str = "microsoft/Phi-4-mini-instruct", | |
| guard_model_id: str = "meta-llama/Llama-Prompt-Guard-2-86M", | |
| guard_threshold: float = 0.5, | |
| blocked_response: str = "I'm not able to assist with that.", | |
| **kwargs, | |
| ): | |
| """ | |
| Args: | |
| phi_model_id: HF repo ID for the base Phi-4-mini-instruct model. | |
| guard_model_id: HF repo ID for the Llama-Prompt-Guard-2-86M classifier. | |
| guard_threshold: Probability threshold above which a prompt is blocked. | |
| A float in [0, 1]. Lower = stricter. | |
| blocked_response: Static string returned when the guardrail fires. | |
| """ | |
| self.phi_model_id = phi_model_id | |
| self.guard_model_id = guard_model_id | |
| self.guard_threshold = guard_threshold | |
| self.blocked_response = blocked_response | |
| super().__init__(**kwargs) | |