opsgate / hyperparameters.py
SidraMiconi's picture
deploy OpsGate environment
5567ff6
"""
OpsGate Hyperparameters
All training, environment, and scoring config in one place.
Mirrors the centralized config pattern from MADDPG hyperparameters.py
and the weighted scoring system from RoboGraph safety_score.py.
Adjust these before each training run.
"""
# ═══════════════════════════════════════════════════════════════
# Environment
# ═══════════════════════════════════════════════════════════════
MAX_STEPS_PER_EPISODE = 15 # Max tool calls before episode ends
TOOL_CALL_PENALTY = -0.05 # Per tool call (forces efficiency)
INVALID_TOOL_PENALTY = -0.1 # Malformed args or unknown tool
POLICY_VIOLATION_PENALTY = -0.5 # Breaking a business rule
# ═══════════════════════════════════════════════════════════════
# Safety Score β€” Weighted Multi-Metric Scoring (100 pts total)
# Modeled after RoboGraph's _compute_score() system
# ═══════════════════════════════════════════════════════════════
SCORE_WEIGHTS = {
"task_completion": {
"max_points": 30,
"description": "Correct final state across all tools",
},
"policy_compliance": {
"max_points": 20,
"penalty_per_violation": 10,
"description": "No business rule violations",
},
"tool_efficiency": {
"max_points": 15,
"optimal_calls": 4,
"penalty_per_extra": 3,
"description": "Fewest tool calls needed to complete task",
},
"notification_completeness": {
"max_points": 15,
"description": "All stakeholder notifications delivered",
},
"state_accuracy": {
"max_points": 10,
"description": "Precise field-level correctness in final state",
},
"action_hygiene": {
"max_points": 10,
"penalty_per_invalid": 5,
"description": "No malformed or invalid calls",
},
}
GRADE_THRESHOLDS = {"A": 90, "B": 80, "C": 70, "D": 60, "F": 0}
GRADE_COLORS = {"A": "emerald", "B": "blue", "C": "yellow", "D": "orange", "F": "red"}
# 3-way verdict: PASS / HOLD / BLOCK
VERDICT_THRESHOLDS = {
"pass_min_score": 90,
"hold_min_score": 60,
}
# ═══════════════════════════════════════════════════════════════
# RL Reward Mapping
# ═══════════════════════════════════════════════════════════════
REWARD_PASS = 1.0
REWARD_HOLD = 0.3
REWARD_BLOCK = -0.5
# ═══════════════════════════════════════════════════════════════
# Model
# ═══════════════════════════════════════════════════════════════
MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct"
MAX_SEQ_LENGTH = 4096
LORA_RANK = 16
LORA_ALPHA = 32
LORA_TARGETS = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
]
# ═══════════════════════════════════════════════════════════════
# GRPO Training
# ═══════════════════════════════════════════════════════════════
LEARNING_RATE = 5e-6
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
NUM_GENERATIONS = 4
NUM_TRAIN_EPOCHS = 3
SAVE_STEPS = 200
LOGGING_STEPS = 10
MAX_COMPLETION_LENGTH = 256
TEMPERATURE = 0.7
# ═══════════════════════════════════════════════════════════════
# Inference
# ═══════════════════════════════════════════════════════════════
EVAL_TEMPERATURE = 0.1
EVAL_MAX_TOKENS = 256
# ═══════════════════════════════════════════════════════════════
# Paths
# ═══════════════════════════════════════════════════════════════
CHECKPOINT_DIR = "./opsgate_checkpoints"
FINAL_MODEL_DIR = "./opsgate_final"
WANDB_PROJECT = "opsgate"