Spaces:
Running
Running
| """ | |
| OpsGate Hyperparameters | |
| All training, environment, and scoring config in one place. | |
| Mirrors the centralized config pattern from MADDPG hyperparameters.py | |
| and the weighted scoring system from RoboGraph safety_score.py. | |
| Adjust these before each training run. | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Environment | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_STEPS_PER_EPISODE = 15 # Max tool calls before episode ends | |
| TOOL_CALL_PENALTY = -0.05 # Per tool call (forces efficiency) | |
| INVALID_TOOL_PENALTY = -0.1 # Malformed args or unknown tool | |
| POLICY_VIOLATION_PENALTY = -0.5 # Breaking a business rule | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Safety Score β Weighted Multi-Metric Scoring (100 pts total) | |
| # Modeled after RoboGraph's _compute_score() system | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SCORE_WEIGHTS = { | |
| "task_completion": { | |
| "max_points": 30, | |
| "description": "Correct final state across all tools", | |
| }, | |
| "policy_compliance": { | |
| "max_points": 20, | |
| "penalty_per_violation": 10, | |
| "description": "No business rule violations", | |
| }, | |
| "tool_efficiency": { | |
| "max_points": 15, | |
| "optimal_calls": 4, | |
| "penalty_per_extra": 3, | |
| "description": "Fewest tool calls needed to complete task", | |
| }, | |
| "notification_completeness": { | |
| "max_points": 15, | |
| "description": "All stakeholder notifications delivered", | |
| }, | |
| "state_accuracy": { | |
| "max_points": 10, | |
| "description": "Precise field-level correctness in final state", | |
| }, | |
| "action_hygiene": { | |
| "max_points": 10, | |
| "penalty_per_invalid": 5, | |
| "description": "No malformed or invalid calls", | |
| }, | |
| } | |
| GRADE_THRESHOLDS = {"A": 90, "B": 80, "C": 70, "D": 60, "F": 0} | |
| GRADE_COLORS = {"A": "emerald", "B": "blue", "C": "yellow", "D": "orange", "F": "red"} | |
| # 3-way verdict: PASS / HOLD / BLOCK | |
| VERDICT_THRESHOLDS = { | |
| "pass_min_score": 90, | |
| "hold_min_score": 60, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # RL Reward Mapping | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| REWARD_PASS = 1.0 | |
| REWARD_HOLD = 0.3 | |
| REWARD_BLOCK = -0.5 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_NAME = "unsloth/Llama-3.1-8B-Instruct" | |
| MAX_SEQ_LENGTH = 4096 | |
| LORA_RANK = 16 | |
| LORA_ALPHA = 32 | |
| LORA_TARGETS = [ | |
| "q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRPO Training | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LEARNING_RATE = 5e-6 | |
| BATCH_SIZE = 4 | |
| GRADIENT_ACCUMULATION_STEPS = 4 | |
| NUM_GENERATIONS = 4 | |
| NUM_TRAIN_EPOCHS = 3 | |
| SAVE_STEPS = 200 | |
| LOGGING_STEPS = 10 | |
| MAX_COMPLETION_LENGTH = 256 | |
| TEMPERATURE = 0.7 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Inference | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EVAL_TEMPERATURE = 0.1 | |
| EVAL_MAX_TOKENS = 256 | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Paths | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CHECKPOINT_DIR = "./opsgate_checkpoints" | |
| FINAL_MODEL_DIR = "./opsgate_final" | |
| WANDB_PROJECT = "opsgate" | |