""" Shared utilities for the ShadowOps Qwen3 SFT + GRPO training pipeline. This module keeps dataset generation, action parsing, reward shaping, baseline evaluation, oracle checks, smoke tests, and report generation on one code path so SFT, GRPO, and final validation cannot drift apart. """ from __future__ import annotations import argparse import contextlib import copy import importlib import importlib.abc import importlib.machinery import importlib.util import inspect import json import math import random import re import statistics import subprocess import sys import time import warnings from collections import Counter from dataclasses import dataclass, field from importlib import import_module from importlib.metadata import PackageNotFoundError, version as package_version from pathlib import Path from typing import Any, Iterable, Optional from packaging.version import Version BACKEND_DIR = Path(__file__).resolve().parents[1] TRAINING_DIR = BACKEND_DIR / "training" CHECKPOINT_DIR = TRAINING_DIR / "checkpoints" if str(BACKEND_DIR) not in sys.path: sys.path.insert(0, str(BACKEND_DIR)) from shadowops_env import ( # noqa: E402 ACTIONS, OBS_DIM, ScenarioGenerator, build_llama_prompt, extract_features, ) MODEL_OPTIONS = { "4b": "unsloth/Qwen3-4B-Base", "1.7b": "unsloth/Qwen3-1.7B", "8b": "unsloth/Qwen3-8B-Base", } VALID_ACTIONS = tuple(ACTIONS.values()) VALID_ACTION_SET = set(VALID_ACTIONS) ACTION_RE = re.compile(r"\b(ALLOW|BLOCK|FORK|QUARANTINE)\b", re.IGNORECASE) ACTION_OR_SYNONYM_RE = re.compile( r"\b(ALLOW|BLOCK|FORK|QUARANTINE|APPROVE|APPROVED|DENY|DENIED|REJECT|REJECTED|HOLD|ISOLATE|ESCALATE|REVIEW|HUMAN_REVIEW|HUMAN REVIEW)\b", re.IGNORECASE, ) ACTION_PREFIX_RE = re.compile(r"^\s*(?:action|decision|supervisor decision)\s*[:=\-]\s*", re.IGNORECASE) ACTION_SYNONYMS = { "APPROVE": "ALLOW", "APPROVED": "ALLOW", "DENY": "BLOCK", "DENIED": "BLOCK", "REJECT": "BLOCK", "REJECTED": "BLOCK", "HOLD": "QUARANTINE", "ISOLATE": "QUARANTINE", "ESCALATE": "FORK", "REVIEW": "FORK", "HUMAN REVIEW": "FORK", "HUMAN_REVIEW": "FORK", } THINK_BLOCK_RE = re.compile(r".*?", re.IGNORECASE | re.DOTALL) SAFE_ACTION_PREFIX_RE = re.compile( r"^(?:i\s+recommend|recommended\s+action(?:\s+is)?|decision(?:\s+is)?|" r"the\s+decision\s+is|action(?:\s+is)?|choose|result)\b", re.IGNORECASE, ) JSON_ACTION_KEYS = ( "action", "decision", "supervisor_action", "supervisor_decision", "recommended_action", ) DEFAULT_QAWARE_POLICY_CONFIG = { "version": 1, "medium_ambiguous_network_to_quarantine": False, "ambiguous_network_quarantine_terms": [ "security group", "allow ", "open port", "opened on", "whitelist", "alert suppression", "vendor claims", "monitoring", "port 3389", "port 6379", "port 27017", "port 4444", ], "min_reward_improvement": 0.0, "safety_floor": 1.0, "max_unsafe_decision_rate": 0.0, "max_false_positive_rate": 0.01, } _QAWARE_POLICY_CONFIG_CACHE: Optional[dict[str, Any]] = None DEFAULT_TRAIN_SIZE = 500 DEFAULT_VAL_SIZE = 100 DEFAULT_TRAIN_SEED = 42 DEFAULT_VAL_SEED = 10_042 DEFAULT_SFT_OUTPUT_DIR = TRAINING_DIR / "checkpoints" / "qwen3_sft_adapter" DEFAULT_GRPO_OUTPUT_DIR = TRAINING_DIR / "checkpoints" / "qwen3_sft_grpo_adapter" DEFAULT_SFT_METRICS_PATH = TRAINING_DIR / "sft_metrics.json" DEFAULT_DATASET_AUDIT_PATH = TRAINING_DIR / "dataset_audit.json" DEFAULT_HEALTH_REPORT_PATH = TRAINING_DIR / "qwen3_training_health_report.json" DEFAULT_FINAL_REPORT_JSON = TRAINING_DIR / "final_training_report.json" DEFAULT_FINAL_REPORT_MD = TRAINING_DIR / "final_training_report.md" DEFAULT_DEMO_BENCHMARK_JSON = TRAINING_DIR / "demo_benchmark_report.json" DEFAULT_DEMO_BENCHMARK_MD = TRAINING_DIR / "demo_benchmark_report.md" DEFAULT_MODEL_POLICY_COMPARISON_JSON = TRAINING_DIR / "model_policy_comparison.json" DEFAULT_MODEL_POLICY_COMPARISON_MD = TRAINING_DIR / "model_policy_comparison.md" DEFAULT_MODEL_EVAL_JSON = TRAINING_DIR / "model_eval_report.json" DEFAULT_MODEL_EVAL_MD = TRAINING_DIR / "model_eval_report.md" DEFAULT_QAWARE_POLICY_CONFIG_JSON = TRAINING_DIR / "qaware_policy_config.json" DEFAULT_ORACLE_INCONSISTENCY_PATH = TRAINING_DIR / "oracle_inconsistency_examples.json" DEFAULT_TRAIN_DATASET_PATH = TRAINING_DIR / "qwen3_train_dataset.json" DEFAULT_VAL_DATASET_PATH = TRAINING_DIR / "qwen3_val_dataset.json" DEFAULT_HARD_NEGATIVE_PATH = TRAINING_DIR / "hard_negative_scenarios.json" DEFAULT_CLOUD_SCRIPT_PATH = TRAINING_DIR / "run_full_training_colab.py" DEFAULT_CLOUD_PS1_PATH = TRAINING_DIR / "run_full_training.ps1" SFT_SYSTEM_PROMPT = ( "You are ShadowOps, a cybersecurity decision policy." ) ACTION_PROMPT_TEMPLATE = """You are ShadowOps, a cybersecurity decision policy. Choose exactly one action: ALLOW, BLOCK, FORK, QUARANTINE. Rules: ALLOW = safe benign activity. BLOCK = clearly malicious or high-risk activity. FORK = suspicious or uncertain activity that needs investigation. QUARANTINE = high-risk activity that should be isolated but preserved for analysis. Return only one action. Do not explain. Do not use tags. Observation: {observation} Answer: """ CLOUD_SFT_COMMAND = ( "python training/train_qwen3_sft.py " "--model-name unsloth/Qwen3-1.7B " "--sft-epochs 2 " "--batch-size 1 " "--grad-accum 8 " "--max-seq-len 256 " "--learning-rate 2e-4 " "--sft-output-dir training/checkpoints/qwen3_sft_adapter" ) CLOUD_GRPO_COMMAND = ( "python training/train_qwen3_grpo.py " "--model-name unsloth/Qwen3-1.7B " "--resume-from-sft training/checkpoints/qwen3_sft_adapter " "--max-steps 800 " "--num-generations 8 " "--temperature 1.0 " "--top-p 0.95 " "--top-k 50 " "--max-new-tokens 8 " "--batch-size 1 " "--grad-accum 4 " "--val-eval-eps 100 " "--eval-batch-size 4 " "--learning-rate 1e-5 " "--output-dir training/checkpoints/qwen3_sft_grpo_adapter" ) CLOUD_FALLBACK_COMMAND = ( "python training/train_qwen3_grpo.py " "--model-name unsloth/Qwen3-1.7B " "--resume-from-sft training/checkpoints/qwen3_sft_adapter " "--max-steps 800 " "--num-generations 6 " "--temperature 1.0 " "--top-p 0.95 " "--top-k 50 " "--max-new-tokens 8 " "--batch-size 1 " "--grad-accum 8 " "--val-eval-eps 50 " "--eval-batch-size 4 " "--learning-rate 1e-5 " "--output-dir training/checkpoints/qwen3_sft_grpo_adapter" ) BROKEN_ITEMS = [ "Action parsing accepted noisy outputs inconsistently and let tags leak into metrics.", "Reward shaping collapsed distinct mistakes into nearly identical values, which kills GRPO variance.", "Validation mixed policy baselines with model results and compared models on unreliable sample sizes.", "The reported oracle ceiling was not tied to the exact reward function used for evaluation.", "Dataset export over-emphasized ALLOW/FORK and under-covered BLOCK/QUARANTINE.", "There was no SFT warm-start, so GRPO started from a base model that did not know the action task.", "Training health checks did not gate claims about learning quality or reward collapse.", ] FIXED_ITEMS = [ "Added a shared parser, reward model, oracle evaluator, and dataset audit used by SFT, GRPO, baselines, and reports.", "Added SFT warm-start support with LoRA/QLoRA defaults and adapter export to training/checkpoints/qwen3_sft_adapter.", "Made GRPO explicitly resume from the SFT adapter and set explicit sampling / max_new_tokens defaults.", "Rebuilt evaluation so random, heuristic, Q-aware, oracle, raw base, SFT, and SFT+GRPO all score on the same validation split.", "Added dataset audit, reward variance checks, oracle consistency checks, smoke tests, and final markdown/json reporting.", "Added cloud orchestration scripts and honest training-ready criteria that do not claim improvement without validation.", ] HARD_ATTACK_CUES = ( "curl http://", "| bash", "socket.socket", "grant admin", "disable 2fa", "0.0.0.0/0", "administratoraccess", "public-read", "export all secrets", "authorize-security-group-ingress", "rogue", "backdoor", "hacker_admin", ) SOFT_ATTACK_CUES = ( "[system", "[override", "ignore policy", "ignore previous", "ignore security checks", "approve without review", "bypass standard review process", "disable encryption", "disable_firewall", "ignore alerts", "suppress all monitoring", "create new iam user", "set webhook url to http://", "