"""
Shared utilities for the ShadowOps Qwen3 SFT + GRPO training pipeline.
This module keeps dataset generation, action parsing, reward shaping, baseline
evaluation, oracle checks, smoke tests, and report generation on one code path
so SFT, GRPO, and final validation cannot drift apart.
"""
from __future__ import annotations
import argparse
import contextlib
import copy
import importlib
import importlib.abc
import importlib.machinery
import importlib.util
import inspect
import json
import math
import random
import re
import statistics
import subprocess
import sys
import time
import warnings
from collections import Counter
from dataclasses import dataclass, field
from importlib import import_module
from importlib.metadata import PackageNotFoundError, version as package_version
from pathlib import Path
from typing import Any, Iterable, Optional
from packaging.version import Version
BACKEND_DIR = Path(__file__).resolve().parents[1]
TRAINING_DIR = BACKEND_DIR / "training"
CHECKPOINT_DIR = TRAINING_DIR / "checkpoints"
if str(BACKEND_DIR) not in sys.path:
sys.path.insert(0, str(BACKEND_DIR))
from shadowops_env import ( # noqa: E402
ACTIONS,
OBS_DIM,
ScenarioGenerator,
build_llama_prompt,
extract_features,
)
MODEL_OPTIONS = {
"4b": "unsloth/Qwen3-4B-Base",
"1.7b": "unsloth/Qwen3-1.7B",
"8b": "unsloth/Qwen3-8B-Base",
}
VALID_ACTIONS = tuple(ACTIONS.values())
VALID_ACTION_SET = set(VALID_ACTIONS)
ACTION_RE = re.compile(r"\b(ALLOW|BLOCK|FORK|QUARANTINE)\b", re.IGNORECASE)
ACTION_OR_SYNONYM_RE = re.compile(
r"\b(ALLOW|BLOCK|FORK|QUARANTINE|APPROVE|APPROVED|DENY|DENIED|REJECT|REJECTED|HOLD|ISOLATE|ESCALATE|REVIEW|HUMAN_REVIEW|HUMAN REVIEW)\b",
re.IGNORECASE,
)
ACTION_PREFIX_RE = re.compile(r"^\s*(?:action|decision|supervisor decision)\s*[:=\-]\s*", re.IGNORECASE)
ACTION_SYNONYMS = {
"APPROVE": "ALLOW",
"APPROVED": "ALLOW",
"DENY": "BLOCK",
"DENIED": "BLOCK",
"REJECT": "BLOCK",
"REJECTED": "BLOCK",
"HOLD": "QUARANTINE",
"ISOLATE": "QUARANTINE",
"ESCALATE": "FORK",
"REVIEW": "FORK",
"HUMAN REVIEW": "FORK",
"HUMAN_REVIEW": "FORK",
}
THINK_BLOCK_RE = re.compile(r".*?", re.IGNORECASE | re.DOTALL)
SAFE_ACTION_PREFIX_RE = re.compile(
r"^(?:i\s+recommend|recommended\s+action(?:\s+is)?|decision(?:\s+is)?|"
r"the\s+decision\s+is|action(?:\s+is)?|choose|result)\b",
re.IGNORECASE,
)
JSON_ACTION_KEYS = (
"action",
"decision",
"supervisor_action",
"supervisor_decision",
"recommended_action",
)
DEFAULT_QAWARE_POLICY_CONFIG = {
"version": 1,
"medium_ambiguous_network_to_quarantine": False,
"ambiguous_network_quarantine_terms": [
"security group",
"allow ",
"open port",
"opened on",
"whitelist",
"alert suppression",
"vendor claims",
"monitoring",
"port 3389",
"port 6379",
"port 27017",
"port 4444",
],
"min_reward_improvement": 0.0,
"safety_floor": 1.0,
"max_unsafe_decision_rate": 0.0,
"max_false_positive_rate": 0.01,
}
_QAWARE_POLICY_CONFIG_CACHE: Optional[dict[str, Any]] = None
DEFAULT_TRAIN_SIZE = 500
DEFAULT_VAL_SIZE = 100
DEFAULT_TRAIN_SEED = 42
DEFAULT_VAL_SEED = 10_042
DEFAULT_SFT_OUTPUT_DIR = TRAINING_DIR / "checkpoints" / "qwen3_sft_adapter"
DEFAULT_GRPO_OUTPUT_DIR = TRAINING_DIR / "checkpoints" / "qwen3_sft_grpo_adapter"
DEFAULT_SFT_METRICS_PATH = TRAINING_DIR / "sft_metrics.json"
DEFAULT_DATASET_AUDIT_PATH = TRAINING_DIR / "dataset_audit.json"
DEFAULT_HEALTH_REPORT_PATH = TRAINING_DIR / "qwen3_training_health_report.json"
DEFAULT_FINAL_REPORT_JSON = TRAINING_DIR / "final_training_report.json"
DEFAULT_FINAL_REPORT_MD = TRAINING_DIR / "final_training_report.md"
DEFAULT_DEMO_BENCHMARK_JSON = TRAINING_DIR / "demo_benchmark_report.json"
DEFAULT_DEMO_BENCHMARK_MD = TRAINING_DIR / "demo_benchmark_report.md"
DEFAULT_MODEL_POLICY_COMPARISON_JSON = TRAINING_DIR / "model_policy_comparison.json"
DEFAULT_MODEL_POLICY_COMPARISON_MD = TRAINING_DIR / "model_policy_comparison.md"
DEFAULT_MODEL_EVAL_JSON = TRAINING_DIR / "model_eval_report.json"
DEFAULT_MODEL_EVAL_MD = TRAINING_DIR / "model_eval_report.md"
DEFAULT_QAWARE_POLICY_CONFIG_JSON = TRAINING_DIR / "qaware_policy_config.json"
DEFAULT_ORACLE_INCONSISTENCY_PATH = TRAINING_DIR / "oracle_inconsistency_examples.json"
DEFAULT_TRAIN_DATASET_PATH = TRAINING_DIR / "qwen3_train_dataset.json"
DEFAULT_VAL_DATASET_PATH = TRAINING_DIR / "qwen3_val_dataset.json"
DEFAULT_HARD_NEGATIVE_PATH = TRAINING_DIR / "hard_negative_scenarios.json"
DEFAULT_CLOUD_SCRIPT_PATH = TRAINING_DIR / "run_full_training_colab.py"
DEFAULT_CLOUD_PS1_PATH = TRAINING_DIR / "run_full_training.ps1"
SFT_SYSTEM_PROMPT = (
"You are ShadowOps, a cybersecurity decision policy."
)
ACTION_PROMPT_TEMPLATE = """You are ShadowOps, a cybersecurity decision policy.
Choose exactly one action:
ALLOW, BLOCK, FORK, QUARANTINE.
Rules:
ALLOW = safe benign activity.
BLOCK = clearly malicious or high-risk activity.
FORK = suspicious or uncertain activity that needs investigation.
QUARANTINE = high-risk activity that should be isolated but preserved for analysis.
Return only one action.
Do not explain.
Do not use tags.
Observation:
{observation}
Answer:
"""
CLOUD_SFT_COMMAND = (
"python training/train_qwen3_sft.py "
"--model-name unsloth/Qwen3-1.7B "
"--sft-epochs 2 "
"--batch-size 1 "
"--grad-accum 8 "
"--max-seq-len 256 "
"--learning-rate 2e-4 "
"--sft-output-dir training/checkpoints/qwen3_sft_adapter"
)
CLOUD_GRPO_COMMAND = (
"python training/train_qwen3_grpo.py "
"--model-name unsloth/Qwen3-1.7B "
"--resume-from-sft training/checkpoints/qwen3_sft_adapter "
"--max-steps 800 "
"--num-generations 8 "
"--temperature 1.0 "
"--top-p 0.95 "
"--top-k 50 "
"--max-new-tokens 8 "
"--batch-size 1 "
"--grad-accum 4 "
"--val-eval-eps 100 "
"--eval-batch-size 4 "
"--learning-rate 1e-5 "
"--output-dir training/checkpoints/qwen3_sft_grpo_adapter"
)
CLOUD_FALLBACK_COMMAND = (
"python training/train_qwen3_grpo.py "
"--model-name unsloth/Qwen3-1.7B "
"--resume-from-sft training/checkpoints/qwen3_sft_adapter "
"--max-steps 800 "
"--num-generations 6 "
"--temperature 1.0 "
"--top-p 0.95 "
"--top-k 50 "
"--max-new-tokens 8 "
"--batch-size 1 "
"--grad-accum 8 "
"--val-eval-eps 50 "
"--eval-batch-size 4 "
"--learning-rate 1e-5 "
"--output-dir training/checkpoints/qwen3_sft_grpo_adapter"
)
BROKEN_ITEMS = [
"Action parsing accepted noisy outputs inconsistently and let tags leak into metrics.",
"Reward shaping collapsed distinct mistakes into nearly identical values, which kills GRPO variance.",
"Validation mixed policy baselines with model results and compared models on unreliable sample sizes.",
"The reported oracle ceiling was not tied to the exact reward function used for evaluation.",
"Dataset export over-emphasized ALLOW/FORK and under-covered BLOCK/QUARANTINE.",
"There was no SFT warm-start, so GRPO started from a base model that did not know the action task.",
"Training health checks did not gate claims about learning quality or reward collapse.",
]
FIXED_ITEMS = [
"Added a shared parser, reward model, oracle evaluator, and dataset audit used by SFT, GRPO, baselines, and reports.",
"Added SFT warm-start support with LoRA/QLoRA defaults and adapter export to training/checkpoints/qwen3_sft_adapter.",
"Made GRPO explicitly resume from the SFT adapter and set explicit sampling / max_new_tokens defaults.",
"Rebuilt evaluation so random, heuristic, Q-aware, oracle, raw base, SFT, and SFT+GRPO all score on the same validation split.",
"Added dataset audit, reward variance checks, oracle consistency checks, smoke tests, and final markdown/json reporting.",
"Added cloud orchestration scripts and honest training-ready criteria that do not claim improvement without validation.",
]
HARD_ATTACK_CUES = (
"curl http://",
"| bash",
"socket.socket",
"grant admin",
"disable 2fa",
"0.0.0.0/0",
"administratoraccess",
"public-read",
"export all secrets",
"authorize-security-group-ingress",
"rogue",
"backdoor",
"hacker_admin",
)
SOFT_ATTACK_CUES = (
"[system",
"[override",
"ignore policy",
"ignore previous",
"ignore security checks",
"approve without review",
"bypass standard review process",
"disable encryption",
"disable_firewall",
"ignore alerts",
"suppress all monitoring",
"create new iam user",
"set webhook url to http://",
"