""" EA Digital Twin Simulation Environment for DRL training. State: 10 capability scores + 3 budget/timeline/risk scalars + 7 domain flags = 20 dims Action: priority ordering of top-10 capabilities (multinomial sampling) Reward: business value - dependency violations - budget overrun - risk penalty """ import numpy as np import random from typing import NamedTuple class EAScenario(NamedTuple): cap_business_values: np.ndarray # shape (10,) — 0..1 cap_effort_scores: np.ndarray # shape (10,) — 0..1 cap_risk_scores: np.ndarray # shape (10,) — 0..1 dependency_matrix: np.ndarray # shape (10, 10) — dep_matrix[i,j]=1 means i must precede j budget_capacity: float # 0..1 normalised timeline_score: float # months/36 risk_tolerance: float # 0..1 # 10 EA capability archetypes (represent real patterns in the graph) ARCHETYPE_NAMES = [ "Data Platform", "API Management", "Customer Portal", "Advanced Analytics", "Security & Compliance", "Process Automation", "Cloud Migration", "AI/ML Platform", "ERP Integration", "DevOps Pipeline", ] # Base business values per archetype (will be perturbed per episode) BASE_BUSINESS_VALUES = np.array([0.90, 0.80, 0.70, 0.85, 0.95, 0.75, 0.70, 0.80, 0.65, 0.70]) BASE_EFFORT_SCORES = np.array([0.80, 0.50, 0.60, 0.70, 0.60, 0.65, 0.90, 0.85, 0.75, 0.50]) BASE_RISK_SCORES = np.array([0.40, 0.30, 0.30, 0.35, 0.20, 0.40, 0.60, 0.50, 0.55, 0.25]) # Dependency rules: prerequisite → dependent (indices) BASE_DEPENDENCIES = [ (0, 1), # Data Platform → API Management (1, 2), # API Management → Customer Portal (0, 3), # Data Platform → Analytics (1, 5), # API Management → Process Automation (6, 7), # Cloud Migration → AI/ML Platform (0, 7), # Data Platform → AI/ML Platform ] class EAEnvironment: """Simulated Enterprise Architecture environment for REINFORCE training.""" STATE_DIM = 20 ACTION_DIM = 10 def __init__(self, noise_scale: float = 0.1, seed: int | None = None): self._rng = np.random.default_rng(seed) self._noise = noise_scale self.scenario: EAScenario | None = None self.current_step = 0 def reset(self) -> np.ndarray: """Generate a new randomised EA scenario and return initial state vector.""" noise = self._rng.uniform(-self._noise, self._noise, size=10) bv = np.clip(BASE_BUSINESS_VALUES + noise, 0.1, 1.0) ef = np.clip(BASE_EFFORT_SCORES + self._rng.uniform(-self._noise, self._noise, 10), 0.1, 1.0) ri = np.clip(BASE_RISK_SCORES + self._rng.uniform(-self._noise / 2, self._noise / 2, 10), 0.05, 0.9) # Randomise dep matrix from base dep_matrix = np.zeros((10, 10), dtype=float) for (i, j) in BASE_DEPENDENCIES: if self._rng.random() > 0.2: # 80% chance to include each dependency dep_matrix[i, j] = 1.0 budget_capacity = float(self._rng.choice([0.4, 0.6, 0.8, 1.0])) timeline_score = float(self._rng.choice([6, 12, 18, 24, 36])) / 36.0 risk_tolerance = float(self._rng.choice([0.33, 0.67, 1.0])) self.scenario = EAScenario(bv, ef, ri, dep_matrix, budget_capacity, timeline_score, risk_tolerance) self.current_step = 0 return self.get_state_vector() def get_state_vector(self) -> np.ndarray: """Build 20-dim state vector from current scenario.""" s = self.scenario # 7 domain flags — simulate which of 7 EA domain categories are in this scenario domain_flags = (s.cap_business_values[:7] > 0.6).astype(float) state = np.concatenate([ s.cap_business_values, # 10 dims [s.budget_capacity], # 1 [s.timeline_score], # 1 [s.risk_tolerance], # 1 domain_flags, # 7 ]).astype(np.float32) return state def step(self, action_indices: np.ndarray) -> tuple[np.ndarray, float, bool]: """ action_indices: ordered priority list of capability indices (len=10) Returns (next_state, reward, done) """ s = self.scenario # Base reward: value-weighted rank score base_reward = 0.0 for rank, idx in enumerate(action_indices): rank_fraction = rank / len(action_indices) base_reward += s.cap_business_values[idx] * (1.0 - rank_fraction) base_reward /= len(action_indices) # normalise to 0..1 # Dependency penalty dep_violations = 0 for i, dep_i in enumerate(action_indices): for j, dep_j in enumerate(action_indices): if s.dependency_matrix[dep_j, dep_i] == 1.0 and j < i: dep_violations += 1 dep_penalty = dep_violations * 0.15 # Budget penalty — cumulative effort of top-N capped by budget cum_effort = 0.0 budget_penalty = 0.0 for idx in action_indices: cum_effort += s.cap_effort_scores[idx] / 10.0 if cum_effort > s.budget_capacity: budget_penalty += 0.05 # Risk penalty — high-risk caps in top-3 positions risk_penalty = 0.0 for idx in action_indices[:3]: if s.cap_risk_scores[idx] > s.risk_tolerance: risk_penalty += s.cap_risk_scores[idx] * 0.2 reward = float(base_reward - dep_penalty - budget_penalty - risk_penalty) reward = max(-1.0, min(2.0, reward)) self.current_step += 1 done = True # single-step environment (one full ordering per episode) next_state = self.get_state_vector() return next_state, reward, done def sample_action(self) -> np.ndarray: """Random action for baseline / exploration.""" return self._rng.permutation(self.ACTION_DIM).astype(np.int64)