EA_strat_optimizer / backend /drl /environment.py
TheQuantEd's picture
deploy: AMD EA Strategy Optimizer — Neo4j + FastAPI + Streamlit
6252f54
"""
EA Digital Twin Simulation Environment for DRL training.
State: 10 capability scores + 3 budget/timeline/risk scalars + 7 domain flags = 20 dims
Action: priority ordering of top-10 capabilities (multinomial sampling)
Reward: business value - dependency violations - budget overrun - risk penalty
"""
import numpy as np
import random
from typing import NamedTuple
class EAScenario(NamedTuple):
cap_business_values: np.ndarray # shape (10,) — 0..1
cap_effort_scores: np.ndarray # shape (10,) — 0..1
cap_risk_scores: np.ndarray # shape (10,) — 0..1
dependency_matrix: np.ndarray # shape (10, 10) — dep_matrix[i,j]=1 means i must precede j
budget_capacity: float # 0..1 normalised
timeline_score: float # months/36
risk_tolerance: float # 0..1
# 10 EA capability archetypes (represent real patterns in the graph)
ARCHETYPE_NAMES = [
"Data Platform",
"API Management",
"Customer Portal",
"Advanced Analytics",
"Security & Compliance",
"Process Automation",
"Cloud Migration",
"AI/ML Platform",
"ERP Integration",
"DevOps Pipeline",
]
# Base business values per archetype (will be perturbed per episode)
BASE_BUSINESS_VALUES = np.array([0.90, 0.80, 0.70, 0.85, 0.95, 0.75, 0.70, 0.80, 0.65, 0.70])
BASE_EFFORT_SCORES = np.array([0.80, 0.50, 0.60, 0.70, 0.60, 0.65, 0.90, 0.85, 0.75, 0.50])
BASE_RISK_SCORES = np.array([0.40, 0.30, 0.30, 0.35, 0.20, 0.40, 0.60, 0.50, 0.55, 0.25])
# Dependency rules: prerequisite → dependent (indices)
BASE_DEPENDENCIES = [
(0, 1), # Data Platform → API Management
(1, 2), # API Management → Customer Portal
(0, 3), # Data Platform → Analytics
(1, 5), # API Management → Process Automation
(6, 7), # Cloud Migration → AI/ML Platform
(0, 7), # Data Platform → AI/ML Platform
]
class EAEnvironment:
"""Simulated Enterprise Architecture environment for REINFORCE training."""
STATE_DIM = 20
ACTION_DIM = 10
def __init__(self, noise_scale: float = 0.1, seed: int | None = None):
self._rng = np.random.default_rng(seed)
self._noise = noise_scale
self.scenario: EAScenario | None = None
self.current_step = 0
def reset(self) -> np.ndarray:
"""Generate a new randomised EA scenario and return initial state vector."""
noise = self._rng.uniform(-self._noise, self._noise, size=10)
bv = np.clip(BASE_BUSINESS_VALUES + noise, 0.1, 1.0)
ef = np.clip(BASE_EFFORT_SCORES + self._rng.uniform(-self._noise, self._noise, 10), 0.1, 1.0)
ri = np.clip(BASE_RISK_SCORES + self._rng.uniform(-self._noise / 2, self._noise / 2, 10), 0.05, 0.9)
# Randomise dep matrix from base
dep_matrix = np.zeros((10, 10), dtype=float)
for (i, j) in BASE_DEPENDENCIES:
if self._rng.random() > 0.2: # 80% chance to include each dependency
dep_matrix[i, j] = 1.0
budget_capacity = float(self._rng.choice([0.4, 0.6, 0.8, 1.0]))
timeline_score = float(self._rng.choice([6, 12, 18, 24, 36])) / 36.0
risk_tolerance = float(self._rng.choice([0.33, 0.67, 1.0]))
self.scenario = EAScenario(bv, ef, ri, dep_matrix, budget_capacity, timeline_score, risk_tolerance)
self.current_step = 0
return self.get_state_vector()
def get_state_vector(self) -> np.ndarray:
"""Build 20-dim state vector from current scenario."""
s = self.scenario
# 7 domain flags — simulate which of 7 EA domain categories are in this scenario
domain_flags = (s.cap_business_values[:7] > 0.6).astype(float)
state = np.concatenate([
s.cap_business_values, # 10 dims
[s.budget_capacity], # 1
[s.timeline_score], # 1
[s.risk_tolerance], # 1
domain_flags, # 7
]).astype(np.float32)
return state
def step(self, action_indices: np.ndarray) -> tuple[np.ndarray, float, bool]:
"""
action_indices: ordered priority list of capability indices (len=10)
Returns (next_state, reward, done)
"""
s = self.scenario
# Base reward: value-weighted rank score
base_reward = 0.0
for rank, idx in enumerate(action_indices):
rank_fraction = rank / len(action_indices)
base_reward += s.cap_business_values[idx] * (1.0 - rank_fraction)
base_reward /= len(action_indices) # normalise to 0..1
# Dependency penalty
dep_violations = 0
for i, dep_i in enumerate(action_indices):
for j, dep_j in enumerate(action_indices):
if s.dependency_matrix[dep_j, dep_i] == 1.0 and j < i:
dep_violations += 1
dep_penalty = dep_violations * 0.15
# Budget penalty — cumulative effort of top-N capped by budget
cum_effort = 0.0
budget_penalty = 0.0
for idx in action_indices:
cum_effort += s.cap_effort_scores[idx] / 10.0
if cum_effort > s.budget_capacity:
budget_penalty += 0.05
# Risk penalty — high-risk caps in top-3 positions
risk_penalty = 0.0
for idx in action_indices[:3]:
if s.cap_risk_scores[idx] > s.risk_tolerance:
risk_penalty += s.cap_risk_scores[idx] * 0.2
reward = float(base_reward - dep_penalty - budget_penalty - risk_penalty)
reward = max(-1.0, min(2.0, reward))
self.current_step += 1
done = True # single-step environment (one full ordering per episode)
next_state = self.get_state_vector()
return next_state, reward, done
def sample_action(self) -> np.ndarray:
"""Random action for baseline / exploration."""
return self._rng.permutation(self.ACTION_DIM).astype(np.int64)