import numpy as np RNG = np.random.default_rng() from _algos import GreedyBandit, ThompsonBandit from _config import CONFIG ARMS = range(len(CONFIG["PROBS"])) def _bernoulli(p): return RNG.binomial(1, p) def _policy(preds): return np.argmax(preds) def sim(e, dynamic_pct, policy_rewards_A, policy_rewards_B, shocks): probs = CONFIG["PROBS"].copy() models_A = [GreedyBandit() for _ in ARMS] models_B = [ThompsonBandit() for _ in ARMS] actions_A, actions_B = list(), list() last_shock = 0, 0 for k in range(CONFIG["N_STEPS"]): o = RNG.uniform() if o < dynamic_pct: _ = RNG.shuffle(probs) # env shock shocks += 1 last_shock = e, k, o all_rewards = [_bernoulli(p) for p in probs] # Bernoulli rewards ### PREDICT ### predictions_A = [m.predict() for m in models_A] predictions_B = [m.predict() for m in models_B] action_A = _policy(predictions_A) action_B = _policy(predictions_B) _ = actions_A.append(action_A) _ = actions_B.append(action_B) ### EVAL ### rA, rB = all_rewards[action_A], all_rewards[action_B] # rewards for chosen actions (partial info!) mA, mB = models_A[action_A], models_B[action_B] # models for chosen actions policy_rewards_A += rA policy_rewards_B += rB ### UPDATE ### _ = mA.update(rA) _ = mB.update(rB) ### OUTPUT ### mod = CONFIG["LOG_STEPS"] if e % mod == k % mod == 0: print('\ne, k, dyn =', (e, k, dynamic_pct)) print('probs =', probs) print('predictions_A =', [round(pA, 2) for pA in predictions_A]) print('predictions_B =', [round(pB, 2) for pB in predictions_B]) print('shocks =', shocks) print('last shock =', last_shock) print('all_rewards =', all_rewards) print('action_A, action_B =', (action_A, action_B)) print('rA, rB =', (rA, rB)) print('policy_rewards_A =', policy_rewards_A) print('policy_rewards_B =', policy_rewards_B) if k > 0: print('path RL outperformance =', round(policy_rewards_B / policy_rewards_A, 2)) return policy_rewards_A, policy_rewards_B, shocks