File size: 2,356 Bytes
d0f2261 06c8f81 14970f5 9b40d79 0723433 3a302e2 7f1b59d 14970f5 7f1b59d 14970f5 7f1b59d 3a302e2 7f1b59d 3ed7f4b 7f1b59d 50c0f2d 7f1b59d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import numpy as np
RNG = np.random.default_rng()
from _algos import GreedyBandit, ThompsonBandit
from _config import CONFIG
ARMS = range(len(CONFIG["PROBS"]))
def _bernoulli(p):
return RNG.binomial(1, p)
def _policy(preds):
return np.argmax(preds)
def sim(e, dynamic_pct, policy_rewards_A, policy_rewards_B, shocks):
probs = CONFIG["PROBS"].copy()
models_A = [GreedyBandit() for _ in ARMS]
models_B = [ThompsonBandit() for _ in ARMS]
actions_A, actions_B = list(), list()
last_shock = 0, 0
for k in range(CONFIG["N_STEPS"]):
o = RNG.uniform()
if o < dynamic_pct:
_ = RNG.shuffle(probs) # env shock
shocks += 1
last_shock = e, k, o
all_rewards = [_bernoulli(p) for p in probs] # Bernoulli rewards
### PREDICT ###
predictions_A = [m.predict() for m in models_A]
predictions_B = [m.predict() for m in models_B]
action_A = _policy(predictions_A)
action_B = _policy(predictions_B)
_ = actions_A.append(action_A)
_ = actions_B.append(action_B)
### EVAL ###
rA, rB = all_rewards[action_A], all_rewards[action_B] # rewards for chosen actions (partial info!)
mA, mB = models_A[action_A], models_B[action_B] # models for chosen actions
policy_rewards_A += rA
policy_rewards_B += rB
### UPDATE ###
_ = mA.update(rA)
_ = mB.update(rB)
### OUTPUT ###
mod = CONFIG["LOG_STEPS"]
if e % mod == k % mod == 0:
print('\ne, k, dyn =', (e, k, dynamic_pct))
print('probs =', probs)
print('predictions_A =', [round(pA, 2) for pA in predictions_A])
print('predictions_B =', [round(pB, 2) for pB in predictions_B])
print('shocks =', shocks)
print('last shock =', last_shock)
print('all_rewards =', all_rewards)
print('action_A, action_B =', (action_A, action_B))
print('rA, rB =', (rA, rB))
print('policy_rewards_A =', policy_rewards_A)
print('policy_rewards_B =', policy_rewards_B)
if k > 0:
print('path RL outperformance =', round(policy_rewards_B / policy_rewards_A, 2))
return policy_rewards_A, policy_rewards_B, shocks |