bandits / _sim.py
jason137's picture
Update _sim.py
50c0f2d
import numpy as np
RNG = np.random.default_rng()
from _algos import GreedyBandit, ThompsonBandit
from _config import CONFIG
ARMS = range(len(CONFIG["PROBS"]))
def _bernoulli(p):
return RNG.binomial(1, p)
def _policy(preds):
return np.argmax(preds)
def sim(e, dynamic_pct, policy_rewards_A, policy_rewards_B, shocks):
probs = CONFIG["PROBS"].copy()
models_A = [GreedyBandit() for _ in ARMS]
models_B = [ThompsonBandit() for _ in ARMS]
actions_A, actions_B = list(), list()
last_shock = 0, 0
for k in range(CONFIG["N_STEPS"]):
o = RNG.uniform()
if o < dynamic_pct:
_ = RNG.shuffle(probs) # env shock
shocks += 1
last_shock = e, k, o
all_rewards = [_bernoulli(p) for p in probs] # Bernoulli rewards
### PREDICT ###
predictions_A = [m.predict() for m in models_A]
predictions_B = [m.predict() for m in models_B]
action_A = _policy(predictions_A)
action_B = _policy(predictions_B)
_ = actions_A.append(action_A)
_ = actions_B.append(action_B)
### EVAL ###
rA, rB = all_rewards[action_A], all_rewards[action_B] # rewards for chosen actions (partial info!)
mA, mB = models_A[action_A], models_B[action_B] # models for chosen actions
policy_rewards_A += rA
policy_rewards_B += rB
### UPDATE ###
_ = mA.update(rA)
_ = mB.update(rB)
### OUTPUT ###
mod = CONFIG["LOG_STEPS"]
if e % mod == k % mod == 0:
print('\ne, k, dyn =', (e, k, dynamic_pct))
print('probs =', probs)
print('predictions_A =', [round(pA, 2) for pA in predictions_A])
print('predictions_B =', [round(pB, 2) for pB in predictions_B])
print('shocks =', shocks)
print('last shock =', last_shock)
print('all_rewards =', all_rewards)
print('action_A, action_B =', (action_A, action_B))
print('rA, rB =', (rA, rB))
print('policy_rewards_A =', policy_rewards_A)
print('policy_rewards_B =', policy_rewards_B)
if k > 0:
print('path RL outperformance =', round(policy_rewards_B / policy_rewards_A, 2))
return policy_rewards_A, policy_rewards_B, shocks