| | import numpy as np |
| | RNG = np.random.default_rng() |
| |
|
| | from _algos import GreedyBandit, ThompsonBandit |
| | from _config import CONFIG |
| | ARMS = range(len(CONFIG["PROBS"])) |
| |
|
| | def _bernoulli(p): |
| | return RNG.binomial(1, p) |
| |
|
| | def _policy(preds): |
| | return np.argmax(preds) |
| | |
| | def sim(e, dynamic_pct, policy_rewards_A, policy_rewards_B, shocks): |
| |
|
| | probs = CONFIG["PROBS"].copy() |
| |
|
| | models_A = [GreedyBandit() for _ in ARMS] |
| | models_B = [ThompsonBandit() for _ in ARMS] |
| |
|
| | actions_A, actions_B = list(), list() |
| |
|
| | last_shock = 0, 0 |
| | for k in range(CONFIG["N_STEPS"]): |
| |
|
| | o = RNG.uniform() |
| | if o < dynamic_pct: |
| |
|
| | _ = RNG.shuffle(probs) |
| |
|
| | shocks += 1 |
| | last_shock = e, k, o |
| |
|
| | all_rewards = [_bernoulli(p) for p in probs] |
| |
|
| | |
| |
|
| | predictions_A = [m.predict() for m in models_A] |
| | predictions_B = [m.predict() for m in models_B] |
| |
|
| | action_A = _policy(predictions_A) |
| | action_B = _policy(predictions_B) |
| |
|
| | _ = actions_A.append(action_A) |
| | _ = actions_B.append(action_B) |
| |
|
| | |
| |
|
| | rA, rB = all_rewards[action_A], all_rewards[action_B] |
| | mA, mB = models_A[action_A], models_B[action_B] |
| |
|
| | policy_rewards_A += rA |
| | policy_rewards_B += rB |
| |
|
| | |
| |
|
| | _ = mA.update(rA) |
| | _ = mB.update(rB) |
| |
|
| | |
| |
|
| | mod = CONFIG["LOG_STEPS"] |
| | if e % mod == k % mod == 0: |
| |
|
| | print('\ne, k, dyn =', (e, k, dynamic_pct)) |
| | print('probs =', probs) |
| | print('predictions_A =', [round(pA, 2) for pA in predictions_A]) |
| | print('predictions_B =', [round(pB, 2) for pB in predictions_B]) |
| |
|
| | print('shocks =', shocks) |
| | print('last shock =', last_shock) |
| |
|
| | print('all_rewards =', all_rewards) |
| | print('action_A, action_B =', (action_A, action_B)) |
| | print('rA, rB =', (rA, rB)) |
| |
|
| | print('policy_rewards_A =', policy_rewards_A) |
| | print('policy_rewards_B =', policy_rewards_B) |
| |
|
| | if k > 0: |
| | print('path RL outperformance =', round(policy_rewards_B / policy_rewards_A, 2)) |
| |
|
| | return policy_rewards_A, policy_rewards_B, shocks |