| import gymnasium as gym
|
| import math
|
| import random
|
| from random import randrange
|
| import pandas as pd
|
| import matplotlib
|
| import matplotlib.pyplot as plt
|
| from collections import namedtuple, deque
|
| from itertools import count
|
| import numpy as np
|
|
|
| import gymnasium as gym
|
| from gymnasium import spaces
|
|
|
| import torch
|
| import torch.nn as nn
|
| import torch.optim as optim
|
| import torch.nn.functional as F
|
|
|
| random.seed()
|
|
|
| budget = 10
|
| impression_max=11.888
|
| price_max=0.118
|
|
|
|
|
| is_ipython = 'inline' in matplotlib.get_backend()
|
| if is_ipython:
|
| from IPython import display
|
|
|
| plt.ion()
|
|
|
|
|
| device = torch.device(
|
| "cuda" if torch.cuda.is_available() else
|
| "mps" if torch.backends.mps.is_available() else
|
| "cpu"
|
| )
|
|
|
| def _normalize_vector(vector):
|
| if type(vector) is list:
|
| vector_np = np.asarray(vector, dtype=np.float32)
|
| else:
|
| vector_np = vector
|
| sum = np.sum(vector_np)
|
| if sum < 1e-8:
|
| return vector
|
| normalized_vector = vector_np / sum
|
| return normalized_vector
|
|
|
|
|
| def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| KL divergence KL(p || q)
|
| Both p and q must be valid probability distributions.
|
| """
|
| epsilon = 0.00001
|
| return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
|
|
|
|
|
| def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| Compute Jensen–Shannon divergence between two 1D probability vectors.
|
|
|
| Parameters
|
| ----------
|
| p : np.ndarray
|
| Desired probability distribution (length 3).
|
| q : np.ndarray
|
| Current probability distribution (length 3).
|
|
|
| Returns
|
| -------
|
| float
|
| JS divergence (bounded between 0 and log(2)).
|
| """
|
|
|
|
|
| p = _normalize_vector(p)
|
| q = _normalize_vector(q)
|
|
|
| m = 0.5 * (p + q)
|
|
|
| js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
|
|
|
| return float(js)
|
|
|
|
|
| file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv"
|
| df_screen_ids = pd.read_csv(file_screen_ids)
|
| screen_ids = list(df_screen_ids['screen'])
|
|
|
| file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv"
|
|
|
|
|
|
|
| df_inventory = pd.read_csv(file_inventory_last)
|
|
|
| weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
|
| hours = list(range(24))
|
|
|
| cols = ['screen', 'weekday', 'hour']
|
| screens_dict = {}
|
| for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
|
| .to_dict()).items():
|
| screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
|
|
|
|
|
| def random_screen():
|
| return random.choice(screen_ids)
|
|
|
|
|
| def generate_bid_requests(num_weeks):
|
| """Generate synthetic bid requests."""
|
| bid_requests = []
|
| for weekIndex in range(num_weeks):
|
| for weekday_index in range(7):
|
| weekday = weekdays[weekday_index]
|
|
|
| for hour in hours:
|
|
|
| for bid_index in range(10):
|
| screen_index = randrange(len(screen_ids))
|
| screen_id = screen_ids[screen_index]
|
|
|
| data = screens_dict[screen_id][weekday][hour]
|
|
|
| householdSmall = data[0]
|
| householdAverage = data[1]
|
| householdLarge = data[2]
|
| incomeLow = data[3]
|
| incomeAverage = data[4]
|
| incomeHigh = data[5]
|
| impressionHour = data[7]
|
| price = data[8]
|
|
|
| publisher_1 = data[9]
|
| publisher_2 = data[10]
|
| publisher_3 = data[11]
|
| venue_type_1 = data[12]
|
| venue_type_2 = data[13]
|
| venue_type_3 = data[14]
|
|
|
| bid_request = {
|
| "features": np.array([
|
|
|
|
|
|
|
| impressionHour,
|
| ], dtype=np.float32),
|
| "household": np.array([
|
| householdSmall,
|
| householdAverage,
|
| householdLarge,
|
| ], dtype=np.float32),
|
| "income": np.array([
|
| incomeLow,
|
| incomeAverage,
|
| incomeHigh,
|
| ], dtype=np.float32),
|
| "publisher": np.array([
|
| publisher_1,
|
| publisher_2,
|
| publisher_3,
|
| ], dtype=np.float32),
|
| "venue_type": np.array([
|
| venue_type_1,
|
| venue_type_2,
|
| venue_type_3,
|
| ], dtype=np.float32),
|
| "price": price,
|
| }
|
| bid_requests.append(bid_request)
|
| print(f'Generated {len(bid_requests)} bid requests.')
|
| return bid_requests
|
|
|
|
|
| class DspCampaign100Env(gym.Env):
|
| """
|
| Minimal DSP RL environment:
|
| - One episode = one campaign
|
| - One step = one bid request
|
| """
|
|
|
| metadata = {"render_modes": []}
|
|
|
| def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
|
| super().__init__()
|
|
|
|
|
|
|
|
|
| self.bid_requests = bid_requests
|
| self.distribution_dim = 0
|
| for key in desired_distributions:
|
| dist = desired_distributions[key]
|
| dist2 = _normalize_vector(dist)
|
| desired_distributions[key] = dist2
|
| self.distribution_dim += len(dist2)
|
| self.desired_distributions = desired_distributions
|
| self.initial_budget = budget
|
| self.impression_max = impression_max
|
| self.price_max = price_max
|
|
|
|
|
|
|
|
|
|
|
| self.action_space = spaces.Discrete(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| self.bid_feat_dim = 1
|
|
|
| obs_dim = (
|
| self.distribution_dim
|
| + 3
|
| + self.bid_feat_dim
|
| + self.distribution_dim
|
| )
|
|
|
| self.observation_space = spaces.Box(
|
| low=-np.inf,
|
| high=np.inf,
|
| shape=(obs_dim,),
|
| dtype=np.float32,
|
| )
|
|
|
| self.reset()
|
|
|
|
|
|
|
|
|
| def reset(self, seed=None, options=None):
|
| super().reset(seed=seed)
|
|
|
| self.step_idx = 0
|
| self.budget_left = self.initial_budget
|
| self.current_distributions = {}
|
|
|
| for key in self.desired_distributions:
|
|
|
| self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, info
|
|
|
|
|
| def reset_bid_requests(self, bid_requests):
|
| self.bid_requests = bid_requests
|
|
|
|
|
|
|
|
|
|
|
| def step(self, action):
|
| assert self.action_space.contains(action)
|
|
|
| done = False
|
|
|
| bid = self.bid_requests[self.step_idx]
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
| cost = bid["price"] * self.price_max
|
|
|
|
|
|
|
|
|
| reward = 0.0
|
|
|
| if action == 1 and self.budget_left >= cost:
|
| self.budget_left -= cost
|
|
|
|
|
| prev_dist = 0.0
|
| for key in self.desired_distributions:
|
| prev_current = _normalize_vector(self.current_distributions[key])
|
| prev_dist += np.linalg.norm(self.desired_distributions[key] - prev_current)
|
|
|
|
|
| for key in self.desired_distributions:
|
| self.current_distributions[key] += bid[key]
|
|
|
| print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'])
|
| print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'])
|
| print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'])
|
|
|
|
|
| new_dist = 0.0
|
| for key in self.desired_distributions:
|
| new_current = _normalize_vector(self.current_distributions[key])
|
| new_dist += np.linalg.norm(self.desired_distributions[key] - new_current)
|
|
|
|
|
|
|
|
|
| dist_improvement = (prev_dist - new_dist)
|
|
|
|
|
|
|
| demo_reward = dist_improvement * 50.0
|
|
|
| reward += demo_reward
|
| else:
|
|
|
|
|
| pacing_gap = time_ratio - budget_ratio
|
|
|
|
|
|
|
| reward += -abs(pacing_gap) * 0.5
|
|
|
| print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
|
|
|
|
|
|
|
|
|
| self.step_idx += 1
|
|
|
| if self.step_idx >= len(self.bid_requests) - 1:
|
| done = True
|
|
|
|
|
|
|
| final_dist = 0.0
|
| for key in self.desired_distributions:
|
| final_current = _normalize_vector(self.current_distributions[key])
|
| final_dist += np.linalg.norm(self.desired_distributions[key] - final_current)
|
| reward += (1.0 - final_dist) * 10
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, reward, done, False, info
|
|
|
|
|
|
|
|
|
| def _get_observation(self):
|
| bid = self.bid_requests[self.step_idx]
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
|
|
| current_distributions_flat = []
|
| desired_distributions_flat = []
|
| gap_flat = []
|
| bid_distribution_flat = []
|
| for key in self.desired_distributions:
|
| current = self.current_distributions[key]
|
| current = _normalize_vector(current)
|
| current_distributions_flat.extend(current)
|
|
|
| desired = _normalize_vector(self.desired_distributions[key])
|
| desired_distributions_flat.extend(self.desired_distributions[key])
|
|
|
| gap = desired - current
|
| gap_flat.extend(gap.tolist())
|
| bid_distribution_flat.extend(bid[key])
|
|
|
| obs = np.concatenate([
|
|
|
|
|
| np.array(gap_flat, dtype=np.float32),
|
| np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
|
| bid["features"],
|
|
|
| np.array(bid_distribution_flat, dtype=np.float32),
|
| ])
|
|
|
|
|
| return obs.astype(np.float32)
|
|
|
|
|
|
|
|
|
| def _get_reward(self, distributions) -> float:
|
| """
|
| Compute reward based on JS divergence for household and income groups.
|
|
|
| Reward is NEGATIVE divergence (because we want to minimize divergence).
|
|
|
| Returns
|
| -------
|
| float
|
| Reward value (higher is better).
|
| """
|
|
|
| groups = []
|
| for key in self.desired_distributions:
|
| a = _normalize_vector(distributions[key])
|
| js_1 = _jensen_shannon_divergence(
|
| self.desired_distributions[key],
|
| a,
|
| )
|
| groups.append((js_1))
|
|
|
| total_divergence = sum(groups)
|
|
|
| reward = -total_divergence * 10
|
|
|
| return reward
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| seed = 42
|
| random.seed(seed)
|
| torch.manual_seed(seed)
|
| if torch.cuda.is_available():
|
| torch.cuda.manual_seed(seed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| Transition = namedtuple('Transition',
|
| ('state', 'action', 'next_state', 'reward'))
|
|
|
|
|
| class ReplayMemory(object):
|
|
|
| def __init__(self, capacity):
|
| self.capacity = capacity
|
| self.memory = deque([], maxlen=capacity)
|
|
|
| def clear(self):
|
| self.memory = deque([], maxlen=self.capacity)
|
|
|
| def push(self, *args):
|
| """Save a transition"""
|
| self.memory.append(Transition(*args))
|
|
|
| def sample(self, batch_size):
|
| return random.sample(self.memory, batch_size)
|
|
|
| def __len__(self):
|
| return len(self.memory)
|
|
|
|
|
| class DQN(nn.Module):
|
|
|
| def __init__(self, n_observations, n_actions):
|
| super(DQN, self).__init__()
|
| self.layer1 = nn.Linear(n_observations, 128)
|
| self.layer2 = nn.Linear(128, 128)
|
| self.layer3 = nn.Linear(128, n_actions)
|
|
|
|
|
|
|
| def forward(self, x):
|
| x = F.relu(self.layer1(x))
|
| x = F.relu(self.layer2(x))
|
| return self.layer3(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| BATCH_SIZE = 128
|
|
|
|
|
| GAMMA = 0.9
|
| EPS_START = 0.9
|
| EPS_END = 0.01
|
|
|
| EPS_DECAY = 3360 / 3
|
|
|
|
|
|
|
| TAU = 0.003
|
|
|
|
|
| LR = 2e-4
|
|
|
|
|
|
|
|
|
| desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
|
|
|
| desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
|
| env = DspCampaign100Env(generate_bid_requests(2),
|
|
|
| desired_distributions={"publisher": desired_publiser_vector, "venue_type": desired_venue_type_vector},
|
|
|
| budget=budget, impression_max=impression_max, price_max=price_max)
|
|
|
| n_actions = env.action_space.n
|
|
|
| state, info = env.reset()
|
| n_observations = len(state)
|
|
|
| policy_net = DQN(n_observations, n_actions).to(device)
|
| target_net = DQN(n_observations, n_actions).to(device)
|
| target_net.load_state_dict(policy_net.state_dict())
|
|
|
| optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
|
| memory = ReplayMemory(10000)
|
|
|
|
|
| steps_done = 0
|
|
|
|
|
| def select_action(state):
|
| global steps_done
|
|
|
| sample = random.random()
|
| eps_threshold = EPS_END + (EPS_START - EPS_END) * \
|
| math.exp(-1. * steps_done / EPS_DECAY)
|
| steps_done += 1
|
| if sample > eps_threshold:
|
| with torch.no_grad():
|
|
|
|
|
|
|
| return policy_net(state).max(1).indices.view(1, 1)
|
| else:
|
| return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
|
|
|
|
|
| episode_rewards = []
|
|
|
|
|
| def plot_rewards(show_result=False):
|
| plt.figure(1)
|
| reward_t = torch.tensor(episode_rewards, dtype=torch.float)
|
|
|
| if show_result:
|
| plt.title('Result')
|
| else:
|
| plt.clf()
|
| plt.title('Training...')
|
| plt.xlabel('Episode')
|
| plt.ylabel('Reward')
|
| plt.plot(reward_t.numpy())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| plt.pause(0.2)
|
| if is_ipython:
|
| if not show_result:
|
| display.display(plt.gcf())
|
| display.clear_output(wait=True)
|
| else:
|
| display.display(plt.gcf())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def optimize_model():
|
| if len(memory) < BATCH_SIZE:
|
| return
|
| transitions = memory.sample(BATCH_SIZE)
|
|
|
|
|
|
|
| batch = Transition(*zip(*transitions))
|
|
|
|
|
|
|
| non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
|
| batch.next_state)), device=device, dtype=torch.bool)
|
| non_final_next_states = torch.cat([s for s in batch.next_state
|
| if s is not None])
|
| state_batch = torch.cat(batch.state)
|
| action_batch = torch.cat(batch.action)
|
| reward_batch = torch.cat(batch.reward)
|
|
|
|
|
|
|
|
|
| state_action_values = policy_net(state_batch).gather(1, action_batch)
|
|
|
|
|
|
|
|
|
|
|
|
|
| next_state_values = torch.zeros(BATCH_SIZE, device=device)
|
| with torch.no_grad():
|
| next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
|
|
|
| expected_state_action_values = (next_state_values * GAMMA) + reward_batch
|
|
|
|
|
| criterion = nn.SmoothL1Loss()
|
| loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
|
|
|
|
|
| optimizer.zero_grad()
|
| loss.backward()
|
|
|
| torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
|
| optimizer.step()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| if torch.cuda.is_available() or torch.backends.mps.is_available():
|
| num_episodes = 600
|
| else:
|
| num_episodes = 250
|
|
|
| for i_episode in range(num_episodes):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| env.reset(seed=seed)
|
| env.action_space.seed(seed)
|
| env.observation_space.seed(seed)
|
|
|
| sum_reward = 0
|
| state, info = env.reset()
|
| if i_episode % 3 == 0:
|
| env.reset_bid_requests(generate_bid_requests(2))
|
| state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
|
| for t in count():
|
| action = select_action(state)
|
| observation, reward, terminated, truncated, _ = env.step(action.item())
|
| if not math.isnan(reward):
|
| sum_reward = sum_reward + reward
|
|
|
| reward = torch.tensor([reward], device=device)
|
| done = terminated or truncated
|
|
|
| if terminated:
|
| next_state = None
|
| else:
|
| next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
|
|
|
|
|
| memory.push(state, action, next_state, reward)
|
|
|
|
|
| state = next_state
|
|
|
|
|
| optimize_model()
|
|
|
|
|
|
|
| target_net_state_dict = target_net.state_dict()
|
| policy_net_state_dict = policy_net.state_dict()
|
| for key in policy_net_state_dict:
|
| target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
|
| target_net.load_state_dict(target_net_state_dict)
|
|
|
|
|
| if done:
|
|
|
| episode_rewards.append(sum_reward)
|
| plot_rewards()
|
|
|
| print("############# Final reward:", env._get_reward(env.current_distributions))
|
| print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
|
| print("############# sum_reward:", sum_reward)
|
| print("############# Desire distributions:", env.desired_distributions)
|
| print("############# Real distributions:", env.current_distributions)
|
| break
|
|
|
| print('Complete')
|
| plot_rewards(show_result=True)
|
| plt.ioff()
|
| plt.show()
|
|
|
| MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model.pt"
|
|
|
| torch.save({
|
| "model_state_dict": policy_net.state_dict(),
|
| "optimizer_state_dict": optimizer.state_dict(),
|
| "n_observations": n_observations,
|
| "n_actions": n_actions,
|
| }, MODEL_PATH)
|
|
|
| print(f"Model saved to {MODEL_PATH}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|