| import gymnasium as gym
|
| from gymnasium import spaces
|
| import math
|
| import random
|
| from random import randrange
|
| import numpy as np
|
| import pandas as pd
|
|
|
| import torch
|
| import torch.nn as nn
|
| import torch.optim as optim
|
| import torch.nn.functional as F
|
|
|
|
|
| def _normalize_vector(vector):
|
| if type(vector) is list:
|
| vector_np = np.asarray(vector, dtype=np.float32)
|
| else:
|
| vector_np = vector
|
| sum = np.sum(vector_np)
|
| if sum < 1e-8:
|
| return vector
|
| normalized_vector = vector_np / sum
|
| return normalized_vector
|
|
|
|
|
| def _KL_divergence(a, b):
|
| epsilon = 0.00001
|
|
|
| a = np.asarray(a + epsilon, dtype=np.float32)
|
| b = np.asarray(b + epsilon, dtype=np.float32)
|
|
|
| return np.sum(np.where(a != 0, a * np.log(a / b), 0))
|
|
|
|
|
| def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| KL divergence KL(p || q)
|
| Both p and q must be valid probability distributions.
|
| """
|
| epsilon = 0.00001
|
| return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
|
|
|
|
|
| def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| Compute Jensen–Shannon divergence between two 1D probability vectors.
|
|
|
| Parameters
|
| ----------
|
| p : np.ndarray
|
| Desired probability distribution (length 3).
|
| q : np.ndarray
|
| Current probability distribution (length 3).
|
|
|
| Returns
|
| -------
|
| float
|
| JS divergence (bounded between 0 and log(2)).
|
| """
|
|
|
|
|
| p = _normalize_vector(p)
|
| q = _normalize_vector(q)
|
|
|
| m = 0.5 * (p + q)
|
|
|
| js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
|
|
|
| return float(js)
|
|
|
|
|
| file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv"
|
| df_screen_ids = pd.read_csv(file_screen_ids)
|
| screen_ids = list(df_screen_ids['screen'])
|
|
|
| file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv"
|
|
|
|
|
|
|
| df_inventory = pd.read_csv(file_inventory_last)
|
|
|
| weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
|
| hours = list(range(24))
|
|
|
| cols = ['screen', 'weekday', 'hour']
|
| screens_dict = {}
|
| for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
|
| .to_dict()).items():
|
| screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
|
|
|
|
|
| def random_screen():
|
| return random.choice(screen_ids)
|
|
|
|
|
| def generate_bid_requests(num_weeks):
|
| """Generate synthetic bid requests."""
|
| bid_requests = []
|
| for weekIndex in range(num_weeks):
|
| for weekday_index in range(7):
|
| weekday = weekdays[weekday_index]
|
|
|
| for hour in hours:
|
|
|
| for bid_index in range(10):
|
| screen_index = randrange(len(screen_ids))
|
| screen_id = screen_ids[screen_index]
|
|
|
| data = screens_dict[screen_id][weekday][hour]
|
|
|
| householdSmall = data[0]
|
| householdAverage = data[1]
|
| householdLarge = data[2]
|
| incomeLow = data[3]
|
| incomeAverage = data[4]
|
| incomeHigh = data[5]
|
| impressionHour = data[7]
|
| price = data[8]
|
|
|
| publisher_1 = data[9]
|
| publisher_2 = data[10]
|
| publisher_3 = data[11]
|
| venue_type_1 = data[12]
|
| venue_type_2 = data[13]
|
| venue_type_3 = data[14]
|
|
|
| bid_request = {
|
| "features": np.array([
|
|
|
|
|
|
|
| impressionHour,
|
| ], dtype=np.float32),
|
| "household": np.array([
|
| householdSmall,
|
| householdAverage,
|
| householdLarge,
|
| ], dtype=np.float32),
|
| "income": np.array([
|
| incomeLow,
|
| incomeAverage,
|
| incomeHigh,
|
| ], dtype=np.float32),
|
| "publisher": np.array([
|
| publisher_1,
|
| publisher_2,
|
| publisher_3,
|
| ], dtype=np.float32),
|
| "venue_type": np.array([
|
| venue_type_1,
|
| venue_type_2,
|
| venue_type_3,
|
| ], dtype=np.float32),
|
| "price": price,
|
| }
|
| bid_requests.append(bid_request)
|
| print(f'Generated {len(bid_requests)} bid requests.')
|
| return bid_requests
|
|
|
|
|
| class DspCampaign100Env(gym.Env):
|
| """
|
| Minimal DSP RL environment:
|
| - One episode = one campaign
|
| - One step = one bid request
|
| """
|
|
|
| metadata = {"render_modes": []}
|
|
|
| def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
|
| super().__init__()
|
|
|
|
|
|
|
|
|
| self.bid_requests = bid_requests
|
| self.distribution_dim = 0
|
| for key in desired_distributions:
|
| dist = desired_distributions[key]
|
| dist2 = _normalize_vector(dist)
|
| desired_distributions[key] = dist2
|
| self.distribution_dim += len(dist2)
|
| self.desired_distributions = desired_distributions
|
| self.initial_budget = budget
|
| self.impression_max = impression_max
|
| self.price_max = price_max
|
|
|
|
|
|
|
|
|
|
|
| self.action_space = spaces.Discrete(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| self.bid_feat_dim = 1
|
|
|
| obs_dim = (
|
| self.distribution_dim
|
| + 3
|
| + self.bid_feat_dim
|
| + self.distribution_dim
|
| )
|
|
|
| self.observation_space = spaces.Box(
|
| low=-np.inf,
|
| high=np.inf,
|
| shape=(obs_dim,),
|
| dtype=np.float32,
|
| )
|
|
|
| self.reset()
|
|
|
|
|
|
|
|
|
| def reset(self, seed=None, options=None):
|
| super().reset(seed=seed)
|
|
|
| self.step_idx = 0
|
| self.budget_left = self.initial_budget
|
| self.current_distributions = {}
|
|
|
| for key in self.desired_distributions:
|
|
|
| self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, info
|
|
|
|
|
| def reset_bid_requests(self, bid_requests):
|
| self.bid_requests = bid_requests
|
|
|
|
|
|
|
|
|
|
|
| def step(self, action):
|
| assert self.action_space.contains(action)
|
|
|
| done = False
|
|
|
| bid = self.bid_requests[self.step_idx]
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
| cost = bid["price"] * self.price_max
|
|
|
|
|
|
|
|
|
| reward = 0.0
|
|
|
| if action == 1 and self.budget_left >= cost:
|
| self.budget_left -= cost
|
|
|
|
|
| prev_dist = 0.0
|
| for key in self.desired_distributions:
|
| prev_current = _normalize_vector(self.current_distributions[key])
|
| prev_dist += np.linalg.norm(self.desired_distributions[key] - prev_current)
|
|
|
|
|
| for key in self.desired_distributions:
|
| self.current_distributions[key] += bid[key]
|
|
|
| print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'])
|
| print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'])
|
| print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'])
|
|
|
|
|
| new_dist = 0.0
|
| for key in self.desired_distributions:
|
| new_current = _normalize_vector(self.current_distributions[key])
|
| new_dist += np.linalg.norm(self.desired_distributions[key] - new_current)
|
|
|
|
|
|
|
|
|
| dist_improvement = (prev_dist - new_dist)
|
|
|
|
|
|
|
| demo_reward = dist_improvement * 50.0
|
|
|
| reward += demo_reward
|
| else:
|
|
|
|
|
| pacing_gap = time_ratio - budget_ratio
|
|
|
|
|
|
|
| reward += -abs(pacing_gap) * 0.5
|
|
|
| print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
|
|
|
|
|
|
|
|
|
| self.step_idx += 1
|
|
|
| if self.step_idx >= len(self.bid_requests) - 1:
|
| done = True
|
|
|
|
|
|
|
| final_dist = 0.0
|
| for key in self.desired_distributions:
|
| final_current = _normalize_vector(self.current_distributions[key])
|
| final_dist += np.linalg.norm(self.desired_distributions[key] - final_current)
|
| reward += (1.0 - final_dist) * 10
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, reward, done, False, info
|
|
|
|
|
|
|
|
|
| def _get_observation(self):
|
| bid = self.bid_requests[self.step_idx]
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
|
|
| current_distributions_flat = []
|
| desired_distributions_flat = []
|
| gap_flat = []
|
| bid_distribution_flat = []
|
| for key in self.desired_distributions:
|
| current = self.current_distributions[key]
|
| current = _normalize_vector(current)
|
| current_distributions_flat.extend(current)
|
|
|
| desired = _normalize_vector(self.desired_distributions[key])
|
| desired_distributions_flat.extend(self.desired_distributions[key])
|
|
|
| gap = desired - current
|
| gap_flat.extend(gap.tolist())
|
| bid_distribution_flat.extend(bid[key])
|
|
|
| obs = np.concatenate([
|
|
|
|
|
| np.array(gap_flat, dtype=np.float32),
|
| np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
|
| bid["features"],
|
|
|
| np.array(bid_distribution_flat, dtype=np.float32),
|
| ])
|
|
|
|
|
| return obs.astype(np.float32)
|
|
|
|
|
|
|
|
|
| def _get_reward(self, distributions) -> float:
|
| """
|
| Compute reward based on JS divergence for household and income groups.
|
|
|
| Reward is NEGATIVE divergence (because we want to minimize divergence).
|
|
|
| Returns
|
| -------
|
| float
|
| Reward value (higher is better).
|
| """
|
|
|
| groups = []
|
| for key in self.desired_distributions:
|
| a = _normalize_vector(distributions[key])
|
| js_1 = _jensen_shannon_divergence(
|
| self.desired_distributions[key],
|
| a,
|
| )
|
| groups.append((js_1))
|
|
|
| total_divergence = sum(groups)
|
|
|
| reward = -total_divergence * 10
|
|
|
| return reward
|
|
|
|
|
| class DQN(nn.Module):
|
|
|
| def __init__(self, n_observations, n_actions):
|
| super(DQN, self).__init__()
|
| self.layer1 = nn.Linear(n_observations, 128)
|
| self.layer2 = nn.Linear(128, 128)
|
| self.layer3 = nn.Linear(128, n_actions)
|
|
|
|
|
|
|
| def forward(self, x):
|
| x = F.relu(self.layer1(x))
|
| x = F.relu(self.layer2(x))
|
| return self.layer3(x)
|
|
|
| MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_036_250_GOOD_3_002.pt"
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
| checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)
|
|
|
|
|
| policy_net = DQN(
|
| checkpoint["n_observations"],
|
| checkpoint["n_actions"]
|
| ).to(device)
|
|
|
| policy_net.load_state_dict(checkpoint["model_state_dict"])
|
| print("Model architecture loaded successfully")
|
| policy_net.eval()
|
| print("Model weights loaded successfully")
|
|
|
| print("Model loaded successfully")
|
|
|
| def choose_action(model, observation):
|
| with torch.no_grad():
|
| state = torch.tensor(
|
| observation,
|
| dtype=torch.float32,
|
| device=device
|
| ).unsqueeze(0)
|
|
|
| q_values = model(state)
|
| action = q_values.argmax(dim=1).item()
|
|
|
| return action
|
|
|
| budget = 10
|
| impression_max=11.888
|
| price_max=0.118
|
|
|
| desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
|
| desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
|
| env = DspCampaign100Env(generate_bid_requests(2),
|
|
|
| desired_distributions={"publisher": desired_publiser_vector, "venue_type": desired_venue_type_vector},
|
|
|
| budget=budget, impression_max=impression_max, price_max=price_max)
|
|
|
| state, _ = env.reset()
|
|
|
| sum_reward = 0.0
|
| while True:
|
| action = choose_action(policy_net, state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| state, reward, terminated, truncated, _ = env.step(action)
|
|
|
| if not math.isnan(reward):
|
| sum_reward = sum_reward + reward
|
|
|
| if terminated or truncated:
|
| print("############# Final distance:", env._get_reward(env.current_distributions))
|
| print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
|
| print("############# sum_reward:", sum_reward)
|
| print("############# Desire distributions:", env.desired_distributions)
|
| print("############# Real distributions:", env.current_distributions)
|
| break
|
|
|
|
|
|
|
|
|
|
|