| import gymnasium as gym
|
| from gymnasium import spaces
|
| import math
|
| import random
|
| from random import randrange
|
| import numpy as np
|
| import pandas as pd
|
|
|
| import torch
|
| import torch.nn as nn
|
| import torch.optim as optim
|
| import torch.nn.functional as F
|
|
|
|
|
| def _normalize_vector(vector):
|
| if type(vector) is list:
|
| vector_np = np.asarray(vector, dtype=np.float32)
|
| else:
|
| vector_np = vector
|
| sum = np.sum(vector_np)
|
| if sum < 1e-8:
|
| return vector
|
| normalized_vector = vector_np / sum
|
| return normalized_vector
|
|
|
|
|
| def _KL_divergence(a, b):
|
| epsilon = 0.00001
|
|
|
| a = np.asarray(a + epsilon, dtype=np.float32)
|
| b = np.asarray(b + epsilon, dtype=np.float32)
|
|
|
| return np.sum(np.where(a != 0, a * np.log(a / b), 0))
|
|
|
|
|
| def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| KL divergence KL(p || q)
|
| Both p and q must be valid probability distributions.
|
| """
|
| epsilon = 0.00001
|
| return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
|
|
|
|
|
| def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
|
| """
|
| Compute Jensen–Shannon divergence between two 1D probability vectors.
|
|
|
| Parameters
|
| ----------
|
| p : np.ndarray
|
| Desired probability distribution (length 3).
|
| q : np.ndarray
|
| Current probability distribution (length 3).
|
|
|
| Returns
|
| -------
|
| float
|
| JS divergence (bounded between 0 and log(2)).
|
| """
|
|
|
|
|
| p = _normalize_vector(p)
|
| q = _normalize_vector(q)
|
|
|
| m = 0.5 * (p + q)
|
|
|
| js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
|
|
|
| return float(js)
|
|
|
|
|
| file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv"
|
| df_screen_ids = pd.read_csv(file_screen_ids)
|
| screen_ids = list(df_screen_ids['screen'])
|
|
|
| file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv"
|
|
|
|
|
|
|
| df_inventory = pd.read_csv(file_inventory_last)
|
|
|
| weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
|
| hours = list(range(24))
|
|
|
| cols = ['screen', 'weekday', 'hour']
|
| screens_dict = {}
|
| for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
|
| .to_dict()).items():
|
| screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
|
|
|
|
|
| def random_screen():
|
| return random.choice(screen_ids)
|
|
|
|
|
| def generate_bid_requests(num_weeks):
|
| """Generate synthetic bid requests."""
|
| bid_requests = []
|
| for weekIndex in range(num_weeks):
|
| for weekday_index in range(7):
|
| weekday = weekdays[weekday_index]
|
|
|
| for hour in hours:
|
|
|
| for bid_index in range(10):
|
| screen_index = randrange(len(screen_ids))
|
| screen_id = screen_ids[screen_index]
|
|
|
| data = screens_dict[screen_id][weekday][hour]
|
|
|
| householdSmall = data[0]
|
| householdAverage = data[1]
|
| householdLarge = data[2]
|
| incomeLow = data[3]
|
| incomeAverage = data[4]
|
| incomeHigh = data[5]
|
| impressionHour = data[7]
|
| price = data[8]
|
|
|
| publisher_1 = data[9]
|
| publisher_2 = data[10]
|
| publisher_3 = data[11]
|
| venue_type_1 = data[12]
|
| venue_type_2 = data[13]
|
| venue_type_3 = data[14]
|
|
|
| bid_request = {
|
| "features": np.array([
|
|
|
|
|
|
|
| impressionHour,
|
| ], dtype=np.float32),
|
| "household": np.array([
|
| householdSmall,
|
| householdAverage,
|
| householdLarge,
|
| ], dtype=np.float32),
|
| "income": np.array([
|
| incomeLow,
|
| incomeAverage,
|
| incomeHigh,
|
| ], dtype=np.float32),
|
| "publisher": np.array([
|
| publisher_1,
|
| publisher_2,
|
| publisher_3,
|
| ], dtype=np.float32),
|
| "venue_type": np.array([
|
| venue_type_1,
|
| venue_type_2,
|
| venue_type_3,
|
| ], dtype=np.float32),
|
| "price": price,
|
| }
|
| bid_requests.append(bid_request)
|
| print(f'Generated {len(bid_requests)} bid requests.')
|
| return bid_requests
|
|
|
|
|
| class DspCampaign100Env(gym.Env):
|
| """
|
| Minimal DSP RL environment:
|
| - One episode = one campaign
|
| - One step = one bid request
|
| """
|
|
|
| metadata = {"render_modes": []}
|
|
|
| def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
|
| super().__init__()
|
|
|
|
|
|
|
|
|
| self.bid_requests = bid_requests
|
| self.distribution_dim = 0
|
| for key in desired_distributions:
|
| dist = desired_distributions[key]
|
| dist2 = _normalize_vector(dist)
|
| desired_distributions[key] = dist2
|
| self.distribution_dim += len(dist2)
|
| self.desired_distributions = desired_distributions
|
| self.initial_budget = budget
|
| self.impression_max = impression_max
|
| self.price_max = price_max
|
|
|
|
|
|
|
|
|
|
|
| self.action_space = spaces.Discrete(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| self.bid_feat_dim = 1
|
|
|
| obs_dim = (
|
| self.distribution_dim
|
| + 3
|
| + self.bid_feat_dim
|
| + self.distribution_dim
|
| + 1
|
| )
|
|
|
| self.observation_space = spaces.Box(
|
| low=-np.inf,
|
| high=np.inf,
|
| shape=(obs_dim,),
|
| dtype=np.float32,
|
| )
|
|
|
| self.reset()
|
|
|
|
|
|
|
|
|
| def reset(self, seed=None, options=None):
|
| super().reset(seed=seed)
|
|
|
| self.step_idx = 0
|
| self.budget_left = self.initial_budget
|
| self.current_distributions = {}
|
|
|
| for key in self.desired_distributions:
|
|
|
| self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, info
|
|
|
| def reset_bid_requests(self, bid_requests):
|
| self.bid_requests = bid_requests
|
|
|
| def get_action_mask(self):
|
| bid = self.bid_requests[self.step_idx]
|
| cost = bid["price"] * self.price_max
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
|
|
|
|
| can_bid = not (
|
|
|
| self.budget_left - cost <= 0
|
| )
|
|
|
|
|
|
|
| return can_bid
|
|
|
|
|
|
|
|
|
| def step(self, action):
|
| assert self.action_space.contains(action)
|
|
|
| done = False
|
|
|
| bid = self.bid_requests[self.step_idx]
|
| cost = bid["price"] * self.price_max
|
|
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
| pacing_diff = budget_ratio - time_ratio
|
|
|
|
|
|
|
|
|
| reward = 0.0
|
|
|
| if action == 1 and self.budget_left >= cost:
|
| self.budget_left -= cost
|
|
|
|
|
|
|
|
|
|
|
| total_alignment_reward = 0.0
|
|
|
| first_key = list(self.desired_distributions.keys())[0]
|
| total_playouts_so_far = np.sum(self.current_distributions[first_key])
|
| stats_warmup_count = 100.0
|
| x = total_playouts_so_far / stats_warmup_count
|
| y = x ** 3
|
| starup_factor = min(1.0, y)
|
|
|
| for key in self.desired_distributions:
|
|
|
| self.current_distributions[key] += bid[key]
|
|
|
|
|
|
|
| current_total = np.sum(self.current_distributions[key])
|
| if current_total > 0:
|
| current_dist_norm = self.current_distributions[key] / current_total
|
| else:
|
| current_dist_norm = np.zeros_like(self.desired_distributions[key])
|
|
|
| gap = self.desired_distributions[key] - current_dist_norm
|
|
|
|
|
|
|
|
|
|
|
| alignment = np.dot(gap, bid[key])
|
|
|
|
|
| total_alignment_reward += alignment * 10.0 * starup_factor
|
|
|
| print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'], self.desired_distributions['household'])
|
| print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'], self.current_distributions['household'])
|
| print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'], bid['household'])
|
|
|
| reward += total_alignment_reward
|
|
|
|
|
| if pacing_diff < -0.005:
|
| reward -= 5.0
|
|
|
| else:
|
|
|
|
|
|
|
|
|
| if pacing_diff > 0.02:
|
| reward -= 0.5
|
| elif pacing_diff < -0.005:
|
| reward -= 0.5
|
|
|
|
|
|
|
|
|
| self.step_idx += 1
|
|
|
| if self.step_idx >= len(self.bid_requests) - 1:
|
| done = True
|
|
|
|
|
| unspent_ratio = self.budget_left / self.initial_budget
|
| reward -= unspent_ratio * 50.0
|
|
|
| print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
|
|
|
| obs = self._get_observation()
|
| info = {}
|
|
|
| return obs, reward, done, False, info
|
|
|
|
|
|
|
|
|
| def _get_observation(self):
|
| bid = self.bid_requests[self.step_idx]
|
|
|
| budget_ratio = self.budget_left / self.initial_budget
|
| time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
|
|
|
| gap_flat = []
|
| bid_distribution_flat = []
|
|
|
|
|
|
|
|
|
| alignment_score = 0.0
|
|
|
| for key in self.desired_distributions:
|
| current_counts = self.current_distributions[key]
|
| total = np.sum(current_counts)
|
| if total > 0:
|
| current_norm = current_counts / total
|
| else:
|
| current_norm = np.zeros_like(current_counts)
|
|
|
| desired = self.desired_distributions[key]
|
| gap = desired - current_norm
|
|
|
| gap_flat.extend(gap.tolist())
|
| bid_distribution_flat.extend(bid[key])
|
|
|
|
|
| alignment_score += np.dot(gap, bid[key])
|
|
|
| obs = np.concatenate([
|
| np.array(gap_flat, dtype=np.float32),
|
| np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
|
| bid["features"],
|
| np.array(bid_distribution_flat, dtype=np.float32),
|
| np.array([alignment_score], dtype=np.float32)
|
| ])
|
|
|
|
|
| return obs.astype(np.float32)
|
|
|
|
|
| class DQN(nn.Module):
|
|
|
| def __init__(self, n_observations, n_actions):
|
| super(DQN, self).__init__()
|
| self.layer1 = nn.Linear(n_observations, 128)
|
| self.layer2 = nn.Linear(128, 128)
|
| self.layer3 = nn.Linear(128, n_actions)
|
|
|
|
|
|
|
| def forward(self, x):
|
| x = F.relu(self.layer1(x))
|
| x = F.relu(self.layer2(x))
|
| return self.layer3(x)
|
|
|
| MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_040_150_4.pt"
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
| checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)
|
|
|
|
|
| policy_net = DQN(
|
| checkpoint["n_observations"],
|
| checkpoint["n_actions"]
|
| ).to(device)
|
|
|
| policy_net.load_state_dict(checkpoint["model_state_dict"])
|
| print("Model architecture loaded successfully")
|
| policy_net.eval()
|
| print("Model weights loaded successfully")
|
|
|
| print("Model loaded successfully")
|
|
|
| def choose_action(model, observation):
|
| with torch.no_grad():
|
| state = torch.tensor(
|
| observation,
|
| dtype=torch.float32,
|
| device=device
|
| ).unsqueeze(0)
|
|
|
| q_values = model(state)
|
| print(f"Q-values: {q_values.cpu().numpy()}")
|
| action = q_values.argmax(dim=1).item()
|
|
|
| return action
|
|
|
| budget = 10
|
| impression_max=11.888
|
| price_max=0.118
|
|
|
| desired_household_vector = _normalize_vector([0.5, 0.3, 0.2])
|
| desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
|
| desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
|
| env = DspCampaign100Env(generate_bid_requests(3),
|
| desired_distributions={"publisher": desired_publiser_vector,
|
| "venue_type": desired_venue_type_vector,
|
| "household": desired_household_vector},
|
| budget=budget, impression_max=impression_max, price_max=price_max)
|
|
|
| state, _ = env.reset()
|
|
|
| sum_reward = 0.0
|
| while True:
|
| action = choose_action(policy_net, state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| state, reward, terminated, truncated, _ = env.step(action)
|
|
|
| if not math.isnan(reward):
|
| sum_reward = sum_reward + reward
|
|
|
| if terminated or truncated:
|
| print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
|
| print("############# sum_reward:", sum_reward)
|
| print("############# Desire distributions:", env.desired_distributions)
|
| print("############# Real distributions:", env.current_distributions)
|
| break
|
|
|
|
|
|
|
|
|
|
|