DSP_Bidder_4_rules / dsp_bidder_4_inference.py
StanislavKo28's picture
Upload 6 files
4c313fa verified
import gymnasium as gym
from gymnasium import spaces
import math
import random
from random import randrange
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
def _normalize_vector(vector):
if type(vector) is list:
vector_np = np.asarray(vector, dtype=np.float32)
else:
vector_np = vector
sum = np.sum(vector_np)
if sum < 1e-8:
return vector
normalized_vector = vector_np / sum
return normalized_vector
def _KL_divergence(a, b):
epsilon = 0.00001
a = np.asarray(a + epsilon, dtype=np.float32)
b = np.asarray(b + epsilon, dtype=np.float32)
return np.sum(np.where(a != 0, a * np.log(a / b), 0))
def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
"""
KL divergence KL(p || q)
Both p and q must be valid probability distributions.
"""
epsilon = 0.00001
return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
"""
Compute Jensen–Shannon divergence between two 1D probability vectors.
Parameters
----------
p : np.ndarray
Desired probability distribution (length 3).
q : np.ndarray
Current probability distribution (length 3).
Returns
-------
float
JS divergence (bounded between 0 and log(2)).
"""
# Normalize to probability distributions
p = _normalize_vector(p)
q = _normalize_vector(q)
m = 0.5 * (p + q)
js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
return float(js)
file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv" # here 1500 screen ids (Strings)
df_screen_ids = pd.read_csv(file_screen_ids)
screen_ids = list(df_screen_ids['screen'])
file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv" # the sample from CSV file is below:
# screen,weekday,hour,householdSmall,householdAverage,householdLarge,incomeLow,incomeAverage,incomeHigh,impressionMax,impressionHour,price,publisher1,publisher2,publisher3,venueType1,venueType2,venueType3
# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,15,0.894,0.0,0.447,0.0,0.894,0.447,6.0,0.399,0.398,1.0,0.0,0.0,0.0,1.0,0.0
# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,16,0.989,0.0,0.141,0.0,1.0,0.0,6.0,0.384,0.381,1.0,0.0,0.0,0.0,1.0,0.0
df_inventory = pd.read_csv(file_inventory_last)
weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
hours = list(range(24))
cols = ['screen', 'weekday', 'hour']
screens_dict = {}
for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
.to_dict()).items():
screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
# print(screens_dict)
def random_screen():
return random.choice(screen_ids)
def generate_bid_requests(num_weeks):
"""Generate synthetic bid requests."""
bid_requests = []
for weekIndex in range(num_weeks):
for weekday_index in range(7):
weekday = weekdays[weekday_index]
# print('weekday', weekday)
for hour in hours:
# print(' hour', hour)
for bid_index in range(10):
screen_index = randrange(len(screen_ids))
screen_id = screen_ids[screen_index]
data = screens_dict[screen_id][weekday][hour]
householdSmall = data[0]
householdAverage = data[1]
householdLarge = data[2]
incomeLow = data[3]
incomeAverage = data[4]
incomeHigh = data[5]
impressionHour = data[7]
price = data[8]
publisher_1 = data[9]
publisher_2 = data[10]
publisher_3 = data[11]
venue_type_1 = data[12]
venue_type_2 = data[13]
venue_type_3 = data[14]
bid_request = {
"features": np.array([
# screen_index,
# weekday_index,
# hour,
impressionHour,
], dtype=np.float32),
"household": np.array([
householdSmall,
householdAverage,
householdLarge,
], dtype=np.float32),
"income": np.array([
incomeLow,
incomeAverage,
incomeHigh,
], dtype=np.float32),
"publisher": np.array([
publisher_1,
publisher_2,
publisher_3,
], dtype=np.float32),
"venue_type": np.array([
venue_type_1,
venue_type_2,
venue_type_3,
], dtype=np.float32),
"price": price,
}
bid_requests.append(bid_request)
print(f'Generated {len(bid_requests)} bid requests.')
return bid_requests
class DspCampaign100Env(gym.Env):
"""
Minimal DSP RL environment:
- One episode = one campaign
- One step = one bid request
"""
metadata = {"render_modes": []}
def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
super().__init__()
# ----------------------------
# Environment data
# ----------------------------
self.bid_requests = bid_requests # list of dicts (one per step)
self.distribution_dim = 0
for key in desired_distributions:
dist = desired_distributions[key]
dist2 = _normalize_vector(dist)
desired_distributions[key] = dist2
self.distribution_dim += len(dist2)
self.desired_distributions = desired_distributions
self.initial_budget = budget
self.impression_max = impression_max
self.price_max = price_max
# ----------------------------
# Action space
# ----------------------------
# 0 = no bid, 1 = bid
self.action_space = spaces.Discrete(2)
# ----------------------------
# Observation space
# ----------------------------
# [current_demo(6), desired_demo(6), budget_ratio, time_ratio,
# bid_request_features...]
self.bid_feat_dim = 1 # example
obs_dim = (
self.distribution_dim
+ 3 # campaign progress: budget_ratio, time_ratio, budget_ratio - time_ratio
+ self.bid_feat_dim
+ self.distribution_dim # bid features related to distributions (e.g. publisher, venue_type)
+ 1 # alignment score (dot product of gap and bid)
)
self.observation_space = spaces.Box(
low=-np.inf,
high=np.inf,
shape=(obs_dim,),
dtype=np.float32,
)
self.reset()
# ----------------------------
# Reset episode
# ----------------------------
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.step_idx = 0
self.budget_left = self.initial_budget
self.current_distributions = {}
# self.current_demo = np.zeros(self.demo_dim, dtype=np.float32)
for key in self.desired_distributions:
# print("key", key, "desired_distributions[key]", type(self.desired_distributions[key]))
self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
obs = self._get_observation()
info = {}
return obs, info
def reset_bid_requests(self, bid_requests):
self.bid_requests = bid_requests
def get_action_mask(self):
bid = self.bid_requests[self.step_idx]
cost = bid["price"] * self.price_max
budget_ratio = self.budget_left / self.initial_budget
time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
# do not allow spend if it violates pacing envelope
can_bid = not (
# budget_ratio < time_ratio - 0.03 or
self.budget_left - cost <= 0
)
# action 0 always allowed
# return np.array([1, int(can_bid)], dtype=np.float32)
return can_bid
# ----------------------------
# Step
# ----------------------------
def step(self, action):
assert self.action_space.contains(action)
done = False
bid = self.bid_requests[self.step_idx]
cost = bid["price"] * self.price_max
# Pacing calculation
budget_ratio = self.budget_left / self.initial_budget
time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
pacing_diff = budget_ratio - time_ratio
# ----------------------------
# Apply action
# ----------------------------
reward = 0.0
if action == 1 and self.budget_left >= cost:
self.budget_left -= cost
# --- ENHANCED REWARD CALCULATION ---
# Instead of global distance diff, we calculate the "alignment" of this specific bid
# with the specific needs of the campaign right now.
total_alignment_reward = 0.0
first_key = list(self.desired_distributions.keys())[0]
total_playouts_so_far = np.sum(self.current_distributions[first_key])
stats_warmup_count = 100.0
x = total_playouts_so_far / stats_warmup_count
y = x ** 3
starup_factor = min(1.0, y)
for key in self.desired_distributions:
# 1. Update distribution counts
self.current_distributions[key] += bid[key]
# 2. Calculate Gap (Desired %) - (Current %)
# We need to normalize current counts to get percentages
current_total = np.sum(self.current_distributions[key])
if current_total > 0:
current_dist_norm = self.current_distributions[key] / current_total
else:
current_dist_norm = np.zeros_like(self.desired_distributions[key])
gap = self.desired_distributions[key] - current_dist_norm
# 3. Alignment Score: Dot product of Gap vector and Bid vector
# If Gap is [0.1, -0.1] (we need index 0, have too much index 1)
# And Bid is [1, 0] -> dot product is 0.1 (Positive reward)
# And Bid is [0, 1] -> dot product is -0.1 (Negative reward)
alignment = np.dot(gap, bid[key])
# Scale up to make it significant for the optimizer
total_alignment_reward += alignment * 10.0 * starup_factor
print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'], self.desired_distributions['household'])
print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'], self.current_distributions['household'])
print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'], bid['household'])
reward += total_alignment_reward
# Penalize overspending slightly if we are ahead of schedule
if pacing_diff < -0.005: # We have spent too much relative to time
reward -= 5.0
else:
# Action = 0 (No Bid)
# If we are falling behind schedule (budget_ratio > time_ratio),
# we should be bidding. Penalize passing.
if pacing_diff > 0.02:
reward -= 0.5 # Penalty for holding budget when behind schedule
elif pacing_diff < -0.005:
reward -= 0.5 # Small positive reward for saving budget if we are ahead of schedule
# ----------------------------
# Advance time
# ----------------------------
self.step_idx += 1
if self.step_idx >= len(self.bid_requests) - 1:
done = True
# Final penalty for unspent budget
unspent_ratio = self.budget_left / self.initial_budget
reward -= unspent_ratio * 50.0
print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
obs = self._get_observation()
info = {}
return obs, reward, done, False, info
# ----------------------------
# Observation builder
# ----------------------------
def _get_observation(self):
bid = self.bid_requests[self.step_idx]
budget_ratio = self.budget_left / self.initial_budget
time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
gap_flat = []
bid_distribution_flat = []
# New feature: Total Alignment Score
# This helps the neural net "see" immediately if a bid is useful
# without doing complex internal math.
alignment_score = 0.0
for key in self.desired_distributions:
current_counts = self.current_distributions[key]
total = np.sum(current_counts)
if total > 0:
current_norm = current_counts / total
else:
current_norm = np.zeros_like(current_counts)
desired = self.desired_distributions[key]
gap = desired - current_norm
gap_flat.extend(gap.tolist())
bid_distribution_flat.extend(bid[key])
# Calculate alignment for this specific feature
alignment_score += np.dot(gap, bid[key])
obs = np.concatenate([
np.array(gap_flat, dtype=np.float32),
np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
bid["features"],
np.array(bid_distribution_flat, dtype=np.float32),
np.array([alignment_score], dtype=np.float32) # Add explicit helper feature
])
# print("obs", obs)
return obs.astype(np.float32)
class DQN(nn.Module):
def __init__(self, n_observations, n_actions):
super(DQN, self).__init__()
self.layer1 = nn.Linear(n_observations, 128)
self.layer2 = nn.Linear(128, 128)
self.layer3 = nn.Linear(128, n_actions)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
return self.layer3(x)
MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_040_150_4.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load checkpoint
checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)
# Recreate model
policy_net = DQN(
checkpoint["n_observations"],
checkpoint["n_actions"]
).to(device)
policy_net.load_state_dict(checkpoint["model_state_dict"])
print("Model architecture loaded successfully")
policy_net.eval() # VERY IMPORTANT (turns off dropout/batchnorm if any)
print("Model weights loaded successfully")
print("Model loaded successfully")
def choose_action(model, observation):
with torch.no_grad():
state = torch.tensor(
observation,
dtype=torch.float32,
device=device
).unsqueeze(0)
q_values = model(state)
print(f"Q-values: {q_values.cpu().numpy()}")
action = q_values.argmax(dim=1).item()
return action
budget = 10
impression_max=11.888
price_max=0.118
desired_household_vector = _normalize_vector([0.5, 0.3, 0.2])
desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
env = DspCampaign100Env(generate_bid_requests(3),
desired_distributions={"publisher": desired_publiser_vector,
"venue_type": desired_venue_type_vector,
"household": desired_household_vector},
budget=budget, impression_max=impression_max, price_max=price_max)
state, _ = env.reset()
sum_reward = 0.0
while True:
action = choose_action(policy_net, state)
# Here instead of env.step, in production:
# if action == 1:
# submit bid to DSP
# else:
# skip
state, reward, terminated, truncated, _ = env.step(action)
if not math.isnan(reward):
sum_reward = sum_reward + reward
if terminated or truncated:
print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
print("############# sum_reward:", sum_reward)
print("############# Desire distributions:", env.desired_distributions)
print("############# Real distributions:", env.current_distributions)
break