DSP_Bidder_3_rules / dsp_bidder_3_inference.py
StanislavKo28's picture
Upload 6 files
24acc4c verified
import gymnasium as gym
from gymnasium import spaces
import math
import random
from random import randrange
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
def _normalize_vector(vector):
if type(vector) is list:
vector_np = np.asarray(vector, dtype=np.float32)
else:
vector_np = vector
sum = np.sum(vector_np)
if sum < 1e-8:
return vector
normalized_vector = vector_np / sum
return normalized_vector
def _KL_divergence(a, b):
epsilon = 0.00001
a = np.asarray(a + epsilon, dtype=np.float32)
b = np.asarray(b + epsilon, dtype=np.float32)
return np.sum(np.where(a != 0, a * np.log(a / b), 0))
def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
"""
KL divergence KL(p || q)
Both p and q must be valid probability distributions.
"""
epsilon = 0.00001
return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
"""
Compute Jensen–Shannon divergence between two 1D probability vectors.
Parameters
----------
p : np.ndarray
Desired probability distribution (length 3).
q : np.ndarray
Current probability distribution (length 3).
Returns
-------
float
JS divergence (bounded between 0 and log(2)).
"""
# Normalize to probability distributions
p = _normalize_vector(p)
q = _normalize_vector(q)
m = 0.5 * (p + q)
js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
return float(js)
file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv" # here 1500 screen ids (Strings)
df_screen_ids = pd.read_csv(file_screen_ids)
screen_ids = list(df_screen_ids['screen'])
file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv" # the sample from CSV file is below:
# screen,weekday,hour,householdSmall,householdAverage,householdLarge,incomeLow,incomeAverage,incomeHigh,impressionMax,impressionHour,price,publisher1,publisher2,publisher3,venueType1,venueType2,venueType3
# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,15,0.894,0.0,0.447,0.0,0.894,0.447,6.0,0.399,0.398,1.0,0.0,0.0,0.0,1.0,0.0
# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,16,0.989,0.0,0.141,0.0,1.0,0.0,6.0,0.384,0.381,1.0,0.0,0.0,0.0,1.0,0.0
df_inventory = pd.read_csv(file_inventory_last)
weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
hours = list(range(24))
cols = ['screen', 'weekday', 'hour']
screens_dict = {}
for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
.to_dict()).items():
screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
# print(screens_dict)
def random_screen():
return random.choice(screen_ids)
def generate_bid_requests(num_weeks):
"""Generate synthetic bid requests."""
bid_requests = []
for weekIndex in range(num_weeks):
for weekday_index in range(7):
weekday = weekdays[weekday_index]
# print('weekday', weekday)
for hour in hours:
# print(' hour', hour)
for bid_index in range(10):
screen_index = randrange(len(screen_ids))
screen_id = screen_ids[screen_index]
data = screens_dict[screen_id][weekday][hour]
householdSmall = data[0]
householdAverage = data[1]
householdLarge = data[2]
incomeLow = data[3]
incomeAverage = data[4]
incomeHigh = data[5]
impressionHour = data[7]
price = data[8]
publisher_1 = data[9]
publisher_2 = data[10]
publisher_3 = data[11]
venue_type_1 = data[12]
venue_type_2 = data[13]
venue_type_3 = data[14]
bid_request = {
"features": np.array([
# screen_index,
# weekday_index,
# hour,
impressionHour,
], dtype=np.float32),
"household": np.array([
householdSmall,
householdAverage,
householdLarge,
], dtype=np.float32),
"income": np.array([
incomeLow,
incomeAverage,
incomeHigh,
], dtype=np.float32),
"publisher": np.array([
publisher_1,
publisher_2,
publisher_3,
], dtype=np.float32),
"venue_type": np.array([
venue_type_1,
venue_type_2,
venue_type_3,
], dtype=np.float32),
"price": price,
}
bid_requests.append(bid_request)
print(f'Generated {len(bid_requests)} bid requests.')
return bid_requests
class DspCampaign100Env(gym.Env):
"""
Minimal DSP RL environment:
- One episode = one campaign
- One step = one bid request
"""
metadata = {"render_modes": []}
def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
super().__init__()
# ----------------------------
# Environment data
# ----------------------------
self.bid_requests = bid_requests # list of dicts (one per step)
self.distribution_dim = 0
for key in desired_distributions:
dist = desired_distributions[key]
dist2 = _normalize_vector(dist)
desired_distributions[key] = dist2
self.distribution_dim += len(dist2)
self.desired_distributions = desired_distributions
self.initial_budget = budget
self.impression_max = impression_max
self.price_max = price_max
# ----------------------------
# Action space
# ----------------------------
# 0 = no bid, 1 = bid
self.action_space = spaces.Discrete(2)
# ----------------------------
# Observation space
# ----------------------------
# [current_demo(6), desired_demo(6), budget_ratio, time_ratio,
# bid_request_features...]
self.bid_feat_dim = 1 # example
obs_dim = (
self.distribution_dim
+ 3 # campaign progress: budget_ratio, time_ratio, budget_ratio - time_ratio
+ self.bid_feat_dim
+ self.distribution_dim # bid features related to distributions (e.g. publisher, venue_type)
)
self.observation_space = spaces.Box(
low=-np.inf,
high=np.inf,
shape=(obs_dim,),
dtype=np.float32,
)
self.reset()
# ----------------------------
# Reset episode
# ----------------------------
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.step_idx = 0
self.budget_left = self.initial_budget
self.current_distributions = {}
# self.current_demo = np.zeros(self.demo_dim, dtype=np.float32)
for key in self.desired_distributions:
# print("key", key, "desired_distributions[key]", type(self.desired_distributions[key]))
self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
obs = self._get_observation()
info = {}
return obs, info
def reset_bid_requests(self, bid_requests):
self.bid_requests = bid_requests
# ----------------------------
# Step
# ----------------------------
def step(self, action):
assert self.action_space.contains(action)
done = False
bid = self.bid_requests[self.step_idx]
budget_ratio = self.budget_left / self.initial_budget
time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
cost = bid["price"] * self.price_max
# ----------------------------
# Apply action
# ----------------------------
reward = 0.0
if action == 1 and self.budget_left >= cost:
self.budget_left -= cost
# Calculate distance BEFORE update
prev_dist = 0.0
for key in self.desired_distributions:
prev_current = _normalize_vector(self.current_distributions[key])
prev_dist += np.linalg.norm(self.desired_distributions[key] - prev_current)
# Update stats
for key in self.desired_distributions:
self.current_distributions[key] += bid[key]
print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'])
print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'])
print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'])
# Calculate distance AFTER update
new_dist = 0.0
for key in self.desired_distributions:
new_current = _normalize_vector(self.current_distributions[key])
new_dist += np.linalg.norm(self.desired_distributions[key] - new_current)
# --- KEY FIX ---
# Reward is positive if we got closer to the target distribution
# The scaling factor (e.g., * 10 or * 100) helps balance against the pacing penalty
dist_improvement = (prev_dist - new_dist)
# If improvement is positive (distance got smaller), good!
# If improvement is negative (distance got bigger), bad!
demo_reward = dist_improvement * 50.0
reward += demo_reward
else:
# Pacing penalty logic remains similar, but ensure it doesn't overpower the demo reward
# If we skip a bid, we might still be penalized if we are lagging behind budget
pacing_gap = time_ratio - budget_ratio
# If we are underspending (budget_ratio > time_ratio), pacing_gap is negative.
# We punish "doing nothing" when we should be spending.
reward += -abs(pacing_gap) * 0.5
print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
# ----------------------------
# Advance time
# ----------------------------
self.step_idx += 1
if self.step_idx >= len(self.bid_requests) - 1:
done = True
# reward += - (self.budget_left / self.initial_budget) * 10
# Optional: Final bonus for hitting the distribution target accurately
final_dist = 0.0
for key in self.desired_distributions:
final_current = _normalize_vector(self.current_distributions[key])
final_dist += np.linalg.norm(self.desired_distributions[key] - final_current)
reward += (1.0 - final_dist) * 10 # Bonus if final distance is small
obs = self._get_observation()
info = {}
return obs, reward, done, False, info
# ----------------------------
# Observation builder
# ----------------------------
def _get_observation(self):
bid = self.bid_requests[self.step_idx]
budget_ratio = self.budget_left / self.initial_budget
time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
current_distributions_flat = []
desired_distributions_flat = []
gap_flat = []
bid_distribution_flat = []
for key in self.desired_distributions:
current = self.current_distributions[key]
current = _normalize_vector(current)
current_distributions_flat.extend(current)
desired = _normalize_vector(self.desired_distributions[key])
desired_distributions_flat.extend(self.desired_distributions[key])
gap = desired - current
gap_flat.extend(gap.tolist())
bid_distribution_flat.extend(bid[key])
obs = np.concatenate([
# np.array(current_distributions_flat, dtype=np.float32),
# np.array(desired_distributions_flat, dtype=np.float32),
np.array(gap_flat, dtype=np.float32),
np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
bid["features"], # price
# bid['publisher'], # publisher
np.array(bid_distribution_flat, dtype=np.float32),
])
# print("obs", obs)
return obs.astype(np.float32)
# ----------------------------
# Distance metric
# ----------------------------
def _get_reward(self, distributions) -> float:
"""
Compute reward based on JS divergence for household and income groups.
Reward is NEGATIVE divergence (because we want to minimize divergence).
Returns
-------
float
Reward value (higher is better).
"""
groups = []
for key in self.desired_distributions:
a = _normalize_vector(distributions[key])
js_1 = _jensen_shannon_divergence(
self.desired_distributions[key],
a,
)
groups.append((js_1))
total_divergence = sum(groups)
reward = -total_divergence * 10 # scale reward to make it more significant
return reward
class DQN(nn.Module):
def __init__(self, n_observations, n_actions):
super(DQN, self).__init__()
self.layer1 = nn.Linear(n_observations, 128)
self.layer2 = nn.Linear(128, 128)
self.layer3 = nn.Linear(128, n_actions)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.layer1(x))
x = F.relu(self.layer2(x))
return self.layer3(x)
MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_036_250_GOOD_3_002.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load checkpoint
checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)
# Recreate model
policy_net = DQN(
checkpoint["n_observations"],
checkpoint["n_actions"]
).to(device)
policy_net.load_state_dict(checkpoint["model_state_dict"])
print("Model architecture loaded successfully")
policy_net.eval() # VERY IMPORTANT (turns off dropout/batchnorm if any)
print("Model weights loaded successfully")
print("Model loaded successfully")
def choose_action(model, observation):
with torch.no_grad():
state = torch.tensor(
observation,
dtype=torch.float32,
device=device
).unsqueeze(0)
q_values = model(state)
action = q_values.argmax(dim=1).item()
return action
budget = 10
impression_max=11.888
price_max=0.118
desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
env = DspCampaign100Env(generate_bid_requests(2),
# desired_distributions={"household": desired_household_vector, "income": desired_income_vector},
desired_distributions={"publisher": desired_publiser_vector, "venue_type": desired_venue_type_vector},
# desired_distributions={"publisher": desired_publiser_vector},
budget=budget, impression_max=impression_max, price_max=price_max)
state, _ = env.reset()
sum_reward = 0.0
while True:
action = choose_action(policy_net, state)
# Here instead of env.step, in production:
# if action == 1:
# submit bid to DSP
# else:
# skip
state, reward, terminated, truncated, _ = env.step(action)
if not math.isnan(reward):
sum_reward = sum_reward + reward
if terminated or truncated:
print("############# Final distance:", env._get_reward(env.current_distributions))
print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
print("############# sum_reward:", sum_reward)
print("############# Desire distributions:", env.desired_distributions)
print("############# Real distributions:", env.current_distributions)
break