DSP_Bidder_4_rules / dsp_bidder_4_inference.py

Upload 6 files

4c313fa verified about 2 months ago

17.9 kB

	import gymnasium as gym
	from gymnasium import spaces
	import math
	import random
	from random import randrange
	import numpy as np
	import pandas as pd

	import torch
	import torch.nn as nn
	import torch.optim as optim
	import torch.nn.functional as F


	def _normalize_vector(vector):
	if type(vector) is list:
	vector_np = np.asarray(vector, dtype=np.float32)
	else:
	vector_np = vector
	sum = np.sum(vector_np)
	if sum < 1e-8:
	return vector
	normalized_vector = vector_np / sum
	return normalized_vector


	def _KL_divergence(a, b):
	epsilon = 0.00001

	a = np.asarray(a + epsilon, dtype=np.float32)
	b = np.asarray(b + epsilon, dtype=np.float32)

	return np.sum(np.where(a != 0, a * np.log(a / b), 0))


	def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
	"""
	KL divergence KL(p \|\| q)
	Both p and q must be valid probability distributions.
	"""
	epsilon = 0.00001
	return np.sum(p * np.log((p + epsilon) / (q + epsilon)))


	def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
	"""
	Compute Jensen–Shannon divergence between two 1D probability vectors.

	Parameters
	----------
	p : np.ndarray
	Desired probability distribution (length 3).
	q : np.ndarray
	Current probability distribution (length 3).

	Returns
	-------
	float
	JS divergence (bounded between 0 and log(2)).
	"""

	# Normalize to probability distributions
	p = _normalize_vector(p)
	q = _normalize_vector(q)

	m = 0.5 * (p + q)

	js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)

	return float(js)


	file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv" # here 1500 screen ids (Strings)
	df_screen_ids = pd.read_csv(file_screen_ids)
	screen_ids = list(df_screen_ids['screen'])

	file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv" # the sample from CSV file is below:
	# screen,weekday,hour,householdSmall,householdAverage,householdLarge,incomeLow,incomeAverage,incomeHigh,impressionMax,impressionHour,price,publisher1,publisher2,publisher3,venueType1,venueType2,venueType3
	# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,15,0.894,0.0,0.447,0.0,0.894,0.447,6.0,0.399,0.398,1.0,0.0,0.0,0.0,1.0,0.0
	# 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,16,0.989,0.0,0.141,0.0,1.0,0.0,6.0,0.384,0.381,1.0,0.0,0.0,0.0,1.0,0.0
	df_inventory = pd.read_csv(file_inventory_last)

	weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
	hours = list(range(24))

	cols = ['screen', 'weekday', 'hour']
	screens_dict = {}
	for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
	.to_dict()).items():
	screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
	# print(screens_dict)

	def random_screen():
	return random.choice(screen_ids)


	def generate_bid_requests(num_weeks):
	"""Generate synthetic bid requests."""
	bid_requests = []
	for weekIndex in range(num_weeks):
	for weekday_index in range(7):
	weekday = weekdays[weekday_index]
	# print('weekday', weekday)
	for hour in hours:
	# print(' hour', hour)
	for bid_index in range(10):
	screen_index = randrange(len(screen_ids))
	screen_id = screen_ids[screen_index]

	data = screens_dict[screen_id][weekday][hour]

	householdSmall = data[0]
	householdAverage = data[1]
	householdLarge = data[2]
	incomeLow = data[3]
	incomeAverage = data[4]
	incomeHigh = data[5]
	impressionHour = data[7]
	price = data[8]

	publisher_1 = data[9]
	publisher_2 = data[10]
	publisher_3 = data[11]
	venue_type_1 = data[12]
	venue_type_2 = data[13]
	venue_type_3 = data[14]

	bid_request = {
	"features": np.array([
	# screen_index,
	# weekday_index,
	# hour,
	impressionHour,
	], dtype=np.float32),
	"household": np.array([
	householdSmall,
	householdAverage,
	householdLarge,
	], dtype=np.float32),
	"income": np.array([
	incomeLow,
	incomeAverage,
	incomeHigh,
	], dtype=np.float32),
	"publisher": np.array([
	publisher_1,
	publisher_2,
	publisher_3,
	], dtype=np.float32),
	"venue_type": np.array([
	venue_type_1,
	venue_type_2,
	venue_type_3,
	], dtype=np.float32),
	"price": price,
	}
	bid_requests.append(bid_request)
	print(f'Generated {len(bid_requests)} bid requests.')
	return bid_requests


	class DspCampaign100Env(gym.Env):
	"""
	Minimal DSP RL environment:
	- One episode = one campaign
	- One step = one bid request
	"""

	metadata = {"render_modes": []}

	def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
	super().__init__()

	# ----------------------------
	# Environment data
	# ----------------------------
	self.bid_requests = bid_requests # list of dicts (one per step)
	self.distribution_dim = 0
	for key in desired_distributions:
	dist = desired_distributions[key]
	dist2 = _normalize_vector(dist)
	desired_distributions[key] = dist2
	self.distribution_dim += len(dist2)
	self.desired_distributions = desired_distributions
	self.initial_budget = budget
	self.impression_max = impression_max
	self.price_max = price_max

	# ----------------------------
	# Action space
	# ----------------------------
	# 0 = no bid, 1 = bid
	self.action_space = spaces.Discrete(2)

	# ----------------------------
	# Observation space
	# ----------------------------
	# [current_demo(6), desired_demo(6), budget_ratio, time_ratio,
	# bid_request_features...]
	self.bid_feat_dim = 1 # example

	obs_dim = (
	self.distribution_dim
	+ 3 # campaign progress: budget_ratio, time_ratio, budget_ratio - time_ratio
	+ self.bid_feat_dim
	+ self.distribution_dim # bid features related to distributions (e.g. publisher, venue_type)
	+ 1 # alignment score (dot product of gap and bid)
	)

	self.observation_space = spaces.Box(
	low=-np.inf,
	high=np.inf,
	shape=(obs_dim,),
	dtype=np.float32,
	)

	self.reset()

	# ----------------------------
	# Reset episode
	# ----------------------------
	def reset(self, seed=None, options=None):
	super().reset(seed=seed)

	self.step_idx = 0
	self.budget_left = self.initial_budget
	self.current_distributions = {}
	# self.current_demo = np.zeros(self.demo_dim, dtype=np.float32)
	for key in self.desired_distributions:
	# print("key", key, "desired_distributions[key]", type(self.desired_distributions[key]))
	self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)

	obs = self._get_observation()
	info = {}

	return obs, info

	def reset_bid_requests(self, bid_requests):
	self.bid_requests = bid_requests

	def get_action_mask(self):
	bid = self.bid_requests[self.step_idx]
	cost = bid["price"] * self.price_max

	budget_ratio = self.budget_left / self.initial_budget
	time_ratio = 1.0 - self.step_idx / len(self.bid_requests)

	# do not allow spend if it violates pacing envelope
	can_bid = not (
	# budget_ratio < time_ratio - 0.03 or
	self.budget_left - cost <= 0
	)

	# action 0 always allowed
	# return np.array([1, int(can_bid)], dtype=np.float32)
	return can_bid

	# ----------------------------
	# Step
	# ----------------------------
	def step(self, action):
	assert self.action_space.contains(action)

	done = False

	bid = self.bid_requests[self.step_idx]
	cost = bid["price"] * self.price_max

	# Pacing calculation
	budget_ratio = self.budget_left / self.initial_budget
	time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
	pacing_diff = budget_ratio - time_ratio

	# ----------------------------
	# Apply action
	# ----------------------------
	reward = 0.0

	if action == 1 and self.budget_left >= cost:
	self.budget_left -= cost

	# --- ENHANCED REWARD CALCULATION ---
	# Instead of global distance diff, we calculate the "alignment" of this specific bid
	# with the specific needs of the campaign right now.

	total_alignment_reward = 0.0

	first_key = list(self.desired_distributions.keys())[0]
	total_playouts_so_far = np.sum(self.current_distributions[first_key])
	stats_warmup_count = 100.0
	x = total_playouts_so_far / stats_warmup_count
	y = x ** 3
	starup_factor = min(1.0, y)

	for key in self.desired_distributions:
	# 1. Update distribution counts
	self.current_distributions[key] += bid[key]

	# 2. Calculate Gap (Desired %) - (Current %)
	# We need to normalize current counts to get percentages
	current_total = np.sum(self.current_distributions[key])
	if current_total > 0:
	current_dist_norm = self.current_distributions[key] / current_total
	else:
	current_dist_norm = np.zeros_like(self.desired_distributions[key])

	gap = self.desired_distributions[key] - current_dist_norm

	# 3. Alignment Score: Dot product of Gap vector and Bid vector
	# If Gap is [0.1, -0.1] (we need index 0, have too much index 1)
	# And Bid is [1, 0] -> dot product is 0.1 (Positive reward)
	# And Bid is [0, 1] -> dot product is -0.1 (Negative reward)
	alignment = np.dot(gap, bid[key])

	# Scale up to make it significant for the optimizer
	total_alignment_reward += alignment * 10.0 * starup_factor

	print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'], self.desired_distributions['household'])
	print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'], self.current_distributions['household'])
	print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'], bid['household'])

	reward += total_alignment_reward

	# Penalize overspending slightly if we are ahead of schedule
	if pacing_diff < -0.005: # We have spent too much relative to time
	reward -= 5.0

	else:
	# Action = 0 (No Bid)

	# If we are falling behind schedule (budget_ratio > time_ratio),
	# we should be bidding. Penalize passing.
	if pacing_diff > 0.02:
	reward -= 0.5 # Penalty for holding budget when behind schedule
	elif pacing_diff < -0.005:
	reward -= 0.5 # Small positive reward for saving budget if we are ahead of schedule

	# ----------------------------
	# Advance time
	# ----------------------------
	self.step_idx += 1

	if self.step_idx >= len(self.bid_requests) - 1:
	done = True

	# Final penalty for unspent budget
	unspent_ratio = self.budget_left / self.initial_budget
	reward -= unspent_ratio * 50.0

	print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)

	obs = self._get_observation()
	info = {}

	return obs, reward, done, False, info

	# ----------------------------
	# Observation builder
	# ----------------------------
	def _get_observation(self):
	bid = self.bid_requests[self.step_idx]

	budget_ratio = self.budget_left / self.initial_budget
	time_ratio = 1.0 - self.step_idx / len(self.bid_requests)

	gap_flat = []
	bid_distribution_flat = []

	# New feature: Total Alignment Score
	# This helps the neural net "see" immediately if a bid is useful
	# without doing complex internal math.
	alignment_score = 0.0

	for key in self.desired_distributions:
	current_counts = self.current_distributions[key]
	total = np.sum(current_counts)
	if total > 0:
	current_norm = current_counts / total
	else:
	current_norm = np.zeros_like(current_counts)

	desired = self.desired_distributions[key]
	gap = desired - current_norm

	gap_flat.extend(gap.tolist())
	bid_distribution_flat.extend(bid[key])

	# Calculate alignment for this specific feature
	alignment_score += np.dot(gap, bid[key])

	obs = np.concatenate([
	np.array(gap_flat, dtype=np.float32),
	np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
	bid["features"],
	np.array(bid_distribution_flat, dtype=np.float32),
	np.array([alignment_score], dtype=np.float32) # Add explicit helper feature
	])
	# print("obs", obs)

	return obs.astype(np.float32)


	class DQN(nn.Module):

	def __init__(self, n_observations, n_actions):
	super(DQN, self).__init__()
	self.layer1 = nn.Linear(n_observations, 128)
	self.layer2 = nn.Linear(128, 128)
	self.layer3 = nn.Linear(128, n_actions)

	# Called with either one element to determine next action, or a batch
	# during optimization. Returns tensor([[left0exp,right0exp]...]).
	def forward(self, x):
	x = F.relu(self.layer1(x))
	x = F.relu(self.layer2(x))
	return self.layer3(x)

	MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_040_150_4.pt"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load checkpoint
	checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)

	# Recreate model
	policy_net = DQN(
	checkpoint["n_observations"],
	checkpoint["n_actions"]
	).to(device)

	policy_net.load_state_dict(checkpoint["model_state_dict"])
	print("Model architecture loaded successfully")
	policy_net.eval() # VERY IMPORTANT (turns off dropout/batchnorm if any)
	print("Model weights loaded successfully")

	print("Model loaded successfully")

	def choose_action(model, observation):
	with torch.no_grad():
	state = torch.tensor(
	observation,
	dtype=torch.float32,
	device=device
	).unsqueeze(0)

	q_values = model(state)
	print(f"Q-values: {q_values.cpu().numpy()}")
	action = q_values.argmax(dim=1).item()

	return action

	budget = 10
	impression_max=11.888
	price_max=0.118

	desired_household_vector = _normalize_vector([0.5, 0.3, 0.2])
	desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
	desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
	env = DspCampaign100Env(generate_bid_requests(3),
	desired_distributions={"publisher": desired_publiser_vector,
	"venue_type": desired_venue_type_vector,
	"household": desired_household_vector},
	budget=budget, impression_max=impression_max, price_max=price_max)

	state, _ = env.reset()

	sum_reward = 0.0
	while True:
	action = choose_action(policy_net, state)

	# Here instead of env.step, in production:
	# if action == 1:
	# submit bid to DSP
	# else:
	# skip

	state, reward, terminated, truncated, _ = env.step(action)

	if not math.isnan(reward):
	sum_reward = sum_reward + reward

	if terminated or truncated:
	print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
	print("############# sum_reward:", sum_reward)
	print("############# Desire distributions:", env.desired_distributions)
	print("############# Real distributions:", env.current_distributions)
	break