StanislavKo28 commited on
Commit
4c313fa
·
verified ·
1 Parent(s): 7aef048

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ bidder_transormer_4_001.png filter=lfs diff=lfs merge=lfs -text
200_bidder_dqn_model_041_250_GOOD_4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cbfc96739f561bd8182a8bab92b680c846b6f126b0905da07cf072cf3d0eb7e
3
+ size 327389
bidder_transormer_4_001.png ADDED

Git LFS Details

  • SHA256: 8d988f8417c1dbdd526a112830263a97fe2c7a4a96f25e5b359d6ac488b9d04c
  • Pointer size: 131 Bytes
  • Size of remote file: 209 kB
csv_files.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c99330846040631d74401102e35bac0c4f1f4ff5249614aacad740760f3b31
3
+ size 2454545
dsp_bidder_4_inference.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ from gymnasium import spaces
3
+ import math
4
+ import random
5
+ from random import randrange
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ import torch.nn.functional as F
13
+
14
+
15
+ def _normalize_vector(vector):
16
+ if type(vector) is list:
17
+ vector_np = np.asarray(vector, dtype=np.float32)
18
+ else:
19
+ vector_np = vector
20
+ sum = np.sum(vector_np)
21
+ if sum < 1e-8:
22
+ return vector
23
+ normalized_vector = vector_np / sum
24
+ return normalized_vector
25
+
26
+
27
+ def _KL_divergence(a, b):
28
+ epsilon = 0.00001
29
+
30
+ a = np.asarray(a + epsilon, dtype=np.float32)
31
+ b = np.asarray(b + epsilon, dtype=np.float32)
32
+
33
+ return np.sum(np.where(a != 0, a * np.log(a / b), 0))
34
+
35
+
36
+ def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
37
+ """
38
+ KL divergence KL(p || q)
39
+ Both p and q must be valid probability distributions.
40
+ """
41
+ epsilon = 0.00001
42
+ return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
43
+
44
+
45
+ def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
46
+ """
47
+ Compute Jensen–Shannon divergence between two 1D probability vectors.
48
+
49
+ Parameters
50
+ ----------
51
+ p : np.ndarray
52
+ Desired probability distribution (length 3).
53
+ q : np.ndarray
54
+ Current probability distribution (length 3).
55
+
56
+ Returns
57
+ -------
58
+ float
59
+ JS divergence (bounded between 0 and log(2)).
60
+ """
61
+
62
+ # Normalize to probability distributions
63
+ p = _normalize_vector(p)
64
+ q = _normalize_vector(q)
65
+
66
+ m = 0.5 * (p + q)
67
+
68
+ js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
69
+
70
+ return float(js)
71
+
72
+
73
+ file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv" # here 1500 screen ids (Strings)
74
+ df_screen_ids = pd.read_csv(file_screen_ids)
75
+ screen_ids = list(df_screen_ids['screen'])
76
+
77
+ file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv" # the sample from CSV file is below:
78
+ # screen,weekday,hour,householdSmall,householdAverage,householdLarge,incomeLow,incomeAverage,incomeHigh,impressionMax,impressionHour,price,publisher1,publisher2,publisher3,venueType1,venueType2,venueType3
79
+ # 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,15,0.894,0.0,0.447,0.0,0.894,0.447,6.0,0.399,0.398,1.0,0.0,0.0,0.0,1.0,0.0
80
+ # 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,16,0.989,0.0,0.141,0.0,1.0,0.0,6.0,0.384,0.381,1.0,0.0,0.0,0.0,1.0,0.0
81
+ df_inventory = pd.read_csv(file_inventory_last)
82
+
83
+ weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
84
+ hours = list(range(24))
85
+
86
+ cols = ['screen', 'weekday', 'hour']
87
+ screens_dict = {}
88
+ for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
89
+ .to_dict()).items():
90
+ screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
91
+ # print(screens_dict)
92
+
93
+ def random_screen():
94
+ return random.choice(screen_ids)
95
+
96
+
97
+ def generate_bid_requests(num_weeks):
98
+ """Generate synthetic bid requests."""
99
+ bid_requests = []
100
+ for weekIndex in range(num_weeks):
101
+ for weekday_index in range(7):
102
+ weekday = weekdays[weekday_index]
103
+ # print('weekday', weekday)
104
+ for hour in hours:
105
+ # print(' hour', hour)
106
+ for bid_index in range(10):
107
+ screen_index = randrange(len(screen_ids))
108
+ screen_id = screen_ids[screen_index]
109
+
110
+ data = screens_dict[screen_id][weekday][hour]
111
+
112
+ householdSmall = data[0]
113
+ householdAverage = data[1]
114
+ householdLarge = data[2]
115
+ incomeLow = data[3]
116
+ incomeAverage = data[4]
117
+ incomeHigh = data[5]
118
+ impressionHour = data[7]
119
+ price = data[8]
120
+
121
+ publisher_1 = data[9]
122
+ publisher_2 = data[10]
123
+ publisher_3 = data[11]
124
+ venue_type_1 = data[12]
125
+ venue_type_2 = data[13]
126
+ venue_type_3 = data[14]
127
+
128
+ bid_request = {
129
+ "features": np.array([
130
+ # screen_index,
131
+ # weekday_index,
132
+ # hour,
133
+ impressionHour,
134
+ ], dtype=np.float32),
135
+ "household": np.array([
136
+ householdSmall,
137
+ householdAverage,
138
+ householdLarge,
139
+ ], dtype=np.float32),
140
+ "income": np.array([
141
+ incomeLow,
142
+ incomeAverage,
143
+ incomeHigh,
144
+ ], dtype=np.float32),
145
+ "publisher": np.array([
146
+ publisher_1,
147
+ publisher_2,
148
+ publisher_3,
149
+ ], dtype=np.float32),
150
+ "venue_type": np.array([
151
+ venue_type_1,
152
+ venue_type_2,
153
+ venue_type_3,
154
+ ], dtype=np.float32),
155
+ "price": price,
156
+ }
157
+ bid_requests.append(bid_request)
158
+ print(f'Generated {len(bid_requests)} bid requests.')
159
+ return bid_requests
160
+
161
+
162
+ class DspCampaign100Env(gym.Env):
163
+ """
164
+ Minimal DSP RL environment:
165
+ - One episode = one campaign
166
+ - One step = one bid request
167
+ """
168
+
169
+ metadata = {"render_modes": []}
170
+
171
+ def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
172
+ super().__init__()
173
+
174
+ # ----------------------------
175
+ # Environment data
176
+ # ----------------------------
177
+ self.bid_requests = bid_requests # list of dicts (one per step)
178
+ self.distribution_dim = 0
179
+ for key in desired_distributions:
180
+ dist = desired_distributions[key]
181
+ dist2 = _normalize_vector(dist)
182
+ desired_distributions[key] = dist2
183
+ self.distribution_dim += len(dist2)
184
+ self.desired_distributions = desired_distributions
185
+ self.initial_budget = budget
186
+ self.impression_max = impression_max
187
+ self.price_max = price_max
188
+
189
+ # ----------------------------
190
+ # Action space
191
+ # ----------------------------
192
+ # 0 = no bid, 1 = bid
193
+ self.action_space = spaces.Discrete(2)
194
+
195
+ # ----------------------------
196
+ # Observation space
197
+ # ----------------------------
198
+ # [current_demo(6), desired_demo(6), budget_ratio, time_ratio,
199
+ # bid_request_features...]
200
+ self.bid_feat_dim = 1 # example
201
+
202
+ obs_dim = (
203
+ self.distribution_dim
204
+ + 3 # campaign progress: budget_ratio, time_ratio, budget_ratio - time_ratio
205
+ + self.bid_feat_dim
206
+ + self.distribution_dim # bid features related to distributions (e.g. publisher, venue_type)
207
+ + 1 # alignment score (dot product of gap and bid)
208
+ )
209
+
210
+ self.observation_space = spaces.Box(
211
+ low=-np.inf,
212
+ high=np.inf,
213
+ shape=(obs_dim,),
214
+ dtype=np.float32,
215
+ )
216
+
217
+ self.reset()
218
+
219
+ # ----------------------------
220
+ # Reset episode
221
+ # ----------------------------
222
+ def reset(self, seed=None, options=None):
223
+ super().reset(seed=seed)
224
+
225
+ self.step_idx = 0
226
+ self.budget_left = self.initial_budget
227
+ self.current_distributions = {}
228
+ # self.current_demo = np.zeros(self.demo_dim, dtype=np.float32)
229
+ for key in self.desired_distributions:
230
+ # print("key", key, "desired_distributions[key]", type(self.desired_distributions[key]))
231
+ self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
232
+
233
+ obs = self._get_observation()
234
+ info = {}
235
+
236
+ return obs, info
237
+
238
+ def reset_bid_requests(self, bid_requests):
239
+ self.bid_requests = bid_requests
240
+
241
+ def get_action_mask(self):
242
+ bid = self.bid_requests[self.step_idx]
243
+ cost = bid["price"] * self.price_max
244
+
245
+ budget_ratio = self.budget_left / self.initial_budget
246
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
247
+
248
+ # do not allow spend if it violates pacing envelope
249
+ can_bid = not (
250
+ # budget_ratio < time_ratio - 0.03 or
251
+ self.budget_left - cost <= 0
252
+ )
253
+
254
+ # action 0 always allowed
255
+ # return np.array([1, int(can_bid)], dtype=np.float32)
256
+ return can_bid
257
+
258
+ # ----------------------------
259
+ # Step
260
+ # ----------------------------
261
+ def step(self, action):
262
+ assert self.action_space.contains(action)
263
+
264
+ done = False
265
+
266
+ bid = self.bid_requests[self.step_idx]
267
+ cost = bid["price"] * self.price_max
268
+
269
+ # Pacing calculation
270
+ budget_ratio = self.budget_left / self.initial_budget
271
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
272
+ pacing_diff = budget_ratio - time_ratio
273
+
274
+ # ----------------------------
275
+ # Apply action
276
+ # ----------------------------
277
+ reward = 0.0
278
+
279
+ if action == 1 and self.budget_left >= cost:
280
+ self.budget_left -= cost
281
+
282
+ # --- ENHANCED REWARD CALCULATION ---
283
+ # Instead of global distance diff, we calculate the "alignment" of this specific bid
284
+ # with the specific needs of the campaign right now.
285
+
286
+ total_alignment_reward = 0.0
287
+
288
+ first_key = list(self.desired_distributions.keys())[0]
289
+ total_playouts_so_far = np.sum(self.current_distributions[first_key])
290
+ stats_warmup_count = 100.0
291
+ x = total_playouts_so_far / stats_warmup_count
292
+ y = x ** 3
293
+ starup_factor = min(1.0, y)
294
+
295
+ for key in self.desired_distributions:
296
+ # 1. Update distribution counts
297
+ self.current_distributions[key] += bid[key]
298
+
299
+ # 2. Calculate Gap (Desired %) - (Current %)
300
+ # We need to normalize current counts to get percentages
301
+ current_total = np.sum(self.current_distributions[key])
302
+ if current_total > 0:
303
+ current_dist_norm = self.current_distributions[key] / current_total
304
+ else:
305
+ current_dist_norm = np.zeros_like(self.desired_distributions[key])
306
+
307
+ gap = self.desired_distributions[key] - current_dist_norm
308
+
309
+ # 3. Alignment Score: Dot product of Gap vector and Bid vector
310
+ # If Gap is [0.1, -0.1] (we need index 0, have too much index 1)
311
+ # And Bid is [1, 0] -> dot product is 0.1 (Positive reward)
312
+ # And Bid is [0, 1] -> dot product is -0.1 (Negative reward)
313
+ alignment = np.dot(gap, bid[key])
314
+
315
+ # Scale up to make it significant for the optimizer
316
+ total_alignment_reward += alignment * 10.0 * starup_factor
317
+
318
+ print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'], self.desired_distributions['household'])
319
+ print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'], self.current_distributions['household'])
320
+ print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'], bid['household'])
321
+
322
+ reward += total_alignment_reward
323
+
324
+ # Penalize overspending slightly if we are ahead of schedule
325
+ if pacing_diff < -0.005: # We have spent too much relative to time
326
+ reward -= 5.0
327
+
328
+ else:
329
+ # Action = 0 (No Bid)
330
+
331
+ # If we are falling behind schedule (budget_ratio > time_ratio),
332
+ # we should be bidding. Penalize passing.
333
+ if pacing_diff > 0.02:
334
+ reward -= 0.5 # Penalty for holding budget when behind schedule
335
+ elif pacing_diff < -0.005:
336
+ reward -= 0.5 # Small positive reward for saving budget if we are ahead of schedule
337
+
338
+ # ----------------------------
339
+ # Advance time
340
+ # ----------------------------
341
+ self.step_idx += 1
342
+
343
+ if self.step_idx >= len(self.bid_requests) - 1:
344
+ done = True
345
+
346
+ # Final penalty for unspent budget
347
+ unspent_ratio = self.budget_left / self.initial_budget
348
+ reward -= unspent_ratio * 50.0
349
+
350
+ print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
351
+
352
+ obs = self._get_observation()
353
+ info = {}
354
+
355
+ return obs, reward, done, False, info
356
+
357
+ # ----------------------------
358
+ # Observation builder
359
+ # ----------------------------
360
+ def _get_observation(self):
361
+ bid = self.bid_requests[self.step_idx]
362
+
363
+ budget_ratio = self.budget_left / self.initial_budget
364
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
365
+
366
+ gap_flat = []
367
+ bid_distribution_flat = []
368
+
369
+ # New feature: Total Alignment Score
370
+ # This helps the neural net "see" immediately if a bid is useful
371
+ # without doing complex internal math.
372
+ alignment_score = 0.0
373
+
374
+ for key in self.desired_distributions:
375
+ current_counts = self.current_distributions[key]
376
+ total = np.sum(current_counts)
377
+ if total > 0:
378
+ current_norm = current_counts / total
379
+ else:
380
+ current_norm = np.zeros_like(current_counts)
381
+
382
+ desired = self.desired_distributions[key]
383
+ gap = desired - current_norm
384
+
385
+ gap_flat.extend(gap.tolist())
386
+ bid_distribution_flat.extend(bid[key])
387
+
388
+ # Calculate alignment for this specific feature
389
+ alignment_score += np.dot(gap, bid[key])
390
+
391
+ obs = np.concatenate([
392
+ np.array(gap_flat, dtype=np.float32),
393
+ np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
394
+ bid["features"],
395
+ np.array(bid_distribution_flat, dtype=np.float32),
396
+ np.array([alignment_score], dtype=np.float32) # Add explicit helper feature
397
+ ])
398
+ # print("obs", obs)
399
+
400
+ return obs.astype(np.float32)
401
+
402
+
403
+ class DQN(nn.Module):
404
+
405
+ def __init__(self, n_observations, n_actions):
406
+ super(DQN, self).__init__()
407
+ self.layer1 = nn.Linear(n_observations, 128)
408
+ self.layer2 = nn.Linear(128, 128)
409
+ self.layer3 = nn.Linear(128, n_actions)
410
+
411
+ # Called with either one element to determine next action, or a batch
412
+ # during optimization. Returns tensor([[left0exp,right0exp]...]).
413
+ def forward(self, x):
414
+ x = F.relu(self.layer1(x))
415
+ x = F.relu(self.layer2(x))
416
+ return self.layer3(x)
417
+
418
+ MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model_040_150_4.pt"
419
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
420
+
421
+ # Load checkpoint
422
+ checkpoint = torch.load(MODEL_PATH, map_location=device, weights_only=False)
423
+
424
+ # Recreate model
425
+ policy_net = DQN(
426
+ checkpoint["n_observations"],
427
+ checkpoint["n_actions"]
428
+ ).to(device)
429
+
430
+ policy_net.load_state_dict(checkpoint["model_state_dict"])
431
+ print("Model architecture loaded successfully")
432
+ policy_net.eval() # VERY IMPORTANT (turns off dropout/batchnorm if any)
433
+ print("Model weights loaded successfully")
434
+
435
+ print("Model loaded successfully")
436
+
437
+ def choose_action(model, observation):
438
+ with torch.no_grad():
439
+ state = torch.tensor(
440
+ observation,
441
+ dtype=torch.float32,
442
+ device=device
443
+ ).unsqueeze(0)
444
+
445
+ q_values = model(state)
446
+ print(f"Q-values: {q_values.cpu().numpy()}")
447
+ action = q_values.argmax(dim=1).item()
448
+
449
+ return action
450
+
451
+ budget = 10
452
+ impression_max=11.888
453
+ price_max=0.118
454
+
455
+ desired_household_vector = _normalize_vector([0.5, 0.3, 0.2])
456
+ desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
457
+ desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
458
+ env = DspCampaign100Env(generate_bid_requests(3),
459
+ desired_distributions={"publisher": desired_publiser_vector,
460
+ "venue_type": desired_venue_type_vector,
461
+ "household": desired_household_vector},
462
+ budget=budget, impression_max=impression_max, price_max=price_max)
463
+
464
+ state, _ = env.reset()
465
+
466
+ sum_reward = 0.0
467
+ while True:
468
+ action = choose_action(policy_net, state)
469
+
470
+ # Here instead of env.step, in production:
471
+ # if action == 1:
472
+ # submit bid to DSP
473
+ # else:
474
+ # skip
475
+
476
+ state, reward, terminated, truncated, _ = env.step(action)
477
+
478
+ if not math.isnan(reward):
479
+ sum_reward = sum_reward + reward
480
+
481
+ if terminated or truncated:
482
+ print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
483
+ print("############# sum_reward:", sum_reward)
484
+ print("############# Desire distributions:", env.desired_distributions)
485
+ print("############# Real distributions:", env.current_distributions)
486
+ break
487
+
488
+
489
+
490
+
dsp_bidder_4_training.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gymnasium as gym
2
+ import math
3
+ import random
4
+ from random import randrange
5
+ import pandas as pd
6
+ import matplotlib
7
+ import matplotlib.pyplot as plt
8
+ from collections import namedtuple, deque
9
+ from itertools import count
10
+ import numpy as np
11
+
12
+ import gymnasium as gym
13
+ from gymnasium import spaces
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.optim as optim
18
+ import torch.nn.functional as F
19
+
20
+ random.seed()
21
+
22
+ budget = 10
23
+ impression_max = 11.888
24
+ price_max = 0.118
25
+
26
+ # set up matplotlib
27
+ is_ipython = 'inline' in matplotlib.get_backend()
28
+ if is_ipython:
29
+ from IPython import display
30
+
31
+ plt.ion()
32
+
33
+ # if GPU is to be used
34
+ device = torch.device(
35
+ "cuda" if torch.cuda.is_available() else
36
+ "mps" if torch.backends.mps.is_available() else
37
+ "cpu"
38
+ )
39
+
40
+
41
+ def _normalize_vector(vector):
42
+ if type(vector) is list:
43
+ vector_np = np.asarray(vector, dtype=np.float32)
44
+ else:
45
+ vector_np = vector
46
+ sum = np.sum(vector_np)
47
+ if sum < 1e-8:
48
+ return vector
49
+ normalized_vector = vector_np / sum
50
+ return normalized_vector
51
+
52
+
53
+ def _safe_kl(p: np.ndarray, q: np.ndarray) -> float:
54
+ """
55
+ KL divergence KL(p || q)
56
+ Both p and q must be valid probability distributions.
57
+ """
58
+ epsilon = 0.00001
59
+ return np.sum(p * np.log((p + epsilon) / (q + epsilon)))
60
+
61
+
62
+ def _jensen_shannon_divergence(p: np.ndarray, q: np.ndarray) -> float:
63
+ """
64
+ Compute Jensen–Shannon divergence between two 1D probability vectors.
65
+
66
+ Parameters
67
+ ----------
68
+ p : np.ndarray
69
+ Desired probability distribution (length 3).
70
+ q : np.ndarray
71
+ Current probability distribution (length 3).
72
+
73
+ Returns
74
+ -------
75
+ float
76
+ JS divergence (bounded between 0 and log(2)).
77
+ """
78
+
79
+ # Normalize to probability distributions
80
+ p = _normalize_vector(p)
81
+ q = _normalize_vector(q)
82
+
83
+ m = 0.5 * (p + q)
84
+
85
+ js = 0.5 * _safe_kl(p, m) + 0.5 * _safe_kl(q, m)
86
+
87
+ return float(js)
88
+
89
+
90
+ file_screen_ids = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\005_raw_screens.csv" # here 1500 screen ids (Strings)
91
+ df_screen_ids = pd.read_csv(file_screen_ids)
92
+ screen_ids = list(df_screen_ids['screen'])
93
+
94
+ file_inventory_last = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\013_raw_data_10dollars_publishers_venueTypes.csv" # the sample from CSV file is below:
95
+ # screen,weekday,hour,householdSmall,householdAverage,householdLarge,incomeLow,incomeAverage,incomeHigh,impressionMax,impressionHour,price,publisher1,publisher2,publisher3,venueType1,venueType2,venueType3
96
+ # 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,15,0.894,0.0,0.447,0.0,0.894,0.447,6.0,0.399,0.398,1.0,0.0,0.0,0.0,1.0,0.0
97
+ # 93d696ad-f4ce-4bb4-a9f1-996c771c3d7b,MONDAY,16,0.989,0.0,0.141,0.0,1.0,0.0,6.0,0.384,0.381,1.0,0.0,0.0,0.0,1.0,0.0
98
+ df_inventory = pd.read_csv(file_inventory_last)
99
+
100
+ weekdays = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
101
+ hours = list(range(24))
102
+
103
+ cols = ['screen', 'weekday', 'hour']
104
+ screens_dict = {}
105
+ for (a, b, c), values in (df_inventory.set_index(cols).apply(list, axis=1)
106
+ .to_dict()).items():
107
+ screens_dict.setdefault(a, {}).setdefault(b, {})[c] = values
108
+
109
+
110
+ # print(screens_dict)
111
+
112
+ def random_screen():
113
+ return random.choice(screen_ids)
114
+
115
+
116
+ def generate_bid_requests(num_weeks):
117
+ """Generate synthetic bid requests."""
118
+ bid_requests = []
119
+ for weekIndex in range(num_weeks):
120
+ for weekday_index in range(7):
121
+ weekday = weekdays[weekday_index]
122
+ # print('weekday', weekday)
123
+ for hour in hours:
124
+ # print(' hour', hour)
125
+ for bid_index in range(10):
126
+ screen_index = randrange(len(screen_ids))
127
+ screen_id = screen_ids[screen_index]
128
+
129
+ data = screens_dict[screen_id][weekday][hour]
130
+
131
+ householdSmall = data[0]
132
+ householdAverage = data[1]
133
+ householdLarge = data[2]
134
+ incomeLow = data[3]
135
+ incomeAverage = data[4]
136
+ incomeHigh = data[5]
137
+ impressionHour = data[7]
138
+ price = data[8]
139
+
140
+ publisher_1 = data[9]
141
+ publisher_2 = data[10]
142
+ publisher_3 = data[11]
143
+ venue_type_1 = data[12]
144
+ venue_type_2 = data[13]
145
+ venue_type_3 = data[14]
146
+
147
+ bid_request = {
148
+ "features": np.array([
149
+ # screen_index,
150
+ # weekday_index,
151
+ # hour,
152
+ impressionHour,
153
+ ], dtype=np.float32),
154
+ "household": np.array([
155
+ householdSmall,
156
+ householdAverage,
157
+ householdLarge,
158
+ ], dtype=np.float32),
159
+ "income": np.array([
160
+ incomeLow,
161
+ incomeAverage,
162
+ incomeHigh,
163
+ ], dtype=np.float32),
164
+ "publisher": np.array([
165
+ publisher_1,
166
+ publisher_2,
167
+ publisher_3,
168
+ ], dtype=np.float32),
169
+ "venue_type": np.array([
170
+ venue_type_1,
171
+ venue_type_2,
172
+ venue_type_3,
173
+ ], dtype=np.float32),
174
+ "price": price,
175
+ }
176
+ bid_requests.append(bid_request)
177
+ print(f'Generated {len(bid_requests)} bid requests.')
178
+ return bid_requests
179
+
180
+
181
+ class DspCampaign100Env(gym.Env):
182
+ """
183
+ Minimal DSP RL environment:
184
+ - One episode = one campaign
185
+ - One step = one bid request
186
+ """
187
+
188
+ metadata = {"render_modes": []}
189
+
190
+ def __init__(self, bid_requests, desired_distributions, budget, impression_max, price_max):
191
+ super().__init__()
192
+
193
+ # ----------------------------
194
+ # Environment data
195
+ # ----------------------------
196
+ self.bid_requests = bid_requests # list of dicts (one per step)
197
+ self.distribution_dim = 0
198
+ for key in desired_distributions:
199
+ dist = desired_distributions[key]
200
+ dist2 = _normalize_vector(dist)
201
+ desired_distributions[key] = dist2
202
+ self.distribution_dim += len(dist2)
203
+ self.desired_distributions = desired_distributions
204
+ self.initial_budget = budget
205
+ self.impression_max = impression_max
206
+ self.price_max = price_max
207
+
208
+ # ----------------------------
209
+ # Action space
210
+ # ----------------------------
211
+ # 0 = no bid, 1 = bid
212
+ self.action_space = spaces.Discrete(2)
213
+
214
+ # ----------------------------
215
+ # Observation space
216
+ # ----------------------------
217
+ # [current_demo(6), desired_demo(6), budget_ratio, time_ratio,
218
+ # bid_request_features...]
219
+ self.bid_feat_dim = 1 # example
220
+
221
+ obs_dim = (
222
+ self.distribution_dim
223
+ + 3 # campaign progress: budget_ratio, time_ratio, budget_ratio - time_ratio
224
+ + self.bid_feat_dim
225
+ + self.distribution_dim # bid features related to distributions (e.g. publisher, venue_type)
226
+ + 1 # alignment score (dot product of gap and bid)
227
+ )
228
+
229
+ self.observation_space = spaces.Box(
230
+ low=-np.inf,
231
+ high=np.inf,
232
+ shape=(obs_dim,),
233
+ dtype=np.float32,
234
+ )
235
+
236
+ self.reset()
237
+
238
+ # ----------------------------
239
+ # Reset episode
240
+ # ----------------------------
241
+ def reset(self, seed=None, options=None):
242
+ super().reset(seed=seed)
243
+
244
+ self.step_idx = 0
245
+ self.budget_left = self.initial_budget
246
+ self.current_distributions = {}
247
+ # self.current_demo = np.zeros(self.demo_dim, dtype=np.float32)
248
+ for key in self.desired_distributions:
249
+ # print("key", key, "desired_distributions[key]", type(self.desired_distributions[key]))
250
+ self.current_distributions[key] = np.zeros(len(self.desired_distributions[key]), dtype=np.float32)
251
+
252
+ obs = self._get_observation()
253
+ info = {}
254
+
255
+ return obs, info
256
+
257
+ def reset_bid_requests(self, bid_requests):
258
+ self.bid_requests = bid_requests
259
+
260
+ def get_action_mask(self):
261
+ bid = self.bid_requests[self.step_idx]
262
+ cost = bid["price"] * self.price_max
263
+
264
+ budget_ratio = self.budget_left / self.initial_budget
265
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
266
+
267
+ # do not allow spend if it violates pacing envelope
268
+ can_bid = not (
269
+ # budget_ratio < time_ratio - 0.03 or
270
+ self.budget_left - cost <= 0
271
+ )
272
+
273
+ # action 0 always allowed
274
+ # return np.array([1, int(can_bid)], dtype=np.float32)
275
+ return can_bid
276
+
277
+ # ----------------------------
278
+ # Step
279
+ # ----------------------------
280
+ def step(self, action):
281
+ assert self.action_space.contains(action)
282
+
283
+ done = False
284
+
285
+ bid = self.bid_requests[self.step_idx]
286
+ cost = bid["price"] * self.price_max
287
+
288
+ # Pacing calculation
289
+ budget_ratio = self.budget_left / self.initial_budget
290
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
291
+ pacing_diff = budget_ratio - time_ratio
292
+
293
+ # ----------------------------
294
+ # Apply action
295
+ # ----------------------------
296
+ reward = 0.0
297
+
298
+ if action == 1 and self.budget_left >= cost:
299
+ self.budget_left -= cost
300
+
301
+ # --- ENHANCED REWARD CALCULATION ---
302
+ # Instead of global distance diff, we calculate the "alignment" of this specific bid
303
+ # with the specific needs of the campaign right now.
304
+
305
+ total_alignment_reward = 0.0
306
+
307
+ first_key = list(self.desired_distributions.keys())[0]
308
+ total_playouts_so_far = np.sum(self.current_distributions[first_key])
309
+ stats_warmup_count = 50.0
310
+ x = total_playouts_so_far / stats_warmup_count
311
+ y = x ** 3
312
+ starup_factor = min(1.0, y)
313
+
314
+ for key in self.desired_distributions:
315
+ # 1. Update distribution counts
316
+ self.current_distributions[key] += bid[key]
317
+
318
+ # 2. Calculate Gap (Desired %) - (Current %)
319
+ # We need to normalize current counts to get percentages
320
+ current_total = np.sum(self.current_distributions[key])
321
+ if current_total > 0:
322
+ current_dist_norm = self.current_distributions[key] / current_total
323
+ else:
324
+ current_dist_norm = np.zeros_like(self.desired_distributions[key])
325
+
326
+ gap = self.desired_distributions[key] - current_dist_norm
327
+
328
+ # 3. Alignment Score: Dot product of Gap vector and Bid vector
329
+ # If Gap is [0.1, -0.1] (we need index 0, have too much index 1)
330
+ # And Bid is [1, 0] -> dot product is 0.1 (Positive reward)
331
+ # And Bid is [0, 1] -> dot product is -0.1 (Negative reward)
332
+ alignment = np.dot(gap, bid[key])
333
+
334
+ # Scale up to make it significant for the optimizer
335
+ total_alignment_reward += alignment * 10.0 * starup_factor
336
+
337
+ print("desired_publishers", self.desired_distributions['publisher'], self.desired_distributions['venue_type'], self.desired_distributions['household'])
338
+ print("current_publishers", self.current_distributions['publisher'], self.current_distributions['venue_type'], self.current_distributions['household'])
339
+ print("bid.publisher", bid['publisher'], "bid.venue_type", bid['venue_type'], bid['household'])
340
+
341
+ reward += total_alignment_reward
342
+
343
+ # Penalize overspending slightly if we are ahead of schedule
344
+ if pacing_diff < -0.005: # We have spent too much relative to time
345
+ reward -= 5.0
346
+
347
+ else:
348
+ # Action = 0 (No Bid)
349
+
350
+ # If we are falling behind schedule (budget_ratio > time_ratio),
351
+ # we should be bidding. Penalize passing.
352
+ if pacing_diff > 0.02:
353
+ reward -= 0.5 # Penalty for holding budget when behind schedule
354
+ elif pacing_diff < -0.005:
355
+ reward -= 0.5 # Small positive reward for saving budget if we are ahead of schedule
356
+
357
+ # ----------------------------
358
+ # Advance time
359
+ # ----------------------------
360
+ self.step_idx += 1
361
+
362
+ if self.step_idx >= len(self.bid_requests) - 1:
363
+ done = True
364
+
365
+ # Final penalty for unspent budget
366
+ unspent_ratio = self.budget_left / self.initial_budget
367
+ reward -= unspent_ratio * 50.0
368
+
369
+ print("reward", reward, "action", action, "self.budget_left", self.budget_left, "time_ratio", time_ratio, "bid['price']", bid["price"] * self.price_max)
370
+
371
+ obs = self._get_observation()
372
+ info = {}
373
+
374
+ return obs, reward, done, False, info
375
+
376
+ # ----------------------------
377
+ # Observation builder
378
+ # ----------------------------
379
+ def _get_observation(self):
380
+ bid = self.bid_requests[self.step_idx]
381
+
382
+ budget_ratio = self.budget_left / self.initial_budget
383
+ time_ratio = 1.0 - self.step_idx / len(self.bid_requests)
384
+
385
+ gap_flat = []
386
+ bid_distribution_flat = []
387
+
388
+ # New feature: Total Alignment Score
389
+ # This helps the neural net "see" immediately if a bid is useful
390
+ # without doing complex internal math.
391
+ alignment_score = 0.0
392
+
393
+ for key in self.desired_distributions:
394
+ current_counts = self.current_distributions[key]
395
+ total = np.sum(current_counts)
396
+ if total > 0:
397
+ current_norm = current_counts / total
398
+ else:
399
+ current_norm = np.zeros_like(current_counts)
400
+
401
+ desired = self.desired_distributions[key]
402
+ gap = desired - current_norm
403
+
404
+ gap_flat.extend(gap.tolist())
405
+ bid_distribution_flat.extend(bid[key])
406
+
407
+ # Calculate alignment for this specific feature
408
+ alignment_score += np.dot(gap, bid[key])
409
+
410
+ obs = np.concatenate([
411
+ np.array(gap_flat, dtype=np.float32),
412
+ np.array([budget_ratio, time_ratio, budget_ratio - time_ratio], dtype=np.float32),
413
+ bid["features"],
414
+ np.array(bid_distribution_flat, dtype=np.float32),
415
+ np.array([alignment_score], dtype=np.float32) # Add explicit helper feature
416
+ ])
417
+ # print("obs", obs)
418
+
419
+ return obs.astype(np.float32)
420
+
421
+
422
+ # To ensure reproducibility during training, you can fix the random seeds
423
+ # by uncommenting the lines below. This makes the results consistent across
424
+ # runs, which is helpful for debugging or comparing different approaches.
425
+ #
426
+ # That said, allowing randomness can be beneficial in practice, as it lets
427
+ # the model explore different training trajectories.
428
+
429
+
430
+ seed = 42
431
+ random.seed(seed)
432
+ torch.manual_seed(seed)
433
+ if torch.cuda.is_available():
434
+ torch.cuda.manual_seed(seed)
435
+
436
+ ######################################################################
437
+ # Replay Memory
438
+ # -------------
439
+ #
440
+ # We'll be using experience replay memory for training our DQN. It stores
441
+ # the transitions that the agent observes, allowing us to reuse this data
442
+ # later. By sampling from it randomly, the transitions that build up a
443
+ # batch are decorrelated. It has been shown that this greatly stabilizes
444
+ # and improves the DQN training procedure.
445
+ #
446
+ # For this, we're going to need two classes:
447
+ #
448
+ # - ``Transition`` - a named tuple representing a single transition in
449
+ # our environment. It essentially maps (state, action) pairs
450
+ # to their (next_state, reward) result, with the state being the
451
+ # screen difference image as described later on.
452
+ # - ``ReplayMemory`` - a cyclic buffer of bounded size that holds the
453
+ # transitions observed recently. It also implements a ``.sample()``
454
+ # method for selecting a random batch of transitions for training.
455
+ #
456
+
457
+ Transition = namedtuple('Transition',
458
+ ('state', 'action', 'next_state', 'reward'))
459
+
460
+
461
+ class ReplayMemory(object):
462
+
463
+ def __init__(self, capacity):
464
+ self.capacity = capacity
465
+ self.memory = deque([], maxlen=capacity)
466
+
467
+ def clear(self):
468
+ self.memory = deque([], maxlen=self.capacity)
469
+
470
+ def push(self, *args):
471
+ """Save a transition"""
472
+ self.memory.append(Transition(*args))
473
+
474
+ def sample(self, batch_size):
475
+ return random.sample(self.memory, batch_size)
476
+
477
+ def __len__(self):
478
+ return len(self.memory)
479
+
480
+
481
+ class DQN(nn.Module):
482
+
483
+ def __init__(self, n_observations, n_actions):
484
+ super(DQN, self).__init__()
485
+ self.layer1 = nn.Linear(n_observations, 128)
486
+ self.layer2 = nn.Linear(128, 128)
487
+ self.layer3 = nn.Linear(128, n_actions)
488
+
489
+ # Called with either one element to determine next action, or a batch
490
+ # during optimization. Returns tensor([[left0exp,right0exp]...]).
491
+ def forward(self, x):
492
+ x = F.relu(self.layer1(x))
493
+ x = F.relu(self.layer2(x))
494
+ return self.layer3(x)
495
+
496
+
497
+ ######################################################################
498
+ # Training
499
+ # --------
500
+ #
501
+ # Hyperparameters and utilities
502
+ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
503
+ # This cell instantiates our model and its optimizer, and defines some
504
+ # utilities:
505
+ #
506
+ # - ``select_action`` - will select an action according to an epsilon
507
+ # greedy policy. Simply put, we'll sometimes use our model for choosing
508
+ # the action, and sometimes we'll just sample one uniformly. The
509
+ # probability of choosing a random action will start at ``EPS_START``
510
+ # and will decay exponentially towards ``EPS_END``. ``EPS_DECAY``
511
+ # controls the rate of the decay.
512
+ # - ``plot_rewards`` - a helper for plotting the sum of rewards of episodes,
513
+ # along with an average over the last 100 episodes (the measure used in
514
+ # the official evaluations). The plot will be underneath the cell
515
+ # containing the main training loop, and will update after every
516
+ # episode.
517
+ #
518
+
519
+ # BATCH_SIZE is the number of transitions sampled from the replay buffer
520
+ # GAMMA is the discount factor as mentioned in the previous section
521
+ # EPS_START is the starting value of epsilon
522
+ # EPS_END is the final value of epsilon
523
+ # EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
524
+ # TAU is the update rate of the target network
525
+ # LR is the learning rate of the ``AdamW`` optimizer
526
+
527
+ BATCH_SIZE = 128
528
+ # GAMMA = 0.99
529
+ # GAMMA = 0.93
530
+ GAMMA = 0.9
531
+ EPS_START = 0.9
532
+ EPS_END = 0.01
533
+ # EPS_DECAY = 2500
534
+ EPS_DECAY = 3360 / 3
535
+ # EPS_DECAY = 16800 / 3
536
+ # TAU = 0.001
537
+ # TAU = 0.005
538
+ TAU = 0.003
539
+ # LR = 1e-4
540
+ # LR = 3e-4
541
+ LR = 2e-4
542
+
543
+ # desired_household_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
544
+ desired_household_vector = _normalize_vector([0.5, 0.3, 0.2])
545
+ # desired_income_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
546
+ # desired_publiser_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
547
+ desired_publiser_vector = _normalize_vector([0.1, 0.2, 0.7])
548
+ # desired_venue_type_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
549
+ desired_venue_type_vector = _normalize_vector([0.5, 0.3, 0.2])
550
+ env = DspCampaign100Env(generate_bid_requests(3),
551
+ # desired_distributions={"household": desired_household_vector, "income": desired_income_vector},
552
+ # desired_distributions={"publisher": desired_publiser_vector, "venue_type": desired_venue_type_vector},
553
+ desired_distributions={"publisher": desired_publiser_vector,
554
+ "venue_type": desired_venue_type_vector,
555
+ "household": desired_household_vector},
556
+ # desired_distributions={"publisher": desired_publiser_vector},
557
+ budget=budget, impression_max=impression_max, price_max=price_max)
558
+ # Get number of actions from gym action space
559
+ n_actions = env.action_space.n
560
+ # Get the number of state observations
561
+ state, info = env.reset()
562
+ n_observations = len(state)
563
+
564
+ policy_net = DQN(n_observations, n_actions).to(device)
565
+ target_net = DQN(n_observations, n_actions).to(device)
566
+ target_net.load_state_dict(policy_net.state_dict())
567
+
568
+ optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
569
+ memory = ReplayMemory(10000)
570
+
571
+ steps_done = 0
572
+
573
+
574
+ def select_action(state, can_bid):
575
+ global steps_done
576
+ # print('steps_done', steps_done)
577
+
578
+ if not can_bid:
579
+ return torch.tensor([[0]], device=device, dtype=torch.long)
580
+
581
+ sample = random.random()
582
+ eps_threshold = EPS_END + (EPS_START - EPS_END) * \
583
+ math.exp(-1. * steps_done / EPS_DECAY)
584
+ steps_done += 1
585
+ if sample > eps_threshold:
586
+ with torch.no_grad():
587
+ # t.max(1) will return the largest column value of each row.
588
+ # second column on max result is index of where max element was
589
+ # found, so we pick action with the larger expected reward.
590
+ return policy_net(state).max(1).indices.view(1, 1)
591
+ else:
592
+ return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
593
+
594
+
595
+ episode_rewards = []
596
+
597
+
598
+ def plot_rewards(show_result=False):
599
+ plt.figure(1)
600
+ reward_t = torch.tensor(episode_rewards, dtype=torch.float)
601
+ # print("episode_rewards", episode_rewards)
602
+ if show_result:
603
+ plt.title('Result')
604
+ else:
605
+ plt.clf()
606
+ plt.title('Training...')
607
+ plt.xlabel('Episode')
608
+ plt.ylabel('Reward')
609
+ plt.plot(reward_t.numpy())
610
+ # Take 100 episode averages and plot them too
611
+ # if len(reward_t) >= 100:
612
+ # means = reward_t.unfold(0, 100, 1).mean(1).view(-1)
613
+ # # print("means", means)
614
+ # means = torch.cat((torch.zeros(99), means))
615
+ # plt.plot(means.numpy())
616
+
617
+ plt.pause(0.2) # pause a bit so that plots are updated
618
+ if is_ipython:
619
+ if not show_result:
620
+ display.display(plt.gcf())
621
+ display.clear_output(wait=True)
622
+ else:
623
+ display.display(plt.gcf())
624
+
625
+
626
+ ######################################################################
627
+ # Training loop
628
+ # ^^^^^^^^^^^^^
629
+ #
630
+ # Finally, the code for training our model.
631
+ #
632
+ # Here, you can find an ``optimize_model`` function that performs a
633
+ # single step of the optimization. It first samples a batch, concatenates
634
+ # all the tensors into a single one, computes :math:`Q(s_t, a_t)` and
635
+ # :math:`V(s_{t+1}) = \max_a Q(s_{t+1}, a)`, and combines them into our
636
+ # loss. By definition we set :math:`V(s) = 0` if :math:`s` is a terminal
637
+ # state. We also use a target network to compute :math:`V(s_{t+1})` for
638
+ # added stability. The target network is updated at every step with a
639
+ # `soft update <https://arxiv.org/pdf/1509.02971.pdf>`__ controlled by
640
+ # the hyperparameter ``TAU``, which was previously defined.
641
+ #
642
+
643
+ def optimize_model():
644
+ if len(memory) < BATCH_SIZE:
645
+ return
646
+ transitions = memory.sample(BATCH_SIZE)
647
+ # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
648
+ # detailed explanation). This converts batch-array of Transitions
649
+ # to Transition of batch-arrays.
650
+ batch = Transition(*zip(*transitions))
651
+
652
+ # Compute a mask of non-final states and concatenate the batch elements
653
+ # (a final state would've been the one after which simulation ended)
654
+ non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
655
+ batch.next_state)), device=device, dtype=torch.bool)
656
+ non_final_next_states = torch.cat([s for s in batch.next_state
657
+ if s is not None])
658
+ state_batch = torch.cat(batch.state)
659
+ action_batch = torch.cat(batch.action)
660
+ reward_batch = torch.cat(batch.reward)
661
+
662
+ # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
663
+ # columns of actions taken. These are the actions which would've been taken
664
+ # for each batch state according to policy_net
665
+ state_action_values = policy_net(state_batch).gather(1, action_batch)
666
+
667
+ # Compute V(s_{t+1}) for all next states.
668
+ # Expected values of actions for non_final_next_states are computed based
669
+ # on the "older" target_net; selecting their best reward with max(1).values
670
+ # This is merged based on the mask, such that we'll have either the expected
671
+ # state value or 0 in case the state was final.
672
+ next_state_values = torch.zeros(BATCH_SIZE, device=device)
673
+ with torch.no_grad():
674
+ next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
675
+ # Compute the expected Q values
676
+ expected_state_action_values = (next_state_values * GAMMA) + reward_batch
677
+
678
+ # Compute Huber loss
679
+ criterion = nn.SmoothL1Loss()
680
+ loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
681
+
682
+ # Optimize the model
683
+ optimizer.zero_grad()
684
+ loss.backward()
685
+ # In-place gradient clipping
686
+ torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
687
+ optimizer.step()
688
+
689
+
690
+ ######################################################################
691
+ #
692
+ # Below, you can find the main training loop. At the beginning we reset
693
+ # the environment and obtain the initial ``state`` Tensor. Then, we sample
694
+ # an action, execute it, observe the next state and the reward (always
695
+ # 1), and optimize our model once. When the episode ends (our model
696
+ # fails), we restart the loop.
697
+ #
698
+ # Below, `num_episodes` is set to 600 if a GPU is available, otherwise 50
699
+ # episodes are scheduled so training does not take too long. However, 50
700
+ # episodes is insufficient for to observe good performance on CartPole.
701
+ # You should see the model constantly achieve 500 steps within 600 training
702
+ # episodes. Training RL agents can be a noisy process, so restarting training
703
+ # can produce better results if convergence is not observed.
704
+ #
705
+
706
+ if torch.cuda.is_available() or torch.backends.mps.is_available():
707
+ num_episodes = 600
708
+ else:
709
+ num_episodes = 250
710
+
711
+ for i_episode in range(num_episodes):
712
+ # if i_episode == 50:
713
+ # memory.clear()
714
+ # Initialize the environment and get its state
715
+ # desired_household_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
716
+ # desired_income_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
717
+ # desired_publiser_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
718
+ # desired_publiser_vector = _normalize_vector([0, 0, 1])
719
+ # desired_venue_type_vector = _normalize_vector([random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)])
720
+ # env = DspCampaign100Env(generate_bid_requests(4),
721
+ # # desired_distributions={"household": desired_household_vector, "income": desired_income_vector},
722
+ # # desired_distributions={"publisher": desired_publiser_vector, "venue_type": desired_venue_type_vector},
723
+ # desired_distributions={"publisher": desired_publiser_vector},
724
+ # budget=budget, impression_max=impression_max, price_max=price_max)
725
+ env.reset(seed=seed)
726
+ env.action_space.seed(seed)
727
+ env.observation_space.seed(seed)
728
+
729
+ sum_reward = 0
730
+ state, info = env.reset()
731
+ if i_episode % 3 == 0:
732
+ env.reset_bid_requests(generate_bid_requests(3))
733
+ state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
734
+ for t in count():
735
+ can_bid = env.get_action_mask()
736
+ action = select_action(state, can_bid)
737
+ observation, reward, terminated, truncated, _ = env.step(action.item())
738
+ if not math.isnan(reward):
739
+ sum_reward = sum_reward + reward
740
+ # print("sum_reward", sum_reward, "reward", reward, "terminated", terminated, "action", action)
741
+ reward = torch.tensor([reward], device=device)
742
+ done = terminated or truncated
743
+
744
+ if terminated:
745
+ next_state = None
746
+ else:
747
+ next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
748
+
749
+ # Store the transition in memory
750
+ memory.push(state, action, next_state, reward)
751
+
752
+ # Move to the next state
753
+ state = next_state
754
+
755
+ # Perform one step of the optimization (on the policy network)
756
+ optimize_model()
757
+
758
+ # Soft update of the target network's weights
759
+ # θ′ ← τ θ + (1 −τ )θ′
760
+ target_net_state_dict = target_net.state_dict()
761
+ policy_net_state_dict = policy_net.state_dict()
762
+ for key in policy_net_state_dict:
763
+ target_net_state_dict[key] = policy_net_state_dict[key] * TAU + target_net_state_dict[key] * (1 - TAU)
764
+ target_net.load_state_dict(target_net_state_dict)
765
+
766
+ # print("sum_reward", sum_reward)
767
+ if done:
768
+ # if len(episode_rewards) > 0 or sum_reward > -200:
769
+ episode_rewards.append(sum_reward)
770
+ plot_rewards()
771
+
772
+ print("############# Budget used:", 1 - env.budget_left / env.initial_budget)
773
+ print("############# sum_reward:", sum_reward)
774
+ print("############# Desire distributions:", env.desired_distributions)
775
+ print("############# Real distributions:", env.current_distributions)
776
+ break
777
+
778
+ print('Complete')
779
+ plot_rewards(show_result=True)
780
+ plt.ioff()
781
+ plt.show()
782
+
783
+ MODEL_PATH = "d:\\proj\\theneuron\\tasks\\CS_155_ml_spotzi\\200_bidder_dqn_model.pt"
784
+
785
+ torch.save({
786
+ "model_state_dict": policy_net.state_dict(),
787
+ "optimizer_state_dict": optimizer.state_dict(),
788
+ "n_observations": n_observations,
789
+ "n_actions": n_actions,
790
+ }, MODEL_PATH)
791
+
792
+ print(f"Model saved to {MODEL_PATH}")
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
training_200_041_250_GOOD_4.png ADDED