RishbhaJain Claude Sonnet 4.6 commited on
Commit
c10dcd0
Β·
1 Parent(s): 4d42a14

fix: pipeline-aware ordering, YoY demand signal, reward rebalancing

Browse files

- Fix bullwhip/overshoot: order against inventory position (on-hand +
pipeline) in both _simulate_rop and inventory_env.py /step endpoint.
Prevents 3 overlapping orders stacking during stockout recovery.
- Add demand_last_year_7d (7-day window same period last year) to
observation β€” surfaced from pre-generated 730-day demand series.
Propagated through server model, client dataclass, and train_grpo prompts.
- Shorten LOOKAHEAD_DAYS 365 β†’ 30: matches adaptive policy horizon,
removes bias toward inflated constant-policy ROPs.
- Raise HOLDING_RATE 0.005 β†’ 0.02: reduces 300:1 stockout/holding
asymmetry that incentivised excessive overstocking.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

agent/train_grpo.py CHANGED
@@ -107,6 +107,7 @@ def format_prompt(obs_dict: dict[str, Any], memory_bank: list[dict[str, Any]]) -
107
  "recent_stockouts": obs_dict["recent_stockouts"],
108
  "recent_lost_sales": round(obs_dict["recent_lost_sales"], 2),
109
  "pending_orders": obs_dict.get("pending_orders", []),
 
110
  "memory_bank": memory_bank[-MEMORY_SIZE:],
111
  }
112
  user_content = json.dumps(snapshot, separators=(",", ":"))
@@ -207,6 +208,7 @@ async def _run_episode_async(
207
  {"arrival_day": o.arrival_day, "quantity": o.quantity}
208
  for o in obs.pending_orders
209
  ],
 
210
  }
211
 
212
  messages = format_prompt(obs_dict, memory_bank)
@@ -312,7 +314,7 @@ FIXED_ORDER_COST = 150.0
312
  HOLDING_RATE = 0.005
313
  WRITE_OFF_RATE = 0.00143
314
  LEAD_TIME = 3
315
- LOOKAHEAD_DAYS = 365
316
  TARGET_FILL_RATE = 0.95
317
  FILL_RATE_WEIGHT = 0.4
318
 
@@ -362,8 +364,10 @@ def _simulate_rop(obs: dict[str, Any], rop: float) -> float:
362
  total_demand += demand
363
 
364
  order_qty = 0.0
365
- if inv <= rop:
366
- order_qty = max(0.0, rop - inv + mean_d * LEAD_TIME)
 
 
367
  pending.append((day + LEAD_TIME, order_qty))
368
 
369
  revenue = sold * SELLING_PRICE
@@ -616,6 +620,7 @@ async def _eval_episode_async(
616
  {"arrival_day": o.arrival_day, "quantity": o.quantity}
617
  for o in obs.pending_orders
618
  ],
 
619
  }
620
 
621
  messages = format_prompt(obs_dict, memory_bank)
 
107
  "recent_stockouts": obs_dict["recent_stockouts"],
108
  "recent_lost_sales": round(obs_dict["recent_lost_sales"], 2),
109
  "pending_orders": obs_dict.get("pending_orders", []),
110
+ "demand_last_year_7d": [round(d, 2) for d in obs_dict.get("demand_last_year_7d", [])],
111
  "memory_bank": memory_bank[-MEMORY_SIZE:],
112
  }
113
  user_content = json.dumps(snapshot, separators=(",", ":"))
 
208
  {"arrival_day": o.arrival_day, "quantity": o.quantity}
209
  for o in obs.pending_orders
210
  ],
211
+ "demand_last_year_7d": [round(d, 2) for d in obs.demand_last_year_7d],
212
  }
213
 
214
  messages = format_prompt(obs_dict, memory_bank)
 
314
  HOLDING_RATE = 0.005
315
  WRITE_OFF_RATE = 0.00143
316
  LEAD_TIME = 3
317
+ LOOKAHEAD_DAYS = 30
318
  TARGET_FILL_RATE = 0.95
319
  FILL_RATE_WEIGHT = 0.4
320
 
 
364
  total_demand += demand
365
 
366
  order_qty = 0.0
367
+ pipeline = sum(qty for arr, qty in pending)
368
+ inv_position = inv + pipeline
369
+ if inv_position <= rop:
370
+ order_qty = max(0.0, rop - inv_position + mean_d * LEAD_TIME)
371
  pending.append((day + LEAD_TIME, order_qty))
372
 
373
  revenue = sold * SELLING_PRICE
 
620
  {"arrival_day": o.arrival_day, "quantity": o.quantity}
621
  for o in obs.pending_orders
622
  ],
623
+ "demand_last_year_7d": [round(d, 2) for d in obs.demand_last_year_7d],
624
  }
625
 
626
  messages = format_prompt(obs_dict, memory_bank)
client/inventory_client.py CHANGED
@@ -51,6 +51,7 @@ class InventoryObservation:
51
  recent_lost_sales: float
52
  days_remaining: int
53
  pending_orders: List[PendingOrder]
 
54
 
55
  @classmethod
56
  def from_dict(cls, d: dict) -> "InventoryObservation":
@@ -65,6 +66,7 @@ class InventoryObservation:
65
  recent_lost_sales=d["recent_lost_sales"],
66
  days_remaining=d["days_remaining"],
67
  pending_orders=[PendingOrder(**o) for o in d["pending_orders"]],
 
68
  )
69
 
70
 
 
51
  recent_lost_sales: float
52
  days_remaining: int
53
  pending_orders: List[PendingOrder]
54
+ demand_last_year_7d: List[float]
55
 
56
  @classmethod
57
  def from_dict(cls, d: dict) -> "InventoryObservation":
 
66
  recent_lost_sales=d["recent_lost_sales"],
67
  days_remaining=d["days_remaining"],
68
  pending_orders=[PendingOrder(**o) for o in d["pending_orders"]],
69
+ demand_last_year_7d=d.get("demand_last_year_7d", []),
70
  )
71
 
72
 
reward.py CHANGED
@@ -5,7 +5,7 @@ from config import (
5
  )
6
 
7
  # Holding cost rate (fraction of unit cost per day)
8
- HOLDING_RATE = 0.005
9
 
10
 
11
  # ── Core P&L computation ───────────────────────────────────────────────────────
 
5
  )
6
 
7
  # Holding cost rate (fraction of unit cost per day)
8
+ HOLDING_RATE = 0.02
9
 
10
 
11
  # ── Core P&L computation ───────────────────────────────────────────────────────
server/inventory_env.py CHANGED
@@ -53,6 +53,7 @@ class InventoryObservation(BaseModel):
53
  recent_lost_sales: float
54
  days_remaining: int
55
  pending_orders: List[PendingOrder]
 
56
 
57
 
58
  class StepResult(BaseModel):
@@ -101,6 +102,11 @@ class EpisodeState:
101
  for o in self.order_processor.order_queue[:5]
102
  ]
103
 
 
 
 
 
 
104
  return InventoryObservation(
105
  day=self.day,
106
  current_inventory=self.inventory,
@@ -115,6 +121,7 @@ class EpisodeState:
115
  recent_lost_sales=self.lost_sales,
116
  days_remaining=SIM_DAYS - self.day,
117
  pending_orders=pending,
 
118
  )
119
 
120
 
@@ -183,8 +190,10 @@ def step(action: InventoryAction):
183
  qty = 0
184
  hist = episode.demand_series[max(0, day - 30):day]
185
  mean_demand = float(np.mean(hist)) if hist else 0.0
186
- if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
187
- qty = max(0.0, rop - episode.inventory + mean_demand * LEAD_TIME)
 
 
188
  if qty > 0:
189
  episode.order_processor.place_order(day, int(qty))
190
 
 
53
  recent_lost_sales: float
54
  days_remaining: int
55
  pending_orders: List[PendingOrder]
56
+ demand_last_year_7d: List[float]
57
 
58
 
59
  class StepResult(BaseModel):
 
102
  for o in self.order_processor.order_queue[:5]
103
  ]
104
 
105
+ ly_anchor = self.day - 365
106
+ ly_start = max(0, ly_anchor - 3)
107
+ ly_end = min(len(self.demand_series), ly_anchor + 4)
108
+ demand_last_year_7d = [float(d) for d in self.demand_series[ly_start:ly_end]]
109
+
110
  return InventoryObservation(
111
  day=self.day,
112
  current_inventory=self.inventory,
 
121
  recent_lost_sales=self.lost_sales,
122
  days_remaining=SIM_DAYS - self.day,
123
  pending_orders=pending,
124
+ demand_last_year_7d=demand_last_year_7d,
125
  )
126
 
127
 
 
190
  qty = 0
191
  hist = episode.demand_series[max(0, day - 30):day]
192
  mean_demand = float(np.mean(hist)) if hist else 0.0
193
+ pipeline = sum(o.quantity for o in episode.order_processor.order_queue)
194
+ inv_position = episode.inventory + pipeline
195
+ if day < SIM_DAYS - LEAD_TIME and inv_position <= rop:
196
+ qty = max(0.0, rop - inv_position + mean_demand * LEAD_TIME)
197
  if qty > 0:
198
  episode.order_processor.place_order(day, int(qty))
199