Spaces:

ademarteau
/

RL-Inventory-Simulations

Runtime error

RishbhaJain Claude Sonnet 4.6 commited on Mar 8

Commit

c10dcd0

1 Parent(s): 4d42a14

fix: pipeline-aware ordering, YoY demand signal, reward rebalancing

- Fix bullwhip/overshoot: order against inventory position (on-hand +
pipeline) in both _simulate_rop and inventory_env.py /step endpoint.
Prevents 3 overlapping orders stacking during stockout recovery.
- Add demand_last_year_7d (7-day window same period last year) to
observation — surfaced from pre-generated 730-day demand series.
Propagated through server model, client dataclass, and train_grpo prompts.
- Shorten LOOKAHEAD_DAYS 365 → 30: matches adaptive policy horizon,
removes bias toward inflated constant-policy ROPs.
- Raise HOLDING_RATE 0.005 → 0.02: reduces 300:1 stockout/holding
asymmetry that incentivised excessive overstocking.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

agent/train_grpo.py +8 -3
client/inventory_client.py +2 -0
reward.py +1 -1
server/inventory_env.py +11 -2

agent/train_grpo.py CHANGED Viewed

@@ -107,6 +107,7 @@ def format_prompt(obs_dict: dict[str, Any], memory_bank: list[dict[str, Any]]) -
         "recent_stockouts": obs_dict["recent_stockouts"],
         "recent_lost_sales": round(obs_dict["recent_lost_sales"], 2),
         "pending_orders": obs_dict.get("pending_orders", []),
         "memory_bank": memory_bank[-MEMORY_SIZE:],
     }
     user_content = json.dumps(snapshot, separators=(",", ":"))
@@ -207,6 +208,7 @@ async def _run_episode_async(
                         {"arrival_day": o.arrival_day, "quantity": o.quantity}
                         for o in obs.pending_orders
                     ],
                 }
                 messages = format_prompt(obs_dict, memory_bank)
@@ -312,7 +314,7 @@ FIXED_ORDER_COST = 150.0
 HOLDING_RATE = 0.005
 WRITE_OFF_RATE = 0.00143
 LEAD_TIME = 3
-LOOKAHEAD_DAYS = 365
 TARGET_FILL_RATE = 0.95
 FILL_RATE_WEIGHT = 0.4
@@ -362,8 +364,10 @@ def _simulate_rop(obs: dict[str, Any], rop: float) -> float:
         total_demand += demand
         order_qty = 0.0
-        if inv <= rop:
-            order_qty = max(0.0, rop - inv + mean_d * LEAD_TIME)
             pending.append((day + LEAD_TIME, order_qty))
         revenue = sold * SELLING_PRICE
@@ -616,6 +620,7 @@ async def _eval_episode_async(
                     {"arrival_day": o.arrival_day, "quantity": o.quantity}
                     for o in obs.pending_orders
                 ],
             }
             messages = format_prompt(obs_dict, memory_bank)

         "recent_stockouts": obs_dict["recent_stockouts"],
         "recent_lost_sales": round(obs_dict["recent_lost_sales"], 2),
         "pending_orders": obs_dict.get("pending_orders", []),
+        "demand_last_year_7d": [round(d, 2) for d in obs_dict.get("demand_last_year_7d", [])],
         "memory_bank": memory_bank[-MEMORY_SIZE:],
     }
     user_content = json.dumps(snapshot, separators=(",", ":"))
                         {"arrival_day": o.arrival_day, "quantity": o.quantity}
                         for o in obs.pending_orders
                     ],
+                    "demand_last_year_7d": [round(d, 2) for d in obs.demand_last_year_7d],
                 }
                 messages = format_prompt(obs_dict, memory_bank)
 HOLDING_RATE = 0.005
 WRITE_OFF_RATE = 0.00143
 LEAD_TIME = 3
+LOOKAHEAD_DAYS = 30
 TARGET_FILL_RATE = 0.95
 FILL_RATE_WEIGHT = 0.4
         total_demand += demand
         order_qty = 0.0
+        pipeline = sum(qty for arr, qty in pending)
+        inv_position = inv + pipeline
+        if inv_position <= rop:
+            order_qty = max(0.0, rop - inv_position + mean_d * LEAD_TIME)
             pending.append((day + LEAD_TIME, order_qty))
         revenue = sold * SELLING_PRICE
                     {"arrival_day": o.arrival_day, "quantity": o.quantity}
                     for o in obs.pending_orders
                 ],
+                "demand_last_year_7d": [round(d, 2) for d in obs.demand_last_year_7d],
             }
             messages = format_prompt(obs_dict, memory_bank)

client/inventory_client.py CHANGED Viewed

@@ -51,6 +51,7 @@ class InventoryObservation:
     recent_lost_sales: float
     days_remaining: int
     pending_orders: List[PendingOrder]
     @classmethod
     def from_dict(cls, d: dict) -> "InventoryObservation":
@@ -65,6 +66,7 @@ class InventoryObservation:
             recent_lost_sales=d["recent_lost_sales"],
             days_remaining=d["days_remaining"],
             pending_orders=[PendingOrder(**o) for o in d["pending_orders"]],
         )

     recent_lost_sales: float
     days_remaining: int
     pending_orders: List[PendingOrder]
+    demand_last_year_7d: List[float]
     @classmethod
     def from_dict(cls, d: dict) -> "InventoryObservation":
             recent_lost_sales=d["recent_lost_sales"],
             days_remaining=d["days_remaining"],
             pending_orders=[PendingOrder(**o) for o in d["pending_orders"]],
+            demand_last_year_7d=d.get("demand_last_year_7d", []),
         )

reward.py CHANGED Viewed

@@ -5,7 +5,7 @@ from config import (
 )
 # Holding cost rate (fraction of unit cost per day)
-HOLDING_RATE = 0.005
 # ── Core P&L computation ───────────────────────────────────────────────────────

 )
 # Holding cost rate (fraction of unit cost per day)
+HOLDING_RATE = 0.02
 # ── Core P&L computation ───────────────────────────────────────────────────────

server/inventory_env.py CHANGED Viewed

@@ -53,6 +53,7 @@ class InventoryObservation(BaseModel):
     recent_lost_sales: float
     days_remaining: int
     pending_orders: List[PendingOrder]
 class StepResult(BaseModel):
@@ -101,6 +102,11 @@ class EpisodeState:
             for o in self.order_processor.order_queue[:5]
         ]
         return InventoryObservation(
             day=self.day,
             current_inventory=self.inventory,
@@ -115,6 +121,7 @@ class EpisodeState:
             recent_lost_sales=self.lost_sales,
             days_remaining=SIM_DAYS - self.day,
             pending_orders=pending,
         )
@@ -183,8 +190,10 @@ def step(action: InventoryAction):
     qty = 0
     hist = episode.demand_series[max(0, day - 30):day]
     mean_demand = float(np.mean(hist)) if hist else 0.0
-    if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
-        qty = max(0.0, rop - episode.inventory + mean_demand * LEAD_TIME)
         if qty > 0:
             episode.order_processor.place_order(day, int(qty))

     recent_lost_sales: float
     days_remaining: int
     pending_orders: List[PendingOrder]
+    demand_last_year_7d: List[float]
 class StepResult(BaseModel):
             for o in self.order_processor.order_queue[:5]
         ]
+        ly_anchor = self.day - 365
+        ly_start = max(0, ly_anchor - 3)
+        ly_end = min(len(self.demand_series), ly_anchor + 4)
+        demand_last_year_7d = [float(d) for d in self.demand_series[ly_start:ly_end]]
         return InventoryObservation(
             day=self.day,
             current_inventory=self.inventory,
             recent_lost_sales=self.lost_sales,
             days_remaining=SIM_DAYS - self.day,
             pending_orders=pending,
+            demand_last_year_7d=demand_last_year_7d,
         )
     qty = 0
     hist = episode.demand_series[max(0, day - 30):day]
     mean_demand = float(np.mean(hist)) if hist else 0.0
+    pipeline = sum(o.quantity for o in episode.order_processor.order_queue)
+    inv_position = episode.inventory + pipeline
+    if day < SIM_DAYS - LEAD_TIME and inv_position <= rop:
+        qty = max(0.0, rop - inv_position + mean_demand * LEAD_TIME)
         if qty > 0:
             episode.order_processor.place_order(day, int(qty))