Spaces:

ademarteau
/

RL-Inventory-Simulations

Runtime error

App Files Files Community

RishbhaJain Claude Sonnet 4.6 commited on Mar 8

Commit

288043f

2 Parent(s): af95772 c041c09

Merge RJ into main — use RJ app.py with P&L reward visualization

Browse files

Files changed (6) hide show

agent/llm_agent.py +34 -26
agent/rl_agent.py +2 -15
app.py +71 -9
config.py +8 -2
order_processor.py +4 -2
server/inventory_env.py +48 -14

agent/llm_agent.py CHANGED Viewed

@@ -18,7 +18,7 @@ import re
 import sys
 from typing import Any
-import anthropic
 sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
@@ -28,27 +28,35 @@ SYSTEM_PROMPT = """\
 You are an expert inventory optimization agent operating inside a stochastic supply-chain simulation.
 YOUR OBJECTIVE:
-Maximize the fill rate (fraction of demand fulfilled) while minimizing inventory write-offs over a \
-365-day episode. The episode ends at day 730 (after 365 days of decisions following a 365-day warm-up).
 ENVIRONMENT RULES:
-- Orders arrive exactly 3 days after placement (LEAD_TIME = 3)
-- An order is placed automatically whenever inventory <= your chosen reorder_point
-- Order quantity = reorder_point - current_inventory + mean_demand * LEAD_TIME (handled by the env)
-- Every 7 days, 1% of on-hand inventory is written off (waste/expiry)
-- Fill rate = total units fulfilled / total units demanded (target: >= 95%)
-- Reward is SPARSE: fill rate only stabilises after many days; plan ahead
 YOUR ACTION EACH STEP:
-Set `reorder_point` — the inventory level at or below which a replenishment order fires.
-A higher ROP builds safety buffer but risks write-offs. A lower ROP conserves stock but risks stockouts.
 REASONING GUIDANCE:
-- Analyse demand trend and variability before deciding
-- Account for pending orders already in the pipeline — they will arrive soon
-- After stockouts, raise ROP aggressively to rebuild buffer
-- If fill rate is healthy and inventory is high, consider lowering ROP to reduce write-offs
-- Think 3+ days ahead; your ROP today only shows its effect after lead time
 RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
 {"reorder_point": <float>, "reasoning": "<concise explanation>", "confidence": <float 0-1>}
@@ -56,14 +64,14 @@ RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
 class ClaudeInventoryAgent:
-    """Inventory optimization agent backed by Claude claude-sonnet-4-5."""
     MEMORY_SIZE = 15
     HISTORY_TURNS = 6
-    MODEL = "claude-sonnet-4-5"
     def __init__(self, api_key: str) -> None:
-        self._client = anthropic.Anthropic(api_key=api_key)
         self._memory_bank: list[dict[str, Any]] = []
         self._conversation: list[dict[str, str]] = []
@@ -127,13 +135,13 @@ class ClaudeInventoryAgent:
         confidence: float
         try:
-            response = self._client.messages.create(
                 model=self.MODEL,
                 max_tokens=512,
-                system=SYSTEM_PROMPT,
-                messages=messages,
             )
-            raw_text: str = response.content[0].text  # type: ignore[union-attr]
             try:
                 parsed = self._parse_response(raw_text)
@@ -253,8 +261,8 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--api-key",
         type=str,
-        default=os.environ.get("ANTHROPIC_API_KEY", ""),
-        help="Anthropic API key (defaults to ANTHROPIC_API_KEY env var).",
     )
     return parser.parse_args()
@@ -263,7 +271,7 @@ if __name__ == "__main__":
     args = _parse_args()
     if not args.api_key:
-        print("Error: no Anthropic API key provided. Set ANTHROPIC_API_KEY or use --api-key.")
         sys.exit(1)
     asyncio.run(

 import sys
 from typing import Any
+from huggingface_hub import InferenceClient
 sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 You are an expert inventory optimization agent operating inside a stochastic supply-chain simulation.
 YOUR OBJECTIVE:
+Maximize daily profit and fill rate over a 365-day episode (days 365–730 after a warm-up period).
 ENVIRONMENT RULES:
+- Lead time: 3 days ± 1 day (stochastic — orders may arrive in 2, 3, or 4 days)
+- An order fires automatically whenever inventory <= your reorder_point
+- Order quantity = reorder_point - current_inventory + mean_demand * lead_time (handled by env)
+- Spoilage: 0.143% of on-hand inventory is lost every day (~1% per week)
+- unit_cost = $10, selling_price = $25, fixed_order_cost = $150 per order
+DAILY REWARD FORMULA:
+  revenue          = units_sold * 25
+  holding_cost     = inventory * 10 * 0.005
+  stockout_penalty = lost_units * 15  (lost margin per unit)
+  order_cost       = 150 (if ordered) + qty * 10
+  writeoff_cost    = spoilage * 10
+  daily_reward     = (revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost) / baseline
+END-OF-EPISODE BONUS (day 730 only):
+  bonus = fill_rate * 0.5 + profit_ratio * 0.5
 YOUR ACTION EACH STEP:
+Set `reorder_point` — the inventory threshold that triggers a replenishment order.
 REASONING GUIDANCE:
+- Stockouts are expensive ($15/unit lost margin) — keep enough buffer for lead time uncertainty
+- Excess inventory bleeds holding cost ($0.05/unit/day) and spoilage — don't over-order
+- $150 fixed order cost: batch orders rather than ordering tiny amounts every day
+- Account for pending orders in the pipeline before deciding to order more
+- Think 3–4 days ahead due to stochastic lead times
 RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
 {"reorder_point": <float>, "reasoning": "<concise explanation>", "confidence": <float 0-1>}
 class ClaudeInventoryAgent:
+    """Inventory optimization agent backed by Qwen2.5-72B via HuggingFace Inference API."""
     MEMORY_SIZE = 15
     HISTORY_TURNS = 6
+    MODEL = "Qwen/Qwen2.5-72B-Instruct"
     def __init__(self, api_key: str) -> None:
+        self._client = InferenceClient(api_key=api_key)
         self._memory_bank: list[dict[str, Any]] = []
         self._conversation: list[dict[str, str]] = []
         confidence: float
         try:
+            hf_messages = [{"role": "system", "content": SYSTEM_PROMPT}] + messages
+            response = self._client.chat.completions.create(
                 model=self.MODEL,
+                messages=hf_messages,
                 max_tokens=512,
             )
+            raw_text: str = response.choices[0].message.content
             try:
                 parsed = self._parse_response(raw_text)
     parser.add_argument(
         "--api-key",
         type=str,
+        default=os.environ.get("HF_TOKEN", ""),
+        help="HuggingFace token (defaults to HF_TOKEN env var).",
     )
     return parser.parse_args()
     args = _parse_args()
     if not args.api_key:
+        print("Error: no HuggingFace token provided. Set HF_TOKEN or use --api-key.")
         sys.exit(1)
     asyncio.run(

agent/rl_agent.py CHANGED Viewed

@@ -57,8 +57,6 @@ class InventoryGymEnv(gym.Env):
         self._inv_client = InventoryEnvClient(base_url)
         self._inv_client._client = self._http_client
-        self._last_fill_rate: float = 0.0
         self.observation_space = spaces.Box(
             low=0.0,
             high=np.inf,
@@ -78,26 +76,15 @@ class InventoryGymEnv(gym.Env):
     def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[np.ndarray, dict]:
         super().reset(seed=seed)
         obs = asyncio.run(self._inv_client.reset(env_type=self._env_type))
-        self._last_fill_rate = 0.0
         return self._obs_to_array(obs), {}
     def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
         result = asyncio.run(
             self._inv_client.step(InventoryAction(reorder_point=float(action[0])))
         )
-        obs = result.observation
-        fill_rate_delta = obs.fill_rate_so_far - self._last_fill_rate
-        shaped_reward = (
-            fill_rate_delta * 10.0
-            - obs.recent_lost_sales * 0.01
-            - obs.current_inventory * 0.0001
-        )
-        self._last_fill_rate = obs.fill_rate_so_far
         return (
-            self._obs_to_array(obs),
-            float(shaped_reward),
             result.done,
             False,
             result.info,

         self._inv_client = InventoryEnvClient(base_url)
         self._inv_client._client = self._http_client
         self.observation_space = spaces.Box(
             low=0.0,
             high=np.inf,
     def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[np.ndarray, dict]:
         super().reset(seed=seed)
         obs = asyncio.run(self._inv_client.reset(env_type=self._env_type))
         return self._obs_to_array(obs), {}
     def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
         result = asyncio.run(
             self._inv_client.step(InventoryAction(reorder_point=float(action[0])))
         )
         return (
+            self._obs_to_array(result.observation),
+            float(result.reward),
             result.done,
             False,
             result.info,

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 from huggingface_hub import InferenceClient
-from config import SIM_DAYS, HISTO_DAYS, LEAD_TIME
 from agent_environment import BaseAgent, SafetyStockAgent, ForecastAgent, MonteCarloAgent
 from demand_environment import GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance
 from demand_calculator import DemandCalculator
@@ -45,9 +45,12 @@ OUTPUT — respond with this exact JSON (no markdown fences):
 # ── Shared chart builder ───────────────────────────────────────────────────────
-def build_chart(daily_inventory, running_fill_rate, rop_markers, title):
-    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 5), sharex=True)
     days = list(range(len(daily_inventory)))
     ax1.plot(days, daily_inventory, color="steelblue", linewidth=0.8)
     if rop_markers:
         rop_days, rop_vals = zip(*rop_markers)
@@ -56,12 +59,35 @@ def build_chart(daily_inventory, running_fill_rate, rop_markers, title):
         ax1.legend(fontsize=8)
     ax1.set_ylabel("Inventory Level")
     ax1.set_title(title)
     ax2.plot(days, running_fill_rate, color="seagreen", linewidth=0.8)
     ax2.axhline(y=0.95, color="red", linestyle="--", linewidth=0.6, label="95% target")
     ax2.set_ylabel("Cumulative Fill Rate")
-    ax2.set_xlabel("Evaluation Day")
     ax2.set_ylim(0, 1)
     ax2.legend(fontsize=8)
     plt.tight_layout()
     return fig
@@ -87,14 +113,17 @@ def run_simulation(agent_name, env_name):
     order_processor = OrderProcessor()
     performance_tracker = PerformanceTracker()
     inventory_manager = InventoryManager(order_processor=order_processor, agent=agent)
-    daily_inventory, running_fill_rate = [], []
     total_demand, total_fulfilled = 0, 0
     for day in range(HISTO_DAYS, SIM_DAYS):
         demand_qty = dc.get_daily_demand(day)
         base_inv = inventory_manager.inventory
         inventory_manager.inventory_update(demand_qty)
         if day < SIM_DAYS - LEAD_TIME:
             inventory_manager.reorder(day)
         inventory_manager.process_deliveries(day)
         fulfilled = min(demand_qty, base_inv)
         daily_writeoff = inventory_manager.apply_writeoff(day)
@@ -103,8 +132,24 @@ def run_simulation(agent_name, env_name):
         performance_tracker.daily_performance(demand_qty, int(fulfilled), daily_writeoff)
         daily_inventory.append(inventory_manager.inventory)
         running_fill_rate.append(total_fulfilled / total_demand if total_demand > 0 else 0)
     summary = performance_tracker.performance_summary()
-    fig = build_chart(daily_inventory, running_fill_rate, [], f"{agent_name}  |  {env_name}")
     metrics = (
         f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"
         f"**Stockouts:** {summary['stock_out_count']}  \n"
@@ -151,7 +196,7 @@ def run_llm_simulation(env_name, hf_token):
     convo_history = []
     memory_bank = []
     current_rop = dc.daily_demand_distribution[HISTO_DAYS].demand_mean * LEAD_TIME
-    daily_inventory, running_fill_rate, rop_markers = [], [], []
     total_demand, total_fulfilled = 0, 0
     decision_log = []
@@ -162,6 +207,7 @@ def run_llm_simulation(env_name, hf_token):
         inventory_manager.inventory_update(demand_qty)
         # Manual reorder using current_rop
         if day < SIM_DAYS - LEAD_TIME and inventory_manager.inventory <= current_rop:
             hist = [dc.daily_demand_distribution[d].actual_demand
                     for d in range(max(0, day - 30), day)]
@@ -169,6 +215,7 @@ def run_llm_simulation(env_name, hf_token):
             qty = max(0, current_rop - inventory_manager.inventory + mean_d * LEAD_TIME)
             if qty > 0:
                 order_processor.place_order(day, int(qty))
         inventory_manager.process_deliveries(day)
         fulfilled = min(demand_qty, base_inv)
@@ -180,6 +227,21 @@ def run_llm_simulation(env_name, hf_token):
         fr = total_fulfilled / total_demand if total_demand > 0 else 0
         running_fill_rate.append(fr)
         # LLM decision every DECISION_INTERVAL days
         if (day - HISTO_DAYS) % DECISION_INTERVAL == 0 and day < SIM_DAYS - LEAD_TIME:
             hist30 = [dc.daily_demand_distribution[d].actual_demand
@@ -233,7 +295,7 @@ def run_llm_simulation(env_name, hf_token):
             # Yield live update
             fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
-                              f"Qwen2.5-72B  |  {env_name}  |  Day {day}/{SIM_DAYS}")
             summary = performance_tracker.performance_summary()
             metrics = (
                 f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"
@@ -247,7 +309,7 @@ def run_llm_simulation(env_name, hf_token):
     # Final yield
     fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
-                      f"Qwen2.5-72B  |  {env_name}  |  COMPLETE")
     summary = performance_tracker.performance_summary()
     metrics = (
         f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"

 import matplotlib.pyplot as plt
 from huggingface_hub import InferenceClient
+from config import SIM_DAYS, HISTO_DAYS, LEAD_TIME, UNIT_COST, SELLING_PRICE, FIXED_ORDER_COST, WRITE_OFF_RATE
 from agent_environment import BaseAgent, SafetyStockAgent, ForecastAgent, MonteCarloAgent
 from demand_environment import GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance
 from demand_calculator import DemandCalculator
 # ── Shared chart builder ───────────────────────────────────────────────────────
+def build_chart(daily_inventory, running_fill_rate, rop_markers, title, daily_pnl=None):
+    n_rows = 3 if daily_pnl else 2
+    fig, axes = plt.subplots(n_rows, 1, figsize=(10, 4 + 2.5 * n_rows), sharex=True)
+    ax1, ax2 = axes[0], axes[1]
     days = list(range(len(daily_inventory)))
     ax1.plot(days, daily_inventory, color="steelblue", linewidth=0.8)
     if rop_markers:
         rop_days, rop_vals = zip(*rop_markers)
         ax1.legend(fontsize=8)
     ax1.set_ylabel("Inventory Level")
     ax1.set_title(title)
     ax2.plot(days, running_fill_rate, color="seagreen", linewidth=0.8)
     ax2.axhline(y=0.95, color="red", linestyle="--", linewidth=0.6, label="95% target")
     ax2.set_ylabel("Cumulative Fill Rate")
     ax2.set_ylim(0, 1)
     ax2.legend(fontsize=8)
+    if daily_pnl:
+        ax3 = axes[2]
+        revenues       = [r["revenue"]          for r in daily_pnl]
+        holding_costs  = [r["holding_cost"]      for r in daily_pnl]
+        stockout_pens  = [r["stockout_penalty"]  for r in daily_pnl]
+        order_costs    = [r["order_cost"]        for r in daily_pnl]
+        writeoff_costs = [r["writeoff_cost"]     for r in daily_pnl]
+        net_profits    = [r["daily_profit"]      for r in daily_pnl]
+        ax3.fill_between(days, revenues, alpha=0.25, color="green", label="Revenue")
+        ax3.plot(days, net_profits,      color="black",  linewidth=0.9, label="Net profit")
+        ax3.fill_between(days, [-h for h in holding_costs],  alpha=0.3, color="royalblue",  label="Holding cost")
+        ax3.fill_between(days, [-s for s in stockout_pens],  alpha=0.3, color="crimson",    label="Stockout penalty")
+        ax3.fill_between(days, [-o for o in order_costs],    alpha=0.25, color="darkorange", label="Order cost")
+        ax3.fill_between(days, [-w for w in writeoff_costs], alpha=0.25, color="purple",     label="Write-off cost")
+        ax3.axhline(y=0, color="grey", linewidth=0.5)
+        ax3.set_ylabel("Daily P&L ($)")
+        ax3.set_xlabel("Evaluation Day")
+        ax3.legend(fontsize=7, ncol=3)
+    else:
+        ax2.set_xlabel("Evaluation Day")
     plt.tight_layout()
     return fig
     order_processor = OrderProcessor()
     performance_tracker = PerformanceTracker()
     inventory_manager = InventoryManager(order_processor=order_processor, agent=agent)
+    daily_inventory, running_fill_rate, daily_pnl = [], [], []
     total_demand, total_fulfilled = 0, 0
     for day in range(HISTO_DAYS, SIM_DAYS):
         demand_qty = dc.get_daily_demand(day)
         base_inv = inventory_manager.inventory
         inventory_manager.inventory_update(demand_qty)
+        q_before = len(order_processor.order_queue)
         if day < SIM_DAYS - LEAD_TIME:
             inventory_manager.reorder(day)
+        new_orders = order_processor.order_queue[q_before:]
+        ordered_qty = sum(o.quantity for o in new_orders)
         inventory_manager.process_deliveries(day)
         fulfilled = min(demand_qty, base_inv)
         daily_writeoff = inventory_manager.apply_writeoff(day)
         performance_tracker.daily_performance(demand_qty, int(fulfilled), daily_writeoff)
         daily_inventory.append(inventory_manager.inventory)
         running_fill_rate.append(total_fulfilled / total_demand if total_demand > 0 else 0)
+        lost = max(0, demand_qty - fulfilled)
+        revenue = fulfilled * SELLING_PRICE
+        holding_cost = inventory_manager.inventory * UNIT_COST * 0.005
+        stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
+        order_cost = (FIXED_ORDER_COST if ordered_qty > 0 else 0.0) + ordered_qty * UNIT_COST
+        writeoff_cost = daily_writeoff * UNIT_COST
+        daily_pnl.append({
+            "revenue": revenue,
+            "holding_cost": holding_cost,
+            "stockout_penalty": stockout_penalty,
+            "order_cost": order_cost,
+            "writeoff_cost": writeoff_cost,
+            "daily_profit": revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost,
+        })
     summary = performance_tracker.performance_summary()
+    fig = build_chart(daily_inventory, running_fill_rate, [], f"{agent_name}  |  {env_name}", daily_pnl)
     metrics = (
         f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"
         f"**Stockouts:** {summary['stock_out_count']}  \n"
     convo_history = []
     memory_bank = []
     current_rop = dc.daily_demand_distribution[HISTO_DAYS].demand_mean * LEAD_TIME
+    daily_inventory, running_fill_rate, rop_markers, daily_pnl = [], [], [], []
     total_demand, total_fulfilled = 0, 0
     decision_log = []
         inventory_manager.inventory_update(demand_qty)
         # Manual reorder using current_rop
+        ordered_qty = 0
         if day < SIM_DAYS - LEAD_TIME and inventory_manager.inventory <= current_rop:
             hist = [dc.daily_demand_distribution[d].actual_demand
                     for d in range(max(0, day - 30), day)]
             qty = max(0, current_rop - inventory_manager.inventory + mean_d * LEAD_TIME)
             if qty > 0:
                 order_processor.place_order(day, int(qty))
+                ordered_qty = qty
         inventory_manager.process_deliveries(day)
         fulfilled = min(demand_qty, base_inv)
         fr = total_fulfilled / total_demand if total_demand > 0 else 0
         running_fill_rate.append(fr)
+        lost = max(0, demand_qty - fulfilled)
+        revenue = fulfilled * SELLING_PRICE
+        holding_cost = inventory_manager.inventory * UNIT_COST * 0.005
+        stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
+        order_cost = (FIXED_ORDER_COST if ordered_qty > 0 else 0.0) + ordered_qty * UNIT_COST
+        writeoff_cost = daily_writeoff * UNIT_COST
+        daily_pnl.append({
+            "revenue": revenue,
+            "holding_cost": holding_cost,
+            "stockout_penalty": stockout_penalty,
+            "order_cost": order_cost,
+            "writeoff_cost": writeoff_cost,
+            "daily_profit": revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost,
+        })
         # LLM decision every DECISION_INTERVAL days
         if (day - HISTO_DAYS) % DECISION_INTERVAL == 0 and day < SIM_DAYS - LEAD_TIME:
             hist30 = [dc.daily_demand_distribution[d].actual_demand
             # Yield live update
             fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
+                              f"Qwen2.5-72B  |  {env_name}  |  Day {day}/{SIM_DAYS}", daily_pnl)
             summary = performance_tracker.performance_summary()
             metrics = (
                 f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"
     # Final yield
     fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
+                      f"Qwen2.5-72B  |  {env_name}  |  COMPLETE", daily_pnl)
     summary = performance_tracker.performance_summary()
     metrics = (
         f"**Fill Rate:** {summary['fill_rate']:.2%}  \n"

config.py CHANGED Viewed

@@ -8,11 +8,17 @@ N_SIMULATIONS = 100
 MC_SIMS = 1000
 # Replenishment constraints & constants
-WRITE_OFF_RATE = 0.01
-WRITE_OFF_FREQUENCY = 7
 # Stock constraints
 LEAD_TIME = 3
 BASE_STOCK = 0
 DEFAULT_SERVICE_LEVEL = 0.95

 MC_SIMS = 1000
 # Replenishment constraints & constants
+WRITE_OFF_RATE = 0.00143   # ~0.143% daily spoilage (≈ 1% per week)
+WRITE_OFF_FREQUENCY = 1    # applied every day
+# Economic parameters
+UNIT_COST = 10.0           # purchase cost per unit
+SELLING_PRICE = 25.0       # revenue per unit sold
+FIXED_ORDER_COST = 150.0   # fixed cost per order placed
 # Stock constraints
 LEAD_TIME = 3
+LEAD_TIME_JITTER = 1       # ±1 day randomness on lead time
 BASE_STOCK = 0
 DEFAULT_SERVICE_LEVEL = 0.95

order_processor.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from typing import List
-from config import LEAD_TIME
 @dataclass
 class Order:
@@ -12,7 +13,8 @@ class OrderProcessor:
         self.order_queue: List[Order] = [] # self.order_queue stores Order objects
     def place_order(self, time_period: int, quantity: int):
-        arrival_day = time_period + LEAD_TIME # time_period = current_day
         self.order_queue.append(Order(arrival_day=arrival_day, quantity=quantity))
     def manage_order(self, time_period: int) -> int:

 from dataclasses import dataclass
 from typing import List
+import numpy as np
+from config import LEAD_TIME, LEAD_TIME_JITTER
 @dataclass
 class Order:
         self.order_queue: List[Order] = [] # self.order_queue stores Order objects
     def place_order(self, time_period: int, quantity: int):
+        jitter = np.random.randint(-LEAD_TIME_JITTER, LEAD_TIME_JITTER + 1)
+        arrival_day = max(time_period + 1, time_period + LEAD_TIME + jitter)
         self.order_queue.append(Order(arrival_day=arrival_day, quantity=quantity))
     def manage_order(self, time_period: int) -> int:

server/inventory_env.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pydantic import BaseModel
 from config import (
     SIM_DAYS, HISTO_DAYS, LEAD_TIME,
     WRITE_OFF_RATE, WRITE_OFF_FREQUENCY,
 )
 from demand_environment import (
     GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance,
@@ -87,6 +88,8 @@ class EpisodeState:
         self.total_fulfilled: float = 0.0
         self.stockouts: int = 0
         self.lost_sales: float = 0.0
         self.initialized: bool = False
     def get_obs(self) -> InventoryObservation:
@@ -139,6 +142,11 @@ def reset(env_type: int = 0):
     episode.day = HISTO_DAYS
     episode.initialized = True
     return episode.get_obs()
@@ -162,18 +170,25 @@ def step(action: InventoryAction):
         o for o in episode.order_processor.order_queue if o.arrival_day > day
     ]
-    # 2. Fulfill demand
-    fulfilled = min(demand, episode.inventory)
     episode.inventory = max(0.0, episode.inventory - demand)
-    lost = max(0.0, demand - fulfilled)
     if lost > 0:
         episode.stockouts += 1
     episode.lost_sales += lost
     episode.total_demand += demand
-    episode.total_fulfilled += fulfilled
-    # 3. Reorder if inventory at or below ROP
     rop = max(0.0, action.reorder_point)
     if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
         hist = episode.demand_series[max(0, day - 30):day]
         mean_demand = float(np.mean(hist)) if hist else 0.0
@@ -181,27 +196,43 @@ def step(action: InventoryAction):
         if qty > 0:
             episode.order_processor.place_order(day, int(qty))
-    # 4. Weekly write-off
-    if day % WRITE_OFF_FREQUENCY == 0:
-        writeoff = int(episode.inventory * WRITE_OFF_RATE)
-        episode.inventory -= writeoff
-        episode.performance_tracker.write_offs += writeoff
     # 5. Track performance
     episode.performance_tracker.daily_performance(
         demand_quantity=demand,
-        fulfilled_demand=int(fulfilled),
-        daily_writeoff=0,  # already applied above
     )
     episode.day += 1
     done = episode.day >= SIM_DAYS
     fill_rate = (
         episode.total_fulfilled / episode.total_demand
         if episode.total_demand > 0 else 0.0
     )
-    reward = fill_rate if done else -0.001
     return StepResult(
         observation=episode.get_obs(),
@@ -211,6 +242,9 @@ def step(action: InventoryAction):
             "fill_rate": fill_rate,
             "stockouts": episode.stockouts,
             "lost_sales": episode.lost_sales,
             "reasoning_logged": action.reasoning[:200] if action.reasoning else "",
         },
     )

 from config import (
     SIM_DAYS, HISTO_DAYS, LEAD_TIME,
     WRITE_OFF_RATE, WRITE_OFF_FREQUENCY,
+    UNIT_COST, SELLING_PRICE, FIXED_ORDER_COST,
 )
 from demand_environment import (
     GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance,
         self.total_fulfilled: float = 0.0
         self.stockouts: int = 0
         self.lost_sales: float = 0.0
+        self.cumulative_profit: float = 0.0
+        self.baseline_profit: float = 0.0
         self.initialized: bool = False
     def get_obs(self) -> InventoryObservation:
     episode.day = HISTO_DAYS
     episode.initialized = True
+    # Compute baseline profit: expected daily profit at full service (no stockouts)
+    episode_demand = episode.demand_series[HISTO_DAYS:]
+    mean_demand = float(np.mean(episode_demand)) if episode_demand else 0.0
+    episode.baseline_profit = mean_demand * (SELLING_PRICE - UNIT_COST)
     return episode.get_obs()
         o for o in episode.order_processor.order_queue if o.arrival_day > day
     ]
+    # 2. Daily spoilage (0.143% per day)
+    spoilage = episode.inventory * WRITE_OFF_RATE
+    writeoff_cost = spoilage * UNIT_COST
+    episode.inventory = max(0.0, episode.inventory - spoilage)
+    episode.performance_tracker.write_offs += spoilage
+    # 3. Fulfill demand
+    units_sold = min(demand, episode.inventory)
     episode.inventory = max(0.0, episode.inventory - demand)
+    lost = max(0.0, demand - units_sold)
     if lost > 0:
         episode.stockouts += 1
     episode.lost_sales += lost
     episode.total_demand += demand
+    episode.total_fulfilled += units_sold
+    # 4. Reorder if inventory at or below ROP
     rop = max(0.0, action.reorder_point)
+    qty = 0
     if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
         hist = episode.demand_series[max(0, day - 30):day]
         mean_demand = float(np.mean(hist)) if hist else 0.0
         if qty > 0:
             episode.order_processor.place_order(day, int(qty))
     # 5. Track performance
     episode.performance_tracker.daily_performance(
         demand_quantity=demand,
+        fulfilled_demand=int(units_sold),
+        daily_writeoff=0,
     )
     episode.day += 1
     done = episode.day >= SIM_DAYS
+    # 6. Compute dense daily P&L reward
+    revenue = units_sold * SELLING_PRICE
+    holding_cost = episode.inventory * UNIT_COST * 0.005
+    stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
+    order_cost = (FIXED_ORDER_COST if qty > 0 else 0.0) + qty * UNIT_COST
+    daily_profit = revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost
+    episode.cumulative_profit += daily_profit
+    baseline = episode.baseline_profit
+    daily_reward = daily_profit / baseline if baseline > 0 else 0.0
+    # 7. Sparse episode bonus at end
     fill_rate = (
         episode.total_fulfilled / episode.total_demand
         if episode.total_demand > 0 else 0.0
     )
+    if done:
+        episode_length = SIM_DAYS - HISTO_DAYS
+        profit_ratio = (
+            episode.cumulative_profit / (baseline * episode_length)
+            if baseline > 0 else 0.0
+        )
+        episode_bonus = fill_rate * 0.5 + profit_ratio * 0.5
+        reward = daily_reward + episode_bonus
+    else:
+        reward = daily_reward
     return StepResult(
         observation=episode.get_obs(),
             "fill_rate": fill_rate,
             "stockouts": episode.stockouts,
             "lost_sales": episode.lost_sales,
+            "inventory_in": delivered,
+            "units_sold": units_sold,
+            "daily_profit": daily_profit,
             "reasoning_logged": action.reasoning[:200] if action.reasoning else "",
         },
     )