RishbhaJain Claude Sonnet 4.6 commited on
Commit
288043f
·
2 Parent(s): af95772c041c09

Merge RJ into main — use RJ app.py with P&L reward visualization

Browse files
Files changed (6) hide show
  1. agent/llm_agent.py +34 -26
  2. agent/rl_agent.py +2 -15
  3. app.py +71 -9
  4. config.py +8 -2
  5. order_processor.py +4 -2
  6. server/inventory_env.py +48 -14
agent/llm_agent.py CHANGED
@@ -18,7 +18,7 @@ import re
18
  import sys
19
  from typing import Any
20
 
21
- import anthropic
22
 
23
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
24
 
@@ -28,27 +28,35 @@ SYSTEM_PROMPT = """\
28
  You are an expert inventory optimization agent operating inside a stochastic supply-chain simulation.
29
 
30
  YOUR OBJECTIVE:
31
- Maximize the fill rate (fraction of demand fulfilled) while minimizing inventory write-offs over a \
32
- 365-day episode. The episode ends at day 730 (after 365 days of decisions following a 365-day warm-up).
33
 
34
  ENVIRONMENT RULES:
35
- - Orders arrive exactly 3 days after placement (LEAD_TIME = 3)
36
- - An order is placed automatically whenever inventory <= your chosen reorder_point
37
- - Order quantity = reorder_point - current_inventory + mean_demand * LEAD_TIME (handled by the env)
38
- - Every 7 days, 1% of on-hand inventory is written off (waste/expiry)
39
- - Fill rate = total units fulfilled / total units demanded (target: >= 95%)
40
- - Reward is SPARSE: fill rate only stabilises after many days; plan ahead
 
 
 
 
 
 
 
 
 
 
41
 
42
  YOUR ACTION EACH STEP:
43
- Set `reorder_point` — the inventory level at or below which a replenishment order fires.
44
- A higher ROP builds safety buffer but risks write-offs. A lower ROP conserves stock but risks stockouts.
45
 
46
  REASONING GUIDANCE:
47
- - Analyse demand trend and variability before deciding
48
- - Account for pending orders already in the pipelinethey will arrive soon
49
- - After stockouts, raise ROP aggressively to rebuild buffer
50
- - If fill rate is healthy and inventory is high, consider lowering ROP to reduce write-offs
51
- - Think 3+ days ahead; your ROP today only shows its effect after lead time
52
 
53
  RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
54
  {"reorder_point": <float>, "reasoning": "<concise explanation>", "confidence": <float 0-1>}
@@ -56,14 +64,14 @@ RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
56
 
57
 
58
  class ClaudeInventoryAgent:
59
- """Inventory optimization agent backed by Claude claude-sonnet-4-5."""
60
 
61
  MEMORY_SIZE = 15
62
  HISTORY_TURNS = 6
63
- MODEL = "claude-sonnet-4-5"
64
 
65
  def __init__(self, api_key: str) -> None:
66
- self._client = anthropic.Anthropic(api_key=api_key)
67
  self._memory_bank: list[dict[str, Any]] = []
68
  self._conversation: list[dict[str, str]] = []
69
 
@@ -127,13 +135,13 @@ class ClaudeInventoryAgent:
127
  confidence: float
128
 
129
  try:
130
- response = self._client.messages.create(
 
131
  model=self.MODEL,
 
132
  max_tokens=512,
133
- system=SYSTEM_PROMPT,
134
- messages=messages,
135
  )
136
- raw_text: str = response.content[0].text # type: ignore[union-attr]
137
 
138
  try:
139
  parsed = self._parse_response(raw_text)
@@ -253,8 +261,8 @@ def _parse_args() -> argparse.Namespace:
253
  parser.add_argument(
254
  "--api-key",
255
  type=str,
256
- default=os.environ.get("ANTHROPIC_API_KEY", ""),
257
- help="Anthropic API key (defaults to ANTHROPIC_API_KEY env var).",
258
  )
259
  return parser.parse_args()
260
 
@@ -263,7 +271,7 @@ if __name__ == "__main__":
263
  args = _parse_args()
264
 
265
  if not args.api_key:
266
- print("Error: no Anthropic API key provided. Set ANTHROPIC_API_KEY or use --api-key.")
267
  sys.exit(1)
268
 
269
  asyncio.run(
 
18
  import sys
19
  from typing import Any
20
 
21
+ from huggingface_hub import InferenceClient
22
 
23
  sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
24
 
 
28
  You are an expert inventory optimization agent operating inside a stochastic supply-chain simulation.
29
 
30
  YOUR OBJECTIVE:
31
+ Maximize daily profit and fill rate over a 365-day episode (days 365–730 after a warm-up period).
 
32
 
33
  ENVIRONMENT RULES:
34
+ - Lead time: 3 days ± 1 day (stochastic orders may arrive in 2, 3, or 4 days)
35
+ - An order fires automatically whenever inventory <= your reorder_point
36
+ - Order quantity = reorder_point - current_inventory + mean_demand * lead_time (handled by env)
37
+ - Spoilage: 0.143% of on-hand inventory is lost every day (~1% per week)
38
+ - unit_cost = $10, selling_price = $25, fixed_order_cost = $150 per order
39
+
40
+ DAILY REWARD FORMULA:
41
+ revenue = units_sold * 25
42
+ holding_cost = inventory * 10 * 0.005
43
+ stockout_penalty = lost_units * 15 (lost margin per unit)
44
+ order_cost = 150 (if ordered) + qty * 10
45
+ writeoff_cost = spoilage * 10
46
+ daily_reward = (revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost) / baseline
47
+
48
+ END-OF-EPISODE BONUS (day 730 only):
49
+ bonus = fill_rate * 0.5 + profit_ratio * 0.5
50
 
51
  YOUR ACTION EACH STEP:
52
+ Set `reorder_point` — the inventory threshold that triggers a replenishment order.
 
53
 
54
  REASONING GUIDANCE:
55
+ - Stockouts are expensive ($15/unit lost margin) — keep enough buffer for lead time uncertainty
56
+ - Excess inventory bleeds holding cost ($0.05/unit/day) and spoilagedon't over-order
57
+ - $150 fixed order cost: batch orders rather than ordering tiny amounts every day
58
+ - Account for pending orders in the pipeline before deciding to order more
59
+ - Think 3–4 days ahead due to stochastic lead times
60
 
61
  RESPONSE FORMAT — reply with ONLY a valid JSON object, no markdown fences:
62
  {"reorder_point": <float>, "reasoning": "<concise explanation>", "confidence": <float 0-1>}
 
64
 
65
 
66
  class ClaudeInventoryAgent:
67
+ """Inventory optimization agent backed by Qwen2.5-72B via HuggingFace Inference API."""
68
 
69
  MEMORY_SIZE = 15
70
  HISTORY_TURNS = 6
71
+ MODEL = "Qwen/Qwen2.5-72B-Instruct"
72
 
73
  def __init__(self, api_key: str) -> None:
74
+ self._client = InferenceClient(api_key=api_key)
75
  self._memory_bank: list[dict[str, Any]] = []
76
  self._conversation: list[dict[str, str]] = []
77
 
 
135
  confidence: float
136
 
137
  try:
138
+ hf_messages = [{"role": "system", "content": SYSTEM_PROMPT}] + messages
139
+ response = self._client.chat.completions.create(
140
  model=self.MODEL,
141
+ messages=hf_messages,
142
  max_tokens=512,
 
 
143
  )
144
+ raw_text: str = response.choices[0].message.content
145
 
146
  try:
147
  parsed = self._parse_response(raw_text)
 
261
  parser.add_argument(
262
  "--api-key",
263
  type=str,
264
+ default=os.environ.get("HF_TOKEN", ""),
265
+ help="HuggingFace token (defaults to HF_TOKEN env var).",
266
  )
267
  return parser.parse_args()
268
 
 
271
  args = _parse_args()
272
 
273
  if not args.api_key:
274
+ print("Error: no HuggingFace token provided. Set HF_TOKEN or use --api-key.")
275
  sys.exit(1)
276
 
277
  asyncio.run(
agent/rl_agent.py CHANGED
@@ -57,8 +57,6 @@ class InventoryGymEnv(gym.Env):
57
  self._inv_client = InventoryEnvClient(base_url)
58
  self._inv_client._client = self._http_client
59
 
60
- self._last_fill_rate: float = 0.0
61
-
62
  self.observation_space = spaces.Box(
63
  low=0.0,
64
  high=np.inf,
@@ -78,26 +76,15 @@ class InventoryGymEnv(gym.Env):
78
  def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[np.ndarray, dict]:
79
  super().reset(seed=seed)
80
  obs = asyncio.run(self._inv_client.reset(env_type=self._env_type))
81
- self._last_fill_rate = 0.0
82
  return self._obs_to_array(obs), {}
83
 
84
  def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
85
  result = asyncio.run(
86
  self._inv_client.step(InventoryAction(reorder_point=float(action[0])))
87
  )
88
- obs = result.observation
89
-
90
- fill_rate_delta = obs.fill_rate_so_far - self._last_fill_rate
91
- shaped_reward = (
92
- fill_rate_delta * 10.0
93
- - obs.recent_lost_sales * 0.01
94
- - obs.current_inventory * 0.0001
95
- )
96
- self._last_fill_rate = obs.fill_rate_so_far
97
-
98
  return (
99
- self._obs_to_array(obs),
100
- float(shaped_reward),
101
  result.done,
102
  False,
103
  result.info,
 
57
  self._inv_client = InventoryEnvClient(base_url)
58
  self._inv_client._client = self._http_client
59
 
 
 
60
  self.observation_space = spaces.Box(
61
  low=0.0,
62
  high=np.inf,
 
76
  def reset(self, *, seed: int | None = None, options: dict[str, Any] | None = None) -> tuple[np.ndarray, dict]:
77
  super().reset(seed=seed)
78
  obs = asyncio.run(self._inv_client.reset(env_type=self._env_type))
 
79
  return self._obs_to_array(obs), {}
80
 
81
  def step(self, action: np.ndarray) -> tuple[np.ndarray, float, bool, bool, dict]:
82
  result = asyncio.run(
83
  self._inv_client.step(InventoryAction(reorder_point=float(action[0])))
84
  )
 
 
 
 
 
 
 
 
 
 
85
  return (
86
+ self._obs_to_array(result.observation),
87
+ float(result.reward),
88
  result.done,
89
  False,
90
  result.info,
app.py CHANGED
@@ -7,7 +7,7 @@ matplotlib.use("Agg")
7
  import matplotlib.pyplot as plt
8
  from huggingface_hub import InferenceClient
9
 
10
- from config import SIM_DAYS, HISTO_DAYS, LEAD_TIME
11
  from agent_environment import BaseAgent, SafetyStockAgent, ForecastAgent, MonteCarloAgent
12
  from demand_environment import GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance
13
  from demand_calculator import DemandCalculator
@@ -45,9 +45,12 @@ OUTPUT — respond with this exact JSON (no markdown fences):
45
 
46
  # ── Shared chart builder ───────────────────────────────────────────────────────
47
 
48
- def build_chart(daily_inventory, running_fill_rate, rop_markers, title):
49
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 5), sharex=True)
 
 
50
  days = list(range(len(daily_inventory)))
 
51
  ax1.plot(days, daily_inventory, color="steelblue", linewidth=0.8)
52
  if rop_markers:
53
  rop_days, rop_vals = zip(*rop_markers)
@@ -56,12 +59,35 @@ def build_chart(daily_inventory, running_fill_rate, rop_markers, title):
56
  ax1.legend(fontsize=8)
57
  ax1.set_ylabel("Inventory Level")
58
  ax1.set_title(title)
 
59
  ax2.plot(days, running_fill_rate, color="seagreen", linewidth=0.8)
60
  ax2.axhline(y=0.95, color="red", linestyle="--", linewidth=0.6, label="95% target")
61
  ax2.set_ylabel("Cumulative Fill Rate")
62
- ax2.set_xlabel("Evaluation Day")
63
  ax2.set_ylim(0, 1)
64
  ax2.legend(fontsize=8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  plt.tight_layout()
66
  return fig
67
 
@@ -87,14 +113,17 @@ def run_simulation(agent_name, env_name):
87
  order_processor = OrderProcessor()
88
  performance_tracker = PerformanceTracker()
89
  inventory_manager = InventoryManager(order_processor=order_processor, agent=agent)
90
- daily_inventory, running_fill_rate = [], []
91
  total_demand, total_fulfilled = 0, 0
92
  for day in range(HISTO_DAYS, SIM_DAYS):
93
  demand_qty = dc.get_daily_demand(day)
94
  base_inv = inventory_manager.inventory
95
  inventory_manager.inventory_update(demand_qty)
 
96
  if day < SIM_DAYS - LEAD_TIME:
97
  inventory_manager.reorder(day)
 
 
98
  inventory_manager.process_deliveries(day)
99
  fulfilled = min(demand_qty, base_inv)
100
  daily_writeoff = inventory_manager.apply_writeoff(day)
@@ -103,8 +132,24 @@ def run_simulation(agent_name, env_name):
103
  performance_tracker.daily_performance(demand_qty, int(fulfilled), daily_writeoff)
104
  daily_inventory.append(inventory_manager.inventory)
105
  running_fill_rate.append(total_fulfilled / total_demand if total_demand > 0 else 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  summary = performance_tracker.performance_summary()
107
- fig = build_chart(daily_inventory, running_fill_rate, [], f"{agent_name} | {env_name}")
108
  metrics = (
109
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
110
  f"**Stockouts:** {summary['stock_out_count']} \n"
@@ -151,7 +196,7 @@ def run_llm_simulation(env_name, hf_token):
151
  convo_history = []
152
  memory_bank = []
153
  current_rop = dc.daily_demand_distribution[HISTO_DAYS].demand_mean * LEAD_TIME
154
- daily_inventory, running_fill_rate, rop_markers = [], [], []
155
  total_demand, total_fulfilled = 0, 0
156
  decision_log = []
157
 
@@ -162,6 +207,7 @@ def run_llm_simulation(env_name, hf_token):
162
  inventory_manager.inventory_update(demand_qty)
163
 
164
  # Manual reorder using current_rop
 
165
  if day < SIM_DAYS - LEAD_TIME and inventory_manager.inventory <= current_rop:
166
  hist = [dc.daily_demand_distribution[d].actual_demand
167
  for d in range(max(0, day - 30), day)]
@@ -169,6 +215,7 @@ def run_llm_simulation(env_name, hf_token):
169
  qty = max(0, current_rop - inventory_manager.inventory + mean_d * LEAD_TIME)
170
  if qty > 0:
171
  order_processor.place_order(day, int(qty))
 
172
 
173
  inventory_manager.process_deliveries(day)
174
  fulfilled = min(demand_qty, base_inv)
@@ -180,6 +227,21 @@ def run_llm_simulation(env_name, hf_token):
180
  fr = total_fulfilled / total_demand if total_demand > 0 else 0
181
  running_fill_rate.append(fr)
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  # LLM decision every DECISION_INTERVAL days
184
  if (day - HISTO_DAYS) % DECISION_INTERVAL == 0 and day < SIM_DAYS - LEAD_TIME:
185
  hist30 = [dc.daily_demand_distribution[d].actual_demand
@@ -233,7 +295,7 @@ def run_llm_simulation(env_name, hf_token):
233
 
234
  # Yield live update
235
  fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
236
- f"Qwen2.5-72B | {env_name} | Day {day}/{SIM_DAYS}")
237
  summary = performance_tracker.performance_summary()
238
  metrics = (
239
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
@@ -247,7 +309,7 @@ def run_llm_simulation(env_name, hf_token):
247
 
248
  # Final yield
249
  fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
250
- f"Qwen2.5-72B | {env_name} | COMPLETE")
251
  summary = performance_tracker.performance_summary()
252
  metrics = (
253
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
 
7
  import matplotlib.pyplot as plt
8
  from huggingface_hub import InferenceClient
9
 
10
+ from config import SIM_DAYS, HISTO_DAYS, LEAD_TIME, UNIT_COST, SELLING_PRICE, FIXED_ORDER_COST, WRITE_OFF_RATE
11
  from agent_environment import BaseAgent, SafetyStockAgent, ForecastAgent, MonteCarloAgent
12
  from demand_environment import GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance
13
  from demand_calculator import DemandCalculator
 
45
 
46
  # ── Shared chart builder ───────────────────────────────────────────────────────
47
 
48
+ def build_chart(daily_inventory, running_fill_rate, rop_markers, title, daily_pnl=None):
49
+ n_rows = 3 if daily_pnl else 2
50
+ fig, axes = plt.subplots(n_rows, 1, figsize=(10, 4 + 2.5 * n_rows), sharex=True)
51
+ ax1, ax2 = axes[0], axes[1]
52
  days = list(range(len(daily_inventory)))
53
+
54
  ax1.plot(days, daily_inventory, color="steelblue", linewidth=0.8)
55
  if rop_markers:
56
  rop_days, rop_vals = zip(*rop_markers)
 
59
  ax1.legend(fontsize=8)
60
  ax1.set_ylabel("Inventory Level")
61
  ax1.set_title(title)
62
+
63
  ax2.plot(days, running_fill_rate, color="seagreen", linewidth=0.8)
64
  ax2.axhline(y=0.95, color="red", linestyle="--", linewidth=0.6, label="95% target")
65
  ax2.set_ylabel("Cumulative Fill Rate")
 
66
  ax2.set_ylim(0, 1)
67
  ax2.legend(fontsize=8)
68
+
69
+ if daily_pnl:
70
+ ax3 = axes[2]
71
+ revenues = [r["revenue"] for r in daily_pnl]
72
+ holding_costs = [r["holding_cost"] for r in daily_pnl]
73
+ stockout_pens = [r["stockout_penalty"] for r in daily_pnl]
74
+ order_costs = [r["order_cost"] for r in daily_pnl]
75
+ writeoff_costs = [r["writeoff_cost"] for r in daily_pnl]
76
+ net_profits = [r["daily_profit"] for r in daily_pnl]
77
+
78
+ ax3.fill_between(days, revenues, alpha=0.25, color="green", label="Revenue")
79
+ ax3.plot(days, net_profits, color="black", linewidth=0.9, label="Net profit")
80
+ ax3.fill_between(days, [-h for h in holding_costs], alpha=0.3, color="royalblue", label="Holding cost")
81
+ ax3.fill_between(days, [-s for s in stockout_pens], alpha=0.3, color="crimson", label="Stockout penalty")
82
+ ax3.fill_between(days, [-o for o in order_costs], alpha=0.25, color="darkorange", label="Order cost")
83
+ ax3.fill_between(days, [-w for w in writeoff_costs], alpha=0.25, color="purple", label="Write-off cost")
84
+ ax3.axhline(y=0, color="grey", linewidth=0.5)
85
+ ax3.set_ylabel("Daily P&L ($)")
86
+ ax3.set_xlabel("Evaluation Day")
87
+ ax3.legend(fontsize=7, ncol=3)
88
+ else:
89
+ ax2.set_xlabel("Evaluation Day")
90
+
91
  plt.tight_layout()
92
  return fig
93
 
 
113
  order_processor = OrderProcessor()
114
  performance_tracker = PerformanceTracker()
115
  inventory_manager = InventoryManager(order_processor=order_processor, agent=agent)
116
+ daily_inventory, running_fill_rate, daily_pnl = [], [], []
117
  total_demand, total_fulfilled = 0, 0
118
  for day in range(HISTO_DAYS, SIM_DAYS):
119
  demand_qty = dc.get_daily_demand(day)
120
  base_inv = inventory_manager.inventory
121
  inventory_manager.inventory_update(demand_qty)
122
+ q_before = len(order_processor.order_queue)
123
  if day < SIM_DAYS - LEAD_TIME:
124
  inventory_manager.reorder(day)
125
+ new_orders = order_processor.order_queue[q_before:]
126
+ ordered_qty = sum(o.quantity for o in new_orders)
127
  inventory_manager.process_deliveries(day)
128
  fulfilled = min(demand_qty, base_inv)
129
  daily_writeoff = inventory_manager.apply_writeoff(day)
 
132
  performance_tracker.daily_performance(demand_qty, int(fulfilled), daily_writeoff)
133
  daily_inventory.append(inventory_manager.inventory)
134
  running_fill_rate.append(total_fulfilled / total_demand if total_demand > 0 else 0)
135
+
136
+ lost = max(0, demand_qty - fulfilled)
137
+ revenue = fulfilled * SELLING_PRICE
138
+ holding_cost = inventory_manager.inventory * UNIT_COST * 0.005
139
+ stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
140
+ order_cost = (FIXED_ORDER_COST if ordered_qty > 0 else 0.0) + ordered_qty * UNIT_COST
141
+ writeoff_cost = daily_writeoff * UNIT_COST
142
+ daily_pnl.append({
143
+ "revenue": revenue,
144
+ "holding_cost": holding_cost,
145
+ "stockout_penalty": stockout_penalty,
146
+ "order_cost": order_cost,
147
+ "writeoff_cost": writeoff_cost,
148
+ "daily_profit": revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost,
149
+ })
150
+
151
  summary = performance_tracker.performance_summary()
152
+ fig = build_chart(daily_inventory, running_fill_rate, [], f"{agent_name} | {env_name}", daily_pnl)
153
  metrics = (
154
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
155
  f"**Stockouts:** {summary['stock_out_count']} \n"
 
196
  convo_history = []
197
  memory_bank = []
198
  current_rop = dc.daily_demand_distribution[HISTO_DAYS].demand_mean * LEAD_TIME
199
+ daily_inventory, running_fill_rate, rop_markers, daily_pnl = [], [], [], []
200
  total_demand, total_fulfilled = 0, 0
201
  decision_log = []
202
 
 
207
  inventory_manager.inventory_update(demand_qty)
208
 
209
  # Manual reorder using current_rop
210
+ ordered_qty = 0
211
  if day < SIM_DAYS - LEAD_TIME and inventory_manager.inventory <= current_rop:
212
  hist = [dc.daily_demand_distribution[d].actual_demand
213
  for d in range(max(0, day - 30), day)]
 
215
  qty = max(0, current_rop - inventory_manager.inventory + mean_d * LEAD_TIME)
216
  if qty > 0:
217
  order_processor.place_order(day, int(qty))
218
+ ordered_qty = qty
219
 
220
  inventory_manager.process_deliveries(day)
221
  fulfilled = min(demand_qty, base_inv)
 
227
  fr = total_fulfilled / total_demand if total_demand > 0 else 0
228
  running_fill_rate.append(fr)
229
 
230
+ lost = max(0, demand_qty - fulfilled)
231
+ revenue = fulfilled * SELLING_PRICE
232
+ holding_cost = inventory_manager.inventory * UNIT_COST * 0.005
233
+ stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
234
+ order_cost = (FIXED_ORDER_COST if ordered_qty > 0 else 0.0) + ordered_qty * UNIT_COST
235
+ writeoff_cost = daily_writeoff * UNIT_COST
236
+ daily_pnl.append({
237
+ "revenue": revenue,
238
+ "holding_cost": holding_cost,
239
+ "stockout_penalty": stockout_penalty,
240
+ "order_cost": order_cost,
241
+ "writeoff_cost": writeoff_cost,
242
+ "daily_profit": revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost,
243
+ })
244
+
245
  # LLM decision every DECISION_INTERVAL days
246
  if (day - HISTO_DAYS) % DECISION_INTERVAL == 0 and day < SIM_DAYS - LEAD_TIME:
247
  hist30 = [dc.daily_demand_distribution[d].actual_demand
 
295
 
296
  # Yield live update
297
  fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
298
+ f"Qwen2.5-72B | {env_name} | Day {day}/{SIM_DAYS}", daily_pnl)
299
  summary = performance_tracker.performance_summary()
300
  metrics = (
301
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
 
309
 
310
  # Final yield
311
  fig = build_chart(daily_inventory, running_fill_rate, rop_markers,
312
+ f"Qwen2.5-72B | {env_name} | COMPLETE", daily_pnl)
313
  summary = performance_tracker.performance_summary()
314
  metrics = (
315
  f"**Fill Rate:** {summary['fill_rate']:.2%} \n"
config.py CHANGED
@@ -8,11 +8,17 @@ N_SIMULATIONS = 100
8
  MC_SIMS = 1000
9
 
10
  # Replenishment constraints & constants
11
- WRITE_OFF_RATE = 0.01
12
- WRITE_OFF_FREQUENCY = 7
 
 
 
 
 
13
 
14
  # Stock constraints
15
  LEAD_TIME = 3
 
16
  BASE_STOCK = 0
17
  DEFAULT_SERVICE_LEVEL = 0.95
18
 
 
8
  MC_SIMS = 1000
9
 
10
  # Replenishment constraints & constants
11
+ WRITE_OFF_RATE = 0.00143 # ~0.143% daily spoilage (≈ 1% per week)
12
+ WRITE_OFF_FREQUENCY = 1 # applied every day
13
+
14
+ # Economic parameters
15
+ UNIT_COST = 10.0 # purchase cost per unit
16
+ SELLING_PRICE = 25.0 # revenue per unit sold
17
+ FIXED_ORDER_COST = 150.0 # fixed cost per order placed
18
 
19
  # Stock constraints
20
  LEAD_TIME = 3
21
+ LEAD_TIME_JITTER = 1 # ±1 day randomness on lead time
22
  BASE_STOCK = 0
23
  DEFAULT_SERVICE_LEVEL = 0.95
24
 
order_processor.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from typing import List
3
- from config import LEAD_TIME
 
4
 
5
  @dataclass
6
  class Order:
@@ -12,7 +13,8 @@ class OrderProcessor:
12
  self.order_queue: List[Order] = [] # self.order_queue stores Order objects
13
 
14
  def place_order(self, time_period: int, quantity: int):
15
- arrival_day = time_period + LEAD_TIME # time_period = current_day
 
16
  self.order_queue.append(Order(arrival_day=arrival_day, quantity=quantity))
17
 
18
  def manage_order(self, time_period: int) -> int:
 
1
  from dataclasses import dataclass
2
  from typing import List
3
+ import numpy as np
4
+ from config import LEAD_TIME, LEAD_TIME_JITTER
5
 
6
  @dataclass
7
  class Order:
 
13
  self.order_queue: List[Order] = [] # self.order_queue stores Order objects
14
 
15
  def place_order(self, time_period: int, quantity: int):
16
+ jitter = np.random.randint(-LEAD_TIME_JITTER, LEAD_TIME_JITTER + 1)
17
+ arrival_day = max(time_period + 1, time_period + LEAD_TIME + jitter)
18
  self.order_queue.append(Order(arrival_day=arrival_day, quantity=quantity))
19
 
20
  def manage_order(self, time_period: int) -> int:
server/inventory_env.py CHANGED
@@ -11,6 +11,7 @@ from pydantic import BaseModel
11
  from config import (
12
  SIM_DAYS, HISTO_DAYS, LEAD_TIME,
13
  WRITE_OFF_RATE, WRITE_OFF_FREQUENCY,
 
14
  )
15
  from demand_environment import (
16
  GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance,
@@ -87,6 +88,8 @@ class EpisodeState:
87
  self.total_fulfilled: float = 0.0
88
  self.stockouts: int = 0
89
  self.lost_sales: float = 0.0
 
 
90
  self.initialized: bool = False
91
 
92
  def get_obs(self) -> InventoryObservation:
@@ -139,6 +142,11 @@ def reset(env_type: int = 0):
139
  episode.day = HISTO_DAYS
140
  episode.initialized = True
141
 
 
 
 
 
 
142
  return episode.get_obs()
143
 
144
 
@@ -162,18 +170,25 @@ def step(action: InventoryAction):
162
  o for o in episode.order_processor.order_queue if o.arrival_day > day
163
  ]
164
 
165
- # 2. Fulfill demand
166
- fulfilled = min(demand, episode.inventory)
 
 
 
 
 
 
167
  episode.inventory = max(0.0, episode.inventory - demand)
168
- lost = max(0.0, demand - fulfilled)
169
  if lost > 0:
170
  episode.stockouts += 1
171
  episode.lost_sales += lost
172
  episode.total_demand += demand
173
- episode.total_fulfilled += fulfilled
174
 
175
- # 3. Reorder if inventory at or below ROP
176
  rop = max(0.0, action.reorder_point)
 
177
  if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
178
  hist = episode.demand_series[max(0, day - 30):day]
179
  mean_demand = float(np.mean(hist)) if hist else 0.0
@@ -181,27 +196,43 @@ def step(action: InventoryAction):
181
  if qty > 0:
182
  episode.order_processor.place_order(day, int(qty))
183
 
184
- # 4. Weekly write-off
185
- if day % WRITE_OFF_FREQUENCY == 0:
186
- writeoff = int(episode.inventory * WRITE_OFF_RATE)
187
- episode.inventory -= writeoff
188
- episode.performance_tracker.write_offs += writeoff
189
-
190
  # 5. Track performance
191
  episode.performance_tracker.daily_performance(
192
  demand_quantity=demand,
193
- fulfilled_demand=int(fulfilled),
194
- daily_writeoff=0, # already applied above
195
  )
196
 
197
  episode.day += 1
198
  done = episode.day >= SIM_DAYS
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  fill_rate = (
201
  episode.total_fulfilled / episode.total_demand
202
  if episode.total_demand > 0 else 0.0
203
  )
204
- reward = fill_rate if done else -0.001
 
 
 
 
 
 
 
 
 
205
 
206
  return StepResult(
207
  observation=episode.get_obs(),
@@ -211,6 +242,9 @@ def step(action: InventoryAction):
211
  "fill_rate": fill_rate,
212
  "stockouts": episode.stockouts,
213
  "lost_sales": episode.lost_sales,
 
 
 
214
  "reasoning_logged": action.reasoning[:200] if action.reasoning else "",
215
  },
216
  )
 
11
  from config import (
12
  SIM_DAYS, HISTO_DAYS, LEAD_TIME,
13
  WRITE_OFF_RATE, WRITE_OFF_FREQUENCY,
14
+ UNIT_COST, SELLING_PRICE, FIXED_ORDER_COST,
15
  )
16
  from demand_environment import (
17
  GammaPoisson, GammaGammaHighVariance, SpikingDemand, SingleGammaLowVariance,
 
88
  self.total_fulfilled: float = 0.0
89
  self.stockouts: int = 0
90
  self.lost_sales: float = 0.0
91
+ self.cumulative_profit: float = 0.0
92
+ self.baseline_profit: float = 0.0
93
  self.initialized: bool = False
94
 
95
  def get_obs(self) -> InventoryObservation:
 
142
  episode.day = HISTO_DAYS
143
  episode.initialized = True
144
 
145
+ # Compute baseline profit: expected daily profit at full service (no stockouts)
146
+ episode_demand = episode.demand_series[HISTO_DAYS:]
147
+ mean_demand = float(np.mean(episode_demand)) if episode_demand else 0.0
148
+ episode.baseline_profit = mean_demand * (SELLING_PRICE - UNIT_COST)
149
+
150
  return episode.get_obs()
151
 
152
 
 
170
  o for o in episode.order_processor.order_queue if o.arrival_day > day
171
  ]
172
 
173
+ # 2. Daily spoilage (0.143% per day)
174
+ spoilage = episode.inventory * WRITE_OFF_RATE
175
+ writeoff_cost = spoilage * UNIT_COST
176
+ episode.inventory = max(0.0, episode.inventory - spoilage)
177
+ episode.performance_tracker.write_offs += spoilage
178
+
179
+ # 3. Fulfill demand
180
+ units_sold = min(demand, episode.inventory)
181
  episode.inventory = max(0.0, episode.inventory - demand)
182
+ lost = max(0.0, demand - units_sold)
183
  if lost > 0:
184
  episode.stockouts += 1
185
  episode.lost_sales += lost
186
  episode.total_demand += demand
187
+ episode.total_fulfilled += units_sold
188
 
189
+ # 4. Reorder if inventory at or below ROP
190
  rop = max(0.0, action.reorder_point)
191
+ qty = 0
192
  if day < SIM_DAYS - LEAD_TIME and episode.inventory <= rop:
193
  hist = episode.demand_series[max(0, day - 30):day]
194
  mean_demand = float(np.mean(hist)) if hist else 0.0
 
196
  if qty > 0:
197
  episode.order_processor.place_order(day, int(qty))
198
 
 
 
 
 
 
 
199
  # 5. Track performance
200
  episode.performance_tracker.daily_performance(
201
  demand_quantity=demand,
202
+ fulfilled_demand=int(units_sold),
203
+ daily_writeoff=0,
204
  )
205
 
206
  episode.day += 1
207
  done = episode.day >= SIM_DAYS
208
 
209
+ # 6. Compute dense daily P&L reward
210
+ revenue = units_sold * SELLING_PRICE
211
+ holding_cost = episode.inventory * UNIT_COST * 0.005
212
+ stockout_penalty = lost * (SELLING_PRICE - UNIT_COST)
213
+ order_cost = (FIXED_ORDER_COST if qty > 0 else 0.0) + qty * UNIT_COST
214
+
215
+ daily_profit = revenue - holding_cost - stockout_penalty - order_cost - writeoff_cost
216
+ episode.cumulative_profit += daily_profit
217
+
218
+ baseline = episode.baseline_profit
219
+ daily_reward = daily_profit / baseline if baseline > 0 else 0.0
220
+
221
+ # 7. Sparse episode bonus at end
222
  fill_rate = (
223
  episode.total_fulfilled / episode.total_demand
224
  if episode.total_demand > 0 else 0.0
225
  )
226
+ if done:
227
+ episode_length = SIM_DAYS - HISTO_DAYS
228
+ profit_ratio = (
229
+ episode.cumulative_profit / (baseline * episode_length)
230
+ if baseline > 0 else 0.0
231
+ )
232
+ episode_bonus = fill_rate * 0.5 + profit_ratio * 0.5
233
+ reward = daily_reward + episode_bonus
234
+ else:
235
+ reward = daily_reward
236
 
237
  return StepResult(
238
  observation=episode.get_obs(),
 
242
  "fill_rate": fill_rate,
243
  "stockouts": episode.stockouts,
244
  "lost_sales": episode.lost_sales,
245
+ "inventory_in": delivered,
246
+ "units_sold": units_sold,
247
+ "daily_profit": daily_profit,
248
  "reasoning_logged": action.reasoning[:200] if action.reasoning else "",
249
  },
250
  )