fix(bedrock): force tool cachePoint via cache_control_injection_points

#24
agent/core/llm_params.py CHANGED
@@ -5,7 +5,12 @@ can import it without pulling in the whole agent loop / tool router and
5
  creating circular imports.
6
  """
7
 
8
- import os
 
 
 
 
 
9
 
10
 
11
  def _patch_litellm_effort_validation() -> None:
@@ -129,7 +134,8 @@ def _resolve_llm_params(
129
  1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
130
  free for users, billed to the Space owner via ``X-HF-Bill-To``).
131
  2. session.hf_token — the user's own token (CLI / OAuth / cache file).
132
- 3. HF_TOKEN envbelt-and-suspenders fallback for CLI users.
 
133
  """
134
  if model_name.startswith("anthropic/"):
135
  params: dict = {"model": model_name}
@@ -160,7 +166,20 @@ def _resolve_llm_params(
160
  # (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
161
  # The Anthropic thinking/effort shape is not forwarded through Converse
162
  # the same way, so we leave it off for now.
163
- return {"model": model_name}
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  if model_name.startswith("openai/"):
166
  params = {"model": model_name}
@@ -175,18 +194,13 @@ def _resolve_llm_params(
175
  return params
176
 
177
  hf_model = model_name.removeprefix("huggingface/")
178
- api_key = (
179
- os.environ.get("INFERENCE_TOKEN")
180
- or session_hf_token
181
- or os.environ.get("HF_TOKEN")
182
- )
183
  params = {
184
  "model": f"openai/{hf_model}",
185
  "api_base": "https://router.huggingface.co/v1",
186
  "api_key": api_key,
187
  }
188
- if os.environ.get("INFERENCE_TOKEN"):
189
- bill_to = os.environ.get("HF_BILL_TO", "smolagents")
190
  params["extra_headers"] = {"X-HF-Bill-To": bill_to}
191
  if reasoning_effort:
192
  hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
 
5
  creating circular imports.
6
  """
7
 
8
+ from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
9
+
10
+
11
+ def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
12
+ """Backward-compatible private wrapper used by tests and older imports."""
13
+ return resolve_hf_router_token(session_hf_token)
14
 
15
 
16
  def _patch_litellm_effort_validation() -> None:
 
134
  1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
135
  free for users, billed to the Space owner via ``X-HF-Bill-To``).
136
  2. session.hf_token — the user's own token (CLI / OAuth / cache file).
137
+ 3. huggingface_hub cache``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
138
+ local ``hf auth login`` cache.
139
  """
140
  if model_name.startswith("anthropic/"):
141
  params: dict = {"model": model_name}
 
166
  # (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
167
  # The Anthropic thinking/effort shape is not forwarded through Converse
168
  # the same way, so we leave it off for now.
169
+ params: dict = {"model": model_name}
170
+ # ``cache_control_injection_points`` instructs the Converse adapter to
171
+ # append a cachePoint at the end of the tool list. Per-tool
172
+ # ``cache_control`` blocks (set by prompt_caching.py for the Anthropic
173
+ # native path) are otherwise silently dropped by Converse, leaving the
174
+ # ~16k tokens of tool defs uncached on every Bedrock turn.
175
+ # Only enabled for Anthropic-on-Bedrock models since other Bedrock
176
+ # providers (Titan, Llama, Mistral...) don't support cachePoint and
177
+ # Bedrock returns an error if it's set on an unsupported model.
178
+ # System-prompt caching still works via cache_control on system content
179
+ # blocks (Converse reads those for any provider).
180
+ if "anthropic" in model_name:
181
+ params["cache_control_injection_points"] = [{"location": "tool_config"}]
182
+ return params
183
 
184
  if model_name.startswith("openai/"):
185
  params = {"model": model_name}
 
194
  return params
195
 
196
  hf_model = model_name.removeprefix("huggingface/")
197
+ api_key = _resolve_hf_router_token(session_hf_token)
 
 
 
 
198
  params = {
199
  "model": f"openai/{hf_model}",
200
  "api_base": "https://router.huggingface.co/v1",
201
  "api_key": api_key,
202
  }
203
+ if bill_to := get_hf_bill_to():
 
204
  params["extra_headers"] = {"X-HF-Bill-To": bill_to}
205
  if reasoning_effort:
206
  hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
agent/core/session.py CHANGED
@@ -79,8 +79,10 @@ class Session:
79
  hf_token: str | None = None,
80
  local_mode: bool = False,
81
  stream: bool = True,
 
82
  ):
83
  self.hf_token: Optional[str] = hf_token
 
84
  self.tool_router = tool_router
85
  self.stream = stream
86
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
@@ -199,11 +201,21 @@ class Session:
199
  tools = self.tool_router.get_tool_specs_for_llm() or []
200
  except Exception:
201
  tools = []
 
 
 
 
 
 
 
 
202
  return {
203
  "session_id": self.session_id,
 
204
  "session_start_time": self.session_start_time,
205
  "session_end_time": datetime.now().isoformat(),
206
  "model_name": self.config.model_name,
 
207
  "messages": [msg.model_dump() for msg in self.context_manager.items],
208
  "events": self.logged_events,
209
  "tools": tools,
 
79
  hf_token: str | None = None,
80
  local_mode: bool = False,
81
  stream: bool = True,
82
+ user_id: str | None = None,
83
  ):
84
  self.hf_token: Optional[str] = hf_token
85
+ self.user_id: Optional[str] = user_id
86
  self.tool_router = tool_router
87
  self.stream = stream
88
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
 
201
  tools = self.tool_router.get_tool_specs_for_llm() or []
202
  except Exception:
203
  tools = []
204
+ # Sum per-call cost from llm_call events so analyzers don't have to
205
+ # walk the events array themselves. Each `llm_call` event already
206
+ # carries cost_usd from `agent.core.telemetry.record_llm_call`.
207
+ total_cost_usd = sum(
208
+ float((e.get("data") or {}).get("cost_usd") or 0.0)
209
+ for e in self.logged_events
210
+ if e.get("event_type") == "llm_call"
211
+ )
212
  return {
213
  "session_id": self.session_id,
214
+ "user_id": self.user_id,
215
  "session_start_time": self.session_start_time,
216
  "session_end_time": datetime.now().isoformat(),
217
  "model_name": self.config.model_name,
218
+ "total_cost_usd": total_cost_usd,
219
  "messages": [msg.model_dump() for msg in self.context_manager.items],
220
  "events": self.logged_events,
221
  "tools": tools,
agent/core/session_uploader.py CHANGED
@@ -90,9 +90,11 @@ def upload_session_as_file(
90
  # across sessions with different tool rosters.
91
  session_row = {
92
  "session_id": data["session_id"],
 
93
  "session_start_time": data["session_start_time"],
94
  "session_end_time": data["session_end_time"],
95
  "model_name": data["model_name"],
 
96
  "messages": json.dumps(scrubbed_messages),
97
  "events": json.dumps(scrubbed_events),
98
  "tools": json.dumps(scrubbed_tools),
 
90
  # across sessions with different tool rosters.
91
  session_row = {
92
  "session_id": data["session_id"],
93
+ "user_id": data.get("user_id"),
94
  "session_start_time": data["session_start_time"],
95
  "session_end_time": data["session_end_time"],
96
  "model_name": data["model_name"],
97
+ "total_cost_usd": data.get("total_cost_usd"),
98
  "messages": json.dumps(scrubbed_messages),
99
  "events": json.dumps(scrubbed_events),
100
  "tools": json.dumps(scrubbed_tools),
backend/session_manager.py CHANGED
@@ -192,7 +192,7 @@ class SessionManager:
192
  session_config.model_name = model
193
  session = Session(
194
  event_queue, config=session_config, tool_router=tool_router,
195
- hf_token=hf_token,
196
  )
197
  t1 = _time.monotonic()
198
  logger.info(f"Session initialized in {t1 - t0:.2f}s")
 
192
  session_config.model_name = model
193
  session = Session(
194
  event_queue, config=session_config, tool_router=tool_router,
195
+ hf_token=hf_token, user_id=user_id,
196
  )
197
  t1 = _time.monotonic()
198
  logger.info(f"Session initialized in {t1 - t0:.2f}s")