Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
fix(bedrock): force tool cachePoint via cache_control_injection_points
#24
by GuillaumeSalouHF HF Staff - opened
- agent/core/llm_params.py +24 -10
- agent/core/session.py +12 -0
- agent/core/session_uploader.py +2 -0
- backend/session_manager.py +1 -1
agent/core/llm_params.py
CHANGED
|
@@ -5,7 +5,12 @@ can import it without pulling in the whole agent loop / tool router and
|
|
| 5 |
creating circular imports.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def _patch_litellm_effort_validation() -> None:
|
|
@@ -129,7 +134,8 @@ def _resolve_llm_params(
|
|
| 129 |
1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
|
| 130 |
free for users, billed to the Space owner via ``X-HF-Bill-To``).
|
| 131 |
2. session.hf_token — the user's own token (CLI / OAuth / cache file).
|
| 132 |
-
3.
|
|
|
|
| 133 |
"""
|
| 134 |
if model_name.startswith("anthropic/"):
|
| 135 |
params: dict = {"model": model_name}
|
|
@@ -160,7 +166,20 @@ def _resolve_llm_params(
|
|
| 160 |
# (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
|
| 161 |
# The Anthropic thinking/effort shape is not forwarded through Converse
|
| 162 |
# the same way, so we leave it off for now.
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
if model_name.startswith("openai/"):
|
| 166 |
params = {"model": model_name}
|
|
@@ -175,18 +194,13 @@ def _resolve_llm_params(
|
|
| 175 |
return params
|
| 176 |
|
| 177 |
hf_model = model_name.removeprefix("huggingface/")
|
| 178 |
-
api_key = (
|
| 179 |
-
os.environ.get("INFERENCE_TOKEN")
|
| 180 |
-
or session_hf_token
|
| 181 |
-
or os.environ.get("HF_TOKEN")
|
| 182 |
-
)
|
| 183 |
params = {
|
| 184 |
"model": f"openai/{hf_model}",
|
| 185 |
"api_base": "https://router.huggingface.co/v1",
|
| 186 |
"api_key": api_key,
|
| 187 |
}
|
| 188 |
-
if
|
| 189 |
-
bill_to = os.environ.get("HF_BILL_TO", "smolagents")
|
| 190 |
params["extra_headers"] = {"X-HF-Bill-To": bill_to}
|
| 191 |
if reasoning_effort:
|
| 192 |
hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
|
|
|
|
| 5 |
creating circular imports.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
|
| 12 |
+
"""Backward-compatible private wrapper used by tests and older imports."""
|
| 13 |
+
return resolve_hf_router_token(session_hf_token)
|
| 14 |
|
| 15 |
|
| 16 |
def _patch_litellm_effort_validation() -> None:
|
|
|
|
| 134 |
1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
|
| 135 |
free for users, billed to the Space owner via ``X-HF-Bill-To``).
|
| 136 |
2. session.hf_token — the user's own token (CLI / OAuth / cache file).
|
| 137 |
+
3. huggingface_hub cache — ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
|
| 138 |
+
local ``hf auth login`` cache.
|
| 139 |
"""
|
| 140 |
if model_name.startswith("anthropic/"):
|
| 141 |
params: dict = {"model": model_name}
|
|
|
|
| 166 |
# (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
|
| 167 |
# The Anthropic thinking/effort shape is not forwarded through Converse
|
| 168 |
# the same way, so we leave it off for now.
|
| 169 |
+
params: dict = {"model": model_name}
|
| 170 |
+
# ``cache_control_injection_points`` instructs the Converse adapter to
|
| 171 |
+
# append a cachePoint at the end of the tool list. Per-tool
|
| 172 |
+
# ``cache_control`` blocks (set by prompt_caching.py for the Anthropic
|
| 173 |
+
# native path) are otherwise silently dropped by Converse, leaving the
|
| 174 |
+
# ~16k tokens of tool defs uncached on every Bedrock turn.
|
| 175 |
+
# Only enabled for Anthropic-on-Bedrock models since other Bedrock
|
| 176 |
+
# providers (Titan, Llama, Mistral...) don't support cachePoint and
|
| 177 |
+
# Bedrock returns an error if it's set on an unsupported model.
|
| 178 |
+
# System-prompt caching still works via cache_control on system content
|
| 179 |
+
# blocks (Converse reads those for any provider).
|
| 180 |
+
if "anthropic" in model_name:
|
| 181 |
+
params["cache_control_injection_points"] = [{"location": "tool_config"}]
|
| 182 |
+
return params
|
| 183 |
|
| 184 |
if model_name.startswith("openai/"):
|
| 185 |
params = {"model": model_name}
|
|
|
|
| 194 |
return params
|
| 195 |
|
| 196 |
hf_model = model_name.removeprefix("huggingface/")
|
| 197 |
+
api_key = _resolve_hf_router_token(session_hf_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
params = {
|
| 199 |
"model": f"openai/{hf_model}",
|
| 200 |
"api_base": "https://router.huggingface.co/v1",
|
| 201 |
"api_key": api_key,
|
| 202 |
}
|
| 203 |
+
if bill_to := get_hf_bill_to():
|
|
|
|
| 204 |
params["extra_headers"] = {"X-HF-Bill-To": bill_to}
|
| 205 |
if reasoning_effort:
|
| 206 |
hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
|
agent/core/session.py
CHANGED
|
@@ -79,8 +79,10 @@ class Session:
|
|
| 79 |
hf_token: str | None = None,
|
| 80 |
local_mode: bool = False,
|
| 81 |
stream: bool = True,
|
|
|
|
| 82 |
):
|
| 83 |
self.hf_token: Optional[str] = hf_token
|
|
|
|
| 84 |
self.tool_router = tool_router
|
| 85 |
self.stream = stream
|
| 86 |
tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
|
|
@@ -199,11 +201,21 @@ class Session:
|
|
| 199 |
tools = self.tool_router.get_tool_specs_for_llm() or []
|
| 200 |
except Exception:
|
| 201 |
tools = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
return {
|
| 203 |
"session_id": self.session_id,
|
|
|
|
| 204 |
"session_start_time": self.session_start_time,
|
| 205 |
"session_end_time": datetime.now().isoformat(),
|
| 206 |
"model_name": self.config.model_name,
|
|
|
|
| 207 |
"messages": [msg.model_dump() for msg in self.context_manager.items],
|
| 208 |
"events": self.logged_events,
|
| 209 |
"tools": tools,
|
|
|
|
| 79 |
hf_token: str | None = None,
|
| 80 |
local_mode: bool = False,
|
| 81 |
stream: bool = True,
|
| 82 |
+
user_id: str | None = None,
|
| 83 |
):
|
| 84 |
self.hf_token: Optional[str] = hf_token
|
| 85 |
+
self.user_id: Optional[str] = user_id
|
| 86 |
self.tool_router = tool_router
|
| 87 |
self.stream = stream
|
| 88 |
tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
|
|
|
|
| 201 |
tools = self.tool_router.get_tool_specs_for_llm() or []
|
| 202 |
except Exception:
|
| 203 |
tools = []
|
| 204 |
+
# Sum per-call cost from llm_call events so analyzers don't have to
|
| 205 |
+
# walk the events array themselves. Each `llm_call` event already
|
| 206 |
+
# carries cost_usd from `agent.core.telemetry.record_llm_call`.
|
| 207 |
+
total_cost_usd = sum(
|
| 208 |
+
float((e.get("data") or {}).get("cost_usd") or 0.0)
|
| 209 |
+
for e in self.logged_events
|
| 210 |
+
if e.get("event_type") == "llm_call"
|
| 211 |
+
)
|
| 212 |
return {
|
| 213 |
"session_id": self.session_id,
|
| 214 |
+
"user_id": self.user_id,
|
| 215 |
"session_start_time": self.session_start_time,
|
| 216 |
"session_end_time": datetime.now().isoformat(),
|
| 217 |
"model_name": self.config.model_name,
|
| 218 |
+
"total_cost_usd": total_cost_usd,
|
| 219 |
"messages": [msg.model_dump() for msg in self.context_manager.items],
|
| 220 |
"events": self.logged_events,
|
| 221 |
"tools": tools,
|
agent/core/session_uploader.py
CHANGED
|
@@ -90,9 +90,11 @@ def upload_session_as_file(
|
|
| 90 |
# across sessions with different tool rosters.
|
| 91 |
session_row = {
|
| 92 |
"session_id": data["session_id"],
|
|
|
|
| 93 |
"session_start_time": data["session_start_time"],
|
| 94 |
"session_end_time": data["session_end_time"],
|
| 95 |
"model_name": data["model_name"],
|
|
|
|
| 96 |
"messages": json.dumps(scrubbed_messages),
|
| 97 |
"events": json.dumps(scrubbed_events),
|
| 98 |
"tools": json.dumps(scrubbed_tools),
|
|
|
|
| 90 |
# across sessions with different tool rosters.
|
| 91 |
session_row = {
|
| 92 |
"session_id": data["session_id"],
|
| 93 |
+
"user_id": data.get("user_id"),
|
| 94 |
"session_start_time": data["session_start_time"],
|
| 95 |
"session_end_time": data["session_end_time"],
|
| 96 |
"model_name": data["model_name"],
|
| 97 |
+
"total_cost_usd": data.get("total_cost_usd"),
|
| 98 |
"messages": json.dumps(scrubbed_messages),
|
| 99 |
"events": json.dumps(scrubbed_events),
|
| 100 |
"tools": json.dumps(scrubbed_tools),
|
backend/session_manager.py
CHANGED
|
@@ -192,7 +192,7 @@ class SessionManager:
|
|
| 192 |
session_config.model_name = model
|
| 193 |
session = Session(
|
| 194 |
event_queue, config=session_config, tool_router=tool_router,
|
| 195 |
-
hf_token=hf_token,
|
| 196 |
)
|
| 197 |
t1 = _time.monotonic()
|
| 198 |
logger.info(f"Session initialized in {t1 - t0:.2f}s")
|
|
|
|
| 192 |
session_config.model_name = model
|
| 193 |
session = Session(
|
| 194 |
event_queue, config=session_config, tool_router=tool_router,
|
| 195 |
+
hf_token=hf_token, user_id=user_id,
|
| 196 |
)
|
| 197 |
t1 = _time.monotonic()
|
| 198 |
logger.info(f"Session initialized in {t1 - t0:.2f}s")
|