ml-intern

Running

App Files Files Community

ml-intern / tests /unit /test_llm_error_classification.py

lewtun HF Staff

Add CI for tests and Ruff (#217)

754345f unverified 9 days ago

raw

history blame contribute delete

3.86 kB

	"""Tests for LLM error classification helpers in agent.core.agent_loop.

	Covers two regressions on 2026-04-25:

	1. Non-Anthropic context overflow (Kimi 365k > 262k) was not classified as
	``_is_context_overflow_error``, so the recovery path didn't fire and
	session 62ccfdcb died with 68 wasted compaction events.

	2. Bedrock TPM rate limit (`Too many tokens, please wait before trying
	again.`) needs the longer rate-limit retry schedule. The old schedule
	([5, 15, 30] = 50s) burned through 6 sessions costing >$2,400 combined
	on the same day.
	"""

	from agent.core.agent_loop import (
	_MAX_LLM_RETRIES,
	_LLM_RATE_LIMIT_RETRY_DELAYS,
	_LLM_RETRY_DELAYS,
	_is_context_overflow_error,
	_is_rate_limit_error,
	_is_transient_error,
	_retry_delay_for,
	)


	# ── context overflow ────────────────────────────────────────────────────


	def test_kimi_prompt_too_long_is_context_overflow():
	# Verbatim error text from session 62ccfdcb (2026-04-25, Kimi K2.6).
	err = Exception(
	"litellm.BadRequestError: OpenAIException - The prompt is too long: "
	"365407, model maximum context length: 262143"
	)
	assert _is_context_overflow_error(err)


	def test_openai_context_length_exceeded_is_context_overflow():
	err = Exception("Error: This model's maximum context length is 8192 tokens.")
	assert _is_context_overflow_error(err)


	def test_random_error_is_not_context_overflow():
	err = Exception("connection reset by peer")
	assert not _is_context_overflow_error(err)


	# ── rate limit ──────────────────────────────────────────────────────────


	def test_bedrock_too_many_tokens_is_rate_limit():
	# Verbatim from sessions b37a3823, c4d7a831, b63c4933 (2026-04-25).
	err = Exception(
	'litellm.RateLimitError: BedrockException - {"message":"Too many '
	'tokens, please wait before trying again."}'
	)
	assert _is_rate_limit_error(err)
	# Rate-limit errors are also classified as transient.
	assert _is_transient_error(err)


	def test_429_is_rate_limit():
	err = Exception("HTTP 429 Too Many Requests")
	assert _is_rate_limit_error(err)


	def test_timeout_is_transient_but_not_rate_limit():
	err = Exception("Request timed out after 600s")
	assert _is_transient_error(err)
	assert not _is_rate_limit_error(err)


	# ── retry schedule selection ────────────────────────────────────────────


	def test_rate_limit_uses_longer_schedule():
	err = Exception("Too many tokens, please wait before trying again.")
	delays = [
	_retry_delay_for(err, i) for i in range(len(_LLM_RATE_LIMIT_RETRY_DELAYS))
	]
	assert delays == _LLM_RATE_LIMIT_RETRY_DELAYS
	# Just past the schedule → None (stop retrying).
	assert _retry_delay_for(err, len(_LLM_RATE_LIMIT_RETRY_DELAYS)) is None


	def test_other_transient_uses_short_schedule():
	err = Exception("503 service unavailable")
	delays = [_retry_delay_for(err, i) for i in range(len(_LLM_RETRY_DELAYS))]
	assert delays == _LLM_RETRY_DELAYS
	assert _retry_delay_for(err, len(_LLM_RETRY_DELAYS)) is None


	def test_non_transient_returns_none():
	err = Exception("invalid request: bad parameter")
	assert _retry_delay_for(err, 0) is None


	def test_rate_limit_total_budget_covers_bedrock_bucket_recovery():
	"""The whole point of the rate-limit schedule: total wait time should
	exceed the ~60s Bedrock TPM bucket recovery window."""
	assert len(_LLM_RATE_LIMIT_RETRY_DELAYS) == _MAX_LLM_RETRIES - 1
	assert sum(_LLM_RATE_LIMIT_RETRY_DELAYS) > 60