"""Tests for #1630 — gateway infinite 400 failure loop prevention. Verifies that: 1. Generic 400 errors with large sessions are treated as context-length errors and trigger compression instead of aborting. 2. The gateway does not persist messages when the agent fails early, preventing the session from growing on each failure. 3. Context-overflow failures produce helpful error messages suggesting /compact. """ import pytest from types import SimpleNamespace from unittest.mock import MagicMock, patch # --------------------------------------------------------------------------- # Test 1: Agent heuristic — generic 400 with large session → compression # --------------------------------------------------------------------------- class TestGeneric400Heuristic: """The agent should treat a generic 400 with a large session as a probable context-length error and trigger compression, not abort.""" def _make_agent(self): """Create a minimal AIAgent for testing error handling.""" with ( patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"), ): from run_agent import AIAgent a = AIAgent( api_key="test-key-12345", quiet_mode=True, skip_context_files=True, skip_memory=True, ) a.client = MagicMock() a._cached_system_prompt = "You are helpful." a._use_prompt_caching = False a.tool_delay = 0 a.compression_enabled = False return a def test_generic_400_with_small_session_is_client_error(self): """A generic 400 with a small session should still be treated as a non-retryable client error (not context overflow).""" error_msg = "error" status_code = 400 approx_tokens = 1000 # Small session api_messages = [{"role": "user", "content": "hi"}] # Simulate the phrase matching is_context_length_error = any(phrase in error_msg for phrase in [ 'context length', 'context size', 'maximum context', 'token limit', 'too many tokens', 'reduce the length', 'exceeds the limit', 'context window', 'request entity too large', 'prompt is too long', ]) assert not is_context_length_error # The heuristic should NOT trigger for small sessions ctx_len = 200000 is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 is_generic_error = len(error_msg.strip()) < 30 assert not is_large_session # Small session → heuristic doesn't fire def test_generic_400_with_large_token_count_triggers_heuristic(self): """A generic 400 with high token count should be treated as probable context overflow.""" error_msg = "error" status_code = 400 ctx_len = 200000 approx_tokens = 100000 # > 40% of 200k api_messages = [{"role": "user", "content": "hi"}] * 20 is_context_length_error = any(phrase in error_msg for phrase in [ 'context length', 'context size', 'maximum context', ]) assert not is_context_length_error # Heuristic check is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 is_generic_error = len(error_msg.strip()) < 30 assert is_large_session assert is_generic_error # Both conditions true → should be treated as context overflow def test_generic_400_with_many_messages_triggers_heuristic(self): """A generic 400 with >80 messages should trigger the heuristic even if estimated tokens are low.""" error_msg = "error" status_code = 400 ctx_len = 200000 approx_tokens = 5000 # Low token estimate api_messages = [{"role": "user", "content": "x"}] * 100 # > 80 messages is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 is_generic_error = len(error_msg.strip()) < 30 assert is_large_session assert is_generic_error def test_specific_error_message_bypasses_heuristic(self): """A 400 with a specific, long error message should NOT trigger the heuristic even with a large session.""" error_msg = "invalid model: anthropic/claude-nonexistent-model is not available" status_code = 400 ctx_len = 200000 approx_tokens = 100000 is_generic_error = len(error_msg.strip()) < 30 assert not is_generic_error # Long specific message → heuristic doesn't fire def test_descriptive_context_error_caught_by_phrases(self): """Descriptive context-length errors should still be caught by the existing phrase matching (not the heuristic).""" error_msg = "prompt is too long: 250000 tokens > 200000 maximum" is_context_length_error = any(phrase in error_msg for phrase in [ 'context length', 'context size', 'maximum context', 'token limit', 'too many tokens', 'reduce the length', 'exceeds the limit', 'context window', 'request entity too large', 'prompt is too long', ]) assert is_context_length_error # --------------------------------------------------------------------------- # Test 2: Gateway skips persistence on failed agent results # --------------------------------------------------------------------------- class TestGatewaySkipsPersistenceOnFailure: """When the agent returns failed=True with no final_response, the gateway should NOT persist messages to the transcript.""" def test_agent_failed_early_detected(self): """The agent_failed_early flag is True when failed=True and no final_response.""" agent_result = { "failed": True, "final_response": None, "messages": [], "error": "Non-retryable client error", } agent_failed_early = ( agent_result.get("failed") and not agent_result.get("final_response") ) assert agent_failed_early def test_agent_with_response_not_failed_early(self): """When the agent has a final_response, it's not a failed-early scenario even if failed=True.""" agent_result = { "failed": True, "final_response": "Here is a partial response", "messages": [], } agent_failed_early = ( agent_result.get("failed") and not agent_result.get("final_response") ) assert not agent_failed_early def test_successful_agent_not_failed_early(self): """A successful agent result should not trigger skip.""" agent_result = { "final_response": "Hello!", "messages": [{"role": "assistant", "content": "Hello!"}], } agent_failed_early = ( agent_result.get("failed") and not agent_result.get("final_response") ) assert not agent_failed_early # --------------------------------------------------------------------------- # Test 3: Context-overflow error messages # --------------------------------------------------------------------------- class TestContextOverflowErrorMessages: """The gateway should produce helpful error messages when the failure looks like a context overflow.""" def test_detects_context_keywords(self): """Error messages containing context-related keywords should be identified as context failures.""" keywords = [ "context length exceeded", "too many tokens in the prompt", "request entity too large", "payload too large for model", "context window exceeded", ] for error_str in keywords: _is_ctx_fail = any(p in error_str.lower() for p in ( "context", "token", "too large", "too long", "exceed", "payload", )) assert _is_ctx_fail, f"Should detect: {error_str}" def test_detects_generic_400_with_large_history(self): """A generic 400 error code in the string with a large history should be flagged as context failure.""" error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}" history_len = 100 # Large session _is_ctx_fail = any(p in error_str.lower() for p in ( "context", "token", "too large", "too long", "exceed", "payload", )) or ( "400" in error_str.lower() and history_len > 50 ) assert _is_ctx_fail def test_unrelated_error_not_flagged(self): """Unrelated errors should not be flagged as context failures.""" error_str = "invalid api key: authentication failed" history_len = 10 _is_ctx_fail = any(p in error_str.lower() for p in ( "context", "token", "too large", "too long", "exceed", "payload", )) or ( "400" in error_str.lower() and history_len > 50 ) assert not _is_ctx_fail # --------------------------------------------------------------------------- # Test 4: Agent skips persistence for large failed sessions # --------------------------------------------------------------------------- class TestAgentSkipsPersistenceForLargeFailedSessions: """When a 400 error occurs and the session is large, the agent should skip persisting to prevent the growth loop.""" def test_large_session_400_skips_persistence(self): """Status 400 + high token count should skip persistence.""" status_code = 400 approx_tokens = 60000 # > 50000 threshold api_messages = [{"role": "user", "content": "x"}] * 10 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) assert should_skip def test_small_session_400_persists_normally(self): """Status 400 + small session should still persist.""" status_code = 400 approx_tokens = 5000 # < 50000 api_messages = [{"role": "user", "content": "x"}] * 10 # < 80 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) assert not should_skip def test_non_400_error_persists_normally(self): """Non-400 errors should always persist normally.""" status_code = 401 # Auth error approx_tokens = 100000 # Large session, but not a 400 api_messages = [{"role": "user", "content": "x"}] * 100 should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) assert not should_skip