Spaces:

VibecoderMcSwaggins
/

DeepBoner

Paused

App Files Files Community

VibecoderMcSwaggins commited on 7 days ago

Commit

580b270

unverified ·

2 Parent(s): 949847c b9e710f

Merge pull request #137 from The-Obstacle-Is-The-Way/refactor/spec-21-middleware-architecture

Browse files

Files changed (13) hide show

docs/specs/SPEC-21-MIDDLEWARE-ARCHITECTURE.md +1 -1
src/clients/huggingface.py +8 -1
src/middleware/__init__.py +10 -1
src/middleware/retry.py +103 -0
src/middleware/token_tracking.py +84 -0
src/orchestrators/hierarchical.py +1 -1
src/{middleware → workflows}/.gitkeep +0 -0
src/workflows/__init__.py +13 -0
src/{middleware → workflows}/sub_iteration.py +0 -0
tests/unit/middleware/__init__.py +0 -0
tests/unit/middleware/test_retry.py +55 -0
tests/unit/middleware/test_token_tracking.py +43 -0
tests/unit/test_hierarchical.py +1 -1

docs/specs/SPEC-21-MIDDLEWARE-ARCHITECTURE.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # SPEC-21: Middleware Architecture Refactor
-**Status:** READY FOR IMPLEMENTATION
 **Priority:** P2 (Architectural hygiene + fixes HuggingFace retry bug)
 **Effort:** 2 hours
 **PR Scope:** Folder rename + new middleware implementations

 # SPEC-21: Middleware Architecture Refactor
+**Status:** COMPLETED
 **Priority:** P2 (Architectural hygiene + fixes HuggingFace retry bug)
 **Effort:** 2 hours
 **PR Scope:** Folder rename + new middleware implementations

src/clients/huggingface.py CHANGED Viewed

@@ -27,6 +27,7 @@ from agent_framework._types import FunctionCallContent, FunctionResultContent
 from agent_framework.observability import use_observability
 from huggingface_hub import InferenceClient
 from src.utils.config import settings
 logger = structlog.get_logger()
@@ -51,7 +52,13 @@ class HuggingFaceChatClient(BaseChatClient):  # type: ignore[misc]
             api_key: HF_TOKEN (optional, defaults to env var).
             **kwargs: Additional arguments passed to BaseChatClient.
         """
-        super().__init__(**kwargs)
         # FIX: Use 7B model to stay on HuggingFace native infrastructure (avoid Novita 500s)
         self.model_id = model_id or settings.huggingface_model or "Qwen/Qwen2.5-7B-Instruct"
         self.api_key = api_key or settings.hf_token

 from agent_framework.observability import use_observability
 from huggingface_hub import InferenceClient
+from src.middleware import RetryMiddleware, TokenTrackingMiddleware
 from src.utils.config import settings
 logger = structlog.get_logger()
             api_key: HF_TOKEN (optional, defaults to env var).
             **kwargs: Additional arguments passed to BaseChatClient.
         """
+        # Create middleware instances for retry and token tracking
+        middleware = [
+            RetryMiddleware(max_attempts=3, min_wait=1.0, max_wait=10.0),
+            TokenTrackingMiddleware(),
+        ]
+        super().__init__(middleware=middleware, **kwargs)  # type: ignore[arg-type]
         # FIX: Use 7B model to stay on HuggingFace native infrastructure (avoid Novita 500s)
         self.model_id = model_id or settings.huggingface_model or "Qwen/Qwen2.5-7B-Instruct"
         self.api_key = api_key or settings.hf_token

src/middleware/__init__.py CHANGED Viewed

	@@ -1 +1,10 @@
1	- """~~Middleware~~ ~~components~~ ~~for~~ ~~orchestration~~.~~"""~~

+"""Microsoft Agent Framework middleware implementations.
+These are interceptor-pattern middleware that wrap chat client calls.
+They are NOT workflows - see src/workflows/ for orchestration patterns.
+"""
+from src.middleware.retry import RetryMiddleware
+from src.middleware.token_tracking import TokenTrackingMiddleware
+__all__ = ["RetryMiddleware", "TokenTrackingMiddleware"]

src/middleware/retry.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""Retry middleware for chat clients with exponential backoff."""
+import asyncio
+import random
+from collections.abc import Awaitable, Callable
+import structlog
+from agent_framework._middleware import ChatContext, ChatMiddleware
+logger = structlog.get_logger()
+class RetryMiddleware(ChatMiddleware):
+    """Retries failed chat requests with exponential backoff.
+    This middleware intercepts chat client calls and retries on transient
+    errors (rate limits, timeouts, server errors).
+    Attributes:
+        max_attempts: Maximum number of attempts (default: 3)
+        min_wait: Minimum wait between retries in seconds (default: 1.0)
+        max_wait: Maximum wait between retries in seconds (default: 10.0)
+        retryable_status_codes: HTTP status codes to retry (default: 429, 500, 502, 503, 504)
+    """
+    def __init__(
+        self,
+        max_attempts: int = 3,
+        min_wait: float = 1.0,
+        max_wait: float = 10.0,
+        retryable_status_codes: tuple[int, ...] = (429, 500, 502, 503, 504),
+    ) -> None:
+        self.max_attempts = max_attempts
+        self.min_wait = min_wait
+        self.max_wait = max_wait
+        self.retryable_status_codes = retryable_status_codes
+    def _is_retryable(self, error: Exception) -> bool:
+        """Check if error is retryable."""
+        # Check for httpx status errors
+        if hasattr(error, "response") and hasattr(error.response, "status_code"):
+            return error.response.status_code in self.retryable_status_codes
+        # Check for timeout errors
+        error_name = type(error).__name__.lower()
+        if "timeout" in error_name:
+            return True
+        # Check for connection errors
+        if "connection" in error_name:
+            return True
+        return False
+    def _calculate_wait(self, attempt: int) -> float:
+        """Calculate wait time with exponential backoff and jitter."""
+        wait = self.min_wait * (2**attempt)
+        wait = min(wait, self.max_wait)
+        # Add jitter (±25%) to avoid thundering herd
+        jitter = wait * 0.25 * (2 * random.random() - 1)
+        return float(max(self.min_wait, wait + jitter))
+    async def process(
+        self, context: ChatContext, next: Callable[[ChatContext], Awaitable[None]]
+    ) -> None:
+        """Process the chat request with retry logic."""
+        last_error: Exception | None = None
+        for attempt in range(self.max_attempts):
+            try:
+                await next(context)
+                return  # Success - exit retry loop
+            except Exception as e:
+                last_error = e
+                if not self._is_retryable(e):
+                    logger.warning(
+                        "Non-retryable error",
+                        error=str(e),
+                        error_type=type(e).__name__,
+                    )
+                    raise  # Don't retry non-retryable errors
+                if attempt < self.max_attempts - 1:
+                    wait_time = self._calculate_wait(attempt)
+                    logger.info(
+                        "Retrying after error",
+                        attempt=attempt + 1,
+                        max_attempts=self.max_attempts,
+                        wait_seconds=wait_time,
+                        error=str(e),
+                    )
+                    await asyncio.sleep(wait_time)
+        # All retries exhausted
+        logger.error(
+            "All retry attempts failed",
+            max_attempts=self.max_attempts,
+            last_error=str(last_error),
+        )
+        if last_error:
+            raise last_error

src/middleware/token_tracking.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Token tracking middleware for monitoring API usage."""
+from collections.abc import Awaitable, Callable
+import structlog
+from agent_framework._middleware import ChatContext, ChatMiddleware
+logger = structlog.get_logger()
+class TokenTrackingMiddleware(ChatMiddleware):
+    """Tracks token usage across chat requests.
+    This middleware logs token usage after each chat completion
+    and maintains running totals for the session.
+    Usage metrics are logged via structlog for observability.
+    """
+    def __init__(self) -> None:
+        self.total_input_tokens = 0
+        self.total_output_tokens = 0
+        self.request_count = 0
+    async def process(
+        self, context: ChatContext, next: Callable[[ChatContext], Awaitable[None]]
+    ) -> None:
+        """Process request and track token usage."""
+        await next(context)
+        # Extract usage from response if available
+        if context.result is None:
+            return
+        usage = None
+        # Try to get usage from response
+        if hasattr(context.result, "usage"):
+            usage = context.result.usage
+        elif hasattr(context.result, "messages") and context.result.messages:
+            # Check first message for usage metadata
+            msg = context.result.messages[0]
+            if hasattr(msg, "metadata") and msg.metadata:
+                usage = msg.metadata.get("usage")
+        if usage:
+            # Handle both dict-like and object attribute access
+            if hasattr(usage, "get"):
+                # Dict-like access
+                input_tokens = usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0)
+                output_tokens = usage.get("output_tokens", 0) or usage.get("completion_tokens", 0)
+            else:
+                # Object attribute access (Pydantic models, etc.)
+                input_tokens = getattr(usage, "input_tokens", 0) or getattr(
+                    usage, "prompt_tokens", 0
+                )
+                output_tokens = getattr(usage, "output_tokens", 0) or getattr(
+                    usage, "completion_tokens", 0
+                )
+            self.total_input_tokens += input_tokens
+            self.total_output_tokens += output_tokens
+            self.request_count += 1
+            logger.info(
+                "Token usage",
+                request_input=input_tokens,
+                request_output=output_tokens,
+                total_input=self.total_input_tokens,
+                total_output=self.total_output_tokens,
+                total_requests=self.request_count,
+            )
+    def get_stats(self) -> dict[str, int]:
+        """Get cumulative token usage statistics.
+        Returns:
+            Dictionary with total_input, total_output, and request_count.
+        """
+        return {
+            "total_input": self.total_input_tokens,
+            "total_output": self.total_output_tokens,
+            "request_count": self.request_count,
+        }

src/orchestrators/hierarchical.py CHANGED Viewed

@@ -19,11 +19,11 @@ import structlog
 from src.agents.judge_agent_llm import LLMSubIterationJudge
 from src.agents.magentic_agents import create_search_agent
 from src.config.domain import ResearchDomain
-from src.middleware.sub_iteration import SubIterationMiddleware, SubIterationTeam
 from src.orchestrators.base import OrchestratorProtocol
 from src.state import init_magentic_state
 from src.utils.models import AgentEvent, OrchestratorConfig
 from src.utils.service_loader import get_embedding_service_if_available
 logger = structlog.get_logger()

 from src.agents.judge_agent_llm import LLMSubIterationJudge
 from src.agents.magentic_agents import create_search_agent
 from src.config.domain import ResearchDomain
 from src.orchestrators.base import OrchestratorProtocol
 from src.state import init_magentic_state
 from src.utils.models import AgentEvent, OrchestratorConfig
 from src.utils.service_loader import get_embedding_service_if_available
+from src.workflows.sub_iteration import SubIterationMiddleware, SubIterationTeam
 logger = structlog.get_logger()

src/{middleware → workflows}/.gitkeep RENAMED Viewed

File without changes

src/workflows/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Workflow components for orchestration.
+These are workflow patterns (e.g., team→judge loops), NOT interceptor middleware.
+For interceptor middleware, see src/middleware/.
+"""
+from src.workflows.sub_iteration import (
+    SubIterationJudge,
+    SubIterationMiddleware,
+    SubIterationTeam,
+)
+__all__ = ["SubIterationJudge", "SubIterationMiddleware", "SubIterationTeam"]

src/{middleware → workflows}/sub_iteration.py RENAMED Viewed

File without changes

tests/unit/middleware/__init__.py ADDED Viewed

File without changes

tests/unit/middleware/test_retry.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from unittest.mock import AsyncMock, MagicMock
+import pytest
+from src.middleware.retry import RetryMiddleware
+pytestmark = pytest.mark.unit
+@pytest.mark.asyncio
+async def test_retry_middleware_succeeds_first_try():
+    """RetryMiddleware should pass through on success."""
+    middleware = RetryMiddleware(max_attempts=3)
+    context = MagicMock()
+    next_fn = AsyncMock()
+    await middleware.process(context, next_fn)
+    next_fn.assert_called_once_with(context)
+@pytest.mark.asyncio
+async def test_retry_middleware_retries_on_429():
+    """RetryMiddleware should retry on 429 rate limit."""
+    middleware = RetryMiddleware(max_attempts=3, min_wait=0.01)
+    context = MagicMock()
+    # First two calls fail with 429, third succeeds
+    call_count = 0
+    async def mock_next(ctx):
+        nonlocal call_count
+        call_count += 1
+        if call_count < 3:
+            error = Exception("Rate limited")
+            error.response = MagicMock(status_code=429)
+            raise error
+    await middleware.process(context, mock_next)
+    assert call_count == 3
+@pytest.mark.asyncio
+async def test_retry_middleware_raises_after_max_attempts():
+    """RetryMiddleware should raise after max attempts exhausted."""
+    middleware = RetryMiddleware(max_attempts=2, min_wait=0.01)
+    context = MagicMock()
+    async def always_fails(ctx):
+        error = Exception("Always fails")
+        error.response = MagicMock(status_code=500)
+        raise error
+    with pytest.raises(Exception, match="Always fails"):
+        await middleware.process(context, always_fails)

tests/unit/middleware/test_token_tracking.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from unittest.mock import AsyncMock, MagicMock
+import pytest
+from src.middleware.token_tracking import TokenTrackingMiddleware
+pytestmark = pytest.mark.unit
+@pytest.mark.asyncio
+async def test_token_tracking_middleware_counts_tokens():
+    """TokenTrackingMiddleware should count tokens from response."""
+    middleware = TokenTrackingMiddleware()
+    context = MagicMock()
+    # Mock response with usage
+    context.result.usage = {"input_tokens": 10, "output_tokens": 20}
+    next_fn = AsyncMock()
+    await middleware.process(context, next_fn)
+    assert middleware.total_input_tokens == 10
+    assert middleware.total_output_tokens == 20
+    assert middleware.request_count == 1
+@pytest.mark.asyncio
+async def test_token_tracking_middleware_handles_no_usage():
+    """TokenTrackingMiddleware should handle response without usage gracefully."""
+    middleware = TokenTrackingMiddleware()
+    context = MagicMock()
+    context.result = MagicMock()
+    del context.result.usage  # Ensure usage attr doesn't exist
+    context.result.messages = []  # Ensure no messages
+    next_fn = AsyncMock()
+    await middleware.process(context, next_fn)
+    assert middleware.total_input_tokens == 0
+    assert middleware.total_output_tokens == 0
+    assert middleware.request_count == 0

tests/unit/test_hierarchical.py CHANGED Viewed

@@ -4,8 +4,8 @@ from unittest.mock import AsyncMock
 import pytest
-from src.middleware.sub_iteration import SubIterationMiddleware
 from src.utils.models import AssessmentDetails, JudgeAssessment
 pytestmark = pytest.mark.unit

 import pytest
 from src.utils.models import AssessmentDetails, JudgeAssessment
+from src.workflows.sub_iteration import SubIterationMiddleware
 pytestmark = pytest.mark.unit