Spaces:

VibecoderMcSwaggins
/

DeepBoner

Paused

VibecoderMcSwaggins commited on 16 days ago

Commit

b1d094d

1 Parent(s): 0f049b6

feat: Implement SPEC_01 (Termination) and SPEC_02 (E2E Tests)

- SPEC_01: Added timeout (300s) and progress events to MagenticOrchestrator
- SPEC_02: Created tests/e2e/ with mocked tests for Simple and Advanced modes
- Docs: Updated specs to match codebase state

Files changed (8) hide show

docs/specs/SPEC_01_DEMO_TERMINATION.md +1 -1
pyproject.toml +1 -0
src/orchestrator_magentic.py +28 -10
src/utils/models.py +2 -0
tests/e2e/conftest.py +60 -0
tests/e2e/test_advanced_mode.py +70 -0
tests/e2e/test_simple_mode.py +65 -0
tests/unit/test_magentic_termination.py +35 -0

docs/specs/SPEC_01_DEMO_TERMINATION.md CHANGED Viewed

@@ -16,7 +16,7 @@ Advanced (Magentic) mode runs indefinitely from user perspective. The demo was m
 ### Question 1: Does max_round_count actually work?
 ```python
-# Current code (src/orchestrator_magentic.py:110)
 .with_standard_manager(
     chat_client=manager_client,
     max_round_count=self._max_rounds,  # Default: 10

 ### Question 1: Does max_round_count actually work?
 ```python
+# Current code (src/orchestrator_magentic.py:94)
 .with_standard_manager(
     chat_client=manager_client,
     max_round_count=self._max_rounds,  # Default: 10

pyproject.toml CHANGED Viewed

@@ -129,6 +129,7 @@ markers = [
     "unit: Unit tests (mocked)",
     "integration: Integration tests (real APIs)",
     "slow: Slow tests",
 ]
 # Filter warnings from unittest.mock introspecting Pydantic models.
 # This is a known upstream issue: https://github.com/pydantic/pydantic/issues/9927

     "unit: Unit tests (mocked)",
     "integration: Integration tests (real APIs)",
     "slow: Slow tests",
+    "e2e: End-to-End tests (full pipeline)",
 ]
 # Filter warnings from unittest.mock introspecting Pydantic models.
 # This is a known upstream issue: https://github.com/pydantic/pydantic/issues/9927

src/orchestrator_magentic.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Magentic-based orchestrator using ChatAgent pattern."""
 from collections.abc import AsyncGenerator
 from typing import TYPE_CHECKING, Any
@@ -169,18 +170,26 @@ The final output should be a structured research report."""
         iteration = 0
         final_event_received = False
         try:
-            async for event in workflow.run_stream(task):
-                agent_event = self._process_event(event, iteration)
-                if agent_event:
-                    if isinstance(event, MagenticAgentMessageEvent):
-                        iteration += 1
-                    if agent_event.type == "complete":
-                        final_event_received = True
-                    yield agent_event
             # GUARANTEE: Always emit termination event if stream ends without one
             # (e.g., max rounds reached)
@@ -200,6 +209,15 @@ The final output should be a structured research report."""
                     iteration=iteration,
                 )
         except Exception as e:
             logger.error("Magentic workflow failed", error=str(e))
             yield AgentEvent(

 """Magentic-based orchestrator using ChatAgent pattern."""
+import asyncio
 from collections.abc import AsyncGenerator
 from typing import TYPE_CHECKING, Any
         iteration = 0
         final_event_received = False
+        demo_timeout_seconds = 300  # 5 minutes max
         try:
+            async with asyncio.timeout(demo_timeout_seconds):
+                async for event in workflow.run_stream(task):
+                    agent_event = self._process_event(event, iteration)
+                    if agent_event:
+                        if isinstance(event, MagenticAgentMessageEvent):
+                            iteration += 1
+                            # Yield progress update before the agent action
+                            yield AgentEvent(
+                                type="progress",
+                                message=f"Round {iteration}/{self._max_rounds}...",
+                                iteration=iteration,
+                            )
+                        if agent_event.type == "complete":
+                            final_event_received = True
+                        yield agent_event
             # GUARANTEE: Always emit termination event if stream ends without one
             # (e.g., max rounds reached)
                     iteration=iteration,
                 )
+        except TimeoutError:
+            logger.warning("Workflow timed out", iterations=iteration)
+            yield AgentEvent(
+                type="complete",
+                message="Research timed out. Synthesizing available evidence...",
+                data={"reason": "timeout", "iterations": iteration},
+                iteration=iteration,
+            )
         except Exception as e:
             logger.error("Magentic workflow failed", error=str(e))
             yield AgentEvent(

src/utils/models.py CHANGED Viewed

@@ -119,6 +119,7 @@ class AgentEvent(BaseModel):
         "hypothesizing",
         "analyzing",  # NEW for Phase 13
         "analysis_complete",  # NEW for Phase 13
     ]
     message: str
     data: Any = None
@@ -142,6 +143,7 @@ class AgentEvent(BaseModel):
             "hypothesizing": "🔬",  # NEW
             "analyzing": "📊",  # NEW
             "analysis_complete": "📈",  # NEW
         }
         icon = icons.get(self.type, "•")
         return f"{icon} **{self.type.upper()}**: {self.message}"

         "hypothesizing",
         "analyzing",  # NEW for Phase 13
         "analysis_complete",  # NEW for Phase 13
+        "progress",  # NEW for SPEC_01
     ]
     message: str
     data: Any = None
             "hypothesizing": "🔬",  # NEW
             "analyzing": "📊",  # NEW
             "analysis_complete": "📈",  # NEW
+            "progress": "⏱️",  # NEW
         }
         icon = icons.get(self.type, "•")
         return f"{icon} **{self.type.upper()}**: {self.message}"

tests/e2e/conftest.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from unittest.mock import MagicMock
+import pytest
+from src.utils.models import AssessmentDetails, Citation, Evidence, JudgeAssessment, SearchResult
+@pytest.fixture
+def mock_search_handler():
+    """Return a mock search handler that returns fake evidence."""
+    mock = MagicMock()
+    async def mock_execute(query, max_results=10):
+        return SearchResult(
+            query=query,
+            evidence=[
+                Evidence(
+                    content=f"Evidence content for {query}",
+                    citation=Citation(
+                        source="pubmed",
+                        title=f"Study on {query}",
+                        url="https://pubmed.example.com/123",
+                        date="2025-01-01",
+                        authors=["Doe J"],
+                    ),
+                )
+            ],
+            sources_searched=["pubmed"],
+            total_found=1,
+            errors=[],
+        )
+    mock.execute = mock_execute
+    return mock
+@pytest.fixture
+def mock_judge_handler():
+    """Return a mock judge that always says 'synthesize'."""
+    mock = MagicMock()
+    async def mock_assess(question, evidence):
+        return JudgeAssessment(
+            sufficient=True,
+            confidence=0.9,
+            recommendation="synthesize",
+            details=AssessmentDetails(
+                mechanism_score=8,
+                mechanism_reasoning="Strong mechanism found in mock data",
+                clinical_evidence_score=7,
+                clinical_reasoning="Good clinical evidence in mock data",
+                drug_candidates=["MockDrug A"],
+                key_findings=["Finding 1", "Finding 2"],
+            ),
+            reasoning="Evidence is sufficient for synthesis.",
+            next_search_queries=[],
+        )
+    mock.assess = mock_assess
+    return mock

tests/e2e/test_advanced_mode.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from unittest.mock import MagicMock, patch
+import pytest
+# Skip entire module if agent_framework is not installed
+agent_framework = pytest.importorskip("agent_framework")
+from agent_framework import MagenticAgentMessageEvent, MagenticFinalResultEvent
+from src.orchestrator_magentic import MagenticOrchestrator
+class MockChatMessage:
+    def __init__(self, content):
+        self.content = content
+    @property
+    def text(self):
+        return self.content
+@pytest.mark.asyncio
+@pytest.mark.e2e
+async def test_advanced_mode_completes_mocked():
+    """Verify Advanced mode runs without crashing (mocked workflow)."""
+    # Initialize orchestrator (mocking requirements check)
+    with patch("src.orchestrator_magentic.check_magentic_requirements"):
+        orchestrator = MagenticOrchestrator(max_rounds=5)
+    # Mock the workflow
+    mock_workflow = MagicMock()
+    # Create fake events
+    # 1. Search Agent runs
+    mock_msg_1 = MockChatMessage("Found 5 papers on PubMed")
+    event1 = MagenticAgentMessageEvent(agent_id="SearchAgent", message=mock_msg_1)
+    # 2. Report Agent finishes
+    mock_result_msg = MockChatMessage("# Final Report\n\nFindings...")
+    event2 = MagenticFinalResultEvent(message=mock_result_msg)
+    async def mock_stream(task):
+        yield event1
+        yield event2
+    mock_workflow.run_stream = mock_stream
+    # Patch dependencies:
+    # _build_workflow: Returns our mock
+    # init_magentic_state: Avoids DB calls
+    # _init_embedding_service: Avoids loading embeddings
+    with (
+        patch.object(orchestrator, "_build_workflow", return_value=mock_workflow),
+        patch("src.orchestrator_magentic.init_magentic_state"),
+        patch.object(orchestrator, "_init_embedding_service", return_value=None),
+    ):
+        events = []
+        async for event in orchestrator.run("test query"):
+            events.append(event)
+        # Check events
+        types = [e.type for e in events]
+        assert "started" in types
+        assert "thinking" in types
+        assert "search_complete" in types  # Mapped from SearchAgent
+        assert "progress" in types  # Added in SPEC_01
+        assert "complete" in types
+        complete_event = next(e for e in events if e.type == "complete")
+        assert "Final Report" in complete_event.message

tests/e2e/test_simple_mode.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import pytest
+from src.orchestrator import Orchestrator
+from src.utils.models import OrchestratorConfig
+@pytest.mark.asyncio
+@pytest.mark.e2e
+async def test_simple_mode_completes(mock_search_handler, mock_judge_handler):
+    """Verify Simple mode runs without crashing using mocks."""
+    config = OrchestratorConfig(max_iterations=2)
+    orchestrator = Orchestrator(
+        search_handler=mock_search_handler,
+        judge_handler=mock_judge_handler,
+        config=config,
+        enable_analysis=False,
+        enable_embeddings=False,
+    )
+    events = []
+    async for event in orchestrator.run("test query"):
+        events.append(event)
+    # Must complete
+    assert any(e.type == "complete" for e in events), "Did not receive complete event"
+    # Must not error
+    assert not any(e.type == "error" for e in events), "Received error event"
+    # Check structure of complete event
+    complete_event = next(e for e in events if e.type == "complete")
+    # The mock judge returns "MockDrug A" and "Finding 1", ensuring synthesis happens
+    assert "MockDrug A" in complete_event.message
+    assert "Finding 1" in complete_event.message
+@pytest.mark.asyncio
+@pytest.mark.e2e
+async def test_simple_mode_structure_validation(mock_search_handler, mock_judge_handler):
+    """Verify output contains expected structure (citations, headings)."""
+    config = OrchestratorConfig(max_iterations=2)
+    orchestrator = Orchestrator(
+        search_handler=mock_search_handler,
+        judge_handler=mock_judge_handler,
+        config=config,
+        enable_analysis=False,
+        enable_embeddings=False,
+    )
+    events = []
+    async for event in orchestrator.run("test query"):
+        events.append(event)
+    complete_event = next(e for e in events if e.type == "complete")
+    report = complete_event.message
+    # Check markdown structure
+    assert "## Drug Repurposing Analysis" in report
+    assert "### Citations" in report
+    assert "### Key Findings" in report
+    # Check for citations
+    assert "Study on test query" in report
+    assert "https://pubmed.example.com/123" in report

tests/unit/test_magentic_termination.py CHANGED Viewed

@@ -109,3 +109,38 @@ async def test_no_double_termination_event(mock_magentic_requirements):
             # Verify we didn't get a SECOND "Max iterations reached" event
             fallback_events = [e for e in events if "Max iterations reached" in e.message]
             assert len(fallback_events) == 0

             # Verify we didn't get a SECOND "Max iterations reached" event
             fallback_events = [e for e in events if "Max iterations reached" in e.message]
             assert len(fallback_events) == 0
+@pytest.mark.asyncio
+async def test_termination_on_timeout(mock_magentic_requirements):
+    """
+    Verify that a termination event is emitted when the workflow times out.
+    """
+    orchestrator = MagenticOrchestrator()
+    mock_workflow = MagicMock()
+    # Simulate a stream that times out (raises TimeoutError)
+    async def mock_stream_raises(task):
+        # Yield one event before timing out
+        yield MagenticAgentMessageEvent(
+            agent_id="SearchAgent", message=MockChatMessage("Working...")
+        )
+        raise TimeoutError()
+    mock_workflow.run_stream = mock_stream_raises
+    with patch.object(orchestrator, "_build_workflow", return_value=mock_workflow):
+        events = []
+        async for event in orchestrator.run("Research query"):
+            events.append(event)
+        # Check for progress/normal events
+        assert any("Working..." in e.message for e in events)
+        # Check for timeout completion
+        completion_events = [e for e in events if e.type == "complete"]
+        assert len(completion_events) > 0
+        last_event = completion_events[-1]
+        assert "timed out" in last_event.message
+        assert last_event.data.get("reason") == "timeout"