Merge pull request #125 from The-Obstacle-Is-The-Way/feat/smoke-tests
Browse files- Makefile +15 -5
- SPEC_ARCHITECTURAL_DEBT.md +36 -12
- tests/e2e/test_smoke.py +65 -0
Makefile
CHANGED
|
@@ -28,9 +28,19 @@ format:
|
|
| 28 |
typecheck:
|
| 29 |
uv run mypy src
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
clean:
|
| 35 |
-
rm -rf .pytest_cache .mypy_cache .ruff_cache __pycache__ .coverage htmlcov
|
| 36 |
-
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
|
|
|
| 28 |
typecheck:
|
| 29 |
uv run mypy src
|
| 30 |
|
| 31 |
+
# Run all checks (lint, typecheck, test)
|
| 32 |
+
check: lint typecheck test
|
| 33 |
+
|
| 34 |
+
# Smoke tests - run against real APIs (slow, not for CI)
|
| 35 |
+
smoke-free:
|
| 36 |
+
@echo "Running Free Tier smoke test..."
|
| 37 |
+
uv run python -m pytest tests/e2e/test_smoke.py::test_free_tier_synthesis -v -s
|
| 38 |
+
|
| 39 |
+
smoke-paid:
|
| 40 |
+
@echo "Running Paid Tier smoke test (requires OPENAI_API_KEY)..."
|
| 41 |
+
uv run python -m pytest tests/e2e/test_smoke.py::test_paid_tier_synthesis -v -s
|
| 42 |
+
|
| 43 |
+
smoke: smoke-free # Default to free tier
|
| 44 |
+
|
| 45 |
+
# Clean up cache and artifacts
|
| 46 |
|
|
|
|
|
|
|
|
|
SPEC_ARCHITECTURAL_DEBT.md
CHANGED
|
@@ -230,6 +230,10 @@ class WorkflowState:
|
|
| 230 |
|
| 231 |
**CRITICAL**: Each phase MUST pass smoke tests before merge. Unit tests alone are insufficient.
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
### Smoke Test Infrastructure
|
| 234 |
|
| 235 |
Add to `Makefile`:
|
|
@@ -339,17 +343,37 @@ Implement **Priority 1, 2, and 3** before merging PR #124.
|
|
| 339 |
### Phase 2: Future PRs (Separate Tickets)
|
| 340 |
Create GitHub issues for Priority 4-8. Do NOT bloat the current bug fix PR.
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
---
|
| 343 |
|
| 344 |
-
## Appendix: Line Number Reference
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
|
| 354 |
-
|
| 355 |
-
| `
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
**CRITICAL**: Each phase MUST pass smoke tests before merge. Unit tests alone are insufficient.
|
| 232 |
|
| 233 |
+
> **Implementation Status**: IMPLEMENTED (PR #125)
|
| 234 |
+
> Smoke tests are now live in `tests/e2e/test_smoke.py` with Makefile targets.
|
| 235 |
+
> Run `make smoke-free` or `make smoke-paid` before any refactoring PR.
|
| 236 |
+
|
| 237 |
### Smoke Test Infrastructure
|
| 238 |
|
| 239 |
Add to `Makefile`:
|
|
|
|
| 343 |
### Phase 2: Future PRs (Separate Tickets)
|
| 344 |
Create GitHub issues for Priority 4-8. Do NOT bloat the current bug fix PR.
|
| 345 |
|
| 346 |
+
**IMPORTANT**: Before starting ANY Priority 4-7 refactors, FIRST implement Priority 8 (Smoke Tests).
|
| 347 |
+
This ensures we can detect regressions from refactoring. Sequence:
|
| 348 |
+
|
| 349 |
+
1. **PR: Smoke Test Infrastructure** (Priority 8) - MUST BE FIRST
|
| 350 |
+
- Create `tests/e2e/test_smoke.py`
|
| 351 |
+
- Add `make smoke-free` and `make smoke-paid` to Makefile
|
| 352 |
+
- Verify both Free Tier and Paid Tier produce synthesis
|
| 353 |
+
|
| 354 |
+
2. **PR: Dead Config Cleanup** (Priority 4)
|
| 355 |
+
3. **PR: Prompt Unification** (Priority 5)
|
| 356 |
+
4. **PR: Factory Registry Pattern** (Priority 6)
|
| 357 |
+
5. **PR: WorkflowState Dataclass** (Priority 7)
|
| 358 |
+
|
| 359 |
---
|
| 360 |
|
| 361 |
+
## Appendix: Line Number Reference (Historical)
|
| 362 |
+
|
| 363 |
+
> **Note**: These line numbers were from BEFORE Phase 1 refactoring.
|
| 364 |
+
> After PR #124 merge, the following methods were consolidated:
|
| 365 |
+
> - `_handle_timeout()` β DELETED (merged into `_synthesize_fallback`)
|
| 366 |
+
> - `_force_synthesis()` β DELETED (merged into `_synthesize_fallback`)
|
| 367 |
+
> - Redundant imports β REMOVED (centralized at module level)
|
| 368 |
+
> - Magic strings β REPLACED with constants
|
| 369 |
+
|
| 370 |
+
| Item (Pre-Refactor) | Original Location | Status |
|
| 371 |
+
|---------------------|-------------------|--------|
|
| 372 |
+
| `_handle_timeout()` | Lines 201-248 | DELETED |
|
| 373 |
+
| `_force_synthesis()` | Lines 250-297 | DELETED |
|
| 374 |
+
| Redundant imports (timeout) | Lines 207-208 | REMOVED |
|
| 375 |
+
| Redundant imports (force) | Lines 257-258 | REMOVED |
|
| 376 |
+
| Magic string detection | Line 385 | REFACTORED |
|
| 377 |
+
| `_get_event_type_for_agent()` | Lines 582-602 | REFACTORED |
|
| 378 |
+
| Module imports | Lines 18-48 | UPDATED |
|
| 379 |
+
| `run()` method | Lines 299-456 | UPDATED |
|
tests/e2e/test_smoke.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smoke tests for regression prevention.
|
| 3 |
+
|
| 4 |
+
These tests run against REAL APIs and verify end-to-end functionality.
|
| 5 |
+
They are slow (2-5 minutes) and should NOT run in CI.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
make smoke-free # Test Free Tier (HuggingFace)
|
| 9 |
+
make smoke-paid # Test Paid Tier (OpenAI BYOK)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
from src.orchestrators.advanced import AdvancedOrchestrator
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@pytest.mark.e2e
|
| 20 |
+
@pytest.mark.timeout(600) # 10 minute timeout for Free Tier
|
| 21 |
+
async def test_free_tier_synthesis():
|
| 22 |
+
"""Verify Free Tier produces actual synthesis (not just 'Research complete.')"""
|
| 23 |
+
# Use a simple query that is likely to yield results quickly
|
| 24 |
+
orch = AdvancedOrchestrator(max_rounds=2)
|
| 25 |
+
|
| 26 |
+
events = []
|
| 27 |
+
print("\nRunning Free Tier Smoke Test...")
|
| 28 |
+
async for event in orch.run("What is libido?"):
|
| 29 |
+
if event.type == "complete":
|
| 30 |
+
events.append(event)
|
| 31 |
+
print(f"Received complete event: {event.message[:50]}...")
|
| 32 |
+
|
| 33 |
+
# MUST have a complete event
|
| 34 |
+
assert len(events) >= 1, "No complete event received"
|
| 35 |
+
|
| 36 |
+
# Complete event MUST have substantive content (not just signal)
|
| 37 |
+
final = events[-1]
|
| 38 |
+
|
| 39 |
+
# P2 Bug Regression Check: Ensure content isn't just "Research complete."
|
| 40 |
+
assert len(final.message) > 100, f"Synthesis too short: {len(final.message)} chars"
|
| 41 |
+
|
| 42 |
+
# P1 Bug Regression Check: Ensure we got actual text
|
| 43 |
+
assert "Research complete." not in final.message or len(final.message) > 50, (
|
| 44 |
+
"Got empty synthesis signal instead of actual report"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@pytest.mark.e2e
|
| 49 |
+
@pytest.mark.timeout(300) # 5 minute timeout for Paid Tier
|
| 50 |
+
async def test_paid_tier_synthesis():
|
| 51 |
+
"""Verify Paid Tier (BYOK) produces synthesis."""
|
| 52 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 53 |
+
if not api_key:
|
| 54 |
+
pytest.skip("OPENAI_API_KEY not set")
|
| 55 |
+
|
| 56 |
+
orch = AdvancedOrchestrator(max_rounds=2, api_key=api_key)
|
| 57 |
+
|
| 58 |
+
events = []
|
| 59 |
+
print("\nRunning Paid Tier Smoke Test...")
|
| 60 |
+
async for event in orch.run("What is libido?"):
|
| 61 |
+
if event.type == "complete":
|
| 62 |
+
events.append(event)
|
| 63 |
+
|
| 64 |
+
assert len(events) >= 1, "No complete event received"
|
| 65 |
+
assert len(events[-1].message) > 100, "Synthesis too short"
|