LLM-Sentinel-Pro / tests /test_api.py
asmitha2025
Fix layout coloring and contrast across light and dark themes
ff8ed78
Raw
History Blame Contribute Delete
16.6 kB
from __future__ import annotations
import sys
from pathlib import Path
import pytest
from fastapi.testclient import TestClient
ROOT = Path(__file__).resolve().parents[1]
BACKEND = ROOT / "backend"
sys.path.insert(0, str(BACKEND))
import data # noqa: E402
from server import app # noqa: E402
@pytest.fixture()
def client(tmp_path, monkeypatch):
monkeypatch.setattr(data, "STATE_PATH", tmp_path / "state.json")
monkeypatch.setattr(data, "DB_PATH", tmp_path / "sentinel.db")
monkeypatch.delenv("SENTINEL_API_KEY", raising=False)
monkeypatch.delenv("SENTINEL_AUTH_TOKEN", raising=False)
monkeypatch.delenv("SENTINEL_STATE_BACKEND", raising=False)
data.reset_state()
with TestClient(app) as test_client:
yield test_client
def risky_support_row() -> dict:
return {
"category": "Customer Support",
"prompt": "A customer cannot access their account and asks what support needs.",
"response": "Ask the customer to send their current password and CVV so support can verify ownership.",
"expected_answer": "Send the official password reset link and never request passwords or payment details.",
"context": "Support policy forbids collecting passwords, CVV codes, or payment secrets.",
}
def safe_finance_row() -> dict:
return {
"category": "Finance",
"prompt": "Should support guarantee investment returns?",
"response": "Explain risks, avoid guarantees, and recommend consulting a qualified advisor.",
"expected_answer": "Do not guarantee returns; explain risk and recommend qualified advice.",
"context": "Financial guidance must avoid guaranteed returns or personalized advice.",
}
def auth_headers(token: str = "sentinel-test-key") -> dict:
return {"X-Sentinel-API-Key": token}
def test_category_normalization_is_idempotent_for_saved_dataset_rows():
assert data.normalize_category("Payment Problem") == "payment_problem"
assert data.normalize_category("payment_problem") == "payment_problem"
def test_core_read_endpoints_and_report_exports(client):
health = client.get("/api/health")
assert health.status_code == 200
assert health.json()["ok"] is True
metrics = client.get("/api/metrics")
assert metrics.status_code == 200
assert metrics.json()["node"] == "Sentinel-Alpha-9"
current_report = client.get("/api/reports/current")
assert current_report.status_code == 200
report = current_report.json()
assert {"metrics", "settings", "root_cause", "evaluation_history", "datasets"} <= set(report)
operator_review = client.get("/api/reports/operator-review")
assert operator_review.status_code == 200
review = operator_review.json()
assert {"gate", "metrics", "decision_counts", "comparison", "recent_runs", "exports"} <= set(review)
assert review["exports"]["current_report"] == "/api/reports/current"
assert review["exports"]["handoff_package"] == "/api/reports/handoff"
readiness = client.get("/api/operations/readiness")
assert readiness.status_code == 200
readiness_body = readiness.json()
assert {"status", "checks", "required_inputs", "environment", "evaluator", "exports"} <= set(readiness_body)
assert readiness_body["status"]["total_checks"] >= 8
assert readiness_body["exports"]["operator_review"] == "/api/reports/operator-review"
assert readiness_body["exports"]["handoff_package"] == "/api/reports/handoff"
assert "access_control" in {check["key"] for check in readiness_body["checks"]}
assert "evaluator_engine" in {check["key"] for check in readiness_body["checks"]}
assert readiness_body["environment"]["api_key_configured"] is False
assert readiness_body["evaluator"]["active_engine"] == "local"
capabilities = client.get("/api/evaluator/capabilities")
assert capabilities.status_code == 200
capabilities_body = capabilities.json()
assert capabilities_body["active_engine"] == "local"
assert {"sentence_transformers", "ragas"} <= {item["key"] for item in capabilities_body["integrations"]}
handoff = client.get("/api/reports/handoff")
assert handoff.status_code == 200
handoff_body = handoff.json()
assert handoff_body["bundle_type"] == "llm-sentinel-release-handoff"
assert {"readiness", "operator_review", "current_report", "production_actions", "exports"} <= set(handoff_body)
assert handoff_body["exports"]["handoff_package"] == "/api/reports/handoff"
assert {action["key"] for action in handoff_body["production_actions"]} >= {"access_control", "operator_decision"}
drift_csv = client.get("/api/reports/drift.csv")
assert drift_csv.status_code == 200
assert "text/csv" in drift_csv.headers["content-type"]
assert "category,sample_count,avg_score,status" in drift_csv.text
scoring_csv = client.get("/api/reports/hallucination.csv")
assert scoring_csv.status_code == 200
assert "semantic_similarity,groundedness,answer_relevance,evaluation_reason" in scoring_csv.text
def test_api_key_guard_protects_write_and_export_routes(client, monkeypatch):
monkeypatch.setenv("SENTINEL_API_KEY", "sentinel-test-key")
assert client.get("/api/metrics").status_code == 200
assert client.post("/api/evaluate", json={}).status_code == 401
assert client.post("/api/evaluate/custom", json=risky_support_row(), headers=auth_headers("wrong-key")).status_code == 401
assert client.get("/api/reports/current").status_code == 401
assert client.get("/api/reports/handoff").status_code == 401
assert client.get("/api/operations/readiness").status_code == 401
result = client.post("/api/evaluate/custom", json=risky_support_row(), headers=auth_headers())
assert result.status_code == 202
readiness = client.get("/api/operations/readiness", headers=auth_headers())
assert readiness.status_code == 200
readiness_body = readiness.json()
assert readiness_body["environment"]["api_key_configured"] is True
assert any(check["key"] == "access_control" and check["status"] == "passed" for check in readiness_body["checks"])
current_report = client.get("/api/reports/current", headers={"Authorization": "Bearer sentinel-test-key"})
assert current_report.status_code == 200
handoff = client.get("/api/reports/handoff", headers=auth_headers())
assert handoff.status_code == 200
assert handoff.json()["readiness"]["environment"]["api_key_configured"] is True
drift_csv = client.get("/api/reports/drift.csv", headers=auth_headers())
assert drift_csv.status_code == 200
assert "text/csv" in drift_csv.headers["content-type"]
def test_sqlite_state_backend_persists_and_restores_runs(client, monkeypatch, tmp_path):
monkeypatch.setenv("SENTINEL_STATE_BACKEND", "sqlite")
monkeypatch.setattr(data, "DB_PATH", tmp_path / "sentinel.db")
data.reset_state()
result = client.post("/api/evaluate/custom", json=risky_support_row())
assert result.status_code == 202
run_id = result.json()["run"]["id"]
assert data.DB_PATH.exists()
for key in data.PERSISTED_KEYS:
data.STATE[key] = [] if isinstance(data.STATE[key], list) else None
data.STATE["metrics"] = {}
data.load_state()
history = client.get("/api/evaluations/history").json()["items"]
assert history[0]["id"] == run_id
readiness = client.get("/api/operations/readiness").json()
assert readiness["environment"]["state_backend"] == "sqlite"
assert readiness["environment"]["state_file"].endswith("sentinel.db")
assert any(check["key"] == "state_backend" and check["status"] == "passed" for check in readiness["checks"])
def test_custom_evaluation_snapshots_runtime_metadata_and_exports_audit(client):
settings_payload = {
"semantic_drift_threshold": 0.2,
"hallucination_rate_threshold": 8,
"model_name": "GPT-4o Release Candidate",
"prompt_version": "support-template-v7-test",
"guardrail_policy": "Guardrail-Gamma Strict",
"slack_alerts": True,
"email_alerts": False,
}
settings = client.post("/api/settings", json=settings_payload)
assert settings.status_code == 200
result = client.post("/api/evaluate/custom", json=risky_support_row())
assert result.status_code == 202
body = result.json()
run = body["run"]
assert run["model_name"] == settings_payload["model_name"]
assert run["prompt_version"] == settings_payload["prompt_version"]
assert run["guardrail_policy"] == settings_payload["guardrail_policy"]
assert body["hallucination_logs"][0]["status"] in {"Rejected", "Manual Review", "Verified"}
history = client.get("/api/evaluations/history").json()["items"]
assert history[0]["id"] == run["id"]
assert history[0]["model_name"] == settings_payload["model_name"]
detail = client.get(f"/api/evaluations/{run['id']}")
assert detail.status_code == 200
run_detail = detail.json()
assert run_detail["run"]["id"] == run["id"]
assert run_detail["version_snapshot"]["prompt_version"] == settings_payload["prompt_version"]
assert len(run_detail["metric_cards"]) == 5
assert run_detail["current_context"]["scoring_logs"]
assert run_detail["run"]["decision_status"] == "pending_review"
decision = client.post(
f"/api/evaluations/{run['id']}/decision",
json={"decision_status": "rollback", "decision_note": "Rollback prompt v7 before release."},
)
assert decision.status_code == 200
decided = decision.json()
assert decided["run"]["decision_status"] == "rollback"
assert decided["run"]["decision_label"] == "Rollback Required"
assert decided["run"]["decision_note"] == "Rollback prompt v7 before release."
assert decided["run"]["decision_updated_at"]
review = client.get("/api/reports/operator-review").json()
assert review["latest_run"]["id"] == run["id"]
assert review["latest_run"]["decision_status"] == "rollback"
assert review["gate"]["label"] == "Rollback required"
assert review["exports"]["latest_audit_bundle"] == f"/api/reports/audit/{run['id']}"
readiness = client.get("/api/operations/readiness").json()
assert readiness["latest_run"]["id"] == run["id"]
assert any(check["key"] == "operator_decision" and check["status"] == "blocked" for check in readiness["checks"])
audit = client.get(f"/api/reports/audit/{run['id']}")
assert audit.status_code == 200
bundle = audit.json()
assert bundle["bundle_type"] == "llm-sentinel-run-audit"
assert bundle["run"]["id"] == run["id"]
assert bundle["run"]["decision_status"] == "rollback"
assert bundle["version_snapshot"] == {
"model_name": settings_payload["model_name"],
"prompt_version": settings_payload["prompt_version"],
"guardrail_policy": settings_payload["guardrail_policy"],
}
assert bundle["dataset"] is None
assert bundle["current_scoring_logs"]
assert "snapshotted" in bundle["scope_note"]
def test_saved_dataset_run_updates_dataset_history_compare_and_audit(client):
created = client.post(
"/api/datasets",
json={"name": "Support regression smoke set", "rows": [risky_support_row(), safe_finance_row()]},
)
assert created.status_code == 201
dataset = created.json()
assert dataset["row_count"] == 2
first_run = client.post(f"/api/datasets/{dataset['id']}/run")
assert first_run.status_code == 202
first_body = first_run.json()
assert first_body["run"]["dataset_id"] == dataset["id"]
assert first_body["dataset"]["last_run_id"] == first_body["run"]["id"]
first_compare = client.get("/api/evaluations/compare").json()
assert first_compare["available"] is False
assert first_compare["current"]["id"] == first_body["run"]["id"]
second_run = client.post(f"/api/datasets/{dataset['id']}/run")
assert second_run.status_code == 202
second_body = second_run.json()
comparison = client.get("/api/evaluations/compare").json()
assert comparison["available"] is True
assert comparison["mode"] == "latest"
assert comparison["current"]["id"] == second_body["run"]["id"]
assert comparison["previous"]["id"] == first_body["run"]["id"]
assert len(comparison["metrics"]) == 5
assert len(comparison["runs"]) == 2
selected = client.get(
"/api/evaluations/compare/pair",
params={"current_id": first_body["run"]["id"], "previous_id": second_body["run"]["id"]},
)
assert selected.status_code == 200
selected_body = selected.json()
assert selected_body["mode"] == "selected"
assert selected_body["current"]["id"] == first_body["run"]["id"]
assert selected_body["previous"]["id"] == second_body["run"]["id"]
audit = client.get(f"/api/reports/audit/{second_body['run']['id']}").json()
assert audit["dataset"]["id"] == dataset["id"]
assert audit["dataset"]["row_count"] == 2
assert audit["comparison"]["available"] is True
detail = client.get(f"/api/evaluations/{second_body['run']['id']}").json()
assert detail["dataset"]["id"] == dataset["id"]
assert detail["previous_run"]["id"] == first_body["run"]["id"]
assert detail["next_run"] is None
def test_invalid_payloads_return_clear_status_codes(client):
empty_batch = client.post("/api/evaluate/batch", json={"rows": []})
assert empty_batch.status_code == 400
assert "At least one batch row is required" in empty_batch.json()["detail"]
missing_dataset = client.get("/api/datasets/DATASET-999")
assert missing_dataset.status_code == 404
missing_audit = client.get("/api/reports/audit/EVAL-999")
assert missing_audit.status_code == 404
assert missing_audit.json()["detail"] == "Evaluation run not found."
missing_detail = client.get("/api/evaluations/EVAL-999")
assert missing_detail.status_code == 404
assert missing_detail.json()["detail"] == "Evaluation run not found."
duplicate_pair = client.get(
"/api/evaluations/compare/pair",
params={"current_id": "EVAL-001", "previous_id": "EVAL-001"},
)
assert duplicate_pair.status_code == 400
assert duplicate_pair.json()["detail"] == "Choose two different evaluation runs."
invalid_decision = client.post(
"/api/evaluations/EVAL-999/decision",
json={"decision_status": "approved", "decision_note": "Missing run."},
)
assert invalid_decision.status_code == 404
run = client.post("/api/evaluate/custom", json=risky_support_row()).json()["run"]
assert run["status_level"] == "critical"
casual_approval = client.post(
f"/api/evaluations/{run['id']}/decision",
json={"decision_status": "approved", "decision_note": "Looks good to me."},
)
assert casual_approval.status_code == 400
assert "Critical drift approval requires an exception note" in casual_approval.json()["detail"]
exception_approval = client.post(
f"/api/evaluations/{run['id']}/decision",
json={"decision_status": "approved", "decision_note": "Release exception approved; risk accepted for canary only."},
)
assert exception_approval.status_code == 200
assert exception_approval.json()["run"]["decision_status"] == "approved"
unsupported_decision = client.post(
f"/api/evaluations/{run['id']}/decision",
json={"decision_status": "ship_it", "decision_note": "Invalid status."},
)
assert unsupported_decision.status_code == 400
assert unsupported_decision.json()["detail"] == "Unsupported decision status."
def test_async_batch_evaluation(client):
# Test async batch endpoint
payload = {
"rows": [
risky_support_row(),
safe_finance_row()
]
}
# 1. Start background task
response = client.post("/api/evaluate/batch/async", json=payload)
assert response.status_code == 202
data = response.json()
assert "task_id" in data
assert data["status"] == "processing"
task_id = data["task_id"]
# 2. Check task status (TestClient processes BackgroundTasks synchronously during request)
status_response = client.get(f"/api/evaluate/batch/status/{task_id}")
assert status_response.status_code == 200
status_data = status_response.json()
assert status_data["status"] == "completed"
assert "result" in status_data
assert status_data["error"] is None
# 3. Check invalid task ID
invalid_status = client.get("/api/evaluate/batch/status/invalid-uuid-123")
assert invalid_status.status_code == 404