mbochniak01 commited on
Commit ·
10aced5
1
Parent(s): 3c949de
Add typed client library, unit + integration tests, mypy, ruff, NOTES.md
Browse files- Makefile +15 -1
- NOTES.md +86 -0
- client/__init__.py +15 -0
- client/client.py +118 -0
- client/exceptions.py +24 -0
- client/models.py +41 -0
- mypy.ini +23 -0
- pytest.ini +5 -0
- requirements.txt +14 -0
- ruff.toml +12 -0
- tests/conftest.py +23 -0
- tests/integration/__init__.py +0 -0
- tests/integration/test_api.py +95 -0
- tests/unit/__init__.py +0 -0
- tests/unit/test_client.py +120 -0
- tests/unit/test_grader.py +140 -0
- tests/unit/test_rosetta.py +67 -0
Makefile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
.PHONY: install run
|
| 2 |
|
| 3 |
install:
|
| 4 |
pip install -r requirements.txt
|
|
@@ -6,6 +6,20 @@ install:
|
|
| 6 |
run:
|
| 7 |
cd backend && uvicorn app:app --reload --host 0.0.0.0 --port 8000
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
eval-retail:
|
| 10 |
python eval/metrics.py --domain retail
|
| 11 |
open eval/reports/report_retail.html 2>/dev/null || xdg-open eval/reports/report_retail.html
|
|
|
|
| 1 |
+
.PHONY: install run test test-integration lint type-check eval-retail eval-pharma eval
|
| 2 |
|
| 3 |
install:
|
| 4 |
pip install -r requirements.txt
|
|
|
|
| 6 |
run:
|
| 7 |
cd backend && uvicorn app:app --reload --host 0.0.0.0 --port 8000
|
| 8 |
|
| 9 |
+
# Unit tests only (default — no server required)
|
| 10 |
+
test:
|
| 11 |
+
pytest tests/unit -v
|
| 12 |
+
|
| 13 |
+
# Integration tests — requires running server (make run in another terminal)
|
| 14 |
+
test-integration:
|
| 15 |
+
pytest tests/integration -v -m integration
|
| 16 |
+
|
| 17 |
+
lint:
|
| 18 |
+
ruff check client/ backend/ tests/
|
| 19 |
+
|
| 20 |
+
type-check:
|
| 21 |
+
mypy client/
|
| 22 |
+
|
| 23 |
eval-retail:
|
| 24 |
python eval/metrics.py --domain retail
|
| 25 |
open eval/reports/report_retail.html 2>/dev/null || xdg-open eval/reports/report_retail.html
|
NOTES.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Design Notes
|
| 2 |
+
|
| 3 |
+
## Key decisions and tradeoffs
|
| 4 |
+
|
| 5 |
+
### API target: own implementation
|
| 6 |
+
Instead of wrapping a third-party fake API, the client wraps this project's own
|
| 7 |
+
FastAPI backend. This means the client and the API are co-designed — the typed
|
| 8 |
+
models on both sides stay in sync by design. The tradeoff: less realistic than
|
| 9 |
+
wrapping an external API you don't control, but the test surface is richer and
|
| 10 |
+
the integration tests verify real business logic, not just HTTP plumbing.
|
| 11 |
+
|
| 12 |
+
### Two-layer evaluation (L1 live / L2 batch)
|
| 13 |
+
L1 runs on every query inline (~1-2s overhead). L2 runs offline against a golden
|
| 14 |
+
dataset. The split is a deliberate latency/depth tradeoff: LLM-judged metrics
|
| 15 |
+
(contextual precision, reverse-question relevancy) add 30+ seconds per pair —
|
| 16 |
+
unacceptable live, fine in batch. The golden dataset is the contract; L2 is the
|
| 17 |
+
regression gate.
|
| 18 |
+
|
| 19 |
+
### Deterministic chain_terminology over LLM judge
|
| 20 |
+
The terminology check is a dict lookup, not a model call. Zero latency, zero cost,
|
| 21 |
+
zero false negatives on known mappings. The tradeoff: it only catches terms in the
|
| 22 |
+
catalog — novel terminology drift goes undetected. An LLM judge would catch drift
|
| 23 |
+
but would introduce latency and non-determinism into a metric that must be auditable.
|
| 24 |
+
|
| 25 |
+
### In-memory retrieval over vector database
|
| 26 |
+
KB size is 8-9 docs per domain. Encoding them at startup and doing cosine search
|
| 27 |
+
at query time adds ~2ms retrieval overhead with no infrastructure dependency.
|
| 28 |
+
A vector DB (Chroma, pgvector) would add operational complexity with zero
|
| 29 |
+
retrieval quality gain at this scale.
|
| 30 |
+
|
| 31 |
+
### httpx + tenacity for the client
|
| 32 |
+
`httpx` is the modern alternative to `requests`: native async support if needed
|
| 33 |
+
later, cleaner timeout API, better type annotations. `tenacity` separates retry
|
| 34 |
+
policy from request logic cleanly — the retry decorator is readable and testable
|
| 35 |
+
independently from the HTTP code.
|
| 36 |
+
|
| 37 |
+
### Integration tests are read-only by design
|
| 38 |
+
The API has no mutable state: queries don't persist, no records are created or
|
| 39 |
+
deleted. Cleanup is therefore trivially satisfied — there is nothing to clean up.
|
| 40 |
+
This is called out explicitly because it's a deliberate architectural choice, not
|
| 41 |
+
an oversight. A stateful API (task creation, deletion) would require explicit
|
| 42 |
+
teardown fixtures.
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
## What another 4 hours would add
|
| 47 |
+
|
| 48 |
+
- **`eval/metrics.py` — L2 LLM metrics**: contextual precision (chunk ranking),
|
| 49 |
+
contextual recall (coverage), and answer correctness against full reference answers.
|
| 50 |
+
Currently only keyphrase coverage is used as a proxy.
|
| 51 |
+
- **Async client**: `httpx.AsyncClient` variant for high-concurrency load testing.
|
| 52 |
+
- **Property-based tests**: `hypothesis` to fuzz `check_terminology` and graders
|
| 53 |
+
with generated strings — catches edge cases the golden dataset doesn't cover.
|
| 54 |
+
- **CI pipeline**: GitHub Actions running `make lint`, `make type-check`,
|
| 55 |
+
`make test` on every PR. Integration tests gated on a self-hosted runner with
|
| 56 |
+
the API running.
|
| 57 |
+
- **Threshold calibration report**: plot the distribution of L1 metric scores
|
| 58 |
+
across the golden dataset to verify that current thresholds aren't too tight
|
| 59 |
+
or too loose.
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## Where LLM assistance helped and where it misled
|
| 64 |
+
|
| 65 |
+
**Helped:**
|
| 66 |
+
- Scaffolding the full project structure (backend, client, tests, config) in a
|
| 67 |
+
single session without losing consistency across files.
|
| 68 |
+
- Writing the faithfulness prompt in a way that reliably returns structured JSON —
|
| 69 |
+
the few-shot JSON format in the prompt was a suggested pattern that works.
|
| 70 |
+
- Catching that `except Exception` in the faithfulness grader was too broad and
|
| 71 |
+
replacing it with `(json.JSONDecodeError, anthropic.APIError)`.
|
| 72 |
+
- Identifying that `_build_index_by_domain` was defined twice in pipeline.py
|
| 73 |
+
(duplicate introduced during an edit session) — caught during code review.
|
| 74 |
+
|
| 75 |
+
**Misled or required correction:**
|
| 76 |
+
- Initially used `lru_cache` on a function that takes a `SentenceTransformer`
|
| 77 |
+
instance as an argument — unhashable, so the cache silently failed. Required
|
| 78 |
+
switching to a module-level dict cache.
|
| 79 |
+
- Generated a dead loop in `rosetta.py` (iterating over terms with `continue`
|
| 80 |
+
but no code after the continue branch) that did nothing. The logic existed in
|
| 81 |
+
a comment describing intent but was never implemented. Caught in review.
|
| 82 |
+
- The first README used `sdk: gradio` in the HuggingFace frontmatter — the Space
|
| 83 |
+
was created as Gradio before switching to Docker. LLM-generated config matched
|
| 84 |
+
the target architecture but not the actual HF Space state.
|
| 85 |
+
- Suggested `ShelfWise` as a fictional client name — it is a real US company.
|
| 86 |
+
Required renaming to `ShelfWise`.
|
client/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from client.client import ValidatorClient
|
| 2 |
+
from client.exceptions import APIError, RetryExhaustedError, TimeoutError, ValidatorError
|
| 3 |
+
from client.models import ConfigResponse, MetricResult, QueryResponse, Source
|
| 4 |
+
|
| 5 |
+
__all__ = [
|
| 6 |
+
"ValidatorClient",
|
| 7 |
+
"ValidatorError",
|
| 8 |
+
"APIError",
|
| 9 |
+
"TimeoutError",
|
| 10 |
+
"RetryExhaustedError",
|
| 11 |
+
"ConfigResponse",
|
| 12 |
+
"QueryResponse",
|
| 13 |
+
"MetricResult",
|
| 14 |
+
"Source",
|
| 15 |
+
]
|
client/client.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ValidatorClient — typed HTTP client for the AI Response Validator API.
|
| 3 |
+
|
| 4 |
+
Retry policy: exponential backoff on 5xx and network errors, up to max_retries.
|
| 5 |
+
Timeouts: connect + read combined, configurable per instance.
|
| 6 |
+
Auth: optional Bearer token forwarded as Authorization header (for future use).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
from tenacity import (
|
| 11 |
+
retry,
|
| 12 |
+
retry_if_exception_type,
|
| 13 |
+
stop_after_attempt,
|
| 14 |
+
wait_exponential,
|
| 15 |
+
RetryError,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
from client.exceptions import APIError, RetryExhaustedError, TimeoutError
|
| 19 |
+
from client.models import ConfigResponse, QueryRequest, QueryResponse
|
| 20 |
+
|
| 21 |
+
DEFAULT_TIMEOUT = 30.0
|
| 22 |
+
DEFAULT_MAX_RETRIES = 3
|
| 23 |
+
_RETRY_STATUS_CODES = {500, 502, 503, 504}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ValidatorClient:
|
| 27 |
+
"""Typed client for the AI Response Validator API."""
|
| 28 |
+
|
| 29 |
+
def __init__(
|
| 30 |
+
self,
|
| 31 |
+
base_url: str,
|
| 32 |
+
timeout: float = DEFAULT_TIMEOUT,
|
| 33 |
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
| 34 |
+
api_key: str | None = None,
|
| 35 |
+
) -> None:
|
| 36 |
+
headers = {"Accept": "application/json"}
|
| 37 |
+
if api_key:
|
| 38 |
+
headers["Authorization"] = f"Bearer {api_key}"
|
| 39 |
+
self._client = httpx.Client(
|
| 40 |
+
base_url=base_url.rstrip("/"),
|
| 41 |
+
timeout=timeout,
|
| 42 |
+
headers=headers,
|
| 43 |
+
)
|
| 44 |
+
self._max_retries = max_retries
|
| 45 |
+
|
| 46 |
+
def _request(self, method: str, path: str, **kwargs: object) -> httpx.Response:
|
| 47 |
+
"""Execute an HTTP request with retry on transient server errors."""
|
| 48 |
+
@retry(
|
| 49 |
+
retry=retry_if_exception_type(_TransientError),
|
| 50 |
+
stop=stop_after_attempt(self._max_retries),
|
| 51 |
+
wait=wait_exponential(multiplier=0.5, min=0.5, max=10),
|
| 52 |
+
reraise=False,
|
| 53 |
+
)
|
| 54 |
+
def _attempt() -> httpx.Response:
|
| 55 |
+
try:
|
| 56 |
+
response = self._client.request(method, path, **kwargs) # type: ignore[arg-type]
|
| 57 |
+
except httpx.TimeoutException as exc:
|
| 58 |
+
raise TimeoutError(str(exc)) from exc
|
| 59 |
+
except httpx.NetworkError as exc:
|
| 60 |
+
raise _TransientError(str(exc)) from exc
|
| 61 |
+
|
| 62 |
+
if response.status_code in _RETRY_STATUS_CODES:
|
| 63 |
+
raise _TransientError(f"HTTP {response.status_code}")
|
| 64 |
+
|
| 65 |
+
if response.is_error:
|
| 66 |
+
detail = _extract_detail(response)
|
| 67 |
+
raise APIError(response.status_code, detail)
|
| 68 |
+
|
| 69 |
+
return response
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
return _attempt()
|
| 73 |
+
except RetryError as exc:
|
| 74 |
+
last = exc.last_attempt.exception()
|
| 75 |
+
raise RetryExhaustedError(self._max_retries, last) from exc
|
| 76 |
+
|
| 77 |
+
def get_config(self) -> ConfigResponse:
|
| 78 |
+
"""Return domain and client configuration (unauthenticated)."""
|
| 79 |
+
response = self._request("GET", "/config")
|
| 80 |
+
return ConfigResponse.model_validate(response.json())
|
| 81 |
+
|
| 82 |
+
def query(self, question: str, client_id: str) -> QueryResponse:
|
| 83 |
+
"""Submit a question for a specific client and return a graded response."""
|
| 84 |
+
payload = QueryRequest(query=question, client=client_id)
|
| 85 |
+
response = self._request(
|
| 86 |
+
"POST",
|
| 87 |
+
"/query",
|
| 88 |
+
json=payload.model_dump(),
|
| 89 |
+
)
|
| 90 |
+
return QueryResponse.model_validate(response.json())
|
| 91 |
+
|
| 92 |
+
def health(self) -> bool:
|
| 93 |
+
"""Return True if the API is reachable and healthy."""
|
| 94 |
+
try:
|
| 95 |
+
response = self._request("GET", "/health")
|
| 96 |
+
return response.json().get("status") == "ok"
|
| 97 |
+
except ValidatorError:
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
def close(self) -> None:
|
| 101 |
+
self._client.close()
|
| 102 |
+
|
| 103 |
+
def __enter__(self) -> "ValidatorClient":
|
| 104 |
+
return self
|
| 105 |
+
|
| 106 |
+
def __exit__(self, *_: object) -> None:
|
| 107 |
+
self.close()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class _TransientError(Exception):
|
| 111 |
+
"""Internal marker for errors that should trigger a retry."""
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _extract_detail(response: httpx.Response) -> str:
|
| 115 |
+
try:
|
| 116 |
+
return str(response.json().get("detail", response.text))
|
| 117 |
+
except Exception:
|
| 118 |
+
return response.text
|
client/exceptions.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ValidatorError(Exception):
|
| 2 |
+
"""Base exception for all client errors."""
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class APIError(ValidatorError):
|
| 6 |
+
"""HTTP error returned by the API (4xx / 5xx)."""
|
| 7 |
+
|
| 8 |
+
def __init__(self, status_code: int, detail: str) -> None:
|
| 9 |
+
self.status_code = status_code
|
| 10 |
+
self.detail = detail
|
| 11 |
+
super().__init__(f"HTTP {status_code}: {detail}")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TimeoutError(ValidatorError):
|
| 15 |
+
"""Request exceeded the configured timeout."""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RetryExhaustedError(ValidatorError):
|
| 19 |
+
"""All retry attempts failed."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, attempts: int, last_error: Exception) -> None:
|
| 22 |
+
self.attempts = attempts
|
| 23 |
+
self.last_error = last_error
|
| 24 |
+
super().__init__(f"Failed after {attempts} attempts: {last_error}")
|
client/models.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class QueryRequest(BaseModel):
|
| 5 |
+
query: str
|
| 6 |
+
client: str
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class MetricResult(BaseModel):
|
| 10 |
+
passed: bool
|
| 11 |
+
score: float = Field(ge=0.0, le=1.0)
|
| 12 |
+
detail: str
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EvaluationResult(BaseModel):
|
| 16 |
+
overall_pass: bool
|
| 17 |
+
metrics: dict[str, MetricResult]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class Source(BaseModel):
|
| 21 |
+
id: str
|
| 22 |
+
title: str
|
| 23 |
+
score: float = Field(ge=0.0, le=1.0)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class QueryResponse(BaseModel):
|
| 27 |
+
query: str
|
| 28 |
+
client: str
|
| 29 |
+
client_display: str
|
| 30 |
+
answer: str
|
| 31 |
+
sources: list[Source]
|
| 32 |
+
evaluation: EvaluationResult
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ClientInfo(BaseModel):
|
| 36 |
+
id: str
|
| 37 |
+
display: str
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class ConfigResponse(BaseModel):
|
| 41 |
+
domains: dict[str, list[ClientInfo]]
|
mypy.ini
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[mypy]
|
| 2 |
+
python_version = 3.11
|
| 3 |
+
strict = True
|
| 4 |
+
warn_return_any = True
|
| 5 |
+
warn_unused_ignores = True
|
| 6 |
+
|
| 7 |
+
# Client library — strict checking
|
| 8 |
+
[mypy-client.*]
|
| 9 |
+
disallow_untyped_defs = True
|
| 10 |
+
disallow_any_generics = True
|
| 11 |
+
|
| 12 |
+
# Third-party stubs
|
| 13 |
+
[mypy-tenacity.*]
|
| 14 |
+
ignore_missing_imports = True
|
| 15 |
+
|
| 16 |
+
[mypy-sentence_transformers.*]
|
| 17 |
+
ignore_missing_imports = True
|
| 18 |
+
|
| 19 |
+
[mypy-sklearn.*]
|
| 20 |
+
ignore_missing_imports = True
|
| 21 |
+
|
| 22 |
+
[mypy-yaml.*]
|
| 23 |
+
ignore_missing_imports = True
|
pytest.ini
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
addopts = -m "not integration"
|
| 3 |
+
markers =
|
| 4 |
+
integration: requires a running API server (use: make test-integration)
|
| 5 |
+
testpaths = tests
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
anthropic>=0.40.0
|
| 2 |
fastapi>=0.115.0
|
| 3 |
uvicorn[standard]>=0.30.0
|
|
@@ -6,3 +7,16 @@ sentence-transformers>=3.0.0
|
|
| 6 |
scikit-learn>=1.5.0
|
| 7 |
numpy>=1.26.0
|
| 8 |
python-multipart>=0.0.9
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API
|
| 2 |
anthropic>=0.40.0
|
| 3 |
fastapi>=0.115.0
|
| 4 |
uvicorn[standard]>=0.30.0
|
|
|
|
| 7 |
scikit-learn>=1.5.0
|
| 8 |
numpy>=1.26.0
|
| 9 |
python-multipart>=0.0.9
|
| 10 |
+
|
| 11 |
+
# Client
|
| 12 |
+
httpx>=0.27.0
|
| 13 |
+
tenacity>=8.3.0
|
| 14 |
+
pydantic>=2.7.0
|
| 15 |
+
|
| 16 |
+
# Testing
|
| 17 |
+
pytest>=8.2.0
|
| 18 |
+
pytest-mock>=3.14.0
|
| 19 |
+
|
| 20 |
+
# Quality
|
| 21 |
+
mypy>=1.10.0
|
| 22 |
+
ruff>=0.4.0
|
ruff.toml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
target-version = "py311"
|
| 2 |
+
line-length = 100
|
| 3 |
+
|
| 4 |
+
[lint]
|
| 5 |
+
select = ["E", "F", "W", "I", "UP", "B", "C4", "RUF"]
|
| 6 |
+
ignore = [
|
| 7 |
+
"E501", # line too long — handled by formatter
|
| 8 |
+
"B008", # function call in default argument (FastAPI pattern)
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
[lint.per-file-ignores]
|
| 12 |
+
"tests/*" = ["S101"] # assert is fine in tests
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pytest
|
| 3 |
+
from client.client import ValidatorClient
|
| 4 |
+
|
| 5 |
+
BASE_URL = os.environ.get("VALIDATOR_URL", "http://localhost:8000")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def pytest_configure(config: pytest.Config) -> None:
|
| 9 |
+
config.addinivalue_line(
|
| 10 |
+
"markers",
|
| 11 |
+
"integration: marks tests that require a running API server (deselect with -m 'not integration')",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.fixture(scope="session")
|
| 16 |
+
def base_url() -> str:
|
| 17 |
+
return BASE_URL
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@pytest.fixture(scope="session")
|
| 21 |
+
def api_client(base_url: str) -> ValidatorClient:
|
| 22 |
+
with ValidatorClient(base_url=base_url) as client:
|
| 23 |
+
yield client
|
tests/integration/__init__.py
ADDED
|
File without changes
|
tests/integration/test_api.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Integration tests — require a running API server.
|
| 3 |
+
|
| 4 |
+
Run with: make test-integration
|
| 5 |
+
Server URL: VALIDATOR_URL env var (default: http://localhost:8000)
|
| 6 |
+
|
| 7 |
+
Design notes:
|
| 8 |
+
- All tests are read-only: the API has no mutable state, so cleanup is N/A.
|
| 9 |
+
- Tests are order-independent: no shared mutable fixtures.
|
| 10 |
+
- Each test is self-contained: uses only the api_client session fixture.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import pytest
|
| 14 |
+
from client.client import ValidatorClient
|
| 15 |
+
from client.exceptions import APIError
|
| 16 |
+
from client.models import ConfigResponse, QueryResponse
|
| 17 |
+
|
| 18 |
+
pytestmark = pytest.mark.integration
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@pytest.fixture(scope="module")
|
| 22 |
+
def config(api_client: ValidatorClient) -> ConfigResponse:
|
| 23 |
+
return api_client.get_config()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TestHealth:
|
| 27 |
+
def test_api_is_reachable(self, api_client: ValidatorClient) -> None:
|
| 28 |
+
assert api_client.health() is True
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TestConfig:
|
| 32 |
+
def test_returns_retail_and_pharma_domains(self, config: ConfigResponse) -> None:
|
| 33 |
+
assert "retail" in config.domains
|
| 34 |
+
assert "pharma" in config.domains
|
| 35 |
+
|
| 36 |
+
def test_each_domain_has_two_clients(self, config: ConfigResponse) -> None:
|
| 37 |
+
for domain, clients in config.domains.items():
|
| 38 |
+
assert len(clients) == 2, f"{domain} should have 2 clients"
|
| 39 |
+
|
| 40 |
+
def test_client_ids_are_strings(self, config: ConfigResponse) -> None:
|
| 41 |
+
for clients in config.domains.values():
|
| 42 |
+
for c in clients:
|
| 43 |
+
assert isinstance(c.id, str)
|
| 44 |
+
assert isinstance(c.display, str)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TestQuery:
|
| 48 |
+
def test_valid_query_returns_answer(self, api_client: ValidatorClient) -> None:
|
| 49 |
+
response = api_client.query(
|
| 50 |
+
"What happens when a product runs out of stock?",
|
| 51 |
+
"novamart",
|
| 52 |
+
)
|
| 53 |
+
assert isinstance(response, QueryResponse)
|
| 54 |
+
assert len(response.answer) > 0
|
| 55 |
+
|
| 56 |
+
def test_response_includes_all_five_metrics(self, api_client: ValidatorClient) -> None:
|
| 57 |
+
response = api_client.query("How do I add a new supplier?", "shelfwise")
|
| 58 |
+
metrics = response.evaluation.metrics
|
| 59 |
+
expected = {"pii_leakage", "token_budget", "answer_relevancy", "faithfulness", "chain_terminology"}
|
| 60 |
+
assert set(metrics.keys()) == expected
|
| 61 |
+
|
| 62 |
+
def test_metric_scores_are_in_valid_range(self, api_client: ValidatorClient) -> None:
|
| 63 |
+
response = api_client.query("What is prior authorization?", "clinixone")
|
| 64 |
+
for name, metric in response.evaluation.metrics.items():
|
| 65 |
+
assert 0.0 <= metric.score <= 1.0, f"{name} score out of range: {metric.score}"
|
| 66 |
+
|
| 67 |
+
def test_sources_are_returned(self, api_client: ValidatorClient) -> None:
|
| 68 |
+
response = api_client.query("How do compliance reports work?", "shelfwise")
|
| 69 |
+
assert len(response.sources) > 0
|
| 70 |
+
for source in response.sources:
|
| 71 |
+
assert source.title
|
| 72 |
+
assert 0.0 <= source.score <= 1.0
|
| 73 |
+
|
| 74 |
+
def test_client_display_name_matches_client_id(self, api_client: ValidatorClient) -> None:
|
| 75 |
+
response = api_client.query("What is formulary pre-approval?", "pharmalink")
|
| 76 |
+
assert response.client == "pharmalink"
|
| 77 |
+
assert response.client_display == "PharmaLink"
|
| 78 |
+
|
| 79 |
+
def test_unknown_client_raises_api_error(self, api_client: ValidatorClient) -> None:
|
| 80 |
+
with pytest.raises(APIError) as exc_info:
|
| 81 |
+
api_client.query("Any question", "nonexistent_client")
|
| 82 |
+
assert exc_info.value.status_code == 400
|
| 83 |
+
|
| 84 |
+
def test_empty_query_raises_api_error(self, api_client: ValidatorClient) -> None:
|
| 85 |
+
with pytest.raises(APIError) as exc_info:
|
| 86 |
+
api_client.query(" ", "novamart")
|
| 87 |
+
assert exc_info.value.status_code == 400
|
| 88 |
+
|
| 89 |
+
def test_pharma_query_uses_client_terminology(self, api_client: ValidatorClient) -> None:
|
| 90 |
+
response = api_client.query(
|
| 91 |
+
"How do I get approval before dispensing a drug?",
|
| 92 |
+
"pharmalink",
|
| 93 |
+
)
|
| 94 |
+
assert "formulary pre-approval" in response.answer.lower() or \
|
| 95 |
+
response.evaluation.metrics["chain_terminology"].passed
|
tests/unit/__init__.py
ADDED
|
File without changes
|
tests/unit/test_client.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for ValidatorClient — all HTTP calls are mocked.
|
| 3 |
+
|
| 4 |
+
Tests cover: error mapping, retry behavior, response parsing, timeout handling.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
import pytest
|
| 9 |
+
from pytest_mock import MockerFixture
|
| 10 |
+
from unittest.mock import MagicMock, patch
|
| 11 |
+
|
| 12 |
+
from client.client import ValidatorClient
|
| 13 |
+
from client.exceptions import APIError, RetryExhaustedError, TimeoutError
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@pytest.fixture
|
| 17 |
+
def client() -> ValidatorClient:
|
| 18 |
+
return ValidatorClient(base_url="http://test.local", max_retries=2)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _mock_response(status_code: int, json_body: object) -> MagicMock:
|
| 22 |
+
response = MagicMock(spec=httpx.Response)
|
| 23 |
+
response.status_code = status_code
|
| 24 |
+
response.is_error = status_code >= 400
|
| 25 |
+
response.json.return_value = json_body
|
| 26 |
+
response.text = str(json_body)
|
| 27 |
+
return response
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TestGetConfig:
|
| 31 |
+
def test_valid_response_parsed_to_model(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 32 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(200, {
|
| 33 |
+
"domains": {
|
| 34 |
+
"retail": [{"id": "novamart", "display": "NovaMart"}]
|
| 35 |
+
}
|
| 36 |
+
}))
|
| 37 |
+
config = client.get_config()
|
| 38 |
+
assert "retail" in config.domains
|
| 39 |
+
assert config.domains["retail"][0].id == "novamart"
|
| 40 |
+
|
| 41 |
+
def test_server_error_raises_api_error(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 42 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(500, {"detail": "boom"}))
|
| 43 |
+
with pytest.raises((APIError, RetryExhaustedError)):
|
| 44 |
+
client.get_config()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TestQuery:
|
| 48 |
+
def test_valid_query_returns_response(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 49 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(200, {
|
| 50 |
+
"query": "test question",
|
| 51 |
+
"client": "novamart",
|
| 52 |
+
"client_display": "NovaMart",
|
| 53 |
+
"answer": "Here is the answer.",
|
| 54 |
+
"sources": [{"id": "retail_001", "title": "Stock Check", "score": 0.9}],
|
| 55 |
+
"evaluation": {
|
| 56 |
+
"overall_pass": True,
|
| 57 |
+
"metrics": {
|
| 58 |
+
"pii_leakage": {"passed": True, "score": 1.0, "detail": "Clean"},
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
}))
|
| 62 |
+
response = client.query("test question", "novamart")
|
| 63 |
+
assert response.answer == "Here is the answer."
|
| 64 |
+
assert response.evaluation.overall_pass is True
|
| 65 |
+
|
| 66 |
+
def test_unknown_client_raises_api_error(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 67 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(400, {
|
| 68 |
+
"detail": "Unknown client: 'bogus'"
|
| 69 |
+
}))
|
| 70 |
+
with pytest.raises(APIError) as exc_info:
|
| 71 |
+
client.query("question", "bogus")
|
| 72 |
+
assert exc_info.value.status_code == 400
|
| 73 |
+
|
| 74 |
+
def test_empty_query_raises_api_error(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 75 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(400, {
|
| 76 |
+
"detail": "Query cannot be empty"
|
| 77 |
+
}))
|
| 78 |
+
with pytest.raises(APIError) as exc_info:
|
| 79 |
+
client.query(" ", "novamart")
|
| 80 |
+
assert exc_info.value.status_code == 400
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class TestRetryBehavior:
|
| 84 |
+
def test_retries_on_503_then_succeeds(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 85 |
+
call_count = 0
|
| 86 |
+
|
| 87 |
+
def side_effect(*args: object, **kwargs: object) -> MagicMock:
|
| 88 |
+
nonlocal call_count
|
| 89 |
+
call_count += 1
|
| 90 |
+
if call_count < 2:
|
| 91 |
+
return _mock_response(503, {"detail": "unavailable"})
|
| 92 |
+
return _mock_response(200, {"domains": {}})
|
| 93 |
+
|
| 94 |
+
mocker.patch.object(client._client, "request", side_effect=side_effect)
|
| 95 |
+
client.get_config()
|
| 96 |
+
assert call_count == 2
|
| 97 |
+
|
| 98 |
+
def test_exhausted_retries_raises_retry_exhausted(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 99 |
+
mocker.patch.object(client._client, "request", return_value=_mock_response(503, {}))
|
| 100 |
+
with pytest.raises(RetryExhaustedError) as exc_info:
|
| 101 |
+
client.get_config()
|
| 102 |
+
assert exc_info.value.attempts == client._max_retries
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestTimeoutHandling:
|
| 106 |
+
def test_timeout_raises_timeout_error(self, client: ValidatorClient, mocker: MockerFixture) -> None:
|
| 107 |
+
mocker.patch.object(
|
| 108 |
+
client._client, "request",
|
| 109 |
+
side_effect=httpx.TimeoutException("timed out")
|
| 110 |
+
)
|
| 111 |
+
with pytest.raises(TimeoutError):
|
| 112 |
+
client.health()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class TestContextManager:
|
| 116 |
+
def test_client_closes_on_exit(self) -> None:
|
| 117 |
+
with patch("httpx.Client.close") as mock_close:
|
| 118 |
+
with ValidatorClient(base_url="http://test.local"):
|
| 119 |
+
pass
|
| 120 |
+
mock_close.assert_called_once()
|
tests/unit/test_grader.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for L1 graders — no network, no LLM calls.
|
| 3 |
+
|
| 4 |
+
Tests are behavioral: each test asserts what the grader DECIDES,
|
| 5 |
+
not how it computes the decision internally.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend"))
|
| 13 |
+
|
| 14 |
+
from grader import (
|
| 15 |
+
grade_pii_leakage,
|
| 16 |
+
grade_token_budget,
|
| 17 |
+
grade_chain_terminology,
|
| 18 |
+
TOKEN_BUDGET,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# ── pii_leakage ──────────────────────────────────────────────────────────────
|
| 23 |
+
|
| 24 |
+
class TestPiiLeakage:
|
| 25 |
+
def test_clean_response_passes(self) -> None:
|
| 26 |
+
result = grade_pii_leakage("Stock check is enabled for this retailer.")
|
| 27 |
+
assert result.passed is True
|
| 28 |
+
assert result.score == 1.0
|
| 29 |
+
|
| 30 |
+
def test_email_address_fails(self) -> None:
|
| 31 |
+
result = grade_pii_leakage("Contact ops@example.com for details.")
|
| 32 |
+
assert result.passed is False
|
| 33 |
+
assert "email" in result.detail
|
| 34 |
+
|
| 35 |
+
def test_ssn_pattern_fails(self) -> None:
|
| 36 |
+
result = grade_pii_leakage("Employee SSN: 123-45-6789 is on file.")
|
| 37 |
+
assert result.passed is False
|
| 38 |
+
assert "SSN" in result.detail
|
| 39 |
+
|
| 40 |
+
def test_phone_number_fails(self) -> None:
|
| 41 |
+
result = grade_pii_leakage("Call 555-867-5309 to reach the manager.")
|
| 42 |
+
assert result.passed is False
|
| 43 |
+
assert result.score == 0.0
|
| 44 |
+
|
| 45 |
+
def test_multiple_pii_types_all_reported(self) -> None:
|
| 46 |
+
result = grade_pii_leakage("Email ops@test.com or call 555-123-4567.")
|
| 47 |
+
assert result.passed is False
|
| 48 |
+
assert "email" in result.detail
|
| 49 |
+
assert "phone" in result.detail
|
| 50 |
+
|
| 51 |
+
def test_score_is_binary(self) -> None:
|
| 52 |
+
clean = grade_pii_leakage("No PII here.")
|
| 53 |
+
dirty = grade_pii_leakage("Email: a@b.com")
|
| 54 |
+
assert clean.score == 1.0
|
| 55 |
+
assert dirty.score == 0.0
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ── token_budget ──────────────────────────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
class TestTokenBudget:
|
| 61 |
+
def test_short_response_passes(self) -> None:
|
| 62 |
+
result = grade_token_budget("Short answer.")
|
| 63 |
+
assert result.passed is True
|
| 64 |
+
assert result.score == 1.0
|
| 65 |
+
|
| 66 |
+
def test_response_at_exact_budget_passes(self) -> None:
|
| 67 |
+
text = "a" * (TOKEN_BUDGET * 4)
|
| 68 |
+
result = grade_token_budget(text)
|
| 69 |
+
assert result.passed is True
|
| 70 |
+
|
| 71 |
+
def test_response_over_budget_fails(self) -> None:
|
| 72 |
+
text = "a" * (TOKEN_BUDGET * 4 + 4)
|
| 73 |
+
result = grade_token_budget(text)
|
| 74 |
+
assert result.passed is False
|
| 75 |
+
assert result.score < 1.0
|
| 76 |
+
|
| 77 |
+
def test_score_degrades_with_length(self) -> None:
|
| 78 |
+
moderate = grade_token_budget("a" * (TOKEN_BUDGET * 5))
|
| 79 |
+
extreme = grade_token_budget("a" * (TOKEN_BUDGET * 20))
|
| 80 |
+
assert moderate.score > extreme.score
|
| 81 |
+
|
| 82 |
+
def test_detail_reports_token_estimate(self) -> None:
|
| 83 |
+
result = grade_token_budget("hello world")
|
| 84 |
+
assert "tokens" in result.detail
|
| 85 |
+
|
| 86 |
+
def test_custom_budget_respected(self) -> None:
|
| 87 |
+
text = "a" * 40 # ~10 tokens
|
| 88 |
+
assert grade_token_budget(text, budget=100).passed is True
|
| 89 |
+
assert grade_token_budget(text, budget=5).passed is False
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ── chain_terminology ─────────────────────────────────────────────────────────
|
| 93 |
+
|
| 94 |
+
class TestChainTerminology:
|
| 95 |
+
def test_correct_client_term_passes(self) -> None:
|
| 96 |
+
result = grade_chain_terminology(
|
| 97 |
+
"Run an availability scan to check inventory levels.",
|
| 98 |
+
client="novamart",
|
| 99 |
+
)
|
| 100 |
+
assert result.passed is True
|
| 101 |
+
|
| 102 |
+
def test_rival_term_without_correct_term_fails(self) -> None:
|
| 103 |
+
# "stock check" is ShelfWise term for STOCK_CHECK — wrong for NovaMart
|
| 104 |
+
result = grade_chain_terminology(
|
| 105 |
+
"Run a stock check to see inventory levels.",
|
| 106 |
+
client="novamart",
|
| 107 |
+
)
|
| 108 |
+
assert result.passed is False
|
| 109 |
+
assert any(v["expected"] == "availability scan" for v in result.metadata["violations"])
|
| 110 |
+
|
| 111 |
+
def test_both_terms_present_does_not_flag(self) -> None:
|
| 112 |
+
# Response explains both — not a violation
|
| 113 |
+
result = grade_chain_terminology(
|
| 114 |
+
"Run an availability scan (also called stock check) to check inventory.",
|
| 115 |
+
client="novamart",
|
| 116 |
+
)
|
| 117 |
+
assert result.passed is True
|
| 118 |
+
|
| 119 |
+
def test_score_reflects_violation_ratio(self) -> None:
|
| 120 |
+
result = grade_chain_terminology(
|
| 121 |
+
"Run a stock check and use a feature toggle.",
|
| 122 |
+
client="novamart",
|
| 123 |
+
)
|
| 124 |
+
assert 0.0 <= result.score < 1.0
|
| 125 |
+
|
| 126 |
+
def test_clean_response_full_score(self) -> None:
|
| 127 |
+
result = grade_chain_terminology(
|
| 128 |
+
"This response uses no retail terminology at all.",
|
| 129 |
+
client="novamart",
|
| 130 |
+
)
|
| 131 |
+
assert result.score == 1.0
|
| 132 |
+
|
| 133 |
+
def test_pharma_client_rival_term_fails(self) -> None:
|
| 134 |
+
# "prior authorization" is ClinixOne term — wrong for PharmaLink
|
| 135 |
+
result = grade_chain_terminology(
|
| 136 |
+
"Submit a prior authorization request to get the drug approved.",
|
| 137 |
+
client="pharmalink",
|
| 138 |
+
)
|
| 139 |
+
assert result.passed is False
|
| 140 |
+
assert any(v["expected"] == "formulary pre-approval" for v in result.metadata["violations"])
|
tests/unit/test_rosetta.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for RosettaStone terminology translation and violation detection."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend"))
|
| 7 |
+
|
| 8 |
+
from rosetta import translate, client_terms, check_terminology
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TestTranslate:
|
| 12 |
+
def test_known_key_returns_client_term(self) -> None:
|
| 13 |
+
assert translate("STOCK_CHECK", "novamart") == "availability scan"
|
| 14 |
+
|
| 15 |
+
def test_same_key_different_clients_differ(self) -> None:
|
| 16 |
+
novamart = translate("STOCK_CHECK", "novamart")
|
| 17 |
+
shelfwise = translate("STOCK_CHECK", "shelfwise")
|
| 18 |
+
assert novamart != shelfwise
|
| 19 |
+
|
| 20 |
+
def test_unknown_key_returns_none(self) -> None:
|
| 21 |
+
assert translate("NONEXISTENT_KEY", "novamart") is None
|
| 22 |
+
|
| 23 |
+
def test_pharma_client_returns_correct_term(self) -> None:
|
| 24 |
+
assert translate("DRUG_APPROVAL", "pharmalink") == "formulary pre-approval"
|
| 25 |
+
assert translate("DRUG_APPROVAL", "clinixone") == "prior authorization"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class TestClientTerms:
|
| 29 |
+
def test_returns_full_mapping(self) -> None:
|
| 30 |
+
terms = client_terms("novamart")
|
| 31 |
+
assert isinstance(terms, dict)
|
| 32 |
+
assert "STOCK_CHECK" in terms
|
| 33 |
+
assert terms["STOCK_CHECK"] == "availability scan"
|
| 34 |
+
|
| 35 |
+
def test_different_clients_have_different_mappings(self) -> None:
|
| 36 |
+
nm = client_terms("novamart")
|
| 37 |
+
sw = client_terms("shelfwise")
|
| 38 |
+
assert nm["STOCK_CHECK"] != sw["STOCK_CHECK"]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TestCheckTerminology:
|
| 42 |
+
def test_correct_term_no_violation(self) -> None:
|
| 43 |
+
result = check_terminology("Please run an availability scan.", "novamart")
|
| 44 |
+
assert result["pass"] is True
|
| 45 |
+
assert result["violations"] == []
|
| 46 |
+
|
| 47 |
+
def test_rival_term_triggers_violation(self) -> None:
|
| 48 |
+
result = check_terminology("Please run a stock check.", "novamart")
|
| 49 |
+
assert result["pass"] is False
|
| 50 |
+
assert len(result["violations"]) >= 1
|
| 51 |
+
|
| 52 |
+
def test_violation_contains_expected_and_found(self) -> None:
|
| 53 |
+
result = check_terminology("Use a feature toggle to enable this.", "novamart")
|
| 54 |
+
violation = next(
|
| 55 |
+
(v for v in result["violations"] if v["found"] == "feature toggle"), None
|
| 56 |
+
)
|
| 57 |
+
assert violation is not None
|
| 58 |
+
assert violation["expected"] == "capability switch"
|
| 59 |
+
|
| 60 |
+
def test_checked_count_matches_catalog(self) -> None:
|
| 61 |
+
terms = client_terms("novamart")
|
| 62 |
+
result = check_terminology("Neutral response with no domain terms.", "novamart")
|
| 63 |
+
assert result["checked"] == len(terms)
|
| 64 |
+
|
| 65 |
+
def test_case_insensitive_matching(self) -> None:
|
| 66 |
+
result = check_terminology("Run a STOCK CHECK now.", "novamart")
|
| 67 |
+
assert result["pass"] is False
|