|
|
from prometheus_client import Counter, Histogram, Gauge |
|
|
|
|
|
|
|
|
""" |
|
|
This code implements Observability, using Prometheus. It monitors the health, cost, and performance of the LLM application in real-time through Grafana dashboard. |
|
|
Counter (Requests/Tokens): Tracks total volume. It helps calculate Tokens per Minute (TPM) or Requests per Minute (RPM) to monitor API costs and usage spikes. |
|
|
Histogram (Latency): Tracks how long the AI takes to respond. This is critical for identifying if the model provider is slowing down. |
|
|
Gauge (Active Sessions): Tracks a value that goes up and down, showing how many users are currently interacting with the agent. |
|
|
Labels: By using labels(model=model), you can compare different models side-by-side in your charts. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
AGENT_REQUESTS = Counter( |
|
|
'agent_requests_total', |
|
|
'Total requests to agent', |
|
|
['model', 'status'] |
|
|
) |
|
|
|
|
|
AGENT_LATENCY = Histogram( |
|
|
'agent_response_latency_seconds', |
|
|
'Response latency', |
|
|
['model'] |
|
|
) |
|
|
|
|
|
TOKEN_USAGE = Counter( |
|
|
'agent_tokens_total', |
|
|
'Total tokens used', |
|
|
['model', 'type'] |
|
|
) |
|
|
|
|
|
USER_FEEDBACK = Counter( |
|
|
'user_feedback_total', |
|
|
'User feedback ratings', |
|
|
['rating'] |
|
|
) |
|
|
|
|
|
ACTIVE_SESSIONS = Gauge('active_sessions', 'Number of active sessions') |
|
|
|
|
|
def record_agent_metrics(model: str, latency: float, tokens_in: int, tokens_out: int, status: str): |
|
|
AGENT_LATENCY.labels(model=model).observe(latency) |
|
|
AGENT_REQUESTS.labels(model=model, status=status).inc() |
|
|
TOKEN_USAGE.labels(model=model, type='input').inc(tokens_in) |
|
|
TOKEN_USAGE.labels(model=model, type='output').inc(tokens_out) |
|
|
|