Mohammad Wasil
Deploy with fixed LFS tracking for ChromaDB
9a3b3da
from prometheus_client import Counter, Histogram, Gauge
"""
This code implements Observability, using Prometheus. It monitors the health, cost, and performance of the LLM application in real-time through Grafana dashboard.
Counter (Requests/Tokens): Tracks total volume. It helps calculate Tokens per Minute (TPM) or Requests per Minute (RPM) to monitor API costs and usage spikes.
Histogram (Latency): Tracks how long the AI takes to respond. This is critical for identifying if the model provider is slowing down.
Gauge (Active Sessions): Tracks a value that goes up and down, showing how many users are currently interacting with the agent.
Labels: By using labels(model=model), you can compare different models side-by-side in your charts.
"""
# Metrics
AGENT_REQUESTS = Counter(
'agent_requests_total',
'Total requests to agent',
['model', 'status']
)
AGENT_LATENCY = Histogram(
'agent_response_latency_seconds',
'Response latency',
['model']
)
TOKEN_USAGE = Counter(
'agent_tokens_total',
'Total tokens used',
['model', 'type']
)
USER_FEEDBACK = Counter(
'user_feedback_total',
'User feedback ratings',
['rating']
)
ACTIVE_SESSIONS = Gauge('active_sessions', 'Number of active sessions')
def record_agent_metrics(model: str, latency: float, tokens_in: int, tokens_out: int, status: str):
AGENT_LATENCY.labels(model=model).observe(latency)
AGENT_REQUESTS.labels(model=model, status=status).inc()
TOKEN_USAGE.labels(model=model, type='input').inc(tokens_in)
TOKEN_USAGE.labels(model=model, type='output').inc(tokens_out)