File size: 1,606 Bytes
9a3b3da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from prometheus_client import Counter, Histogram, Gauge


"""
    This code implements Observability, using Prometheus. It monitors the health, cost, and performance of the LLM application in real-time through Grafana dashboard. 
Counter (Requests/Tokens): Tracks total volume. It helps calculate Tokens per Minute (TPM) or Requests per Minute (RPM) to monitor API costs and usage spikes.
Histogram (Latency): Tracks how long the AI takes to respond. This is critical for identifying if the model provider is slowing down.
Gauge (Active Sessions): Tracks a value that goes up and down, showing how many users are currently interacting with the agent.
Labels: By using labels(model=model), you can compare different models side-by-side in your charts.
"""


# Metrics
AGENT_REQUESTS = Counter(
    'agent_requests_total',
    'Total requests to agent',
    ['model', 'status']
)

AGENT_LATENCY = Histogram(
    'agent_response_latency_seconds',
    'Response latency',
    ['model']
)

TOKEN_USAGE = Counter(
    'agent_tokens_total',
    'Total tokens used',
    ['model', 'type']  
)

USER_FEEDBACK = Counter(
    'user_feedback_total',
    'User feedback ratings',
    ['rating'] 
)

ACTIVE_SESSIONS = Gauge('active_sessions', 'Number of active sessions')

def record_agent_metrics(model: str, latency: float, tokens_in: int, tokens_out: int, status: str):
    AGENT_LATENCY.labels(model=model).observe(latency)
    AGENT_REQUESTS.labels(model=model, status=status).inc()
    TOKEN_USAGE.labels(model=model, type='input').inc(tokens_in)
    TOKEN_USAGE.labels(model=model, type='output').inc(tokens_out)