llm-inference-dashboard / components /tracing_panel.py
jkottu's picture
Initial commit: LLM Inference Dashboard
aefabf0
"""Request tracing panel component."""
import gradio as gr
import pandas as pd
from typing import Dict, Any, Tuple
from services.request_tracer import RequestTracer
def create_tracing_panel(tracer: RequestTracer) -> Dict[str, Any]:
"""
Create the request tracing panel.
Args:
tracer: Request tracer instance
Returns:
Dictionary of Gradio components
"""
with gr.Column():
gr.Markdown("### Request Tracing")
# Filter controls
with gr.Row():
trace_filter = gr.Dropdown(
choices=["All Requests", "Slow Only"],
value="All Requests",
label="Filter",
)
trace_limit = gr.Slider(
minimum=10,
maximum=500,
value=100,
step=10,
label="Show Last N Requests",
)
refresh_btn = gr.Button("Refresh", size="sm")
# Summary stats
with gr.Row():
total_requests = gr.Number(
label="Total Requests",
precision=0,
interactive=False,
)
slow_requests = gr.Number(
label="Slow Requests",
precision=0,
interactive=False,
)
slow_rate = gr.Number(
label="Slow Rate %",
precision=1,
interactive=False,
)
baseline_p95 = gr.Number(
label="Baseline P95 (ms)",
precision=1,
interactive=False,
)
# Traces table
traces_table = gr.Dataframe(
headers=[
"ID", "Prompt Toks", "Output Toks",
"Queue (ms)", "Prefill (ms)", "Decode (ms)",
"Total (ms)", "Tok/s", "Slow?"
],
datatype=[
"str", "number", "number",
"number", "number", "number",
"number", "number", "str"
],
label="Request Traces",
interactive=False,
)
# Latency breakdown chart
gr.Markdown("#### Average Latency Breakdown")
latency_breakdown = gr.BarPlot(
x="phase",
y="ms",
title="Latency by Phase",
x_title="Phase",
y_title="Time (ms)",
height=200,
)
# Percentiles
gr.Markdown("#### Latency Percentiles")
with gr.Row():
p50 = gr.Number(label="P50 (ms)", precision=1, interactive=False)
p95 = gr.Number(label="P95 (ms)", precision=1, interactive=False)
p99 = gr.Number(label="P99 (ms)", precision=1, interactive=False)
# Event handlers
def refresh_traces(filter_val, limit):
slow_only = filter_val == "Slow Only"
return update_tracing_panel(tracer, slow_only, int(limit))
refresh_btn.click(
fn=refresh_traces,
inputs=[trace_filter, trace_limit],
outputs=[
total_requests, slow_requests, slow_rate, baseline_p95,
traces_table, latency_breakdown, p50, p95, p99
],
)
return {
"trace_filter": trace_filter,
"trace_limit": trace_limit,
"total_requests": total_requests,
"slow_requests": slow_requests,
"slow_rate": slow_rate,
"baseline_p95": baseline_p95,
"traces_table": traces_table,
"latency_breakdown": latency_breakdown,
"p50": p50,
"p95": p95,
"p99": p99,
}
def update_tracing_panel(
tracer: RequestTracer,
slow_only: bool = False,
limit: int = 100,
) -> Tuple[int, int, float, float, pd.DataFrame, pd.DataFrame, float, float, float]:
"""
Update the tracing panel with current data.
Args:
tracer: Request tracer instance
slow_only: Only show slow requests
limit: Maximum number of traces to show
Returns:
Tuple of all component values
"""
stats = tracer.get_stats()
traces = tracer.get_recent_traces(limit=limit, slow_only=slow_only)
breakdown = tracer.get_latency_breakdown()
percentiles = tracer.get_percentiles()
# Build traces table
trace_rows = []
for trace in reversed(traces): # Most recent first
trace_rows.append({
"ID": trace.request_id,
"Prompt Toks": trace.prompt_tokens,
"Output Toks": trace.output_tokens,
"Queue (ms)": round(trace.queue_time_ms, 1),
"Prefill (ms)": round(trace.prefill_time_ms, 1),
"Decode (ms)": round(trace.decode_time_ms, 1),
"Total (ms)": round(trace.total_time_ms, 1),
"Tok/s": round(trace.tokens_per_second, 1),
"Slow?": "Yes" if trace.is_slow else "",
})
traces_df = pd.DataFrame(trace_rows) if trace_rows else pd.DataFrame(
columns=[
"ID", "Prompt Toks", "Output Toks",
"Queue (ms)", "Prefill (ms)", "Decode (ms)",
"Total (ms)", "Tok/s", "Slow?"
]
)
# Build breakdown chart
breakdown_df = pd.DataFrame([
{"phase": "Queue", "ms": breakdown.queue_ms},
{"phase": "Prefill", "ms": breakdown.prefill_ms},
{"phase": "Decode", "ms": breakdown.decode_ms},
])
return (
stats["total_requests"],
stats["slow_requests"],
stats.get("slow_rate_percent", 0),
stats.get("baseline_p95", 0) or 0,
traces_df,
breakdown_df,
percentiles["p50"],
percentiles["p95"],
percentiles["p99"],
)