Spaces:

balaji958685
/

edge-node-diagnostics-agent

Sleeping

App Files Files Community

edge-node-diagnostics-agent / app.py

balaji958685

Add Edge Node Diagnostics Agent app

d7e8289 verified 16 days ago

raw

history blame contribute delete

52.3 kB

	"""
	Edge Node Diagnostics Agent - SmartClass Face Recognition Pipeline Troubleshooter

	A conversational AI agent specialized in troubleshooting and optimizing
	face recognition pipelines running on classroom edge devices (Raspberry Pi 5 / Intel NUC).
	"""

	import gradio as gr
	import json
	import random
	import time
	from datetime import datetime, timedelta
	from typing import Generator

	# ============================================================================
	# SYSTEM PROMPT
	# ============================================================================

	SYSTEM_PROMPT = """You are the Edge Node Diagnostics Agent for the SmartClass attendance system.
	You specialize in troubleshooting and optimizing the face recognition pipeline that runs on classroom edge devices (Raspberry Pi 5 or Intel NUC).

	You understand the complete pipeline: video capture → face detection (SCRFD) → tracking (ByteTrack) → quality assessment → face alignment → embedding extraction (MobileFaceNet/AdaFace) → FAISS similarity search → identity evidence engine → event transmission (Redis Streams / HTTP fallback / SQLite queue).

	When diagnosing issues:
	1. Always start by checking relevant metrics and system status
	2. Identify the specific pipeline stage where the problem occurs
	3. Provide concrete configuration changes with exact YAML paths
	4. Explain the root cause in technical but accessible terms
	5. Suggest both immediate fixes and long-term optimizations

	Key configuration file: config/edge_config.yaml
	Key metrics endpoint: http://localhost:9100/metrics (Prometheus format)

	You have access to diagnostic tools that can query the edge node's state. Use them proactively."""

	# ============================================================================
	# SIMULATED EDGE NODE STATE
	# ============================================================================

	class EdgeNodeState:
	"""Simulates the state of an edge node for demonstration purposes."""

	def __init__(self):
	self.reset()

	def reset(self):
	self.fps = round(random.uniform(18, 28), 1)
	self.detection_latency = round(random.uniform(12, 45), 1)
	self.recognition_latency = round(random.uniform(25, 80), 1)
	self.active_tracks = random.randint(0, 15)
	self.cpu_temp = round(random.uniform(55, 78), 1)
	self.memory_usage = round(random.uniform(45, 85), 1)
	self.offline_queue_depth = random.randint(0, 50)
	self.events_sent = random.randint(100, 5000)
	self.events_failed = random.randint(0, 25)
	self.faces_detected = random.randint(50, 2000)
	self.faces_recognized = random.randint(30, 1800)
	self.recognition_confidence_avg = round(random.uniform(0.65, 0.95), 3)
	self.faiss_index_size = random.randint(50, 500)
	self.model_loaded = True
	self.redis_connected = random.choice([True, True, True, False])
	self.camera_active = True
	self.last_event_time = datetime.now() - timedelta(seconds=random.randint(1, 300))

	# Configuration
	self.config = {
	"pipeline": {
	"detect_every_n_frames": 3,
	"recognize_every_n_frames": 5,
	"idle_mode": {"enabled": True, "timeout_seconds": 30, "fps_target": 5}
	},
	"models": {
	"detector": {
	"path": "models/scrfd_2.5g.onnx",
	"conf_threshold": 0.5,
	"nms_threshold": 0.4,
	"input_size": [640, 640],
	"execution_provider": "CPUExecutionProvider"
	},
	"recognizer": {
	"path": "models/mobilefacenet_v2.onnx",
	"embedding_dim": 512,
	"input_size": [112, 112]
	}
	},
	"faiss": {
	"index_path": "data/student_index.faiss",
	"cosine_threshold": 0.45,
	"nprobe": 10,
	"use_gpu": False
	},
	"tracker": {
	"track_thresh": 0.5,
	"track_buffer": 30,
	"match_thresh": 0.8,
	"max_time_lost": 60
	},
	"identity": {
	"min_evidence_count": 3,
	"consensus_threshold": 0.75,
	"cooldown_seconds": 300,
	"max_evidence_age_seconds": 10
	},
	"cameras": [
	{"id": "cam_01", "source": "/dev/video0", "type": "usb",
	"resolution": [1280, 720], "fps": 30}
	],
	"redis": {
	"host": "192.168.1.100",
	"port": 6379,
	"stream_key": "smartclass:attendance",
	"max_retries": 3
	},
	"http_fallback": {
	"url": "https://api.smartclass.edu/v1/attendance",
	"timeout_seconds": 5,
	"batch_size": 10
	},
	"offline_queue": {
	"db_path": "data/offline_queue.db",
	"max_size_mb": 100,
	"retry_interval_seconds": 60
	},
	"quality_gate": {
	"min_face_size": 80,
	"min_quality_score": 0.3,
	"max_blur_score": 100,
	"min_brightness": 40,
	"max_brightness": 220
	}
	}

	# Simulated error log
	self.errors = [
	{"timestamp": "2024-03-15T10:23:45", "level": "WARN",
	"component": "recognizer", "message": "Low quality face crop rejected (quality=0.18)"},
	{"timestamp": "2024-03-15T10:24:12", "level": "ERROR",
	"component": "event_sender", "message": "Redis connection timeout after 5s"},
	{"timestamp": "2024-03-15T10:25:01", "level": "WARN",
	"component": "tracker", "message": "Track #47 lost after 60 frames without update"},
	]

	def get_metrics(self):
	# Slightly vary metrics each call
	return {
	"smartclass_edge_fps": round(self.fps + random.uniform(-2, 2), 1),
	"smartclass_detection_latency_ms": round(self.detection_latency + random.uniform(-5, 5), 1),
	"smartclass_recognition_latency_ms": round(self.recognition_latency + random.uniform(-10, 10), 1),
	"smartclass_tracking_active_tracks": max(0, self.active_tracks + random.randint(-3, 3)),
	"smartclass_cpu_temp_celsius": round(self.cpu_temp + random.uniform(-2, 2), 1),
	"smartclass_memory_usage_percent": round(self.memory_usage + random.uniform(-3, 3), 1),
	"smartclass_offline_queue_depth": max(0, self.offline_queue_depth + random.randint(-5, 5)),
	"smartclass_events_sent_total": self.events_sent + random.randint(0, 10),
	"smartclass_events_failed_total": self.events_failed + random.randint(0, 2),
	"smartclass_faces_detected_total": self.faces_detected + random.randint(0, 20),
	"smartclass_faces_recognized_total": self.faces_recognized + random.randint(0, 15),
	"smartclass_recognition_confidence_avg": round(self.recognition_confidence_avg + random.uniform(-0.05, 0.05), 3),
	}

	def get_pipeline_status(self):
	return {
	"status": "running" if self.camera_active else "stopped",
	"uptime_hours": round(random.uniform(1, 72), 1),
	"camera_active": self.camera_active,
	"detector_loaded": self.model_loaded,
	"recognizer_loaded": self.model_loaded,
	"faiss_index_loaded": True,
	"faiss_index_vectors": self.faiss_index_size,
	"redis_connected": self.redis_connected,
	"current_fps": round(self.fps + random.uniform(-2, 2), 1),
	"active_tracks": max(0, self.active_tracks + random.randint(-2, 2)),
	"last_recognition_event": self.last_event_time.isoformat(),
	}

	def get_error_log(self, n=10):
	return self.errors[-n:]

	def check_detection(self):
	return {
	"model": "SCRFD-2.5G",
	"model_path": self.config["models"]["detector"]["path"],
	"model_loaded": self.model_loaded,
	"execution_provider": self.config["models"]["detector"]["execution_provider"],
	"conf_threshold": self.config["models"]["detector"]["conf_threshold"],
	"nms_threshold": self.config["models"]["detector"]["nms_threshold"],
	"input_size": self.config["models"]["detector"]["input_size"],
	"avg_latency_ms": round(self.detection_latency + random.uniform(-3, 3), 1),
	"detections_per_frame_avg": round(random.uniform(0.5, 5.0), 2),
	"false_positive_rate_estimate": round(random.uniform(0.01, 0.08), 3),
	}

	def check_recognition(self):
	return {
	"model": "MobileFaceNet-v2",
	"model_path": self.config["models"]["recognizer"]["path"],
	"model_loaded": self.model_loaded,
	"embedding_dim": self.config["models"]["recognizer"]["embedding_dim"],
	"faiss_threshold": self.config["faiss"]["cosine_threshold"],
	"avg_latency_ms": round(self.recognition_latency + random.uniform(-5, 5), 1),
	"avg_confidence": round(self.recognition_confidence_avg, 3),
	"quality_gate_rejection_rate": round(random.uniform(0.1, 0.4), 2),
	"faces_below_threshold": random.randint(5, 50),
	"index_vectors": self.faiss_index_size,
	}

	def check_tracking(self):
	return {
	"algorithm": "ByteTrack",
	"active_tracks": max(0, self.active_tracks + random.randint(-2, 2)),
	"track_thresh": self.config["tracker"]["track_thresh"],
	"track_buffer": self.config["tracker"]["track_buffer"],
	"match_thresh": self.config["tracker"]["match_thresh"],
	"tracks_created_last_minute": random.randint(2, 20),
	"tracks_lost_last_minute": random.randint(1, 10),
	"avg_track_duration_frames": random.randint(30, 200),
	"id_switch_rate": round(random.uniform(0.01, 0.1), 3),
	}

	def check_event_transmission(self):
	return {
	"redis_connected": self.redis_connected,
	"redis_host": self.config["redis"]["host"],
	"redis_stream_key": self.config["redis"]["stream_key"],
	"events_sent_total": self.events_sent,
	"events_failed_total": self.events_failed,
	"offline_queue_depth": self.offline_queue_depth,
	"http_fallback_available": True,
	"last_successful_event": self.last_event_time.isoformat(),
	"avg_delivery_latency_ms": round(random.uniform(5, 50), 1),
	}

	def check_identity_engine(self):
	return {
	"min_evidence_count": self.config["identity"]["min_evidence_count"],
	"consensus_threshold": self.config["identity"]["consensus_threshold"],
	"cooldown_seconds": self.config["identity"]["cooldown_seconds"],
	"pending_identities": random.randint(0, 5),
	"resolved_last_minute": random.randint(0, 15),
	"avg_evidence_per_resolution": round(random.uniform(3, 8), 1),
	"avg_confidence_at_resolution": round(random.uniform(0.78, 0.95), 3),
	"cooldown_blocked_events": random.randint(0, 10),
	}


	# Global edge node state
	edge_state = EdgeNodeState()

	# ============================================================================
	# DIAGNOSTIC TOOLS
	# ============================================================================

	TOOLS = {
	"get_pipeline_status": {
	"description": "Get overall pipeline status including component health",
	"fn": lambda: edge_state.get_pipeline_status()
	},
	"get_metrics": {
	"description": "Get current Prometheus metrics from the edge node",
	"fn": lambda: edge_state.get_metrics()
	},
	"check_detection": {
	"description": "Check face detection subsystem (SCRFD model status and performance)",
	"fn": lambda: edge_state.check_detection()
	},
	"check_recognition": {
	"description": "Check face recognition subsystem (MobileFaceNet, FAISS index)",
	"fn": lambda: edge_state.check_recognition()
	},
	"check_tracking": {
	"description": "Check ByteTrack face tracking subsystem",
	"fn": lambda: edge_state.check_tracking()
	},
	"check_event_transmission": {
	"description": "Check event delivery (Redis Streams, HTTP fallback, offline queue)",
	"fn": lambda: edge_state.check_event_transmission()
	},
	"check_identity_engine": {
	"description": "Check identity evidence engine status",
	"fn": lambda: edge_state.check_identity_engine()
	},
	"get_error_log": {
	"description": "Get recent error log entries",
	"fn": lambda: edge_state.get_error_log()
	},
	"get_config": {
	"description": "Get current edge_config.yaml configuration",
	"fn": lambda: edge_state.config
	},
	}

	# ============================================================================
	# AGENT LOGIC
	# ============================================================================

	def determine_tools_to_call(message: str) -> list:
	"""Determine which diagnostic tools to call based on user message."""
	message_lower = message.lower()
	tools_to_call = []

	# Status/overview queries
	if any(w in message_lower for w in ["status", "overview", "health", "running", "working"]):
	tools_to_call.append("get_pipeline_status")
	tools_to_call.append("get_metrics")

	# FPS/Performance queries
	if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "bottleneck", "optimize", "latency"]):
	tools_to_call.append("get_metrics")
	tools_to_call.append("get_pipeline_status")

	# Detection queries
	if any(w in message_lower for w in ["detect", "scrfd", "no faces", "false positive", "bounding box", "camera"]):
	tools_to_call.append("check_detection")

	# Recognition queries
	if any(w in message_lower for w in ["recogni", "embed", "faiss", "identity", "threshold", "mobilefacenet", "confidence", "not recognized", "never recognized"]):
	tools_to_call.append("check_recognition")

	# Tracking queries
	if any(w in message_lower for w in ["track", "bytetrack", "id switch", "lost", "duplicate"]):
	tools_to_call.append("check_tracking")

	# Event/transmission queries
	if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "send", "offline", "http", "delivery"]):
	tools_to_call.append("check_event_transmission")

	# Identity engine queries
	if any(w in message_lower for w in ["evidence", "consensus", "cooldown", "identity engine", "fusion"]):
	tools_to_call.append("check_identity_engine")

	# Error queries
	if any(w in message_lower for w in ["error", "fail", "crash", "exception", "log", "warn"]):
	tools_to_call.append("get_error_log")

	# Config queries
	if any(w in message_lower for w in ["config", "setting", "parameter", "yaml", "threshold"]):
	tools_to_call.append("get_config")

	# Default: get metrics and status if nothing specific matched
	if not tools_to_call:
	tools_to_call.append("get_pipeline_status")
	tools_to_call.append("get_metrics")

	return list(dict.fromkeys(tools_to_call)) # deduplicate preserving order


	def generate_response(message: str, tool_results: dict) -> str:
	"""Generate a diagnostic response based on user query and tool results."""
	message_lower = message.lower()

	# FPS optimization
	if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "optimize"]):
	metrics = tool_results.get("get_metrics", {})
	fps = metrics.get("smartclass_edge_fps", 0)
	det_lat = metrics.get("smartclass_detection_latency_ms", 0)
	rec_lat = metrics.get("smartclass_recognition_latency_ms", 0)

	response = f"""## 📊 Pipeline Performance Analysis

	Current FPS: {fps}
	Detection Latency: {det_lat}ms
	Recognition Latency: {rec_lat}ms

	### Bottleneck Analysis

	"""
	if fps < 10:
	response += """⚠️ Critical: FPS is dangerously low. The pipeline cannot keep up with the camera frame rate.

	### Root Cause Analysis
	The most common causes of low FPS on edge devices:

	1. Recognition is the bottleneck ({rec_lat}ms per call) — it runs on every Nth frame
	2. Detection is too frequent — check `pipeline.detect_every_n_frames`
	3. Too many active tracks consuming resources
	4. Thermal throttling — check CPU temperature

	### Recommended Actions

	#### Immediate Fixes:
	```yaml
	# config/edge_config.yaml
	pipeline:
	detect_every_n_frames: 5 # Increase from current 3
	recognize_every_n_frames: 8 # Increase from current 5
	idle_mode:
	enabled: true
	timeout_seconds: 15 # Enter idle faster
	fps_target: 3
	```

	#### If still slow:
	- Switch to SCRFD-0.5G model (smaller, faster): `models.detector.path: models/scrfd_0.5g.onnx`
	- Enable execution provider optimization: Set `models.detector.execution_provider: TensorrtExecutionProvider` if TensorRT is available
	- Reduce detection input size: `models.detector.input_size: [320, 320]` (trade detection range for speed)
	""".format(rec_lat=det_lat)
	elif fps < 20:
	response += f"""⚡ Warning: FPS is below optimal. Target is 24+ FPS for smooth tracking.

	### Recommendations:
	1. Increase frame skip ratios:
	- `pipeline.detect_every_n_frames`: 4 (from 3)
	- `pipeline.recognize_every_n_frames`: 6 (from 5)

	2. Enable idle mode to conserve resources when no faces present:
	```yaml
	pipeline.idle_mode.enabled: true
	pipeline.idle_mode.timeout_seconds: 20
	```

	3. Quality gate is likely too lenient — accepting too many low-quality crops increases recognition load:
	```yaml
	quality_gate.min_quality_score: 0.4 # Raise from 0.3
	quality_gate.min_face_size: 100 # Raise from 80
	```
	"""
	else:
	response += f"""✅ FPS is healthy ({fps}). Pipeline is performing within expected parameters.

	Current performance breakdown:
	- Detection: {det_lat}ms (target: <30ms) {'✅' if det_lat < 30 else '⚠️'}
	- Recognition: {rec_lat}ms (target: <50ms) {'✅' if rec_lat < 50 else '⚠️'}

	No immediate optimization needed, but you can further improve by:
	- Enabling GPU acceleration if hardware supports it
	- Using the ensemble recognizer only for edge cases
	"""
	return response

	# Faces detected but not recognized
	if any(phrase in message_lower for phrase in ["not recognized", "never recognized", "detected but"]):
	rec = tool_results.get("check_recognition", {})
	response = f"""## 🔍 Recognition Failure Diagnosis

	Recognition System Status:
	- Model loaded: {'✅' if rec.get('model_loaded') else '❌'}
	- FAISS index vectors: {rec.get('index_vectors', 'N/A')}
	- Avg confidence: {rec.get('avg_confidence', 'N/A')}
	- Quality gate rejection rate: {rec.get('quality_gate_rejection_rate', 'N/A')}
	- Faces below threshold: {rec.get('faces_below_threshold', 'N/A')}

	### Diagnosis Steps:

	#### 1. Check FAISS Index
	The index has {rec.get('index_vectors', 0)} vectors. If this is 0 or doesn't match your enrolled student count, the index needs to be rebuilt.

	```bash
	# Verify index
	python -c "import faiss; idx = faiss.read_index('data/student_index.faiss'); print(f'Vectors: {{idx.ntotal}}')"
	```

	#### 2. Cosine Threshold Too High
	Current threshold: {rec.get('faiss_threshold', 'N/A')}

	If this is above 0.5, many legitimate matches will be rejected. Recommended range: 0.35-0.50.

	```yaml
	# config/edge_config.yaml
	faiss:
	cosine_threshold: 0.40 # Lower from {rec.get('faiss_threshold', 0.45)}
	```

	#### 3. Quality Gate Rejecting Too Many Crops
	Rejection rate is *{rec.get('quality_gate_rejection_rate', 0)100:.0f}%**. If this is above 30%, the quality gate may be too strict.

	```yaml
	# Relax quality requirements
	quality_gate:
	min_quality_score: 0.25 # Lower from 0.3
	min_face_size: 64 # Lower from 80
	max_blur_score: 150 # Raise from 100
	```

	#### 4. Embedding Model Issues
	- Verify model produces valid 512-dim embeddings (not zeros/NaN)
	- Check if face alignment landmarks are accurate
	- Verify input normalization matches training preprocessing

	#### 5. Student Map Verification
	Ensure enrolled students have embeddings in the index:
	```bash
	python scripts/verify_enrollments.py --index data/student_index.faiss --map data/student_map.json
	```
	"""
	return response

	# Tracking issues
	if any(w in message_lower for w in ["track", "id switch", "duplicate", "same person different"]):
	tracking = tool_results.get("check_tracking", {})
	response = f"""## 🎯 Tracking Diagnosis (ByteTrack)

	Current Status:
	- Active tracks: {tracking.get('active_tracks', 'N/A')}
	- ID switch rate: {tracking.get('id_switch_rate', 'N/A')}
	- Avg track duration: {tracking.get('avg_track_duration_frames', 'N/A')} frames
	- Tracks created/minute: {tracking.get('tracks_created_last_minute', 'N/A')}
	- Tracks lost/minute: {tracking.get('tracks_lost_last_minute', 'N/A')}

	### Common Tracking Issues:

	#### ID Switching (same person gets multiple IDs)
	ID switch rate: {tracking.get('id_switch_rate', 0)}
	{'⚠️ High ID switch rate!' if tracking.get('id_switch_rate', 0) > 0.05 else '✅ ID switch rate is acceptable.'}

	Fix: Increase track buffer and lower match threshold:
	```yaml
	tracker:
	track_buffer: 45 # Increase from {tracking.get('track_buffer', 30)} (keeps lost tracks longer)
	match_thresh: 0.7 # Lower from {tracking.get('match_thresh', 0.8)} (easier re-association)
	track_thresh: 0.4 # Lower from {tracking.get('track_thresh', 0.5)} (lower birth threshold)
	```

	#### Tracks Dying Too Quickly
	If `tracks_lost_last_minute` >> `tracks_created_last_minute`:
	- Increase `tracker.track_buffer` (frames before track termination)
	- Lower detection confidence to maintain tracking continuity
	- Check if detection frame skip is too aggressive

	#### Ghost Tracks (tracking non-existent faces)
	If `active_tracks` is much higher than actual people:
	- Raise `tracker.track_thresh` to 0.6+
	- Raise detection `conf_threshold` to reduce false positives
	- Add minimum detection size filter
	"""
	return response

	# Event transmission
	if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "offline", "delivery"]):
	events = tool_results.get("check_event_transmission", {})
	response = f"""## 📡 Event Transmission Status

	Connection Status:
	- Redis connected: {'✅' if events.get('redis_connected') else '❌ DISCONNECTED'}
	- Last successful event: {events.get('last_successful_event', 'N/A')}
	- Offline queue depth: {events.get('offline_queue_depth', 0)}

	Delivery Stats:
	- Events sent: {events.get('events_sent_total', 0)}
	- Events failed: {events.get('events_failed_total', 0)}
	- Avg delivery latency: {events.get('avg_delivery_latency_ms', 0)}ms

	"""
	if not events.get('redis_connected'):
	response += """### ❌ Redis Connection Issue

	The edge node cannot reach the Redis server. Events are being queued locally.

	Immediate Actions:
	1. Check Redis server connectivity:
	```bash
	redis-cli -h {host} -p 6379 ping
	```

	2. Check network connectivity:
	```bash
	ping {host}
	nc -zv {host} 6379
	```

	3. Check Redis configuration:
	```yaml
	redis:
	host: {host}
	port: 6379
	password: <check if auth required>
	max_retries: 5
	retry_delay_seconds: 2
	```

	4. HTTP fallback should be handling events in the meantime.

	Queue Management:
	The offline queue has {queue} events pending. Once Redis reconnects, these will be drained automatically.
	If the queue exceeds 100MB, oldest events are dropped. Monitor via:
	```
	smartclass_offline_queue_depth
	```
	""".format(host=events.get('redis_host', '192.168.1.100'),
	queue=events.get('offline_queue_depth', 0))
	else:
	response += """### ✅ Redis Connected

	Event delivery is functioning normally.

	Performance Notes:
	- Delivery latency {lat}ms is {'within normal range' if events.get('avg_delivery_latency_ms', 0) < 30 else 'elevated - check network'}
	- Failure rate: {rate:.1f}% {'✅' if events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) < 0.02 else '⚠️ Above 2% threshold'}
	""".format(lat=events.get('avg_delivery_latency_ms', 0),
	rate=events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) * 100)
	return response

	# Error log analysis
	if any(w in message_lower for w in ["error", "fail", "crash", "log", "warn"]):
	errors = tool_results.get("get_error_log", [])
	response = "## 📋 Recent Error Log\n\n"
	if errors:
	for e in errors:
	icon = "🔴" if e["level"] == "ERROR" else "🟡" if e["level"] == "WARN" else "🔵"
	response += f"{icon} [{e['timestamp']}] `{e['component']}`: {e['message']}\n\n"

	response += "\n### Analysis\n\n"
	error_components = [e["component"] for e in errors if e["level"] == "ERROR"]
	warn_components = [e["component"] for e in errors if e["level"] == "WARN"]

	if "event_sender" in error_components:
	response += "- Event Sender errors detected — check Redis connectivity and HTTP fallback\n"
	if "recognizer" in warn_components:
	response += "- Recognizer warnings — quality gate is rejecting crops. Consider relaxing thresholds or improving lighting\n"
	if "tracker" in warn_components:
	response += "- Tracker warnings — tracks being lost. May need to increase `track_buffer`\n"
	else:
	response += "✅ No recent errors found. System is operating normally.\n"
	return response

	# Configuration query
	if any(w in message_lower for w in ["config", "setting", "parameter", "yaml"]):
	config = tool_results.get("get_config", {})
	response = f"""## ⚙️ Current Configuration

	```yaml
	# config/edge_config.yaml (key sections)

	pipeline:
	detect_every_n_frames: {config.get('pipeline', {}).get('detect_every_n_frames', 'N/A')}
	recognize_every_n_frames: {config.get('pipeline', {}).get('recognize_every_n_frames', 'N/A')}
	idle_mode:
	enabled: {config.get('pipeline', {}).get('idle_mode', {}).get('enabled', 'N/A')}
	timeout_seconds: {config.get('pipeline', {}).get('idle_mode', {}).get('timeout_seconds', 'N/A')}

	models:
	detector:
	path: {config.get('models', {}).get('detector', {}).get('path', 'N/A')}
	conf_threshold: {config.get('models', {}).get('detector', {}).get('conf_threshold', 'N/A')}
	nms_threshold: {config.get('models', {}).get('detector', {}).get('nms_threshold', 'N/A')}
	recognizer:
	path: {config.get('models', {}).get('recognizer', {}).get('path', 'N/A')}
	embedding_dim: {config.get('models', {}).get('recognizer', {}).get('embedding_dim', 'N/A')}

	faiss:
	cosine_threshold: {config.get('faiss', {}).get('cosine_threshold', 'N/A')}
	nprobe: {config.get('faiss', {}).get('nprobe', 'N/A')}

	tracker:
	track_thresh: {config.get('tracker', {}).get('track_thresh', 'N/A')}
	track_buffer: {config.get('tracker', {}).get('track_buffer', 'N/A')}
	match_thresh: {config.get('tracker', {}).get('match_thresh', 'N/A')}

	identity:
	min_evidence_count: {config.get('identity', {}).get('min_evidence_count', 'N/A')}
	consensus_threshold: {config.get('identity', {}).get('consensus_threshold', 'N/A')}
	cooldown_seconds: {config.get('identity', {}).get('cooldown_seconds', 'N/A')}

	quality_gate:
	min_face_size: {config.get('quality_gate', {}).get('min_face_size', 'N/A')}
	min_quality_score: {config.get('quality_gate', {}).get('min_quality_score', 'N/A')}
	```

	Need help tuning any specific parameter? Ask about:
	- Detection thresholds (sensitivity vs false positives)
	- Recognition thresholds (miss rate vs false accepts)
	- Tracking parameters (stability vs responsiveness)
	- Quality gate (thoroughness vs coverage)
	"""
	return response

	# Identity engine
	if any(w in message_lower for w in ["evidence", "consensus", "identity engine", "fusion"]):
	identity = tool_results.get("check_identity_engine", {})
	response = f"""## 🧠 Identity Evidence Engine Status

	Configuration:
	- Min evidence count: {identity.get('min_evidence_count', 'N/A')} (frames needed before decision)
	- Consensus threshold: {identity.get('consensus_threshold', 'N/A')} (agreement required)
	- Cooldown: {identity.get('cooldown_seconds', 'N/A')}s (prevents duplicate events)

	Performance:
	- Pending identities: {identity.get('pending_identities', 0)} (accumulating evidence)
	- Resolved last minute: {identity.get('resolved_last_minute', 0)}
	- Avg evidence per resolution: {identity.get('avg_evidence_per_resolution', 'N/A')}
	- Avg confidence at resolution: {identity.get('avg_confidence_at_resolution', 'N/A')}
	- Cooldown-blocked events: {identity.get('cooldown_blocked_events', 0)}

	### How It Works

	The Identity Evidence Engine prevents false attendance by requiring multiple consistent recognition results before confirming identity:

	1. Evidence Collection: Each recognition attempt adds a weighted sample to the track's evidence pool
	2. Quality Weighting: Higher quality face crops contribute more weight
	3. Consensus Check: Once `min_evidence_count` samples exist AND the top identity has ≥ `consensus_threshold` agreement, identity is resolved
	4. Cooldown: After resolution, the same identity won't trigger another event for `cooldown_seconds`

	### Tuning Guide

	\| Scenario \| Adjustment \|
	\|----------\|-----------\|
	\| Missing attendance (too strict) \| Lower `consensus_threshold` to 0.65 or `min_evidence_count` to 2 \|
	\| False attendance (too lenient) \| Raise `consensus_threshold` to 0.85 or `min_evidence_count` to 5 \|
	\| Duplicate events \| Increase `cooldown_seconds` \|
	\| Slow resolution \| Decrease `min_evidence_count` or `recognize_every_n_frames` \|
	"""
	return response

	# General status/overview
	status = tool_results.get("get_pipeline_status", {})
	metrics = tool_results.get("get_metrics", {})

	response = f"""## 📍 Edge Node Status Overview

	Pipeline: {'🟢 Running' if status.get('status') == 'running' else '🔴 Stopped'}
	Uptime: {status.get('uptime_hours', 'N/A')} hours
	FPS: {metrics.get('smartclass_edge_fps', 'N/A')}

	### Component Health
	\| Component \| Status \|
	\|-----------\|--------\|
	\| Camera \| {'✅ Active' if status.get('camera_active') else '❌ Inactive'} \|
	\| Detector (SCRFD) \| {'✅ Loaded' if status.get('detector_loaded') else '❌ Not loaded'} \|
	\| Recognizer (MFN) \| {'✅ Loaded' if status.get('recognizer_loaded') else '❌ Not loaded'} \|
	\| FAISS Index \| {'✅ Loaded (' + str(status.get('faiss_index_vectors', 0)) + ' vectors)' if status.get('faiss_index_loaded') else '❌ Not loaded'} \|
	\| Redis \| {'✅ Connected' if status.get('redis_connected') else '⚠️ Disconnected (using fallback)'} \|

	### Key Metrics
	- Detection latency: {metrics.get('smartclass_detection_latency_ms', 'N/A')}ms
	- Recognition latency: {metrics.get('smartclass_recognition_latency_ms', 'N/A')}ms
	- Active tracks: {metrics.get('smartclass_tracking_active_tracks', 'N/A')}
	- CPU temp: {metrics.get('smartclass_cpu_temp_celsius', 'N/A')}°C
	- Memory: {metrics.get('smartclass_memory_usage_percent', 'N/A')}%
	- Offline queue: {metrics.get('smartclass_offline_queue_depth', 'N/A')} events

	### Recent Activity
	- Last recognition: {status.get('last_recognition_event', 'N/A')}
	- Faces detected: {metrics.get('smartclass_faces_detected_total', 'N/A')}
	- Faces recognized: {metrics.get('smartclass_faces_recognized_total', 'N/A')}

	What would you like to investigate further? I can help with:
	- 🔧 Pipeline performance optimization
	- 🔍 Detection/recognition troubleshooting
	- 🎯 Tracking issues
	- 📡 Event transmission debugging
	- ⚙️ Configuration tuning
	"""
	return response


	def agent_respond(message: str, history: list) -> Generator:
	"""Main agent response function with streaming and tool calls."""

	# Add user message
	history = history + [{"role": "user", "content": message}]
	yield history

	# Determine which tools to call
	tools_needed = determine_tools_to_call(message)
	tool_results = {}

	# Execute tools and show them in chat
	for tool_name in tools_needed:
	tool = TOOLS[tool_name]
	result = tool["fn"]()
	tool_results[tool_name] = result

	# Display tool call in chat
	history = history + [{
	"role": "assistant",
	"content": f"```json\n{json.dumps(result, indent=2, default=str)}\n```",
	"metadata": {"title": f"🔧 {tool_name}: {tool['description']}"}
	}]
	yield history
	time.sleep(0.3) # Simulate tool execution time

	# Generate response
	response = generate_response(message, tool_results)

	# Stream the response
	history = history + [{"role": "assistant", "content": ""}]
	words = response.split(" ")
	for i in range(0, len(words), 3):
	chunk = " ".join(words[i:i+3]) + " "
	history[-1]["content"] += chunk
	yield history
	time.sleep(0.02)


	# ============================================================================
	# METRICS POLLING FUNCTIONS
	# ============================================================================

	def poll_metrics():
	"""Poll live metrics for the dashboard."""
	m = edge_state.get_metrics()
	return m

	def poll_status():
	"""Poll pipeline status."""
	return edge_state.get_pipeline_status()

	def reset_simulation():
	"""Reset the simulated edge node state."""
	edge_state.reset()
	return "✅ Edge node simulation reset with new random state."

	def inject_fault(fault_type):
	"""Inject a simulated fault for testing diagnostics."""
	if fault_type == "Low FPS":
	edge_state.fps = round(random.uniform(1, 5), 1)
	edge_state.detection_latency = round(random.uniform(80, 150), 1)
	edge_state.recognition_latency = round(random.uniform(150, 300), 1)
	return "⚠️ Injected: Low FPS condition (heavy processing load)"
	elif fault_type == "Redis Disconnect":
	edge_state.redis_connected = False
	edge_state.offline_queue_depth = random.randint(50, 200)
	return "⚠️ Injected: Redis disconnected, events queuing locally"
	elif fault_type == "No Recognitions":
	edge_state.recognition_confidence_avg = 0.25
	edge_state.faces_recognized = 0
	edge_state.config["faiss"]["cosine_threshold"] = 0.85
	return "⚠️ Injected: Recognition threshold too high, no matches"
	elif fault_type == "High Temperature":
	edge_state.cpu_temp = round(random.uniform(82, 95), 1)
	edge_state.fps = round(random.uniform(5, 12), 1)
	return "⚠️ Injected: CPU thermal throttling"
	elif fault_type == "Tracking Chaos":
	edge_state.active_tracks = random.randint(30, 50)
	edge_state.config["tracker"]["track_thresh"] = 0.2
	return "⚠️ Injected: Too many ghost tracks (low threshold)"
	return "Unknown fault type"


	# ============================================================================
	# GRADIO UI
	# ============================================================================

	CUSTOM_CSS = """
	.tool-output {
	font-size: 0.85em;
	max-height: 200px;
	overflow-y: auto;
	}
	.metric-card {
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 16px;
	margin: 8px;
	}
	"""

	EXAMPLES = [
	"The edge node is only getting 2 FPS, how do I optimize?",
	"Faces are detected but never recognized",
	"I'm seeing duplicate attendance events for the same student",
	"The offline queue keeps growing, events aren't being sent",
	"Show me the current system status",
	"How do I tune the identity evidence engine?",
	"Tracking IDs keep switching between the same person",
	"What's the recommended configuration for a Raspberry Pi 5?",
	"Explain the quality gate and how to adjust it",
	"The CPU temperature is too high, what can I do?",
	]

	with gr.Blocks(title="Edge Node Diagnostics Agent") as demo:

	gr.Markdown("""
	# 🔬 Edge Node Diagnostics Agent
	### SmartClass Face Recognition Pipeline Troubleshooter

	Specialized AI assistant for diagnosing and optimizing face recognition pipelines
	running on classroom edge devices (Raspberry Pi 5 / Intel NUC).
	""")

	with gr.Sidebar():
	gr.Markdown("### 🎛️ Simulation Controls")
	gr.Markdown("Inject faults to test diagnostics capabilities")

	fault_dropdown = gr.Dropdown(
	choices=["Low FPS", "Redis Disconnect", "No Recognitions",
	"High Temperature", "Tracking Chaos"],
	label="Fault Type",
	value="Low FPS"
	)
	inject_btn = gr.Button("💥 Inject Fault", variant="secondary")
	reset_btn = gr.Button("🔄 Reset Node", variant="secondary")
	fault_status = gr.Textbox(label="Status", interactive=False, lines=2)

	inject_btn.click(inject_fault, fault_dropdown, fault_status)
	reset_btn.click(reset_simulation, outputs=fault_status)

	gr.Markdown("---")
	gr.Markdown("### 📌 Quick Reference")
	gr.Markdown("""
	Pipeline Flow:
	1. 📷 Camera Capture
	2. 👤 Face Detection (SCRFD)
	3. 🎯 Face Tracking (ByteTrack)
	4. ✅ Quality Assessment
	5. 🔄 Face Alignment
	6. 🧠 Embedding (MobileFaceNet)
	7. 🔍 FAISS Search
	8. ⚖️ Evidence Engine
	9. 📡 Event Transmission
	""")

	with gr.Tabs():
	# ============ CHAT TAB ============
	with gr.Tab("💬 Diagnostics Chat", id="chat"):
	chatbot = gr.Chatbot(
	height=550,
	label="Edge Diagnostics Agent",
	placeholder="<center><h3>🔬 Edge Node Diagnostics Agent</h3><p>Ask me about pipeline performance, detection issues, recognition failures, tracking problems, or event delivery.</p></center>",
	)

	with gr.Row():
	msg = gr.Textbox(
	placeholder="e.g., 'The edge node is only getting 2 FPS, how do I optimize?'",
	show_label=False,
	scale=8,
	container=False,
	autofocus=True,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1, min_width=100)
	clear_btn = gr.Button("Clear", scale=1, min_width=80)

	gr.Examples(
	examples=EXAMPLES,
	inputs=msg,
	label="💡 Try these questions:",
	)

	# Event handlers
	msg.submit(
	agent_respond, [msg, chatbot], chatbot
	).then(lambda: "", outputs=msg)

	submit_btn.click(
	agent_respond, [msg, chatbot], chatbot
	).then(lambda: "", outputs=msg)

	clear_btn.click(lambda: [], outputs=chatbot)

	# ============ METRICS TAB ============
	with gr.Tab("📊 Live Metrics", id="metrics"):
	gr.Markdown("### Real-time Edge Node Metrics")
	gr.Markdown("Metrics update every 5 seconds (simulated)")

	timer = gr.Timer(value=5, active=True)

	with gr.Row():
	metrics_json = gr.JSON(label="📈 Prometheus Metrics", scale=2)
	status_json = gr.JSON(label="🔋 Pipeline Status", scale=1)

	timer.tick(poll_metrics, outputs=metrics_json)
	timer.tick(poll_status, outputs=status_json)

	# ============ CONFIG TAB ============
	with gr.Tab("⚙️ Configuration", id="config"):
	gr.Markdown("### Edge Node Configuration")
	gr.Markdown("Current `config/edge_config.yaml` settings")

	with gr.Row():
	with gr.Column():
	gr.Markdown("#### Detection Settings")
	det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05,
	label="Detection Confidence Threshold")
	det_nms = gr.Slider(0.1, 0.9, value=0.4, step=0.05,
	label="NMS Threshold")
	det_skip = gr.Slider(1, 10, value=3, step=1,
	label="Detect Every N Frames")

	gr.Markdown("#### Recognition Settings")
	rec_threshold = gr.Slider(0.2, 0.8, value=0.45, step=0.05,
	label="FAISS Cosine Threshold")
	rec_skip = gr.Slider(1, 15, value=5, step=1,
	label="Recognize Every N Frames")

	with gr.Column():
	gr.Markdown("#### Tracking Settings")
	track_thresh = gr.Slider(0.1, 0.9, value=0.5, step=0.05,
	label="Track Birth Threshold")
	track_buffer = gr.Slider(10, 120, value=30, step=5,
	label="Track Buffer (frames)")
	match_thresh = gr.Slider(0.3, 1.0, value=0.8, step=0.05,
	label="Match Threshold")

	gr.Markdown("#### Identity Engine")
	min_evidence = gr.Slider(1, 10, value=3, step=1,
	label="Min Evidence Count")
	consensus = gr.Slider(0.5, 1.0, value=0.75, step=0.05,
	label="Consensus Threshold")
	cooldown = gr.Slider(30, 600, value=300, step=30,
	label="Cooldown (seconds)")

	with gr.Row():
	save_btn = gr.Button("💾 Apply Configuration", variant="primary")
	config_status = gr.Textbox(label="Status", interactive=False)

	def apply_config(det_c, det_n, det_s, rec_t, rec_s, tr_t, tr_b, m_t, me, cs, cd):
	edge_state.config["models"]["detector"]["conf_threshold"] = det_c
	edge_state.config["models"]["detector"]["nms_threshold"] = det_n
	edge_state.config["pipeline"]["detect_every_n_frames"] = det_s
	edge_state.config["faiss"]["cosine_threshold"] = rec_t
	edge_state.config["pipeline"]["recognize_every_n_frames"] = rec_s
	edge_state.config["tracker"]["track_thresh"] = tr_t
	edge_state.config["tracker"]["track_buffer"] = tr_b
	edge_state.config["tracker"]["match_thresh"] = m_t
	edge_state.config["identity"]["min_evidence_count"] = me
	edge_state.config["identity"]["consensus_threshold"] = cs
	edge_state.config["identity"]["cooldown_seconds"] = cd
	return f"✅ Configuration applied at {datetime.now().strftime('%H:%M:%S')}"

	save_btn.click(
	apply_config,
	[det_conf, det_nms, det_skip, rec_threshold, rec_skip,
	track_thresh, track_buffer, match_thresh,
	min_evidence, consensus, cooldown],
	config_status
	)

	# ============ ARCHITECTURE TAB ============
	with gr.Tab("📐 Architecture", id="arch"):
	gr.Markdown("""
	### SmartClass Edge Node Pipeline Architecture

	```
	┌─────────────────────────────────────────────────────────────────────┐
	│ EDGE NODE (Pi 5 / NUC) │
	├─────────────────────────────────────────────────────────────────────┤
	│ │
	│ ┌──────────┐ ┌───────────┐ ┌──────────┐ ┌──────────────┐ │
	│ │ Camera │───▶│ Detector │───▶│ Tracker │───▶│ Quality │ │
	│ │ Capture │ │ (SCRFD) │ │(ByteTrack)│ │ Assessment │ │
	│ └──────────┘ └───────────┘ └──────────┘ └──────────────┘ │
	│ │ │
	│ ▼ │
	│ ┌──────────────┐ ┌───────────┐ ┌──────────┐ ┌─────────┐ │
	│ │ Identity │◀───│ FAISS │◀───│ Embedding│◀───│ Face │ │
	│ │ Evidence │ │ Search │ │(MobileFN)│ │ Align │ │
	│ │ Engine │ └───────────┘ └──────────┘ └─────────┘ │
	│ └──────────────┘ │
	│ │ │
	│ ▼ │
	│ ┌──────────────────────────────────────────────────────────────┐ │
	│ │ Event Transmission │ │
	│ │ ┌─────────┐ ┌──────────────┐ ┌─────────────────────┐ │ │
	│ │ │ Redis │──▶│ HTTP Fallback│──▶│ SQLite Offline Q │ │ │
	│ │ │ Streams │ │ (REST API) │ │ (retry background) │ │ │
	│ │ └─────────┘ └──────────────┘ └─────────────────────┘ │ │
	│ └──────────────────────────────────────────────────────────────┘ │
	│ │
	│ ┌──────────────────────────────────────────────────────────────┐ │
	│ │ Prometheus Metrics (port 9100) │ │
	│ │ FPS \| Latency \| Tracks \| Temp \| Memory \| Queue \| Events │ │
	│ └──────────────────────────────────────────────────────────────┘ │
	│ │
	└─────────────────────────────────────────────────────────────────────┘
	```

	### Key Source Files

	\| File \| Purpose \|
	\|------\|---------\|
	\| `src/main_pipeline.py` \| CameraPipeline orchestration \|
	\| `src/capture.py` \| Camera frame capture (USB/RTSP) \|
	\| `src/detector.py` \| SCRFD face detection \|
	\| `src/tracker.py` \| ByteTrack face tracking \|
	\| `src/face_quality.py` \| FIQA quality scoring \|
	\| `src/face_align.py` \| Face alignment & warping \|
	\| `src/face_enhance.py` \| MSRCR Retinex / CLAHE \|
	\| `src/recognizer.py` \| MobileFaceNet embedding \|
	\| `src/ensemble_recognizer.py` \| AdaFace + MFN dual model \|
	\| `src/identity_evidence_engine.py` \| Global identity fusion \|
	\| `src/event_sender.py` \| Event transmission logic \|
	\| `src/edge_metrics.py` \| Prometheus metrics export \|

	### Configuration

	All settings in `config/edge_config.yaml`. Key sections:
	- `pipeline` — Frame processing rates, idle mode
	- `models` — Model paths, thresholds, execution providers
	- `faiss` — Index path, similarity threshold, GPU mode
	- `tracker` — ByteTrack parameters
	- `identity` — Evidence engine tuning
	- `cameras` — Camera sources
	- `redis` / `http_fallback` / `offline_queue` — Event delivery
	- `quality_gate` — Face crop quality requirements
	""")

	# ============ HELP TAB ============
	with gr.Tab("❓ Help", id="help"):
	gr.Markdown("""
	### How to Use This Agent

	1. Ask Natural Language Questions
	Simply describe your problem in the chat. The agent will:
	- Automatically query relevant diagnostic tools
	- Show raw data from the edge node (collapsible)
	- Provide analysis and recommendations

	2. Common Troubleshooting Scenarios

	\| Problem \| What to Ask \|
	\|---------\|-------------\|
	\| Low FPS \| "Pipeline is slow, only getting X FPS" \|
	\| No recognitions \| "Faces are detected but never recognized" \|
	\| Duplicate events \| "Same student marked present twice" \|
	\| Events not sending \| "Offline queue keeps growing" \|
	\| Tracking issues \| "Track IDs keep switching" \|
	\| False detections \| "Detecting faces where there are none" \|
	\| Model errors \| "Detection model won't load" \|
	\| Overheating \| "CPU temperature is too high" \|

	3. Simulation Controls (Sidebar)
	Use the sidebar to inject faults and test the agent's diagnostic capabilities:
	- Low FPS — Simulates heavy processing load
	- Redis Disconnect — Simulates network failure
	- No Recognitions — Simulates overly strict thresholds
	- High Temperature — Simulates thermal throttling
	- Tracking Chaos — Simulates too many ghost tracks

	4. Configuration Tab
	Adjust edge node parameters in real-time and see how they affect the system.

	5. Metrics Tab
	View live (simulated) Prometheus metrics from the edge node.

	---

	### Technical Reference

	Prometheus Metrics (port 9100):
	```
	smartclass_edge_fps # Overall frames per second
	smartclass_detection_latency_ms # Time per detection
	smartclass_recognition_latency_ms # Time per recognition
	smartclass_tracking_active_tracks # Active face tracks
	smartclass_cpu_temp_celsius # CPU temperature
	smartclass_memory_usage_percent # RAM usage
	smartclass_offline_queue_depth # Queued events
	smartclass_events_sent_total # Successfully sent events
	smartclass_events_failed_total # Failed event deliveries
	smartclass_faces_detected_total # Total detections
	smartclass_faces_recognized_total # Successful recognitions
	smartclass_recognition_confidence_avg # Average match confidence
	```

	Key Config Paths:
	- `config/edge_config.yaml` → All settings
	- `models/scrfd_2.5g.onnx` → Detection model
	- `models/mobilefacenet_v2.onnx` → Recognition model
	- `data/student_index.faiss` → FAISS vector index
	- `data/student_map.json` → ID → student name mapping
	- `data/offline_queue.db` → SQLite fallback queue
	""")

	demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)