balaji958685's picture
Add Edge Node Diagnostics Agent app
d7e8289 verified
"""
Edge Node Diagnostics Agent - SmartClass Face Recognition Pipeline Troubleshooter
A conversational AI agent specialized in troubleshooting and optimizing
face recognition pipelines running on classroom edge devices (Raspberry Pi 5 / Intel NUC).
"""
import gradio as gr
import json
import random
import time
from datetime import datetime, timedelta
from typing import Generator
# ============================================================================
# SYSTEM PROMPT
# ============================================================================
SYSTEM_PROMPT = """You are the Edge Node Diagnostics Agent for the SmartClass attendance system.
You specialize in troubleshooting and optimizing the face recognition pipeline that runs on classroom edge devices (Raspberry Pi 5 or Intel NUC).
You understand the complete pipeline: video capture β†’ face detection (SCRFD) β†’ tracking (ByteTrack) β†’ quality assessment β†’ face alignment β†’ embedding extraction (MobileFaceNet/AdaFace) β†’ FAISS similarity search β†’ identity evidence engine β†’ event transmission (Redis Streams / HTTP fallback / SQLite queue).
When diagnosing issues:
1. Always start by checking relevant metrics and system status
2. Identify the specific pipeline stage where the problem occurs
3. Provide concrete configuration changes with exact YAML paths
4. Explain the root cause in technical but accessible terms
5. Suggest both immediate fixes and long-term optimizations
Key configuration file: config/edge_config.yaml
Key metrics endpoint: http://localhost:9100/metrics (Prometheus format)
You have access to diagnostic tools that can query the edge node's state. Use them proactively."""
# ============================================================================
# SIMULATED EDGE NODE STATE
# ============================================================================
class EdgeNodeState:
"""Simulates the state of an edge node for demonstration purposes."""
def __init__(self):
self.reset()
def reset(self):
self.fps = round(random.uniform(18, 28), 1)
self.detection_latency = round(random.uniform(12, 45), 1)
self.recognition_latency = round(random.uniform(25, 80), 1)
self.active_tracks = random.randint(0, 15)
self.cpu_temp = round(random.uniform(55, 78), 1)
self.memory_usage = round(random.uniform(45, 85), 1)
self.offline_queue_depth = random.randint(0, 50)
self.events_sent = random.randint(100, 5000)
self.events_failed = random.randint(0, 25)
self.faces_detected = random.randint(50, 2000)
self.faces_recognized = random.randint(30, 1800)
self.recognition_confidence_avg = round(random.uniform(0.65, 0.95), 3)
self.faiss_index_size = random.randint(50, 500)
self.model_loaded = True
self.redis_connected = random.choice([True, True, True, False])
self.camera_active = True
self.last_event_time = datetime.now() - timedelta(seconds=random.randint(1, 300))
# Configuration
self.config = {
"pipeline": {
"detect_every_n_frames": 3,
"recognize_every_n_frames": 5,
"idle_mode": {"enabled": True, "timeout_seconds": 30, "fps_target": 5}
},
"models": {
"detector": {
"path": "models/scrfd_2.5g.onnx",
"conf_threshold": 0.5,
"nms_threshold": 0.4,
"input_size": [640, 640],
"execution_provider": "CPUExecutionProvider"
},
"recognizer": {
"path": "models/mobilefacenet_v2.onnx",
"embedding_dim": 512,
"input_size": [112, 112]
}
},
"faiss": {
"index_path": "data/student_index.faiss",
"cosine_threshold": 0.45,
"nprobe": 10,
"use_gpu": False
},
"tracker": {
"track_thresh": 0.5,
"track_buffer": 30,
"match_thresh": 0.8,
"max_time_lost": 60
},
"identity": {
"min_evidence_count": 3,
"consensus_threshold": 0.75,
"cooldown_seconds": 300,
"max_evidence_age_seconds": 10
},
"cameras": [
{"id": "cam_01", "source": "/dev/video0", "type": "usb",
"resolution": [1280, 720], "fps": 30}
],
"redis": {
"host": "192.168.1.100",
"port": 6379,
"stream_key": "smartclass:attendance",
"max_retries": 3
},
"http_fallback": {
"url": "https://api.smartclass.edu/v1/attendance",
"timeout_seconds": 5,
"batch_size": 10
},
"offline_queue": {
"db_path": "data/offline_queue.db",
"max_size_mb": 100,
"retry_interval_seconds": 60
},
"quality_gate": {
"min_face_size": 80,
"min_quality_score": 0.3,
"max_blur_score": 100,
"min_brightness": 40,
"max_brightness": 220
}
}
# Simulated error log
self.errors = [
{"timestamp": "2024-03-15T10:23:45", "level": "WARN",
"component": "recognizer", "message": "Low quality face crop rejected (quality=0.18)"},
{"timestamp": "2024-03-15T10:24:12", "level": "ERROR",
"component": "event_sender", "message": "Redis connection timeout after 5s"},
{"timestamp": "2024-03-15T10:25:01", "level": "WARN",
"component": "tracker", "message": "Track #47 lost after 60 frames without update"},
]
def get_metrics(self):
# Slightly vary metrics each call
return {
"smartclass_edge_fps": round(self.fps + random.uniform(-2, 2), 1),
"smartclass_detection_latency_ms": round(self.detection_latency + random.uniform(-5, 5), 1),
"smartclass_recognition_latency_ms": round(self.recognition_latency + random.uniform(-10, 10), 1),
"smartclass_tracking_active_tracks": max(0, self.active_tracks + random.randint(-3, 3)),
"smartclass_cpu_temp_celsius": round(self.cpu_temp + random.uniform(-2, 2), 1),
"smartclass_memory_usage_percent": round(self.memory_usage + random.uniform(-3, 3), 1),
"smartclass_offline_queue_depth": max(0, self.offline_queue_depth + random.randint(-5, 5)),
"smartclass_events_sent_total": self.events_sent + random.randint(0, 10),
"smartclass_events_failed_total": self.events_failed + random.randint(0, 2),
"smartclass_faces_detected_total": self.faces_detected + random.randint(0, 20),
"smartclass_faces_recognized_total": self.faces_recognized + random.randint(0, 15),
"smartclass_recognition_confidence_avg": round(self.recognition_confidence_avg + random.uniform(-0.05, 0.05), 3),
}
def get_pipeline_status(self):
return {
"status": "running" if self.camera_active else "stopped",
"uptime_hours": round(random.uniform(1, 72), 1),
"camera_active": self.camera_active,
"detector_loaded": self.model_loaded,
"recognizer_loaded": self.model_loaded,
"faiss_index_loaded": True,
"faiss_index_vectors": self.faiss_index_size,
"redis_connected": self.redis_connected,
"current_fps": round(self.fps + random.uniform(-2, 2), 1),
"active_tracks": max(0, self.active_tracks + random.randint(-2, 2)),
"last_recognition_event": self.last_event_time.isoformat(),
}
def get_error_log(self, n=10):
return self.errors[-n:]
def check_detection(self):
return {
"model": "SCRFD-2.5G",
"model_path": self.config["models"]["detector"]["path"],
"model_loaded": self.model_loaded,
"execution_provider": self.config["models"]["detector"]["execution_provider"],
"conf_threshold": self.config["models"]["detector"]["conf_threshold"],
"nms_threshold": self.config["models"]["detector"]["nms_threshold"],
"input_size": self.config["models"]["detector"]["input_size"],
"avg_latency_ms": round(self.detection_latency + random.uniform(-3, 3), 1),
"detections_per_frame_avg": round(random.uniform(0.5, 5.0), 2),
"false_positive_rate_estimate": round(random.uniform(0.01, 0.08), 3),
}
def check_recognition(self):
return {
"model": "MobileFaceNet-v2",
"model_path": self.config["models"]["recognizer"]["path"],
"model_loaded": self.model_loaded,
"embedding_dim": self.config["models"]["recognizer"]["embedding_dim"],
"faiss_threshold": self.config["faiss"]["cosine_threshold"],
"avg_latency_ms": round(self.recognition_latency + random.uniform(-5, 5), 1),
"avg_confidence": round(self.recognition_confidence_avg, 3),
"quality_gate_rejection_rate": round(random.uniform(0.1, 0.4), 2),
"faces_below_threshold": random.randint(5, 50),
"index_vectors": self.faiss_index_size,
}
def check_tracking(self):
return {
"algorithm": "ByteTrack",
"active_tracks": max(0, self.active_tracks + random.randint(-2, 2)),
"track_thresh": self.config["tracker"]["track_thresh"],
"track_buffer": self.config["tracker"]["track_buffer"],
"match_thresh": self.config["tracker"]["match_thresh"],
"tracks_created_last_minute": random.randint(2, 20),
"tracks_lost_last_minute": random.randint(1, 10),
"avg_track_duration_frames": random.randint(30, 200),
"id_switch_rate": round(random.uniform(0.01, 0.1), 3),
}
def check_event_transmission(self):
return {
"redis_connected": self.redis_connected,
"redis_host": self.config["redis"]["host"],
"redis_stream_key": self.config["redis"]["stream_key"],
"events_sent_total": self.events_sent,
"events_failed_total": self.events_failed,
"offline_queue_depth": self.offline_queue_depth,
"http_fallback_available": True,
"last_successful_event": self.last_event_time.isoformat(),
"avg_delivery_latency_ms": round(random.uniform(5, 50), 1),
}
def check_identity_engine(self):
return {
"min_evidence_count": self.config["identity"]["min_evidence_count"],
"consensus_threshold": self.config["identity"]["consensus_threshold"],
"cooldown_seconds": self.config["identity"]["cooldown_seconds"],
"pending_identities": random.randint(0, 5),
"resolved_last_minute": random.randint(0, 15),
"avg_evidence_per_resolution": round(random.uniform(3, 8), 1),
"avg_confidence_at_resolution": round(random.uniform(0.78, 0.95), 3),
"cooldown_blocked_events": random.randint(0, 10),
}
# Global edge node state
edge_state = EdgeNodeState()
# ============================================================================
# DIAGNOSTIC TOOLS
# ============================================================================
TOOLS = {
"get_pipeline_status": {
"description": "Get overall pipeline status including component health",
"fn": lambda: edge_state.get_pipeline_status()
},
"get_metrics": {
"description": "Get current Prometheus metrics from the edge node",
"fn": lambda: edge_state.get_metrics()
},
"check_detection": {
"description": "Check face detection subsystem (SCRFD model status and performance)",
"fn": lambda: edge_state.check_detection()
},
"check_recognition": {
"description": "Check face recognition subsystem (MobileFaceNet, FAISS index)",
"fn": lambda: edge_state.check_recognition()
},
"check_tracking": {
"description": "Check ByteTrack face tracking subsystem",
"fn": lambda: edge_state.check_tracking()
},
"check_event_transmission": {
"description": "Check event delivery (Redis Streams, HTTP fallback, offline queue)",
"fn": lambda: edge_state.check_event_transmission()
},
"check_identity_engine": {
"description": "Check identity evidence engine status",
"fn": lambda: edge_state.check_identity_engine()
},
"get_error_log": {
"description": "Get recent error log entries",
"fn": lambda: edge_state.get_error_log()
},
"get_config": {
"description": "Get current edge_config.yaml configuration",
"fn": lambda: edge_state.config
},
}
# ============================================================================
# AGENT LOGIC
# ============================================================================
def determine_tools_to_call(message: str) -> list:
"""Determine which diagnostic tools to call based on user message."""
message_lower = message.lower()
tools_to_call = []
# Status/overview queries
if any(w in message_lower for w in ["status", "overview", "health", "running", "working"]):
tools_to_call.append("get_pipeline_status")
tools_to_call.append("get_metrics")
# FPS/Performance queries
if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "bottleneck", "optimize", "latency"]):
tools_to_call.append("get_metrics")
tools_to_call.append("get_pipeline_status")
# Detection queries
if any(w in message_lower for w in ["detect", "scrfd", "no faces", "false positive", "bounding box", "camera"]):
tools_to_call.append("check_detection")
# Recognition queries
if any(w in message_lower for w in ["recogni", "embed", "faiss", "identity", "threshold", "mobilefacenet", "confidence", "not recognized", "never recognized"]):
tools_to_call.append("check_recognition")
# Tracking queries
if any(w in message_lower for w in ["track", "bytetrack", "id switch", "lost", "duplicate"]):
tools_to_call.append("check_tracking")
# Event/transmission queries
if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "send", "offline", "http", "delivery"]):
tools_to_call.append("check_event_transmission")
# Identity engine queries
if any(w in message_lower for w in ["evidence", "consensus", "cooldown", "identity engine", "fusion"]):
tools_to_call.append("check_identity_engine")
# Error queries
if any(w in message_lower for w in ["error", "fail", "crash", "exception", "log", "warn"]):
tools_to_call.append("get_error_log")
# Config queries
if any(w in message_lower for w in ["config", "setting", "parameter", "yaml", "threshold"]):
tools_to_call.append("get_config")
# Default: get metrics and status if nothing specific matched
if not tools_to_call:
tools_to_call.append("get_pipeline_status")
tools_to_call.append("get_metrics")
return list(dict.fromkeys(tools_to_call)) # deduplicate preserving order
def generate_response(message: str, tool_results: dict) -> str:
"""Generate a diagnostic response based on user query and tool results."""
message_lower = message.lower()
# FPS optimization
if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "optimize"]):
metrics = tool_results.get("get_metrics", {})
fps = metrics.get("smartclass_edge_fps", 0)
det_lat = metrics.get("smartclass_detection_latency_ms", 0)
rec_lat = metrics.get("smartclass_recognition_latency_ms", 0)
response = f"""## πŸ“Š Pipeline Performance Analysis
**Current FPS:** {fps}
**Detection Latency:** {det_lat}ms
**Recognition Latency:** {rec_lat}ms
### Bottleneck Analysis
"""
if fps < 10:
response += """⚠️ **Critical: FPS is dangerously low.** The pipeline cannot keep up with the camera frame rate.
### Root Cause Analysis
The most common causes of low FPS on edge devices:
1. **Recognition is the bottleneck** ({rec_lat}ms per call) β€” it runs on every Nth frame
2. **Detection is too frequent** β€” check `pipeline.detect_every_n_frames`
3. **Too many active tracks** consuming resources
4. **Thermal throttling** β€” check CPU temperature
### Recommended Actions
#### Immediate Fixes:
```yaml
# config/edge_config.yaml
pipeline:
detect_every_n_frames: 5 # Increase from current 3
recognize_every_n_frames: 8 # Increase from current 5
idle_mode:
enabled: true
timeout_seconds: 15 # Enter idle faster
fps_target: 3
```
#### If still slow:
- Switch to SCRFD-0.5G model (smaller, faster): `models.detector.path: models/scrfd_0.5g.onnx`
- Enable execution provider optimization: Set `models.detector.execution_provider: TensorrtExecutionProvider` if TensorRT is available
- Reduce detection input size: `models.detector.input_size: [320, 320]` (trade detection range for speed)
""".format(rec_lat=det_lat)
elif fps < 20:
response += f"""⚑ **Warning: FPS is below optimal.** Target is 24+ FPS for smooth tracking.
### Recommendations:
1. **Increase frame skip ratios:**
- `pipeline.detect_every_n_frames`: 4 (from 3)
- `pipeline.recognize_every_n_frames`: 6 (from 5)
2. **Enable idle mode** to conserve resources when no faces present:
```yaml
pipeline.idle_mode.enabled: true
pipeline.idle_mode.timeout_seconds: 20
```
3. **Quality gate is likely too lenient** β€” accepting too many low-quality crops increases recognition load:
```yaml
quality_gate.min_quality_score: 0.4 # Raise from 0.3
quality_gate.min_face_size: 100 # Raise from 80
```
"""
else:
response += f"""βœ… **FPS is healthy ({fps}).** Pipeline is performing within expected parameters.
Current performance breakdown:
- Detection: {det_lat}ms (target: <30ms) {'βœ…' if det_lat < 30 else '⚠️'}
- Recognition: {rec_lat}ms (target: <50ms) {'βœ…' if rec_lat < 50 else '⚠️'}
No immediate optimization needed, but you can further improve by:
- Enabling GPU acceleration if hardware supports it
- Using the ensemble recognizer only for edge cases
"""
return response
# Faces detected but not recognized
if any(phrase in message_lower for phrase in ["not recognized", "never recognized", "detected but"]):
rec = tool_results.get("check_recognition", {})
response = f"""## πŸ” Recognition Failure Diagnosis
**Recognition System Status:**
- Model loaded: {'βœ…' if rec.get('model_loaded') else '❌'}
- FAISS index vectors: {rec.get('index_vectors', 'N/A')}
- Avg confidence: {rec.get('avg_confidence', 'N/A')}
- Quality gate rejection rate: {rec.get('quality_gate_rejection_rate', 'N/A')}
- Faces below threshold: {rec.get('faces_below_threshold', 'N/A')}
### Diagnosis Steps:
#### 1. Check FAISS Index
The index has **{rec.get('index_vectors', 0)} vectors**. If this is 0 or doesn't match your enrolled student count, the index needs to be rebuilt.
```bash
# Verify index
python -c "import faiss; idx = faiss.read_index('data/student_index.faiss'); print(f'Vectors: {{idx.ntotal}}')"
```
#### 2. Cosine Threshold Too High
Current threshold: **{rec.get('faiss_threshold', 'N/A')}**
If this is above 0.5, many legitimate matches will be rejected. Recommended range: **0.35-0.50**.
```yaml
# config/edge_config.yaml
faiss:
cosine_threshold: 0.40 # Lower from {rec.get('faiss_threshold', 0.45)}
```
#### 3. Quality Gate Rejecting Too Many Crops
Rejection rate is **{rec.get('quality_gate_rejection_rate', 0)*100:.0f}%**. If this is above 30%, the quality gate may be too strict.
```yaml
# Relax quality requirements
quality_gate:
min_quality_score: 0.25 # Lower from 0.3
min_face_size: 64 # Lower from 80
max_blur_score: 150 # Raise from 100
```
#### 4. Embedding Model Issues
- Verify model produces valid 512-dim embeddings (not zeros/NaN)
- Check if face alignment landmarks are accurate
- Verify input normalization matches training preprocessing
#### 5. Student Map Verification
Ensure enrolled students have embeddings in the index:
```bash
python scripts/verify_enrollments.py --index data/student_index.faiss --map data/student_map.json
```
"""
return response
# Tracking issues
if any(w in message_lower for w in ["track", "id switch", "duplicate", "same person different"]):
tracking = tool_results.get("check_tracking", {})
response = f"""## 🎯 Tracking Diagnosis (ByteTrack)
**Current Status:**
- Active tracks: {tracking.get('active_tracks', 'N/A')}
- ID switch rate: {tracking.get('id_switch_rate', 'N/A')}
- Avg track duration: {tracking.get('avg_track_duration_frames', 'N/A')} frames
- Tracks created/minute: {tracking.get('tracks_created_last_minute', 'N/A')}
- Tracks lost/minute: {tracking.get('tracks_lost_last_minute', 'N/A')}
### Common Tracking Issues:
#### ID Switching (same person gets multiple IDs)
ID switch rate: **{tracking.get('id_switch_rate', 0)}**
{'⚠️ High ID switch rate!' if tracking.get('id_switch_rate', 0) > 0.05 else 'βœ… ID switch rate is acceptable.'}
**Fix:** Increase track buffer and lower match threshold:
```yaml
tracker:
track_buffer: 45 # Increase from {tracking.get('track_buffer', 30)} (keeps lost tracks longer)
match_thresh: 0.7 # Lower from {tracking.get('match_thresh', 0.8)} (easier re-association)
track_thresh: 0.4 # Lower from {tracking.get('track_thresh', 0.5)} (lower birth threshold)
```
#### Tracks Dying Too Quickly
If `tracks_lost_last_minute` >> `tracks_created_last_minute`:
- Increase `tracker.track_buffer` (frames before track termination)
- Lower detection confidence to maintain tracking continuity
- Check if detection frame skip is too aggressive
#### Ghost Tracks (tracking non-existent faces)
If `active_tracks` is much higher than actual people:
- Raise `tracker.track_thresh` to 0.6+
- Raise detection `conf_threshold` to reduce false positives
- Add minimum detection size filter
"""
return response
# Event transmission
if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "offline", "delivery"]):
events = tool_results.get("check_event_transmission", {})
response = f"""## πŸ“‘ Event Transmission Status
**Connection Status:**
- Redis connected: {'βœ…' if events.get('redis_connected') else '❌ DISCONNECTED'}
- Last successful event: {events.get('last_successful_event', 'N/A')}
- Offline queue depth: {events.get('offline_queue_depth', 0)}
**Delivery Stats:**
- Events sent: {events.get('events_sent_total', 0)}
- Events failed: {events.get('events_failed_total', 0)}
- Avg delivery latency: {events.get('avg_delivery_latency_ms', 0)}ms
"""
if not events.get('redis_connected'):
response += """### ❌ Redis Connection Issue
The edge node cannot reach the Redis server. Events are being queued locally.
**Immediate Actions:**
1. Check Redis server connectivity:
```bash
redis-cli -h {host} -p 6379 ping
```
2. Check network connectivity:
```bash
ping {host}
nc -zv {host} 6379
```
3. Check Redis configuration:
```yaml
redis:
host: {host}
port: 6379
password: <check if auth required>
max_retries: 5
retry_delay_seconds: 2
```
4. HTTP fallback should be handling events in the meantime.
**Queue Management:**
The offline queue has {queue} events pending. Once Redis reconnects, these will be drained automatically.
If the queue exceeds 100MB, oldest events are dropped. Monitor via:
```
smartclass_offline_queue_depth
```
""".format(host=events.get('redis_host', '192.168.1.100'),
queue=events.get('offline_queue_depth', 0))
else:
response += """### βœ… Redis Connected
Event delivery is functioning normally.
**Performance Notes:**
- Delivery latency {lat}ms is {'within normal range' if events.get('avg_delivery_latency_ms', 0) < 30 else 'elevated - check network'}
- Failure rate: {rate:.1f}% {'βœ…' if events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) < 0.02 else '⚠️ Above 2% threshold'}
""".format(lat=events.get('avg_delivery_latency_ms', 0),
rate=events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) * 100)
return response
# Error log analysis
if any(w in message_lower for w in ["error", "fail", "crash", "log", "warn"]):
errors = tool_results.get("get_error_log", [])
response = "## πŸ“‹ Recent Error Log\n\n"
if errors:
for e in errors:
icon = "πŸ”΄" if e["level"] == "ERROR" else "🟑" if e["level"] == "WARN" else "πŸ”΅"
response += f"{icon} **[{e['timestamp']}]** `{e['component']}`: {e['message']}\n\n"
response += "\n### Analysis\n\n"
error_components = [e["component"] for e in errors if e["level"] == "ERROR"]
warn_components = [e["component"] for e in errors if e["level"] == "WARN"]
if "event_sender" in error_components:
response += "- **Event Sender errors** detected β€” check Redis connectivity and HTTP fallback\n"
if "recognizer" in warn_components:
response += "- **Recognizer warnings** β€” quality gate is rejecting crops. Consider relaxing thresholds or improving lighting\n"
if "tracker" in warn_components:
response += "- **Tracker warnings** β€” tracks being lost. May need to increase `track_buffer`\n"
else:
response += "βœ… No recent errors found. System is operating normally.\n"
return response
# Configuration query
if any(w in message_lower for w in ["config", "setting", "parameter", "yaml"]):
config = tool_results.get("get_config", {})
response = f"""## βš™οΈ Current Configuration
```yaml
# config/edge_config.yaml (key sections)
pipeline:
detect_every_n_frames: {config.get('pipeline', {}).get('detect_every_n_frames', 'N/A')}
recognize_every_n_frames: {config.get('pipeline', {}).get('recognize_every_n_frames', 'N/A')}
idle_mode:
enabled: {config.get('pipeline', {}).get('idle_mode', {}).get('enabled', 'N/A')}
timeout_seconds: {config.get('pipeline', {}).get('idle_mode', {}).get('timeout_seconds', 'N/A')}
models:
detector:
path: {config.get('models', {}).get('detector', {}).get('path', 'N/A')}
conf_threshold: {config.get('models', {}).get('detector', {}).get('conf_threshold', 'N/A')}
nms_threshold: {config.get('models', {}).get('detector', {}).get('nms_threshold', 'N/A')}
recognizer:
path: {config.get('models', {}).get('recognizer', {}).get('path', 'N/A')}
embedding_dim: {config.get('models', {}).get('recognizer', {}).get('embedding_dim', 'N/A')}
faiss:
cosine_threshold: {config.get('faiss', {}).get('cosine_threshold', 'N/A')}
nprobe: {config.get('faiss', {}).get('nprobe', 'N/A')}
tracker:
track_thresh: {config.get('tracker', {}).get('track_thresh', 'N/A')}
track_buffer: {config.get('tracker', {}).get('track_buffer', 'N/A')}
match_thresh: {config.get('tracker', {}).get('match_thresh', 'N/A')}
identity:
min_evidence_count: {config.get('identity', {}).get('min_evidence_count', 'N/A')}
consensus_threshold: {config.get('identity', {}).get('consensus_threshold', 'N/A')}
cooldown_seconds: {config.get('identity', {}).get('cooldown_seconds', 'N/A')}
quality_gate:
min_face_size: {config.get('quality_gate', {}).get('min_face_size', 'N/A')}
min_quality_score: {config.get('quality_gate', {}).get('min_quality_score', 'N/A')}
```
Need help tuning any specific parameter? Ask about:
- Detection thresholds (sensitivity vs false positives)
- Recognition thresholds (miss rate vs false accepts)
- Tracking parameters (stability vs responsiveness)
- Quality gate (thoroughness vs coverage)
"""
return response
# Identity engine
if any(w in message_lower for w in ["evidence", "consensus", "identity engine", "fusion"]):
identity = tool_results.get("check_identity_engine", {})
response = f"""## 🧠 Identity Evidence Engine Status
**Configuration:**
- Min evidence count: {identity.get('min_evidence_count', 'N/A')} (frames needed before decision)
- Consensus threshold: {identity.get('consensus_threshold', 'N/A')} (agreement required)
- Cooldown: {identity.get('cooldown_seconds', 'N/A')}s (prevents duplicate events)
**Performance:**
- Pending identities: {identity.get('pending_identities', 0)} (accumulating evidence)
- Resolved last minute: {identity.get('resolved_last_minute', 0)}
- Avg evidence per resolution: {identity.get('avg_evidence_per_resolution', 'N/A')}
- Avg confidence at resolution: {identity.get('avg_confidence_at_resolution', 'N/A')}
- Cooldown-blocked events: {identity.get('cooldown_blocked_events', 0)}
### How It Works
The Identity Evidence Engine prevents false attendance by requiring multiple consistent recognition results before confirming identity:
1. **Evidence Collection**: Each recognition attempt adds a weighted sample to the track's evidence pool
2. **Quality Weighting**: Higher quality face crops contribute more weight
3. **Consensus Check**: Once `min_evidence_count` samples exist AND the top identity has β‰₯ `consensus_threshold` agreement, identity is resolved
4. **Cooldown**: After resolution, the same identity won't trigger another event for `cooldown_seconds`
### Tuning Guide
| Scenario | Adjustment |
|----------|-----------|
| Missing attendance (too strict) | Lower `consensus_threshold` to 0.65 or `min_evidence_count` to 2 |
| False attendance (too lenient) | Raise `consensus_threshold` to 0.85 or `min_evidence_count` to 5 |
| Duplicate events | Increase `cooldown_seconds` |
| Slow resolution | Decrease `min_evidence_count` or `recognize_every_n_frames` |
"""
return response
# General status/overview
status = tool_results.get("get_pipeline_status", {})
metrics = tool_results.get("get_metrics", {})
response = f"""## πŸ“ Edge Node Status Overview
**Pipeline:** {'🟒 Running' if status.get('status') == 'running' else 'πŸ”΄ Stopped'}
**Uptime:** {status.get('uptime_hours', 'N/A')} hours
**FPS:** {metrics.get('smartclass_edge_fps', 'N/A')}
### Component Health
| Component | Status |
|-----------|--------|
| Camera | {'βœ… Active' if status.get('camera_active') else '❌ Inactive'} |
| Detector (SCRFD) | {'βœ… Loaded' if status.get('detector_loaded') else '❌ Not loaded'} |
| Recognizer (MFN) | {'βœ… Loaded' if status.get('recognizer_loaded') else '❌ Not loaded'} |
| FAISS Index | {'βœ… Loaded (' + str(status.get('faiss_index_vectors', 0)) + ' vectors)' if status.get('faiss_index_loaded') else '❌ Not loaded'} |
| Redis | {'βœ… Connected' if status.get('redis_connected') else '⚠️ Disconnected (using fallback)'} |
### Key Metrics
- Detection latency: {metrics.get('smartclass_detection_latency_ms', 'N/A')}ms
- Recognition latency: {metrics.get('smartclass_recognition_latency_ms', 'N/A')}ms
- Active tracks: {metrics.get('smartclass_tracking_active_tracks', 'N/A')}
- CPU temp: {metrics.get('smartclass_cpu_temp_celsius', 'N/A')}Β°C
- Memory: {metrics.get('smartclass_memory_usage_percent', 'N/A')}%
- Offline queue: {metrics.get('smartclass_offline_queue_depth', 'N/A')} events
### Recent Activity
- Last recognition: {status.get('last_recognition_event', 'N/A')}
- Faces detected: {metrics.get('smartclass_faces_detected_total', 'N/A')}
- Faces recognized: {metrics.get('smartclass_faces_recognized_total', 'N/A')}
What would you like to investigate further? I can help with:
- πŸ”§ Pipeline performance optimization
- πŸ” Detection/recognition troubleshooting
- 🎯 Tracking issues
- πŸ“‘ Event transmission debugging
- βš™οΈ Configuration tuning
"""
return response
def agent_respond(message: str, history: list) -> Generator:
"""Main agent response function with streaming and tool calls."""
# Add user message
history = history + [{"role": "user", "content": message}]
yield history
# Determine which tools to call
tools_needed = determine_tools_to_call(message)
tool_results = {}
# Execute tools and show them in chat
for tool_name in tools_needed:
tool = TOOLS[tool_name]
result = tool["fn"]()
tool_results[tool_name] = result
# Display tool call in chat
history = history + [{
"role": "assistant",
"content": f"```json\n{json.dumps(result, indent=2, default=str)}\n```",
"metadata": {"title": f"πŸ”§ {tool_name}: {tool['description']}"}
}]
yield history
time.sleep(0.3) # Simulate tool execution time
# Generate response
response = generate_response(message, tool_results)
# Stream the response
history = history + [{"role": "assistant", "content": ""}]
words = response.split(" ")
for i in range(0, len(words), 3):
chunk = " ".join(words[i:i+3]) + " "
history[-1]["content"] += chunk
yield history
time.sleep(0.02)
# ============================================================================
# METRICS POLLING FUNCTIONS
# ============================================================================
def poll_metrics():
"""Poll live metrics for the dashboard."""
m = edge_state.get_metrics()
return m
def poll_status():
"""Poll pipeline status."""
return edge_state.get_pipeline_status()
def reset_simulation():
"""Reset the simulated edge node state."""
edge_state.reset()
return "βœ… Edge node simulation reset with new random state."
def inject_fault(fault_type):
"""Inject a simulated fault for testing diagnostics."""
if fault_type == "Low FPS":
edge_state.fps = round(random.uniform(1, 5), 1)
edge_state.detection_latency = round(random.uniform(80, 150), 1)
edge_state.recognition_latency = round(random.uniform(150, 300), 1)
return "⚠️ Injected: Low FPS condition (heavy processing load)"
elif fault_type == "Redis Disconnect":
edge_state.redis_connected = False
edge_state.offline_queue_depth = random.randint(50, 200)
return "⚠️ Injected: Redis disconnected, events queuing locally"
elif fault_type == "No Recognitions":
edge_state.recognition_confidence_avg = 0.25
edge_state.faces_recognized = 0
edge_state.config["faiss"]["cosine_threshold"] = 0.85
return "⚠️ Injected: Recognition threshold too high, no matches"
elif fault_type == "High Temperature":
edge_state.cpu_temp = round(random.uniform(82, 95), 1)
edge_state.fps = round(random.uniform(5, 12), 1)
return "⚠️ Injected: CPU thermal throttling"
elif fault_type == "Tracking Chaos":
edge_state.active_tracks = random.randint(30, 50)
edge_state.config["tracker"]["track_thresh"] = 0.2
return "⚠️ Injected: Too many ghost tracks (low threshold)"
return "Unknown fault type"
# ============================================================================
# GRADIO UI
# ============================================================================
CUSTOM_CSS = """
.tool-output {
font-size: 0.85em;
max-height: 200px;
overflow-y: auto;
}
.metric-card {
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 16px;
margin: 8px;
}
"""
EXAMPLES = [
"The edge node is only getting 2 FPS, how do I optimize?",
"Faces are detected but never recognized",
"I'm seeing duplicate attendance events for the same student",
"The offline queue keeps growing, events aren't being sent",
"Show me the current system status",
"How do I tune the identity evidence engine?",
"Tracking IDs keep switching between the same person",
"What's the recommended configuration for a Raspberry Pi 5?",
"Explain the quality gate and how to adjust it",
"The CPU temperature is too high, what can I do?",
]
with gr.Blocks(title="Edge Node Diagnostics Agent") as demo:
gr.Markdown("""
# πŸ”¬ Edge Node Diagnostics Agent
### SmartClass Face Recognition Pipeline Troubleshooter
Specialized AI assistant for diagnosing and optimizing face recognition pipelines
running on classroom edge devices (Raspberry Pi 5 / Intel NUC).
""")
with gr.Sidebar():
gr.Markdown("### πŸŽ›οΈ Simulation Controls")
gr.Markdown("*Inject faults to test diagnostics capabilities*")
fault_dropdown = gr.Dropdown(
choices=["Low FPS", "Redis Disconnect", "No Recognitions",
"High Temperature", "Tracking Chaos"],
label="Fault Type",
value="Low FPS"
)
inject_btn = gr.Button("πŸ’₯ Inject Fault", variant="secondary")
reset_btn = gr.Button("πŸ”„ Reset Node", variant="secondary")
fault_status = gr.Textbox(label="Status", interactive=False, lines=2)
inject_btn.click(inject_fault, fault_dropdown, fault_status)
reset_btn.click(reset_simulation, outputs=fault_status)
gr.Markdown("---")
gr.Markdown("### πŸ“Œ Quick Reference")
gr.Markdown("""
**Pipeline Flow:**
1. πŸ“· Camera Capture
2. πŸ‘€ Face Detection (SCRFD)
3. 🎯 Face Tracking (ByteTrack)
4. βœ… Quality Assessment
5. πŸ”„ Face Alignment
6. 🧠 Embedding (MobileFaceNet)
7. πŸ” FAISS Search
8. βš–οΈ Evidence Engine
9. πŸ“‘ Event Transmission
""")
with gr.Tabs():
# ============ CHAT TAB ============
with gr.Tab("πŸ’¬ Diagnostics Chat", id="chat"):
chatbot = gr.Chatbot(
height=550,
label="Edge Diagnostics Agent",
placeholder="<center><h3>πŸ”¬ Edge Node Diagnostics Agent</h3><p>Ask me about pipeline performance, detection issues, recognition failures, tracking problems, or event delivery.</p></center>",
)
with gr.Row():
msg = gr.Textbox(
placeholder="e.g., 'The edge node is only getting 2 FPS, how do I optimize?'",
show_label=False,
scale=8,
container=False,
autofocus=True,
)
submit_btn = gr.Button("Send", variant="primary", scale=1, min_width=100)
clear_btn = gr.Button("Clear", scale=1, min_width=80)
gr.Examples(
examples=EXAMPLES,
inputs=msg,
label="πŸ’‘ Try these questions:",
)
# Event handlers
msg.submit(
agent_respond, [msg, chatbot], chatbot
).then(lambda: "", outputs=msg)
submit_btn.click(
agent_respond, [msg, chatbot], chatbot
).then(lambda: "", outputs=msg)
clear_btn.click(lambda: [], outputs=chatbot)
# ============ METRICS TAB ============
with gr.Tab("πŸ“Š Live Metrics", id="metrics"):
gr.Markdown("### Real-time Edge Node Metrics")
gr.Markdown("*Metrics update every 5 seconds (simulated)*")
timer = gr.Timer(value=5, active=True)
with gr.Row():
metrics_json = gr.JSON(label="πŸ“ˆ Prometheus Metrics", scale=2)
status_json = gr.JSON(label="πŸ”‹ Pipeline Status", scale=1)
timer.tick(poll_metrics, outputs=metrics_json)
timer.tick(poll_status, outputs=status_json)
# ============ CONFIG TAB ============
with gr.Tab("βš™οΈ Configuration", id="config"):
gr.Markdown("### Edge Node Configuration")
gr.Markdown("*Current `config/edge_config.yaml` settings*")
with gr.Row():
with gr.Column():
gr.Markdown("#### Detection Settings")
det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05,
label="Detection Confidence Threshold")
det_nms = gr.Slider(0.1, 0.9, value=0.4, step=0.05,
label="NMS Threshold")
det_skip = gr.Slider(1, 10, value=3, step=1,
label="Detect Every N Frames")
gr.Markdown("#### Recognition Settings")
rec_threshold = gr.Slider(0.2, 0.8, value=0.45, step=0.05,
label="FAISS Cosine Threshold")
rec_skip = gr.Slider(1, 15, value=5, step=1,
label="Recognize Every N Frames")
with gr.Column():
gr.Markdown("#### Tracking Settings")
track_thresh = gr.Slider(0.1, 0.9, value=0.5, step=0.05,
label="Track Birth Threshold")
track_buffer = gr.Slider(10, 120, value=30, step=5,
label="Track Buffer (frames)")
match_thresh = gr.Slider(0.3, 1.0, value=0.8, step=0.05,
label="Match Threshold")
gr.Markdown("#### Identity Engine")
min_evidence = gr.Slider(1, 10, value=3, step=1,
label="Min Evidence Count")
consensus = gr.Slider(0.5, 1.0, value=0.75, step=0.05,
label="Consensus Threshold")
cooldown = gr.Slider(30, 600, value=300, step=30,
label="Cooldown (seconds)")
with gr.Row():
save_btn = gr.Button("πŸ’Ύ Apply Configuration", variant="primary")
config_status = gr.Textbox(label="Status", interactive=False)
def apply_config(det_c, det_n, det_s, rec_t, rec_s, tr_t, tr_b, m_t, me, cs, cd):
edge_state.config["models"]["detector"]["conf_threshold"] = det_c
edge_state.config["models"]["detector"]["nms_threshold"] = det_n
edge_state.config["pipeline"]["detect_every_n_frames"] = det_s
edge_state.config["faiss"]["cosine_threshold"] = rec_t
edge_state.config["pipeline"]["recognize_every_n_frames"] = rec_s
edge_state.config["tracker"]["track_thresh"] = tr_t
edge_state.config["tracker"]["track_buffer"] = tr_b
edge_state.config["tracker"]["match_thresh"] = m_t
edge_state.config["identity"]["min_evidence_count"] = me
edge_state.config["identity"]["consensus_threshold"] = cs
edge_state.config["identity"]["cooldown_seconds"] = cd
return f"βœ… Configuration applied at {datetime.now().strftime('%H:%M:%S')}"
save_btn.click(
apply_config,
[det_conf, det_nms, det_skip, rec_threshold, rec_skip,
track_thresh, track_buffer, match_thresh,
min_evidence, consensus, cooldown],
config_status
)
# ============ ARCHITECTURE TAB ============
with gr.Tab("πŸ“ Architecture", id="arch"):
gr.Markdown("""
### SmartClass Edge Node Pipeline Architecture
```
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ EDGE NODE (Pi 5 / NUC) β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
β”‚ β”‚ Camera │───▢│ Detector │───▢│ Tracker │───▢│ Quality β”‚ β”‚
β”‚ β”‚ Capture β”‚ β”‚ (SCRFD) β”‚ β”‚(ByteTrack)β”‚ β”‚ Assessment β”‚ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
β”‚ β”‚ β”‚
β”‚ β–Ό β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
β”‚ β”‚ Identity │◀───│ FAISS │◀───│ Embedding│◀───│ Face β”‚ β”‚
β”‚ β”‚ Evidence β”‚ β”‚ Search β”‚ β”‚(MobileFN)β”‚ β”‚ Align β”‚ β”‚
β”‚ β”‚ Engine β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
β”‚ β”‚ β”‚
β”‚ β–Ό β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
β”‚ β”‚ Event Transmission β”‚ β”‚
β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ β”‚
β”‚ β”‚ β”‚ Redis │──▢│ HTTP Fallback│──▢│ SQLite Offline Q β”‚ β”‚ β”‚
β”‚ β”‚ β”‚ Streams β”‚ β”‚ (REST API) β”‚ β”‚ (retry background) β”‚ β”‚ β”‚
β”‚ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
β”‚ β”‚
β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚
β”‚ β”‚ Prometheus Metrics (port 9100) β”‚ β”‚
β”‚ β”‚ FPS | Latency | Tracks | Temp | Memory | Queue | Events β”‚ β”‚
β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚
β”‚ β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
```
### Key Source Files
| File | Purpose |
|------|---------|
| `src/main_pipeline.py` | CameraPipeline orchestration |
| `src/capture.py` | Camera frame capture (USB/RTSP) |
| `src/detector.py` | SCRFD face detection |
| `src/tracker.py` | ByteTrack face tracking |
| `src/face_quality.py` | FIQA quality scoring |
| `src/face_align.py` | Face alignment & warping |
| `src/face_enhance.py` | MSRCR Retinex / CLAHE |
| `src/recognizer.py` | MobileFaceNet embedding |
| `src/ensemble_recognizer.py` | AdaFace + MFN dual model |
| `src/identity_evidence_engine.py` | Global identity fusion |
| `src/event_sender.py` | Event transmission logic |
| `src/edge_metrics.py` | Prometheus metrics export |
### Configuration
All settings in `config/edge_config.yaml`. Key sections:
- `pipeline` β€” Frame processing rates, idle mode
- `models` β€” Model paths, thresholds, execution providers
- `faiss` β€” Index path, similarity threshold, GPU mode
- `tracker` β€” ByteTrack parameters
- `identity` β€” Evidence engine tuning
- `cameras` β€” Camera sources
- `redis` / `http_fallback` / `offline_queue` β€” Event delivery
- `quality_gate` β€” Face crop quality requirements
""")
# ============ HELP TAB ============
with gr.Tab("❓ Help", id="help"):
gr.Markdown("""
### How to Use This Agent
**1. Ask Natural Language Questions**
Simply describe your problem in the chat. The agent will:
- Automatically query relevant diagnostic tools
- Show raw data from the edge node (collapsible)
- Provide analysis and recommendations
**2. Common Troubleshooting Scenarios**
| Problem | What to Ask |
|---------|-------------|
| Low FPS | "Pipeline is slow, only getting X FPS" |
| No recognitions | "Faces are detected but never recognized" |
| Duplicate events | "Same student marked present twice" |
| Events not sending | "Offline queue keeps growing" |
| Tracking issues | "Track IDs keep switching" |
| False detections | "Detecting faces where there are none" |
| Model errors | "Detection model won't load" |
| Overheating | "CPU temperature is too high" |
**3. Simulation Controls (Sidebar)**
Use the sidebar to inject faults and test the agent's diagnostic capabilities:
- **Low FPS** β€” Simulates heavy processing load
- **Redis Disconnect** β€” Simulates network failure
- **No Recognitions** β€” Simulates overly strict thresholds
- **High Temperature** β€” Simulates thermal throttling
- **Tracking Chaos** β€” Simulates too many ghost tracks
**4. Configuration Tab**
Adjust edge node parameters in real-time and see how they affect the system.
**5. Metrics Tab**
View live (simulated) Prometheus metrics from the edge node.
---
### Technical Reference
**Prometheus Metrics (port 9100):**
```
smartclass_edge_fps # Overall frames per second
smartclass_detection_latency_ms # Time per detection
smartclass_recognition_latency_ms # Time per recognition
smartclass_tracking_active_tracks # Active face tracks
smartclass_cpu_temp_celsius # CPU temperature
smartclass_memory_usage_percent # RAM usage
smartclass_offline_queue_depth # Queued events
smartclass_events_sent_total # Successfully sent events
smartclass_events_failed_total # Failed event deliveries
smartclass_faces_detected_total # Total detections
smartclass_faces_recognized_total # Successful recognitions
smartclass_recognition_confidence_avg # Average match confidence
```
**Key Config Paths:**
- `config/edge_config.yaml` β†’ All settings
- `models/scrfd_2.5g.onnx` β†’ Detection model
- `models/mobilefacenet_v2.onnx` β†’ Recognition model
- `data/student_index.faiss` β†’ FAISS vector index
- `data/student_map.json` β†’ ID β†’ student name mapping
- `data/offline_queue.db` β†’ SQLite fallback queue
""")
demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)