""" Edge Node Diagnostics Agent - SmartClass Face Recognition Pipeline Troubleshooter A conversational AI agent specialized in troubleshooting and optimizing face recognition pipelines running on classroom edge devices (Raspberry Pi 5 / Intel NUC). """ import gradio as gr import json import random import time from datetime import datetime, timedelta from typing import Generator # ============================================================================ # SYSTEM PROMPT # ============================================================================ SYSTEM_PROMPT = """You are the Edge Node Diagnostics Agent for the SmartClass attendance system. You specialize in troubleshooting and optimizing the face recognition pipeline that runs on classroom edge devices (Raspberry Pi 5 or Intel NUC). You understand the complete pipeline: video capture → face detection (SCRFD) → tracking (ByteTrack) → quality assessment → face alignment → embedding extraction (MobileFaceNet/AdaFace) → FAISS similarity search → identity evidence engine → event transmission (Redis Streams / HTTP fallback / SQLite queue). When diagnosing issues: 1. Always start by checking relevant metrics and system status 2. Identify the specific pipeline stage where the problem occurs 3. Provide concrete configuration changes with exact YAML paths 4. Explain the root cause in technical but accessible terms 5. Suggest both immediate fixes and long-term optimizations Key configuration file: config/edge_config.yaml Key metrics endpoint: http://localhost:9100/metrics (Prometheus format) You have access to diagnostic tools that can query the edge node's state. Use them proactively.""" # ============================================================================ # SIMULATED EDGE NODE STATE # ============================================================================ class EdgeNodeState: """Simulates the state of an edge node for demonstration purposes.""" def __init__(self): self.reset() def reset(self): self.fps = round(random.uniform(18, 28), 1) self.detection_latency = round(random.uniform(12, 45), 1) self.recognition_latency = round(random.uniform(25, 80), 1) self.active_tracks = random.randint(0, 15) self.cpu_temp = round(random.uniform(55, 78), 1) self.memory_usage = round(random.uniform(45, 85), 1) self.offline_queue_depth = random.randint(0, 50) self.events_sent = random.randint(100, 5000) self.events_failed = random.randint(0, 25) self.faces_detected = random.randint(50, 2000) self.faces_recognized = random.randint(30, 1800) self.recognition_confidence_avg = round(random.uniform(0.65, 0.95), 3) self.faiss_index_size = random.randint(50, 500) self.model_loaded = True self.redis_connected = random.choice([True, True, True, False]) self.camera_active = True self.last_event_time = datetime.now() - timedelta(seconds=random.randint(1, 300)) # Configuration self.config = { "pipeline": { "detect_every_n_frames": 3, "recognize_every_n_frames": 5, "idle_mode": {"enabled": True, "timeout_seconds": 30, "fps_target": 5} }, "models": { "detector": { "path": "models/scrfd_2.5g.onnx", "conf_threshold": 0.5, "nms_threshold": 0.4, "input_size": [640, 640], "execution_provider": "CPUExecutionProvider" }, "recognizer": { "path": "models/mobilefacenet_v2.onnx", "embedding_dim": 512, "input_size": [112, 112] } }, "faiss": { "index_path": "data/student_index.faiss", "cosine_threshold": 0.45, "nprobe": 10, "use_gpu": False }, "tracker": { "track_thresh": 0.5, "track_buffer": 30, "match_thresh": 0.8, "max_time_lost": 60 }, "identity": { "min_evidence_count": 3, "consensus_threshold": 0.75, "cooldown_seconds": 300, "max_evidence_age_seconds": 10 }, "cameras": [ {"id": "cam_01", "source": "/dev/video0", "type": "usb", "resolution": [1280, 720], "fps": 30} ], "redis": { "host": "192.168.1.100", "port": 6379, "stream_key": "smartclass:attendance", "max_retries": 3 }, "http_fallback": { "url": "https://api.smartclass.edu/v1/attendance", "timeout_seconds": 5, "batch_size": 10 }, "offline_queue": { "db_path": "data/offline_queue.db", "max_size_mb": 100, "retry_interval_seconds": 60 }, "quality_gate": { "min_face_size": 80, "min_quality_score": 0.3, "max_blur_score": 100, "min_brightness": 40, "max_brightness": 220 } } # Simulated error log self.errors = [ {"timestamp": "2024-03-15T10:23:45", "level": "WARN", "component": "recognizer", "message": "Low quality face crop rejected (quality=0.18)"}, {"timestamp": "2024-03-15T10:24:12", "level": "ERROR", "component": "event_sender", "message": "Redis connection timeout after 5s"}, {"timestamp": "2024-03-15T10:25:01", "level": "WARN", "component": "tracker", "message": "Track #47 lost after 60 frames without update"}, ] def get_metrics(self): # Slightly vary metrics each call return { "smartclass_edge_fps": round(self.fps + random.uniform(-2, 2), 1), "smartclass_detection_latency_ms": round(self.detection_latency + random.uniform(-5, 5), 1), "smartclass_recognition_latency_ms": round(self.recognition_latency + random.uniform(-10, 10), 1), "smartclass_tracking_active_tracks": max(0, self.active_tracks + random.randint(-3, 3)), "smartclass_cpu_temp_celsius": round(self.cpu_temp + random.uniform(-2, 2), 1), "smartclass_memory_usage_percent": round(self.memory_usage + random.uniform(-3, 3), 1), "smartclass_offline_queue_depth": max(0, self.offline_queue_depth + random.randint(-5, 5)), "smartclass_events_sent_total": self.events_sent + random.randint(0, 10), "smartclass_events_failed_total": self.events_failed + random.randint(0, 2), "smartclass_faces_detected_total": self.faces_detected + random.randint(0, 20), "smartclass_faces_recognized_total": self.faces_recognized + random.randint(0, 15), "smartclass_recognition_confidence_avg": round(self.recognition_confidence_avg + random.uniform(-0.05, 0.05), 3), } def get_pipeline_status(self): return { "status": "running" if self.camera_active else "stopped", "uptime_hours": round(random.uniform(1, 72), 1), "camera_active": self.camera_active, "detector_loaded": self.model_loaded, "recognizer_loaded": self.model_loaded, "faiss_index_loaded": True, "faiss_index_vectors": self.faiss_index_size, "redis_connected": self.redis_connected, "current_fps": round(self.fps + random.uniform(-2, 2), 1), "active_tracks": max(0, self.active_tracks + random.randint(-2, 2)), "last_recognition_event": self.last_event_time.isoformat(), } def get_error_log(self, n=10): return self.errors[-n:] def check_detection(self): return { "model": "SCRFD-2.5G", "model_path": self.config["models"]["detector"]["path"], "model_loaded": self.model_loaded, "execution_provider": self.config["models"]["detector"]["execution_provider"], "conf_threshold": self.config["models"]["detector"]["conf_threshold"], "nms_threshold": self.config["models"]["detector"]["nms_threshold"], "input_size": self.config["models"]["detector"]["input_size"], "avg_latency_ms": round(self.detection_latency + random.uniform(-3, 3), 1), "detections_per_frame_avg": round(random.uniform(0.5, 5.0), 2), "false_positive_rate_estimate": round(random.uniform(0.01, 0.08), 3), } def check_recognition(self): return { "model": "MobileFaceNet-v2", "model_path": self.config["models"]["recognizer"]["path"], "model_loaded": self.model_loaded, "embedding_dim": self.config["models"]["recognizer"]["embedding_dim"], "faiss_threshold": self.config["faiss"]["cosine_threshold"], "avg_latency_ms": round(self.recognition_latency + random.uniform(-5, 5), 1), "avg_confidence": round(self.recognition_confidence_avg, 3), "quality_gate_rejection_rate": round(random.uniform(0.1, 0.4), 2), "faces_below_threshold": random.randint(5, 50), "index_vectors": self.faiss_index_size, } def check_tracking(self): return { "algorithm": "ByteTrack", "active_tracks": max(0, self.active_tracks + random.randint(-2, 2)), "track_thresh": self.config["tracker"]["track_thresh"], "track_buffer": self.config["tracker"]["track_buffer"], "match_thresh": self.config["tracker"]["match_thresh"], "tracks_created_last_minute": random.randint(2, 20), "tracks_lost_last_minute": random.randint(1, 10), "avg_track_duration_frames": random.randint(30, 200), "id_switch_rate": round(random.uniform(0.01, 0.1), 3), } def check_event_transmission(self): return { "redis_connected": self.redis_connected, "redis_host": self.config["redis"]["host"], "redis_stream_key": self.config["redis"]["stream_key"], "events_sent_total": self.events_sent, "events_failed_total": self.events_failed, "offline_queue_depth": self.offline_queue_depth, "http_fallback_available": True, "last_successful_event": self.last_event_time.isoformat(), "avg_delivery_latency_ms": round(random.uniform(5, 50), 1), } def check_identity_engine(self): return { "min_evidence_count": self.config["identity"]["min_evidence_count"], "consensus_threshold": self.config["identity"]["consensus_threshold"], "cooldown_seconds": self.config["identity"]["cooldown_seconds"], "pending_identities": random.randint(0, 5), "resolved_last_minute": random.randint(0, 15), "avg_evidence_per_resolution": round(random.uniform(3, 8), 1), "avg_confidence_at_resolution": round(random.uniform(0.78, 0.95), 3), "cooldown_blocked_events": random.randint(0, 10), } # Global edge node state edge_state = EdgeNodeState() # ============================================================================ # DIAGNOSTIC TOOLS # ============================================================================ TOOLS = { "get_pipeline_status": { "description": "Get overall pipeline status including component health", "fn": lambda: edge_state.get_pipeline_status() }, "get_metrics": { "description": "Get current Prometheus metrics from the edge node", "fn": lambda: edge_state.get_metrics() }, "check_detection": { "description": "Check face detection subsystem (SCRFD model status and performance)", "fn": lambda: edge_state.check_detection() }, "check_recognition": { "description": "Check face recognition subsystem (MobileFaceNet, FAISS index)", "fn": lambda: edge_state.check_recognition() }, "check_tracking": { "description": "Check ByteTrack face tracking subsystem", "fn": lambda: edge_state.check_tracking() }, "check_event_transmission": { "description": "Check event delivery (Redis Streams, HTTP fallback, offline queue)", "fn": lambda: edge_state.check_event_transmission() }, "check_identity_engine": { "description": "Check identity evidence engine status", "fn": lambda: edge_state.check_identity_engine() }, "get_error_log": { "description": "Get recent error log entries", "fn": lambda: edge_state.get_error_log() }, "get_config": { "description": "Get current edge_config.yaml configuration", "fn": lambda: edge_state.config }, } # ============================================================================ # AGENT LOGIC # ============================================================================ def determine_tools_to_call(message: str) -> list: """Determine which diagnostic tools to call based on user message.""" message_lower = message.lower() tools_to_call = [] # Status/overview queries if any(w in message_lower for w in ["status", "overview", "health", "running", "working"]): tools_to_call.append("get_pipeline_status") tools_to_call.append("get_metrics") # FPS/Performance queries if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "bottleneck", "optimize", "latency"]): tools_to_call.append("get_metrics") tools_to_call.append("get_pipeline_status") # Detection queries if any(w in message_lower for w in ["detect", "scrfd", "no faces", "false positive", "bounding box", "camera"]): tools_to_call.append("check_detection") # Recognition queries if any(w in message_lower for w in ["recogni", "embed", "faiss", "identity", "threshold", "mobilefacenet", "confidence", "not recognized", "never recognized"]): tools_to_call.append("check_recognition") # Tracking queries if any(w in message_lower for w in ["track", "bytetrack", "id switch", "lost", "duplicate"]): tools_to_call.append("check_tracking") # Event/transmission queries if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "send", "offline", "http", "delivery"]): tools_to_call.append("check_event_transmission") # Identity engine queries if any(w in message_lower for w in ["evidence", "consensus", "cooldown", "identity engine", "fusion"]): tools_to_call.append("check_identity_engine") # Error queries if any(w in message_lower for w in ["error", "fail", "crash", "exception", "log", "warn"]): tools_to_call.append("get_error_log") # Config queries if any(w in message_lower for w in ["config", "setting", "parameter", "yaml", "threshold"]): tools_to_call.append("get_config") # Default: get metrics and status if nothing specific matched if not tools_to_call: tools_to_call.append("get_pipeline_status") tools_to_call.append("get_metrics") return list(dict.fromkeys(tools_to_call)) # deduplicate preserving order def generate_response(message: str, tool_results: dict) -> str: """Generate a diagnostic response based on user query and tool results.""" message_lower = message.lower() # FPS optimization if any(w in message_lower for w in ["fps", "slow", "performance", "speed", "optimize"]): metrics = tool_results.get("get_metrics", {}) fps = metrics.get("smartclass_edge_fps", 0) det_lat = metrics.get("smartclass_detection_latency_ms", 0) rec_lat = metrics.get("smartclass_recognition_latency_ms", 0) response = f"""## 📊 Pipeline Performance Analysis **Current FPS:** {fps} **Detection Latency:** {det_lat}ms **Recognition Latency:** {rec_lat}ms ### Bottleneck Analysis """ if fps < 10: response += """⚠️ **Critical: FPS is dangerously low.** The pipeline cannot keep up with the camera frame rate. ### Root Cause Analysis The most common causes of low FPS on edge devices: 1. **Recognition is the bottleneck** ({rec_lat}ms per call) — it runs on every Nth frame 2. **Detection is too frequent** — check `pipeline.detect_every_n_frames` 3. **Too many active tracks** consuming resources 4. **Thermal throttling** — check CPU temperature ### Recommended Actions #### Immediate Fixes: ```yaml # config/edge_config.yaml pipeline: detect_every_n_frames: 5 # Increase from current 3 recognize_every_n_frames: 8 # Increase from current 5 idle_mode: enabled: true timeout_seconds: 15 # Enter idle faster fps_target: 3 ``` #### If still slow: - Switch to SCRFD-0.5G model (smaller, faster): `models.detector.path: models/scrfd_0.5g.onnx` - Enable execution provider optimization: Set `models.detector.execution_provider: TensorrtExecutionProvider` if TensorRT is available - Reduce detection input size: `models.detector.input_size: [320, 320]` (trade detection range for speed) """.format(rec_lat=det_lat) elif fps < 20: response += f"""⚡ **Warning: FPS is below optimal.** Target is 24+ FPS for smooth tracking. ### Recommendations: 1. **Increase frame skip ratios:** - `pipeline.detect_every_n_frames`: 4 (from 3) - `pipeline.recognize_every_n_frames`: 6 (from 5) 2. **Enable idle mode** to conserve resources when no faces present: ```yaml pipeline.idle_mode.enabled: true pipeline.idle_mode.timeout_seconds: 20 ``` 3. **Quality gate is likely too lenient** — accepting too many low-quality crops increases recognition load: ```yaml quality_gate.min_quality_score: 0.4 # Raise from 0.3 quality_gate.min_face_size: 100 # Raise from 80 ``` """ else: response += f"""✅ **FPS is healthy ({fps}).** Pipeline is performing within expected parameters. Current performance breakdown: - Detection: {det_lat}ms (target: <30ms) {'✅' if det_lat < 30 else '⚠️'} - Recognition: {rec_lat}ms (target: <50ms) {'✅' if rec_lat < 50 else '⚠️'} No immediate optimization needed, but you can further improve by: - Enabling GPU acceleration if hardware supports it - Using the ensemble recognizer only for edge cases """ return response # Faces detected but not recognized if any(phrase in message_lower for phrase in ["not recognized", "never recognized", "detected but"]): rec = tool_results.get("check_recognition", {}) response = f"""## 🔍 Recognition Failure Diagnosis **Recognition System Status:** - Model loaded: {'✅' if rec.get('model_loaded') else '❌'} - FAISS index vectors: {rec.get('index_vectors', 'N/A')} - Avg confidence: {rec.get('avg_confidence', 'N/A')} - Quality gate rejection rate: {rec.get('quality_gate_rejection_rate', 'N/A')} - Faces below threshold: {rec.get('faces_below_threshold', 'N/A')} ### Diagnosis Steps: #### 1. Check FAISS Index The index has **{rec.get('index_vectors', 0)} vectors**. If this is 0 or doesn't match your enrolled student count, the index needs to be rebuilt. ```bash # Verify index python -c "import faiss; idx = faiss.read_index('data/student_index.faiss'); print(f'Vectors: {{idx.ntotal}}')" ``` #### 2. Cosine Threshold Too High Current threshold: **{rec.get('faiss_threshold', 'N/A')}** If this is above 0.5, many legitimate matches will be rejected. Recommended range: **0.35-0.50**. ```yaml # config/edge_config.yaml faiss: cosine_threshold: 0.40 # Lower from {rec.get('faiss_threshold', 0.45)} ``` #### 3. Quality Gate Rejecting Too Many Crops Rejection rate is **{rec.get('quality_gate_rejection_rate', 0)*100:.0f}%**. If this is above 30%, the quality gate may be too strict. ```yaml # Relax quality requirements quality_gate: min_quality_score: 0.25 # Lower from 0.3 min_face_size: 64 # Lower from 80 max_blur_score: 150 # Raise from 100 ``` #### 4. Embedding Model Issues - Verify model produces valid 512-dim embeddings (not zeros/NaN) - Check if face alignment landmarks are accurate - Verify input normalization matches training preprocessing #### 5. Student Map Verification Ensure enrolled students have embeddings in the index: ```bash python scripts/verify_enrollments.py --index data/student_index.faiss --map data/student_map.json ``` """ return response # Tracking issues if any(w in message_lower for w in ["track", "id switch", "duplicate", "same person different"]): tracking = tool_results.get("check_tracking", {}) response = f"""## 🎯 Tracking Diagnosis (ByteTrack) **Current Status:** - Active tracks: {tracking.get('active_tracks', 'N/A')} - ID switch rate: {tracking.get('id_switch_rate', 'N/A')} - Avg track duration: {tracking.get('avg_track_duration_frames', 'N/A')} frames - Tracks created/minute: {tracking.get('tracks_created_last_minute', 'N/A')} - Tracks lost/minute: {tracking.get('tracks_lost_last_minute', 'N/A')} ### Common Tracking Issues: #### ID Switching (same person gets multiple IDs) ID switch rate: **{tracking.get('id_switch_rate', 0)}** {'⚠️ High ID switch rate!' if tracking.get('id_switch_rate', 0) > 0.05 else '✅ ID switch rate is acceptable.'} **Fix:** Increase track buffer and lower match threshold: ```yaml tracker: track_buffer: 45 # Increase from {tracking.get('track_buffer', 30)} (keeps lost tracks longer) match_thresh: 0.7 # Lower from {tracking.get('match_thresh', 0.8)} (easier re-association) track_thresh: 0.4 # Lower from {tracking.get('track_thresh', 0.5)} (lower birth threshold) ``` #### Tracks Dying Too Quickly If `tracks_lost_last_minute` >> `tracks_created_last_minute`: - Increase `tracker.track_buffer` (frames before track termination) - Lower detection confidence to maintain tracking continuity - Check if detection frame skip is too aggressive #### Ghost Tracks (tracking non-existent faces) If `active_tracks` is much higher than actual people: - Raise `tracker.track_thresh` to 0.6+ - Raise detection `conf_threshold` to reduce false positives - Add minimum detection size filter """ return response # Event transmission if any(w in message_lower for w in ["event", "redis", "queue", "transmit", "offline", "delivery"]): events = tool_results.get("check_event_transmission", {}) response = f"""## 📡 Event Transmission Status **Connection Status:** - Redis connected: {'✅' if events.get('redis_connected') else '❌ DISCONNECTED'} - Last successful event: {events.get('last_successful_event', 'N/A')} - Offline queue depth: {events.get('offline_queue_depth', 0)} **Delivery Stats:** - Events sent: {events.get('events_sent_total', 0)} - Events failed: {events.get('events_failed_total', 0)} - Avg delivery latency: {events.get('avg_delivery_latency_ms', 0)}ms """ if not events.get('redis_connected'): response += """### ❌ Redis Connection Issue The edge node cannot reach the Redis server. Events are being queued locally. **Immediate Actions:** 1. Check Redis server connectivity: ```bash redis-cli -h {host} -p 6379 ping ``` 2. Check network connectivity: ```bash ping {host} nc -zv {host} 6379 ``` 3. Check Redis configuration: ```yaml redis: host: {host} port: 6379 password: max_retries: 5 retry_delay_seconds: 2 ``` 4. HTTP fallback should be handling events in the meantime. **Queue Management:** The offline queue has {queue} events pending. Once Redis reconnects, these will be drained automatically. If the queue exceeds 100MB, oldest events are dropped. Monitor via: ``` smartclass_offline_queue_depth ``` """.format(host=events.get('redis_host', '192.168.1.100'), queue=events.get('offline_queue_depth', 0)) else: response += """### ✅ Redis Connected Event delivery is functioning normally. **Performance Notes:** - Delivery latency {lat}ms is {'within normal range' if events.get('avg_delivery_latency_ms', 0) < 30 else 'elevated - check network'} - Failure rate: {rate:.1f}% {'✅' if events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) < 0.02 else '⚠️ Above 2% threshold'} """.format(lat=events.get('avg_delivery_latency_ms', 0), rate=events.get('events_failed_total', 0) / max(events.get('events_sent_total', 1), 1) * 100) return response # Error log analysis if any(w in message_lower for w in ["error", "fail", "crash", "log", "warn"]): errors = tool_results.get("get_error_log", []) response = "## 📋 Recent Error Log\n\n" if errors: for e in errors: icon = "🔴" if e["level"] == "ERROR" else "🟡" if e["level"] == "WARN" else "🔵" response += f"{icon} **[{e['timestamp']}]** `{e['component']}`: {e['message']}\n\n" response += "\n### Analysis\n\n" error_components = [e["component"] for e in errors if e["level"] == "ERROR"] warn_components = [e["component"] for e in errors if e["level"] == "WARN"] if "event_sender" in error_components: response += "- **Event Sender errors** detected — check Redis connectivity and HTTP fallback\n" if "recognizer" in warn_components: response += "- **Recognizer warnings** — quality gate is rejecting crops. Consider relaxing thresholds or improving lighting\n" if "tracker" in warn_components: response += "- **Tracker warnings** — tracks being lost. May need to increase `track_buffer`\n" else: response += "✅ No recent errors found. System is operating normally.\n" return response # Configuration query if any(w in message_lower for w in ["config", "setting", "parameter", "yaml"]): config = tool_results.get("get_config", {}) response = f"""## ⚙️ Current Configuration ```yaml # config/edge_config.yaml (key sections) pipeline: detect_every_n_frames: {config.get('pipeline', {}).get('detect_every_n_frames', 'N/A')} recognize_every_n_frames: {config.get('pipeline', {}).get('recognize_every_n_frames', 'N/A')} idle_mode: enabled: {config.get('pipeline', {}).get('idle_mode', {}).get('enabled', 'N/A')} timeout_seconds: {config.get('pipeline', {}).get('idle_mode', {}).get('timeout_seconds', 'N/A')} models: detector: path: {config.get('models', {}).get('detector', {}).get('path', 'N/A')} conf_threshold: {config.get('models', {}).get('detector', {}).get('conf_threshold', 'N/A')} nms_threshold: {config.get('models', {}).get('detector', {}).get('nms_threshold', 'N/A')} recognizer: path: {config.get('models', {}).get('recognizer', {}).get('path', 'N/A')} embedding_dim: {config.get('models', {}).get('recognizer', {}).get('embedding_dim', 'N/A')} faiss: cosine_threshold: {config.get('faiss', {}).get('cosine_threshold', 'N/A')} nprobe: {config.get('faiss', {}).get('nprobe', 'N/A')} tracker: track_thresh: {config.get('tracker', {}).get('track_thresh', 'N/A')} track_buffer: {config.get('tracker', {}).get('track_buffer', 'N/A')} match_thresh: {config.get('tracker', {}).get('match_thresh', 'N/A')} identity: min_evidence_count: {config.get('identity', {}).get('min_evidence_count', 'N/A')} consensus_threshold: {config.get('identity', {}).get('consensus_threshold', 'N/A')} cooldown_seconds: {config.get('identity', {}).get('cooldown_seconds', 'N/A')} quality_gate: min_face_size: {config.get('quality_gate', {}).get('min_face_size', 'N/A')} min_quality_score: {config.get('quality_gate', {}).get('min_quality_score', 'N/A')} ``` Need help tuning any specific parameter? Ask about: - Detection thresholds (sensitivity vs false positives) - Recognition thresholds (miss rate vs false accepts) - Tracking parameters (stability vs responsiveness) - Quality gate (thoroughness vs coverage) """ return response # Identity engine if any(w in message_lower for w in ["evidence", "consensus", "identity engine", "fusion"]): identity = tool_results.get("check_identity_engine", {}) response = f"""## 🧠 Identity Evidence Engine Status **Configuration:** - Min evidence count: {identity.get('min_evidence_count', 'N/A')} (frames needed before decision) - Consensus threshold: {identity.get('consensus_threshold', 'N/A')} (agreement required) - Cooldown: {identity.get('cooldown_seconds', 'N/A')}s (prevents duplicate events) **Performance:** - Pending identities: {identity.get('pending_identities', 0)} (accumulating evidence) - Resolved last minute: {identity.get('resolved_last_minute', 0)} - Avg evidence per resolution: {identity.get('avg_evidence_per_resolution', 'N/A')} - Avg confidence at resolution: {identity.get('avg_confidence_at_resolution', 'N/A')} - Cooldown-blocked events: {identity.get('cooldown_blocked_events', 0)} ### How It Works The Identity Evidence Engine prevents false attendance by requiring multiple consistent recognition results before confirming identity: 1. **Evidence Collection**: Each recognition attempt adds a weighted sample to the track's evidence pool 2. **Quality Weighting**: Higher quality face crops contribute more weight 3. **Consensus Check**: Once `min_evidence_count` samples exist AND the top identity has ≥ `consensus_threshold` agreement, identity is resolved 4. **Cooldown**: After resolution, the same identity won't trigger another event for `cooldown_seconds` ### Tuning Guide | Scenario | Adjustment | |----------|-----------| | Missing attendance (too strict) | Lower `consensus_threshold` to 0.65 or `min_evidence_count` to 2 | | False attendance (too lenient) | Raise `consensus_threshold` to 0.85 or `min_evidence_count` to 5 | | Duplicate events | Increase `cooldown_seconds` | | Slow resolution | Decrease `min_evidence_count` or `recognize_every_n_frames` | """ return response # General status/overview status = tool_results.get("get_pipeline_status", {}) metrics = tool_results.get("get_metrics", {}) response = f"""## 📍 Edge Node Status Overview **Pipeline:** {'🟢 Running' if status.get('status') == 'running' else '🔴 Stopped'} **Uptime:** {status.get('uptime_hours', 'N/A')} hours **FPS:** {metrics.get('smartclass_edge_fps', 'N/A')} ### Component Health | Component | Status | |-----------|--------| | Camera | {'✅ Active' if status.get('camera_active') else '❌ Inactive'} | | Detector (SCRFD) | {'✅ Loaded' if status.get('detector_loaded') else '❌ Not loaded'} | | Recognizer (MFN) | {'✅ Loaded' if status.get('recognizer_loaded') else '❌ Not loaded'} | | FAISS Index | {'✅ Loaded (' + str(status.get('faiss_index_vectors', 0)) + ' vectors)' if status.get('faiss_index_loaded') else '❌ Not loaded'} | | Redis | {'✅ Connected' if status.get('redis_connected') else '⚠️ Disconnected (using fallback)'} | ### Key Metrics - Detection latency: {metrics.get('smartclass_detection_latency_ms', 'N/A')}ms - Recognition latency: {metrics.get('smartclass_recognition_latency_ms', 'N/A')}ms - Active tracks: {metrics.get('smartclass_tracking_active_tracks', 'N/A')} - CPU temp: {metrics.get('smartclass_cpu_temp_celsius', 'N/A')}°C - Memory: {metrics.get('smartclass_memory_usage_percent', 'N/A')}% - Offline queue: {metrics.get('smartclass_offline_queue_depth', 'N/A')} events ### Recent Activity - Last recognition: {status.get('last_recognition_event', 'N/A')} - Faces detected: {metrics.get('smartclass_faces_detected_total', 'N/A')} - Faces recognized: {metrics.get('smartclass_faces_recognized_total', 'N/A')} What would you like to investigate further? I can help with: - 🔧 Pipeline performance optimization - 🔍 Detection/recognition troubleshooting - 🎯 Tracking issues - 📡 Event transmission debugging - ⚙️ Configuration tuning """ return response def agent_respond(message: str, history: list) -> Generator: """Main agent response function with streaming and tool calls.""" # Add user message history = history + [{"role": "user", "content": message}] yield history # Determine which tools to call tools_needed = determine_tools_to_call(message) tool_results = {} # Execute tools and show them in chat for tool_name in tools_needed: tool = TOOLS[tool_name] result = tool["fn"]() tool_results[tool_name] = result # Display tool call in chat history = history + [{ "role": "assistant", "content": f"```json\n{json.dumps(result, indent=2, default=str)}\n```", "metadata": {"title": f"🔧 {tool_name}: {tool['description']}"} }] yield history time.sleep(0.3) # Simulate tool execution time # Generate response response = generate_response(message, tool_results) # Stream the response history = history + [{"role": "assistant", "content": ""}] words = response.split(" ") for i in range(0, len(words), 3): chunk = " ".join(words[i:i+3]) + " " history[-1]["content"] += chunk yield history time.sleep(0.02) # ============================================================================ # METRICS POLLING FUNCTIONS # ============================================================================ def poll_metrics(): """Poll live metrics for the dashboard.""" m = edge_state.get_metrics() return m def poll_status(): """Poll pipeline status.""" return edge_state.get_pipeline_status() def reset_simulation(): """Reset the simulated edge node state.""" edge_state.reset() return "✅ Edge node simulation reset with new random state." def inject_fault(fault_type): """Inject a simulated fault for testing diagnostics.""" if fault_type == "Low FPS": edge_state.fps = round(random.uniform(1, 5), 1) edge_state.detection_latency = round(random.uniform(80, 150), 1) edge_state.recognition_latency = round(random.uniform(150, 300), 1) return "⚠️ Injected: Low FPS condition (heavy processing load)" elif fault_type == "Redis Disconnect": edge_state.redis_connected = False edge_state.offline_queue_depth = random.randint(50, 200) return "⚠️ Injected: Redis disconnected, events queuing locally" elif fault_type == "No Recognitions": edge_state.recognition_confidence_avg = 0.25 edge_state.faces_recognized = 0 edge_state.config["faiss"]["cosine_threshold"] = 0.85 return "⚠️ Injected: Recognition threshold too high, no matches" elif fault_type == "High Temperature": edge_state.cpu_temp = round(random.uniform(82, 95), 1) edge_state.fps = round(random.uniform(5, 12), 1) return "⚠️ Injected: CPU thermal throttling" elif fault_type == "Tracking Chaos": edge_state.active_tracks = random.randint(30, 50) edge_state.config["tracker"]["track_thresh"] = 0.2 return "⚠️ Injected: Too many ghost tracks (low threshold)" return "Unknown fault type" # ============================================================================ # GRADIO UI # ============================================================================ CUSTOM_CSS = """ .tool-output { font-size: 0.85em; max-height: 200px; overflow-y: auto; } .metric-card { border: 1px solid #e0e0e0; border-radius: 8px; padding: 16px; margin: 8px; } """ EXAMPLES = [ "The edge node is only getting 2 FPS, how do I optimize?", "Faces are detected but never recognized", "I'm seeing duplicate attendance events for the same student", "The offline queue keeps growing, events aren't being sent", "Show me the current system status", "How do I tune the identity evidence engine?", "Tracking IDs keep switching between the same person", "What's the recommended configuration for a Raspberry Pi 5?", "Explain the quality gate and how to adjust it", "The CPU temperature is too high, what can I do?", ] with gr.Blocks(title="Edge Node Diagnostics Agent") as demo: gr.Markdown(""" # 🔬 Edge Node Diagnostics Agent ### SmartClass Face Recognition Pipeline Troubleshooter Specialized AI assistant for diagnosing and optimizing face recognition pipelines running on classroom edge devices (Raspberry Pi 5 / Intel NUC). """) with gr.Sidebar(): gr.Markdown("### 🎛️ Simulation Controls") gr.Markdown("*Inject faults to test diagnostics capabilities*") fault_dropdown = gr.Dropdown( choices=["Low FPS", "Redis Disconnect", "No Recognitions", "High Temperature", "Tracking Chaos"], label="Fault Type", value="Low FPS" ) inject_btn = gr.Button("💥 Inject Fault", variant="secondary") reset_btn = gr.Button("🔄 Reset Node", variant="secondary") fault_status = gr.Textbox(label="Status", interactive=False, lines=2) inject_btn.click(inject_fault, fault_dropdown, fault_status) reset_btn.click(reset_simulation, outputs=fault_status) gr.Markdown("---") gr.Markdown("### 📌 Quick Reference") gr.Markdown(""" **Pipeline Flow:** 1. 📷 Camera Capture 2. 👤 Face Detection (SCRFD) 3. 🎯 Face Tracking (ByteTrack) 4. ✅ Quality Assessment 5. 🔄 Face Alignment 6. 🧠 Embedding (MobileFaceNet) 7. 🔍 FAISS Search 8. ⚖️ Evidence Engine 9. 📡 Event Transmission """) with gr.Tabs(): # ============ CHAT TAB ============ with gr.Tab("💬 Diagnostics Chat", id="chat"): chatbot = gr.Chatbot( height=550, label="Edge Diagnostics Agent", placeholder="

🔬 Edge Node Diagnostics Agent

Ask me about pipeline performance, detection issues, recognition failures, tracking problems, or event delivery.

", ) with gr.Row(): msg = gr.Textbox( placeholder="e.g., 'The edge node is only getting 2 FPS, how do I optimize?'", show_label=False, scale=8, container=False, autofocus=True, ) submit_btn = gr.Button("Send", variant="primary", scale=1, min_width=100) clear_btn = gr.Button("Clear", scale=1, min_width=80) gr.Examples( examples=EXAMPLES, inputs=msg, label="💡 Try these questions:", ) # Event handlers msg.submit( agent_respond, [msg, chatbot], chatbot ).then(lambda: "", outputs=msg) submit_btn.click( agent_respond, [msg, chatbot], chatbot ).then(lambda: "", outputs=msg) clear_btn.click(lambda: [], outputs=chatbot) # ============ METRICS TAB ============ with gr.Tab("📊 Live Metrics", id="metrics"): gr.Markdown("### Real-time Edge Node Metrics") gr.Markdown("*Metrics update every 5 seconds (simulated)*") timer = gr.Timer(value=5, active=True) with gr.Row(): metrics_json = gr.JSON(label="📈 Prometheus Metrics", scale=2) status_json = gr.JSON(label="🔋 Pipeline Status", scale=1) timer.tick(poll_metrics, outputs=metrics_json) timer.tick(poll_status, outputs=status_json) # ============ CONFIG TAB ============ with gr.Tab("⚙️ Configuration", id="config"): gr.Markdown("### Edge Node Configuration") gr.Markdown("*Current `config/edge_config.yaml` settings*") with gr.Row(): with gr.Column(): gr.Markdown("#### Detection Settings") det_conf = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Detection Confidence Threshold") det_nms = gr.Slider(0.1, 0.9, value=0.4, step=0.05, label="NMS Threshold") det_skip = gr.Slider(1, 10, value=3, step=1, label="Detect Every N Frames") gr.Markdown("#### Recognition Settings") rec_threshold = gr.Slider(0.2, 0.8, value=0.45, step=0.05, label="FAISS Cosine Threshold") rec_skip = gr.Slider(1, 15, value=5, step=1, label="Recognize Every N Frames") with gr.Column(): gr.Markdown("#### Tracking Settings") track_thresh = gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Track Birth Threshold") track_buffer = gr.Slider(10, 120, value=30, step=5, label="Track Buffer (frames)") match_thresh = gr.Slider(0.3, 1.0, value=0.8, step=0.05, label="Match Threshold") gr.Markdown("#### Identity Engine") min_evidence = gr.Slider(1, 10, value=3, step=1, label="Min Evidence Count") consensus = gr.Slider(0.5, 1.0, value=0.75, step=0.05, label="Consensus Threshold") cooldown = gr.Slider(30, 600, value=300, step=30, label="Cooldown (seconds)") with gr.Row(): save_btn = gr.Button("💾 Apply Configuration", variant="primary") config_status = gr.Textbox(label="Status", interactive=False) def apply_config(det_c, det_n, det_s, rec_t, rec_s, tr_t, tr_b, m_t, me, cs, cd): edge_state.config["models"]["detector"]["conf_threshold"] = det_c edge_state.config["models"]["detector"]["nms_threshold"] = det_n edge_state.config["pipeline"]["detect_every_n_frames"] = det_s edge_state.config["faiss"]["cosine_threshold"] = rec_t edge_state.config["pipeline"]["recognize_every_n_frames"] = rec_s edge_state.config["tracker"]["track_thresh"] = tr_t edge_state.config["tracker"]["track_buffer"] = tr_b edge_state.config["tracker"]["match_thresh"] = m_t edge_state.config["identity"]["min_evidence_count"] = me edge_state.config["identity"]["consensus_threshold"] = cs edge_state.config["identity"]["cooldown_seconds"] = cd return f"✅ Configuration applied at {datetime.now().strftime('%H:%M:%S')}" save_btn.click( apply_config, [det_conf, det_nms, det_skip, rec_threshold, rec_skip, track_thresh, track_buffer, match_thresh, min_evidence, consensus, cooldown], config_status ) # ============ ARCHITECTURE TAB ============ with gr.Tab("📐 Architecture", id="arch"): gr.Markdown(""" ### SmartClass Edge Node Pipeline Architecture ``` ┌─────────────────────────────────────────────────────────────────────┐ │ EDGE NODE (Pi 5 / NUC) │ ├─────────────────────────────────────────────────────────────────────┤ │ │ │ ┌──────────┐ ┌───────────┐ ┌──────────┐ ┌──────────────┐ │ │ │ Camera │───▶│ Detector │───▶│ Tracker │───▶│ Quality │ │ │ │ Capture │ │ (SCRFD) │ │(ByteTrack)│ │ Assessment │ │ │ └──────────┘ └───────────┘ └──────────┘ └──────────────┘ │ │ │ │ │ ▼ │ │ ┌──────────────┐ ┌───────────┐ ┌──────────┐ ┌─────────┐ │ │ │ Identity │◀───│ FAISS │◀───│ Embedding│◀───│ Face │ │ │ │ Evidence │ │ Search │ │(MobileFN)│ │ Align │ │ │ │ Engine │ └───────────┘ └──────────┘ └─────────┘ │ │ └──────────────┘ │ │ │ │ │ ▼ │ │ ┌──────────────────────────────────────────────────────────────┐ │ │ │ Event Transmission │ │ │ │ ┌─────────┐ ┌──────────────┐ ┌─────────────────────┐ │ │ │ │ │ Redis │──▶│ HTTP Fallback│──▶│ SQLite Offline Q │ │ │ │ │ │ Streams │ │ (REST API) │ │ (retry background) │ │ │ │ │ └─────────┘ └──────────────┘ └─────────────────────┘ │ │ │ └──────────────────────────────────────────────────────────────┘ │ │ │ │ ┌──────────────────────────────────────────────────────────────┐ │ │ │ Prometheus Metrics (port 9100) │ │ │ │ FPS | Latency | Tracks | Temp | Memory | Queue | Events │ │ │ └──────────────────────────────────────────────────────────────┘ │ │ │ └─────────────────────────────────────────────────────────────────────┘ ``` ### Key Source Files | File | Purpose | |------|---------| | `src/main_pipeline.py` | CameraPipeline orchestration | | `src/capture.py` | Camera frame capture (USB/RTSP) | | `src/detector.py` | SCRFD face detection | | `src/tracker.py` | ByteTrack face tracking | | `src/face_quality.py` | FIQA quality scoring | | `src/face_align.py` | Face alignment & warping | | `src/face_enhance.py` | MSRCR Retinex / CLAHE | | `src/recognizer.py` | MobileFaceNet embedding | | `src/ensemble_recognizer.py` | AdaFace + MFN dual model | | `src/identity_evidence_engine.py` | Global identity fusion | | `src/event_sender.py` | Event transmission logic | | `src/edge_metrics.py` | Prometheus metrics export | ### Configuration All settings in `config/edge_config.yaml`. Key sections: - `pipeline` — Frame processing rates, idle mode - `models` — Model paths, thresholds, execution providers - `faiss` — Index path, similarity threshold, GPU mode - `tracker` — ByteTrack parameters - `identity` — Evidence engine tuning - `cameras` — Camera sources - `redis` / `http_fallback` / `offline_queue` — Event delivery - `quality_gate` — Face crop quality requirements """) # ============ HELP TAB ============ with gr.Tab("❓ Help", id="help"): gr.Markdown(""" ### How to Use This Agent **1. Ask Natural Language Questions** Simply describe your problem in the chat. The agent will: - Automatically query relevant diagnostic tools - Show raw data from the edge node (collapsible) - Provide analysis and recommendations **2. Common Troubleshooting Scenarios** | Problem | What to Ask | |---------|-------------| | Low FPS | "Pipeline is slow, only getting X FPS" | | No recognitions | "Faces are detected but never recognized" | | Duplicate events | "Same student marked present twice" | | Events not sending | "Offline queue keeps growing" | | Tracking issues | "Track IDs keep switching" | | False detections | "Detecting faces where there are none" | | Model errors | "Detection model won't load" | | Overheating | "CPU temperature is too high" | **3. Simulation Controls (Sidebar)** Use the sidebar to inject faults and test the agent's diagnostic capabilities: - **Low FPS** — Simulates heavy processing load - **Redis Disconnect** — Simulates network failure - **No Recognitions** — Simulates overly strict thresholds - **High Temperature** — Simulates thermal throttling - **Tracking Chaos** — Simulates too many ghost tracks **4. Configuration Tab** Adjust edge node parameters in real-time and see how they affect the system. **5. Metrics Tab** View live (simulated) Prometheus metrics from the edge node. --- ### Technical Reference **Prometheus Metrics (port 9100):** ``` smartclass_edge_fps # Overall frames per second smartclass_detection_latency_ms # Time per detection smartclass_recognition_latency_ms # Time per recognition smartclass_tracking_active_tracks # Active face tracks smartclass_cpu_temp_celsius # CPU temperature smartclass_memory_usage_percent # RAM usage smartclass_offline_queue_depth # Queued events smartclass_events_sent_total # Successfully sent events smartclass_events_failed_total # Failed event deliveries smartclass_faces_detected_total # Total detections smartclass_faces_recognized_total # Successful recognitions smartclass_recognition_confidence_avg # Average match confidence ``` **Key Config Paths:** - `config/edge_config.yaml` → All settings - `models/scrfd_2.5g.onnx` → Detection model - `models/mobilefacenet_v2.onnx` → Recognition model - `data/student_index.faiss` → FAISS vector index - `data/student_map.json` → ID → student name mapping - `data/offline_queue.db` → SQLite fallback queue """) demo.launch(theme=gr.themes.Soft(), css=CUSTOM_CSS)