Spaces:

petermutwiri
/

analytics-engine

Paused

App Files Files Community

shaliz-kong commited on Dec 2, 2025

Commit

ba09259

1 Parent(s): 00f9956

agragated all logind in heal th api for prometheus

Browse files

Files changed (6) hide show

app/deps.py +61 -14
app/mapper.py +3 -3
app/routers/health.py +383 -145
app/service/llm_service.py +3 -2
app/service/vector_service.py +4 -3
app/tasks/analytics_worker.py +3 -2

app/deps.py CHANGED Viewed

@@ -131,7 +131,7 @@ def get_duckdb(org_id: str) -> duckdb.DuckDBPyConnection:
             try:
                 conn = duckdb.connect(str(db_file), read_only=False)
                 # Enable VSS
                 conn.execute("INSTALL vss;")
                 conn.execute("LOAD vss;")
@@ -313,25 +313,72 @@ _qstash_client = None
 _qstash_verifier = None
 def get_qstash_client():
-    """Singleton QStash client (unchanged)"""
     global _qstash_client
-    if _qstash_client is None and QSTASH_TOKEN:
         from upstash_qstash import Client
-        _qstash_client = Client(token=QSTASH_TOKEN)
     return _qstash_client
 def get_qstash_verifier():
-    """Singleton QStash verifier (unchanged)"""
     global _qstash_verifier
-    if _qstash_verifier is None:
-        current = os.getenv("QSTASH_CURRENT_SIGNING_KEY")
-        next_key = os.getenv("QSTASH_NEXT_SIGNING_KEY")
-        if current and next_key:
-            from upstash_qstash import Receiver
-            _qstash_verifier = Receiver({
-                "current_signing_key": current,
-                "next_signing_key": next_key
-            })
     return _qstash_verifier

             try:
                 conn = duckdb.connect(str(db_file), read_only=False)
+                conn.execute("SET hnsw_enable_experimental_persistence = true")
                 # Enable VSS
                 conn.execute("INSTALL vss;")
                 conn.execute("LOAD vss;")
 _qstash_verifier = None
 def get_qstash_client():
+    """Singleton QStash client.
+    This is optional. If the `QSTASH_TOKEN` environment variable is not set
+    or the `upstash_qstash` package is not installed, this function will
+    return `None` and log a warning/info rather than raising an ImportError.
+    """
     global _qstash_client
+    if _qstash_client is not None:
+        return _qstash_client
+    token = os.getenv("QSTASH_TOKEN")
+    if not token:
+        logger.info("QStash token not configured; skipping QStash client initialization")
+        return None
+    try:
         from upstash_qstash import Client
+    except Exception as e:
+        logger.warning("upstash_qstash package not installed; QStash disabled: %s", e)
+        return None
+    try:
+        qstash_url = os.getenv("QSTASH_URL")
+        if qstash_url:
+            _qstash_client = Client(token=token, url=qstash_url)
+        else:
+            _qstash_client = Client(token=token)
+        logger.info("✅ QStash client initialized")
+    except Exception as e:
+        logger.warning(f"Failed to initialize QStash client: {e}")
+        _qstash_client = None
     return _qstash_client
 def get_qstash_verifier():
+    """Singleton QStash verifier.
+    Safe to call even if `upstash_qstash` is not installed or signing keys
+    are not configured. Returns `None` when verifier cannot be created.
+    """
     global _qstash_verifier
+    if _qstash_verifier is not None:
+        return _qstash_verifier
+    current = os.getenv("QSTASH_CURRENT_SIGNING_KEY")
+    next_key = os.getenv("QSTASH_NEXT_SIGNING_KEY")
+    if not (current and next_key):
+        logger.info("QStash signing keys not configured; skipping verifier initialization")
+        return None
+    try:
+        from upstash_qstash import Receiver
+    except Exception as e:
+        logger.warning("upstash_qstash package not installed; cannot create QStash verifier: %s", e)
+        return None
+    try:
+        _qstash_verifier = Receiver({
+            "current_signing_key": current,
+            "next_signing_key": next_key
+        })
+        logger.info("✅ QStash verifier initialized")
+    except Exception as e:
+        logger.warning(f"Failed to initialize QStash verifier: {e}")
+        _qstash_verifier = None
     return _qstash_verifier

app/mapper.py CHANGED Viewed

@@ -25,7 +25,7 @@ from app.db import get_conn, ensure_raw_table, transactional_conn, ensure_schema
 from app.hybrid_entity_detector import hybrid_detect_entity_type
 from app.core.event_hub import event_hub
 from app.deps import get_sre_metrics
 # Prometheus metrics (free tier compatible)
 try:
     from prometheus_client import Counter, Histogram, Gauge
@@ -596,7 +596,7 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
     Added: SRE metrics, structured logging, pub/sub events
     """
     start_time = time.time()
-    logger.info(f"\n[MAPPER] 🚀 Starting pipeline for {org_id}/{source_id}")
     # Load aliases
     load_dynamic_aliases()
@@ -615,7 +615,7 @@ def canonify_df(org_id: str, source_id: str, hours_window: int = 24) -> tuple[pd
                 ORDER BY ingested_at DESC
             """, (cutoff_time,)).fetchall()
         except Exception as e:
-            logger.error(f"[MAPPER] ❌ SQL read error: {e}")
             return pd.DataFrame(), "unknown", 0.0
     if not rows:

 from app.hybrid_entity_detector import hybrid_detect_entity_type
 from app.core.event_hub import event_hub
 from app.deps import get_sre_metrics
+from app.routers.health import emit_mapper_log
 # Prometheus metrics (free tier compatible)
 try:
     from prometheus_client import Counter, Histogram, Gauge
     Added: SRE metrics, structured logging, pub/sub events
     """
     start_time = time.time()
+    emit_mapper_log("info", f"🚀 Starting pipeline for {org_id}/{source_id}")
     # Load aliases
     load_dynamic_aliases()
                 ORDER BY ingested_at DESC
             """, (cutoff_time,)).fetchall()
         except Exception as e:
+            emit_mapper_log("error", f"❌ SQL read error: {e}", error=str(e))
             return pd.DataFrame(), "unknown", 0.0
     if not rows:

app/routers/health.py CHANGED Viewed

@@ -1,190 +1,428 @@
 """
-app/routers/health.py – ENTERPRISE HEALTH & MONITORING
-======================================================
-Provides Kubernetes-compatible probes, service status checks, and per-tenant
-database metrics for capacity planning and alerting.
 """
-from fastapi import APIRouter, HTTPException, Depends, Path
-from app.deps import check_all_services, get_redis, get_vector_db, get_duckdb
-from app.db import get_db_stats  # Import our new stats function
 import os
 import time
-from app.service.llm_service import LocalLLMService
-from typing import Dict, Any
 router = APIRouter(tags=["health"])
-@router.get("/llm-status")
-async def llm_status():
-    """Check if LLM is ready"""
-    return {
-        "loaded": llm_service.is_loaded,
-        "loading": llm_service._is_loading,
-        "error": llm_service.load_error,
-        "model": llm_service.model_id
-    }
-@router.get("/health")
-def health_check():
-    """
-    Basic health check for load balancers.
-    Returns 200 if service is alive.
-    """
-    return {"status": "ok", "service": "analytics-engine"}
-@router.get("/health/detailed")
-def health_detailed():
-    """
-    Comprehensive health check for all services.
-    Returns detailed status of each component.
-    """
     start_time = time.time()
-    statuses = check_all_services()
-    # Determine overall health
-    all_healthy = all("✅" in str(status) for status in statuses.values())
-    http_status = 200 if all_healthy else 503
     return {
-        "status": "healthy" if all_healthy else "unhealthy",
-        "services": statuses,
-        "environment": "production" if os.getenv("SPACE_ID") else "development",
         "uptime_seconds": time.time() - start_time,
-        "timestamp": time.time()
     }
-@router.get("/health/ready")
-def health_ready():
     """
-    Kubernetes-style readiness probe.
-    Returns 200 if ready to serve traffic.
     """
-    try:
-        # Quick smoke test: Can we connect to core services?
-        redis = get_redis()
-        redis.ping()
-        # Test DuckDB with a dummy org
-        conn = get_duckdb("health_check")
-        conn.execute("SELECT 1")
-        return {"status": "ready"}
-    except Exception as e:
-        raise HTTPException(
-            status_code=503,
-            detail=f"Not ready: {str(e)}"
         )
-@router.get("/health/live")
-def health_live():
-    """
-    Kubernetes-style liveness probe.
-    Returns 200 if service is alive (doesn't check dependencies).
-    """
-    return {"status": "alive"}
-@router.post("/health/reload")
-def health_reload(_: str = Depends(check_all_services)):
-    """
-    Trigger reload of services (if needed).
-    Requires API key for security.
-    """
-    # Clear cached connections
-    from app.deps import _org_db_connections, _vector_db_conn, _redis_client
-    _org_db_connections.clear()
-    _vector_db_conn = None
-    _redis_client = None
-    return {"status": "reloaded", "message": "Connections cleared"}
-@router.get("/health/metrics")
-def health_metrics():
-    """
-    Performance metrics for monitoring.
-    """
     try:
-        import psutil
         return {
-            "cpu_percent": psutil.cpu_percent(),
-            "memory_mb": psutil.virtual_memory().used // (1024 * 1024),
-            "disk_gb": psutil.disk_usage("/").free // (1024**3),
-            "connections": len(_org_db_connections) if '_org_db_connections' in globals() else 0
         }
-    except ImportError:
-        return {"error": "psutil not installed"}
-@router.get("/health/db-stats/{org_id}")
-def health_db_stats(org_id: str = Path(..., description="Organization ID")):
-    """
-    Per-tenant database statistics for capacity planning.
-    Returns:
-        - DB size in GB (with quota status)
-        - Row counts per table
-        - Total rows ingested
-        - Schema version info
-    Use this for:
-        - Monitoring tenant growth
-        - Alerting on quota breaches
-        - Capacity planning ahead of limits
-    """
     try:
-        stats = get_db_stats(org_id)
-        # Enrich with quota status
-        quota_status = "ok"
-        if stats["db_size_gb"] > MAX_DB_SIZE_GB * 0.8:
-            quota_status = "warning"
-        if stats["db_size_gb"] > MAX_DB_SIZE_GB:
-            quota_status = "critical"
-        # Add schema versions
-        with get_conn(org_id) as conn:
-            schema_versions = conn.execute("""
-                SELECT version_id, table_name, status, created_at
-                FROM main.schema_versions
-                ORDER BY version_id DESC
-                LIMIT 5
-            """).fetchall()
-            versions = [
-                {
-                    "version_id": r[0],
-                    "table_name": r[1],
-                    "status": r[2],
-                    "created_at": r[3].isoformat() if r[3] else None
-                }
-                for r in schema_versions
-            ]
         return {
-            "org_id": org_id,
-            "storage": {
-                "size_gb": round(stats["db_size_gb"], 2),
-                "quota_gb": MAX_DB_SIZE_GB,
-                "status": quota_status,
-                "percent_used": round((stats["db_size_gb"] / MAX_DB_SIZE_GB) * 100, 1)
-            },
-            "tables": stats["table_counts"],
-            "total_rows": stats["total_rows"],
-            "recent_schema_versions": versions,
-            "timestamp": time.time()
         }
     except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=f"Failed to retrieve DB stats: {str(e)}"
-        )

 """
+app/routers/health.py – SRE LOG AGGREGATION HUB
+===============================================
+Central observability endpoint aggregating logs from all refactored services:
+- Analytics Worker
+- Vector Service
+- LLM Service
+- Mapper/Detector
+- Database Connections
+Provides real-time logs, error rates, and service-specific diagnostics.
 """
+from fastapi import APIRouter, HTTPException, Depends, Query, Path
+from typing import Dict, Any, List, Optional
 import os
 import time
+import json
+import logging
+import threading
+import asyncio
+import torch
+import datetime
+import timedelta
+from app.deps import (
+    check_all_services, get_redis, get_vector_db, get_duckdb,
+    get_sre_metrics, HF_API_TOKEN, close_all_connections
+)
+from app.db import get_db_stats
+from app.service.llm_service import LocalLLMService, get_llm_service
+from app.tasks.analytics_worker import get_worker_manager
+from app.service.vector_service import VectorService
+from app.mapper import health_check_mapper, MapperMetrics
+from app.core.event_hub import  StreamingResponse, Response
+# Prometheus aggregation
+try:
+    from prometheus_client import generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST, Gauge
+except ImportError:
+    CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
+    Gauge = None
+logger = logging.getLogger(__name__)
+from app.mapper import health_check_mapper, MapperMetrics
+# Prometheus aggregation
+try:
+    from prometheus_client import generate_latest, CollectorRegistry, CONTENT_TYPE_LATEST
+except ImportError:
+    CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
+logger = logging.getLogger(__name__)
 router = APIRouter(tags=["health"])
+# Global log aggregator (in-memory ring buffer for recent logs)
+class LogAggregator:
+    """Ring buffer storing last 1000 logs from all services"""
+    def __init__(self, max_size: int = 1000):
+        self.max_size = max_size
+        self.buffer: List[Dict[str, Any]] = []
+        self.lock = threading.Lock()
+    def emit(self, service: str, level: str, message: str, **kwargs):
+        """Add a log entry from any service"""
+        with self.lock:
+            entry = {
+                "timestamp": datetime.utcnow().isoformat(),
+                "service": service,
+                "level": level,
+                "message": message,
+                **kwargs
+            }
+            self.buffer.append(entry)
+            if len(self.buffer) > self.max_size:
+                self.buffer.pop(0)
+    def get_logs(self, service: Optional[str] = None, level: Optional[str] = None, limit: int = 100) -> List[Dict]:
+        """Retrieve filtered logs"""
+        with self.lock:
+            filtered = [
+                log for log in self.buffer
+                if (not service or log["service"] == service)
+                and (not level or log["level"] == level)
+            ]
+            return filtered[-limit:]
+    def get_error_rate(self, service: str, window_minutes: int = 5) -> float:
+        """Calculate error rate for a service"""
+        cutoff = datetime.utcnow() - timedelta(minutes=window_minutes)
+        recent = [
+            log for log in self.buffer
+            if log["service"] == service and log["timestamp"] >= cutoff.isoformat()
+        ]
+        if not recent:
+            return 0.0
+        errors = [log for log in recent if log["level"] in ("error", "critical")]
+        return len(errors) / len(recent)
+# Global aggregator instance
+log_aggregator = LogAggregator(max_size=1000)
+# Service-specific log emitters (to be imported by each service)
+def emit_worker_log(level: str, message: str, **kwargs):
+    log_aggregator.emit("analytics_worker", level, message, **kwargs)
+def emit_vector_log(level: str, message: str, **kwargs):
+    log_aggregator.emit("vector_service", level, message, **kwargs)
+def emit_llm_log(level: str, message: str, **kwargs):
+    log_aggregator.emit("llm_service", level, message, **kwargs)
+def emit_mapper_log(level: str, message: str, **kwargs):
+    log_aggregator.emit("mapper", level, message, **kwargs)
+def emit_deps_log(level: str, message: str, **kwargs):
+    log_aggregator.emit("dependencies", level, message, **kwargs)
+# ---------------------- SRE: Unified Health Endpoint ---------------------- #
+@router.get("/health")
+async def health_check():
+    """Aggregated health status from all services"""
     start_time = time.time()
+    # Check all core services
+    service_status = check_all_services()
+    # Check worker manager health
+    try:
+        manager = await get_worker_manager()
+        worker_metrics = manager.get_metrics()
+        worker_healthy = len(worker_metrics.get("active_workers", [])) < 50  # Arbitrary threshold
+    except Exception as e:
+        worker_healthy = False
+        service_status["worker_manager"] = f"❌ {e}"
+    # Check LLM service
+    try:
+        llm = get_llm_service()
+        llm_health = llm.health_check()
+        llm_healthy = llm_health["status"] == "healthy"
+    except Exception as e:
+        llm_healthy = False
+        service_status["llm_service"] = f"❌ {e}"
+    # Check mapper cache health
+    try:
+        mapper_health = health_check_mapper()
+        mapper_healthy = mapper_health["status"] == "healthy"
+    except Exception as e:
+        mapper_healthy = False
+        service_status["mapper"] = f"❌ {e}"
+    # Overall health determination
+    all_healthy = (
+        all("✅" in str(v) for v in service_status.values()) and
+        worker_healthy and llm_healthy and mapper_healthy
+    )
+    # Emit aggregated health log
+    log_aggregator.emit(
+        "health_router", "info" if all_healthy else "error",
+        "Health check completed",
+        all_healthy=all_healthy,
+        services_checked=len(service_status),
+        duration_ms=(time.time() - start_time) * 1000
+    )
     return {
+        "status": "healthy" if all_healthy else "degraded",
+        "timestamp": datetime.utcnow().isoformat(),
         "uptime_seconds": time.time() - start_time,
+        "environment": "production" if os.getenv("SPACE_ID") else "development",
+        "services": {
+            **service_status,
+            "worker_manager": "✅ healthy" if worker_healthy else "❌ unhealthy",
+            "llm_service": "✅ healthy" if llm_healthy else "❌ unhealthy",
+            "mapper": "✅ healthy" if mapper_healthy else "❌ unhealthy"
+        },
+        "sre_metrics": get_sre_metrics(),
+        "_links": {
+            "logs": "/health/logs",
+            "metrics": "/health/metrics",
+            "status": "/health/status"
+        }
     }
+# ---------------------- SRE: Real-Time Log Streaming ---------------------- #
+@router.get("/health/logs")
+async def get_service_logs(
+    service: Optional[str] = Query(None, description="Filter by service (analytics_worker, vector_service, llm_service, mapper, dependencies)"),
+    level: Optional[str] = Query(None, description="Filter by level (info, warning, error, critical)"),
+    limit: int = Query(100, ge=1, le=1000, description="Number of logs to return"),
+    tail: bool = Query(False, description="Stream logs in real-time (SSE)")
+):
     """
+    Retrieve recent logs from all services or filter by service/level.
+    Examples:
+    - GET /health/logs?service=vector_service&level=error
+    - GET /health/logs?service=analytics_worker&tail=true (SSE stream)
     """
+    if tail:
+        # SSE streaming of logs
+        async def log_stream():
+            last_count = len(log_aggregator.buffer)
+            while True:
+                current_count = len(log_aggregator.buffer)
+                if current_count > last_count:
+                    new_logs = log_aggregator.buffer[last_count:]
+                    for log in new_logs:
+                        if (not service or log["service"] == service) and (not level or log["level"] == level):
+                            yield f"data: {json.dumps(log)}\n\n"
+                    last_count = current_count
+                await asyncio.sleep(0.5)
+        return StreamingResponse(
+            log_stream(),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache"}
         )
+    # Return historical logs
+    logs = log_aggregator.get_logs(service=service, level=level, limit=limit)
+    return {
+        "status": "success",
+        "logs": logs,
+        "total": len(logs),
+        "service": service or "all",
+        "level": level or "all"
+    }
+# ---------------------- SRE: Error Rate Tracking ---------------------- #
+@router.get("/health/error-rates")
+async def get_error_rates(
+    window_minutes: int = Query(5, ge=1, le=60, description="Time window in minutes")
+):
+    """Get error rates for all services over the specified time window"""
+    services = ["analytics_worker", "vector_service", "llm_service", "mapper", "dependencies"]
+    rates = {}
+    for service in services:
+        rates[service] = {
+            "error_rate": log_aggregator.get_error_rate(service, window_minutes),
+            "window_minutes": window_minutes
+        }
+    # Overall system error rate
+    total_logs = sum(len([log for log in log_aggregator.buffer if log["timestamp"] >= (datetime.utcnow() - timedelta(minutes=window_minutes)).isoformat()]) for _ in services)
+    total_errors = sum(len([log for log in log_aggregator.buffer if log["level"] in ("error", "critical") and log["timestamp"] >= (datetime.utcnow() - timedelta(minutes=window_minutes)).isoformat()]) for _ in services)
+    overall_rate = total_errors / total_logs if total_logs > 0 else 0.0
+    # Alert if error rate is high
+    alert = overall_rate > 0.1  # 10% error rate threshold
+    if alert:
+        log_aggregator.emit("health_router", "error", "High system error rate detected", rate=overall_rate)
+    return {
+        "status": "healthy" if not alert else "alerting",
+        "overall_error_rate": round(overall_rate, 4),
+        "service_rates": rates,
+        "window_minutes": window_minutes,
+        "alert": alert
+    }
+# ---------------------- SRE: Service-Specific Health ---------------------- #
+@router.get("/health/workers")
+async def health_workers():
+    """Analytics worker health and metrics"""
     try:
+        manager = await get_worker_manager()
+        metrics = manager.get_metrics()
+        # Get recent worker logs
+        worker_logs = log_aggregator.get_logs(service="analytics_worker", limit=50)
         return {
+            "status": "healthy" if metrics.get("workers_failed", 0) < 10 else "degraded",
+            "active_workers": metrics.get("active_workers", 0),
+            "triggers_processed": metrics.get("triggers_processed", 0),
+            "workers_failed": metrics.get("workers_failed", 0),
+            "total_latency_ms": metrics.get("total_latency_ms", 0),
+            "recent_logs": worker_logs,
+            "_links": {
+                "logs": "/health/logs?service=analytics_worker",
+                "stream": "/api/v1/analytics/stream/sse"
+            }
         }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+@router.get("/health/vectors")
+async def health_vectors():
+    """Vector service health and metrics"""
+    try:
+        # Create a dummy vector service to check health
+        vector_service = VectorService(org_id="health_check")
+        # Get recent vector logs
+        vector_logs = log_aggregator.get_logs(service="vector_service", limit=50)
+        return {
+            "status": "healthy",
+            "model_cached": len(vector_service._global_model_cache) > 0,
+            "redis_type": "tcp" if hasattr(vector_service.vector_conn, 'pubsub') else "upstash",
+            "recent_logs": vector_logs,
+            "circuit_breaker": vector_service._check_circuit_breaker(),
+            "_links": {
+                "logs": "/health/logs?service=vector_service",
+                "metrics": "/health/metrics/vector"
+            }
+        }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+@router.get("/health/llm")
+async def health_llm():
+    """LLM service health and metrics"""
     try:
+        llm_service = get_llm_service()
+        health = llm_service.health_check()
+        # Get recent LLM logs
+        llm_logs = log_aggregator.get_logs(service="llm_service", limit=50)
+        return {
+            **health,
+            "recent_logs": llm_logs,
+            "_links": {
+                "logs": "/health/logs?service=llm_service",
+                "generate": "/api/v1/generate"
+            }
+        }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+@router.get("/health/mapper")
+async def health_mapper():
+    """Mapper service health and metrics"""
+    try:
+        mapper_health = health_check_mapper()
+        # Get recent mapper logs
+        mapper_logs = log_aggregator.get_logs(service="mapper", limit=50)
         return {
+            **mapper_health,
+            "recent_logs": mapper_logs,
+            "_links": {
+                "logs": "/health/logs?service=mapper",
+                "canonical_columns": len(mapper_health.get("canonical_columns", []))
+            }
         }
+    except Exception as e:
+        return {"status": "error", "error": str(e)}
+# ---------------------- SRE: Prometheus Metrics ---------------------- #
+@router.get("/health/metrics")
+async def get_prometheus_metrics():
+    """
+    Return aggregated Prometheus metrics from all services
+    Compatible with Prometheus scraping
+    """
+    registry = CollectorRegistry()
+    # Aggregate metrics from all services
+    sre_metrics = get_sre_metrics()
+    # Create gauges for SRE metrics
+    for metric_name, values in sre_metrics.items():
+        if isinstance(values, dict):
+            gauge = Gauge(f'sre_{metric_name}', f'SRE {metric_name}', ['org_id'], registry=registry)
+            for org_id, value in values.items():
+                gauge.labels(org_id=org_id).set(value)
+    # Add error rates
+    error_rate_gauge = Gauge('system_error_rate', 'Overall system error rate', registry=registry)
+    error_rate_gauge.set(log_aggregator.get_error_rate("all", 5))
+    # Add service health status
+    health_gauge = Gauge('service_health', 'Service health status (1=healthy)', ['service'], registry=registry)
+    services = ["analytics_worker", "vector_service", "llm_service", "mapper", "dependencies"]
+    for service in services:
+        is_healthy = log_aggregator.get_error_rate(service, 5) < 0.1
+        health_gauge.labels(service=service).set(1 if is_healthy else 0)
+    return Response(
+        content=generate_latest(registry),
+        media_type=CONTENT_TYPE_LATEST
+    )
+# ---------------------- SRE: Shutdown Handler ---------------------- #
+@router.post("/health/shutdown")
+async def shutdown_services():
+    """Graceful shutdown - close all connections"""
+    try:
+        # Shutdown LLM service
+        llm_service = get_llm_service()
+        if hasattr(llm_service, '_model') and llm_service._model:
+            del llm_service._model
+            if 'torch' in globals() and torch is not None:
+                torch.cuda.empty_cache()
+        # Shutdown worker manager
+        manager = await get_worker_manager()
+        manager.shutdown()
+        # Shutdown LLM service again (if needed)
+        llm_service = get_llm_service()
+        if hasattr(llm_service, '_model') and llm_service._model:
+            del llm_service._model
+            if 'torch' in globals() and torch is not None:
+                torch.cuda.empty_cache()
+        log_aggregator.emit("health_router", "info", "Shutdown completed")
+        return {"status": "shutdown_complete"}
     except Exception as e:
+        log_aggregator.emit("health_router", "error", f"Shutdown failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

app/service/llm_service.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Optional, Dict, Any, List, Callable
 from dataclasses import dataclass, asdict
 import psutil  # For resource monitoring
 from fastapi import HTTPException
 # Prometheus metrics (free tier compatible)
 try:
     from prometheus_client import Counter, Histogram, Gauge
@@ -344,7 +345,7 @@ class LocalLLMService:
             # ✅ SRE: Update gauge
             self.model_loaded_gauge.labels(org_id=self.org_id).set(1)
-            logger.info("✅ [BACKGROUND] LLM loaded successfully")
         except Exception as e:
             logger.error(f"❌ [BACKGROUND] LLM loading failed: {e}")
@@ -530,7 +531,7 @@ class LocalLLMService:
                 raise
             except Exception as e:
-                logger.error(f"[ASYNC] ❌ Generation error: {e}")
                 self.inference_requests.labels(
                     org_id=self.org_id,

 from dataclasses import dataclass, asdict
 import psutil  # For resource monitoring
 from fastapi import HTTPException
+from app.routers.health import emit_llm_log
 # Prometheus metrics (free tier compatible)
 try:
     from prometheus_client import Counter, Histogram, Gauge
             # ✅ SRE: Update gauge
             self.model_loaded_gauge.labels(org_id=self.org_id).set(1)
+            emit_llm_log("info", "✅ LLM loaded successfully", model_id=self.model_id)
         except Exception as e:
             logger.error(f"❌ [BACKGROUND] LLM loading failed: {e}")
                 raise
             except Exception as e:
+                emit_llm_log("error", f"❌ Generation failed: {e}", error=str(e))
                 self.inference_requests.labels(
                     org_id=self.org_id,

app/service/vector_service.py CHANGED Viewed

@@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer
 import logging
 from datetime import datetime, timedelta
 from enum import Enum
 logger = logging.getLogger(__name__)
@@ -250,7 +250,8 @@ class VectorService:
             if (i // batch_size + 1) % 5 == 0:
                 logger.debug(f"[Embed] Batch {i//batch_size + 1}/{total_batches}")
-        logger.info(f"[Embed] ✅ Generated {len(embeddings)} embeddings")
         return embeddings
     # ====== REFACTORED: TCP Redis pipeline + pub/sub ======
@@ -372,7 +373,7 @@ class VectorService:
                 }
             )
-            logger.error(f"[❌ VECTOR] Redis error: {e}")
             return False
     # ====== Existing methods (polished with metrics) ======

 import logging
 from datetime import datetime, timedelta
 from enum import Enum
+from app.routers.health import emit_vector_log
 logger = logging.getLogger(__name__)
             if (i // batch_size + 1) % 5 == 0:
                 logger.debug(f"[Embed] Batch {i//batch_size + 1}/{total_batches}")
+        emit_vector_log("info", f"✅ Generated {len(embeddings)} embeddings",
+                org_id=self.org_id, vector_count=len(embeddings))
         return embeddings
     # ====== REFACTORED: TCP Redis pipeline + pub/sub ======
                 }
             )
+            emit_vector_log("error", f"❌ Redis error: {e}", error=str(e))
             return False
     # ====== Existing methods (polished with metrics) ======

app/tasks/analytics_worker.py CHANGED Viewed

@@ -26,6 +26,7 @@ from app.schemas.org_schema import OrgSchema
 from app.service.vector_service import VectorService, VectorStoreEventType, VectorMetrics
 from app.engine.kpi_calculators.registry import get_kpi_calculator_async
 from app.service.embedding_service import EmbeddingService
 # Configure structured logging for SRE tools (Loki, etc.)
 logging.basicConfig(
@@ -169,7 +170,7 @@ class AnalyticsWorker:
             if not await self._acquire_lock():
                 return {"status": "skipped", "reason": "lock_failed"}
-            logger.info(f"\n[WORKER] 🚀 STARTING {worker_id}")
             # STEP 2: Load entity info from Redis
             await self._load_entity_from_redis()
@@ -245,7 +246,7 @@ class AnalyticsWorker:
             return results
         except Exception as e:
-            logger.error(f"[WORKER] ❌ CRITICAL: {e}", exc_info=True)
             await self._publish_status("error", str(e))
             # Publish error event

 from app.service.vector_service import VectorService, VectorStoreEventType, VectorMetrics
 from app.engine.kpi_calculators.registry import get_kpi_calculator_async
 from app.service.embedding_service import EmbeddingService
+from app.routers.health import emit_worker_log
 # Configure structured logging for SRE tools (Loki, etc.)
 logging.basicConfig(
             if not await self._acquire_lock():
                 return {"status": "skipped", "reason": "lock_failed"}
+            emit_worker_log("info", f"🚀 STARTING {worker_id}", worker_id=worker_id)
             # STEP 2: Load entity info from Redis
             await self._load_entity_from_redis()
             return results
         except Exception as e:
+            emit_worker_log("error", f"❌ CRITICAL: {e}", error=str(e))
             await self._publish_status("error", str(e))
             # Publish error event