Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

App Files Files Community

Mauro Carlucci commited on 25 days ago

Commit

7f13b65

unverified ·

2 Parent(s): 27bcd6a 0b25b12

Merge pull request #41 from se4ai2526-uniba/Fix-Grafana-Prometheus

Browse files

Files changed (12) hide show

Dockerfile +22 -5
docker/nginx.conf +35 -0
docker/scripts/start_space.sh +62 -24
docs/design_choices.md +6 -5
docs/milestone_summaries.md +6 -4
docs/user_guide.md +6 -5
monitoring/alertmanager/config.yml +7 -7
monitoring/grafana/dashboards/hopcroft_dashboard.json +98 -35
monitoring/grafana/provisioning/dashboards/dashboard.yml +3 -3
monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json +0 -358
monitoring/grafana/provisioning/datasources/prometheus.yml +1 -1
monitoring/prometheus/prometheus.yml +11 -11

Dockerfile CHANGED Viewed

@@ -40,9 +40,26 @@ RUN wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prom
     mkdir -p /etc/prometheus /var/lib/prometheus && \
     rm -rf prometheus-*
-COPY monitoring/grafana/provisioning /etc/grafana/provisioning
-COPY monitoring/grafana/dashboards /var/lib/grafana/dashboards
-COPY monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
 # Copy requirements first for caching
 COPY requirements.txt .
@@ -58,8 +75,8 @@ COPY --chown=user:user . .
 # Ensure the user has permissions on the app directory (needed for dvc init if .dvc is missing)
 RUN chown -R user:user /app
-# Fix line endings and permissions for the start script
-RUN dos2unix docker/scripts/start_space.sh && \
     chmod +x docker/scripts/start_space.sh
 # Install the project itself

     mkdir -p /etc/prometheus /var/lib/prometheus && \
     rm -rf prometheus-*
+# Alertmanager
+RUN wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz && \
+    tar xvfz alertmanager-*.tar.gz && \
+    mv alertmanager-*/alertmanager /usr/local/bin/ && \
+    mv alertmanager-*/amtool /usr/local/bin/ && \
+    mkdir -p /etc/alertmanager /var/lib/alertmanager && \
+    rm -rf alertmanager-*
+# Pushgateway
+RUN wget https://github.com/prometheus/pushgateway/releases/download/v1.6.0/pushgateway-1.6.0.linux-amd64.tar.gz && \
+    tar xvfz pushgateway-*.tar.gz && \
+    mv pushgateway-*/pushgateway /usr/local/bin/ && \
+    rm -rf pushgateway-*
+COPY --chown=user monitoring/grafana/provisioning /etc/grafana/provisioning
+COPY --chown=user monitoring/grafana/dashboards /var/lib/grafana/dashboards
+COPY --chown=user monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
+COPY --chown=user monitoring/prometheus/alert_rules.yml /etc/prometheus/alert_rules.yml
+COPY --chown=user monitoring/alertmanager/config.yml /etc/alertmanager/config.yml
 # Copy requirements first for caching
 COPY requirements.txt .
 # Ensure the user has permissions on the app directory (needed for dvc init if .dvc is missing)
 RUN chown -R user:user /app
+# Fix line endings and permissions for the start script and configs
+RUN find . -name "*.sh" -o -name "*.yml" -o -name "*.ini" -o -name "*.json" | xargs dos2unix && \
     chmod +x docker/scripts/start_space.sh
 # Install the project itself

docker/nginx.conf CHANGED Viewed

@@ -33,6 +33,14 @@ http {
         server 127.0.0.1:9090;
     }
     server {
         listen 7860;
         server_name localhost;
@@ -73,6 +81,15 @@ http {
             proxy_set_header Host $host;
         }
         # Grafana
         location = /grafana {
             return 301 /grafana/;
@@ -108,6 +125,24 @@ http {
             proxy_set_header X-Forwarded-Proto $scheme;
         }
         # Streamlit (Catch-all)
         location / {
             proxy_pass http://streamlit;

         server 127.0.0.1:9090;
     }
+    upstream alertmanager {
+        server 127.0.0.1:9093;
+    }
+    upstream pushgateway {
+        server 127.0.0.1:9091;
+    }
     server {
         listen 7860;
         server_name localhost;
             proxy_set_header Host $host;
         }
+        # FastAPI Metrics endpoint for Prometheus
+        location /metrics {
+            proxy_pass http://fastapi/metrics;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
         # Grafana
         location = /grafana {
             return 301 /grafana/;
             proxy_set_header X-Forwarded-Proto $scheme;
         }
+        # Alertmanager UI
+        location /alertmanager/ {
+            proxy_pass http://alertmanager;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        # Pushgateway UI
+        location /pushgateway/ {
+            proxy_pass http://pushgateway;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
         # Streamlit (Catch-all)
         location / {
             proxy_pass http://streamlit;

docker/scripts/start_space.sh CHANGED Viewed

@@ -53,23 +53,47 @@ for i in {1..30}; do
 done
 echo "$(date) - Configuring and starting Prometheus..."
-# Create a config for the space
-cat <<EOF > /tmp/prometheus.yml
-global:
-  scrape_interval: 15s
-  evaluation_interval: 15s
-scrape_configs:
-  - job_name: 'hopcroft-api'
-    metrics_path: '/metrics'
-    static_configs:
-      - targets: ['127.0.0.1:8000']
-    scrape_interval: 10s
-  - job_name: 'prometheus'
-    static_configs:
-      - targets: ['127.0.0.1:9090']
-EOF
 # Determine Prometheus External URL
 # Always use relative path so it works on both huggingface.co and .hf.space domains
@@ -101,15 +125,23 @@ else
     GRAFANA_ROOT_URL="http://localhost:3000/grafana/"
 fi
 echo "$(date) - Starting Grafana with Root URL: $GRAFANA_ROOT_URL"
-grafana-server --homepath=/usr/share/grafana \
     --config=/app/monitoring/grafana/grafana.ini \
-    cfg:default.paths.data=/tmp/grafana_data \
-    cfg:default.paths.logs=/tmp/grafana_logs \
-    cfg:default.paths.plugins=/usr/share/grafana/plugins \
     cfg:server.root_url="$GRAFANA_ROOT_URL" \
     cfg:server.serve_from_sub_path=true \
-    >> /tmp/grafana.log 2>&1 &
 # Wait for Grafana to start
 echo "$(date) - Waiting for Grafana (20s)..."
@@ -124,6 +156,12 @@ for i in {1..20}; do
     sleep 1
 done
 echo "$(date) - Starting Nginx reverse proxy..."
 if ! command -v nginx &> /dev/null; then
@@ -166,5 +204,5 @@ for i in {1..30}; do
     sleep 2
 done
-echo "$(date) - Process started. Tailing Nginx logs for debug..."
-tail -f /tmp/nginx_startup.log /tmp/fastapi.log

 done
 echo "$(date) - Configuring and starting Prometheus..."
+# Patch Grafana Datasource for Localhost (HF Space) and fix URL path
+# Replace prometheus:9090 with 127.0.0.1:9090/prometheus in all datasource configs
+find /app/monitoring/grafana/provisioning/datasources -name '*.yml' -exec sed -i 's/prometheus:9090/127.0.0.1:9090\/prometheus/g' {} +
+# Copy production configs to /tmp for modification
+cp /etc/prometheus/prometheus.yml /tmp/prometheus.yml
+cp /etc/prometheus/alert_rules.yml /tmp/alert_rules.yml
+cp /etc/alertmanager/config.yml /tmp/alertmanager.yml
+# Modify Prometheus config for local execution (replace docker-compose service names with localhost)
+# hopcroft-api:8080 -> 127.0.0.1:8000 (API runs on 8000 in Space)
+sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/prometheus.yml
+# Alertmanager: hopcroft-api:8080 -> 127.0.0.1:8000
+sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/alertmanager.yml
+# alertmanager:9093 -> 127.0.0.1:9093
+sed -i 's/alertmanager:9093/127.0.0.1:9093/g' /tmp/prometheus.yml
+# pushgateway:9091 -> 127.0.0.1:9091
+sed -i 's/pushgateway:9091/127.0.0.1:9091/g' /tmp/prometheus.yml
+# Fix alert_rules path to be absolute or relative to execution
+sed -i 's|"alert_rules.yml"|"/tmp/alert_rules.yml"|g' /tmp/prometheus.yml
+# FIX: Add path prefixes to match --web.route-prefix arguments
+# Add metrics_path for self-scraping prometheus
+sed -i 's/job_name: "prometheus"/job_name: "prometheus"\n    metrics_path: "\/prometheus\/metrics"/g' /tmp/prometheus.yml
+# Add metrics_path for pushgateway
+sed -i 's/job_name: "pushgateway"/job_name: "pushgateway"\n    metrics_path: "\/pushgateway\/metrics"/g' /tmp/prometheus.yml
+# Add path_prefix for Alertmanager
+sed -i 's/    - static_configs:/    - path_prefix: "\/alertmanager\/"\n      static_configs:/g' /tmp/prometheus.yml
+echo "$(date) - Starting Alertmanager..."
+alertmanager \
+    --config.file=/tmp/alertmanager.yml \
+    --storage.path=/tmp/alertmanager_data \
+    --web.route-prefix=/alertmanager/ \
+    >> /tmp/alertmanager.log 2>&1 &
+echo "$(date) - Starting Pushgateway..."
+pushgateway \
+    --persistence.file=/tmp/pushgateway_data \
+    --web.route-prefix=/pushgateway/ \
+    >> /tmp/pushgateway.log 2>&1 &
 # Determine Prometheus External URL
 # Always use relative path so it works on both huggingface.co and .hf.space domains
     GRAFANA_ROOT_URL="http://localhost:3000/grafana/"
 fi
+# Locate Grafana binary
+GRAFANA_BIN=$(which grafana-server || echo "/usr/sbin/grafana-server")
+echo "$(date) - Found Grafana binary at: $GRAFANA_BIN"
 echo "$(date) - Starting Grafana with Root URL: $GRAFANA_ROOT_URL"
+# Use the project's grafana.ini which we have permissions to read
+$GRAFANA_BIN --homepath=/usr/share/grafana \
     --config=/app/monitoring/grafana/grafana.ini \
+    cfg:paths.data=/tmp/grafana_data \
+    cfg:paths.logs=/tmp/grafana_logs \
+    cfg:paths.plugins=/usr/share/grafana/plugins \
+    cfg:paths.provisioning=/app/monitoring/grafana/provisioning \
     cfg:server.root_url="$GRAFANA_ROOT_URL" \
     cfg:server.serve_from_sub_path=true \
+    cfg:server.http_port=3000 \
+    > /tmp/grafana.log 2>&1 &
 # Wait for Grafana to start
 echo "$(date) - Waiting for Grafana (20s)..."
     sleep 1
 done
+# If Grafana is still down, print logs
+if ! curl -s http://127.0.0.1:3000/api/health > /dev/null; then
+    echo "$(date) - ERROR: Grafana failed to start within 20 seconds. Dumping logs:"
+    cat /tmp/grafana.log
+fi
 echo "$(date) - Starting Nginx reverse proxy..."
 if ! command -v nginx &> /dev/null; then
     sleep 2
 done
+echo "$(date) - Process started. Tailing logs for debug..."
+tail -f /tmp/nginx_startup.log /tmp/fastapi.log /tmp/grafana.log /tmp/prometheus.log

docs/design_choices.md CHANGED Viewed

@@ -465,11 +465,12 @@ async def monitor_requests(request, call_next):
 ### Grafana Visualization
 **Dashboard Panels:**
-1. API Request Rate (time series)
-2. API Latency Percentiles (heatmap)
-3. Drift Detection Status (stat panel)
-4. Drift P-Value Trend (time series)
-5. Error Rate (gauge)
 **Data Sources:**
 - Prometheus: Real-time metrics

 ### Grafana Visualization
 **Dashboard Panels:**
+1. Request Rate (gauge)
+2. Request Latency p50/p95 (time series)
+3. In-Progress Requests (stat panel)
+4. Error Rate 5xx (stat panel)
+5. Model Prediction Time (time series)
+6. Requests by Endpoint (bar chart)
 **Data Sources:**
 - Prometheus: Real-time metrics

docs/milestone_summaries.md CHANGED Viewed

@@ -242,10 +242,12 @@ Jobs:
 ### Grafana Dashboards
-- **API Request Rate**: Real-time requests per second
-- **API Latency**: P50, P90, P99 percentiles
-- **Drift Detection Status**: Binary indicator (0/1)
-- **Drift P-Value**: Statistical significance metric
 ### Data Drift Detection

 ### Grafana Dashboards
+- **Request Rate**: Real-time requests per second
+- **Request Latency (p50, p95)**: Response time percentiles
+- **In-Progress Requests**: Currently processing requests
+- **Error Rate (5xx)**: Failed request percentage
+- **Model Prediction Time**: Inference latency
+- **Requests by Endpoint**: Traffic distribution
 ### Data Drift Detection

docs/user_guide.md CHANGED Viewed

@@ -406,11 +406,12 @@ The pre-configured dashboard includes:
 | Panel | Description |
 |-------|-------------|
-| API Request Rate | Real-time requests per endpoint |
-| API Latency | Response time distribution |
-| Drift Detection Status | Binary indicator (0=No Drift, 1=Drift) |
-| Drift P-Value | Statistical significance |
-| Drift Distance | KS test distance metric |
 ### Data Drift Detection

 | Panel | Description |
 |-------|-------------|
+| Request Rate | Real-time requests per second |
+| Request Latency (p50, p95) | Response time percentiles |
+| In-Progress Requests | Currently processing requests |
+| Error Rate (5xx) | Percentage of failed requests |
+| Model Prediction Time | Average model inference latency |
+| Requests by Endpoint | Traffic distribution per endpoint |
 ### Data Drift Detection

monitoring/alertmanager/config.yml CHANGED Viewed

@@ -2,20 +2,20 @@ global:
   resolve_timeout: 5m
 route:
-  group_by: ['alertname', 'severity']
   group_wait: 10s
   group_interval: 10s
   repeat_interval: 1h
-  receiver: 'log-receiver'
 receivers:
-  - name: 'log-receiver'
     webhook_configs:
-      - url: 'http://hopcroft-api:8080/health'
 inhibition_rules:
   - source_match:
-      severity: 'critical'
     target_match:
-      severity: 'warning'
-    equal: ['alertname', 'dev', 'instance']

   resolve_timeout: 5m
 route:
+  group_by: ["alertname", "severity"]
   group_wait: 10s
   group_interval: 10s
   repeat_interval: 1h
+  receiver: "log-receiver"
 receivers:
+  - name: "log-receiver"
     webhook_configs:
+      - url: "http://hopcroft-api:8080/health"
 inhibition_rules:
   - source_match:
+      severity: "critical"
     target_match:
+      severity: "warning"
+    equal: ["alertname", "dev", "instance"]

monitoring/grafana/dashboards/hopcroft_dashboard.json CHANGED Viewed

@@ -62,7 +62,7 @@
       "pluginVersion": "9.0.0",
       "targets": [
         {
-          "expr": "rate(fastapi_requests_total[1m])",
           "refId": "A"
         }
       ],
@@ -131,12 +131,12 @@
       "pluginVersion": "9.0.0",
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
           "legendFormat": "p95",
           "refId": "A"
         },
         {
-          "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
           "legendFormat": "p50 (median)",
           "refId": "B"
         }
@@ -152,32 +152,25 @@
           "color": {
             "mode": "thresholds"
           },
-          "mappings": [
-            {
-              "options": {
-                "0": {
-                  "color": "red",
-                  "index": 1,
-                  "text": "No Drift"
-                },
-                "1": {
-                  "color": "green",
-                  "index": 0,
-                  "text": "Drift Detected"
-                }
-              },
-              "type": "value"
-            }
-          ],
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
                 "color": "green",
                 "value": null
               }
             ]
-          }
         }
       },
       "gridPos": {
@@ -201,13 +194,13 @@
       "pluginVersion": "9.0.0",
       "targets": [
         {
-          "expr": "drift_detected",
           "refId": "A"
         }
       ],
-      "title": "Data Drift Status",
       "type": "stat",
-      "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
     },
     {
       "datasource": "Prometheus",
@@ -216,7 +209,7 @@
           "color": {
             "mode": "thresholds"
           },
-          "decimals": 4,
           "mappings": [],
           "thresholds": {
             "mode": "absolute",
@@ -235,7 +228,7 @@
               }
             ]
           },
-          "unit": "short"
         }
       },
       "gridPos": {
@@ -259,13 +252,13 @@
       "pluginVersion": "9.0.0",
       "targets": [
         {
-          "expr": "drift_p_value",
           "refId": "A"
         }
       ],
-      "title": "Drift P-Value",
       "type": "stat",
-      "description": "Statistical significance of detected drift (lower = more significant)"
     },
     {
       "datasource": "Prometheus",
@@ -305,7 +298,7 @@
               }
             ]
           },
-          "unit": "short"
         }
       },
       "gridPos": {
@@ -328,14 +321,84 @@
       "pluginVersion": "9.0.0",
       "targets": [
         {
-          "expr": "drift_distance",
-          "legendFormat": "Distance",
           "refId": "A"
         }
       ],
-      "title": "Drift Distance Over Time",
       "type": "timeseries",
-      "description": "Statistical distance between baseline and current data distribution"
     }
   ],
   "refresh": "10s",
@@ -353,6 +416,6 @@
   "timezone": "",
   "title": "Hopcroft ML Model Monitoring",
   "uid": "hopcroft-ml-dashboard",
-  "version": 1,
   "weekStart": ""
 }

       "pluginVersion": "9.0.0",
       "targets": [
         {
+          "expr": "sum(rate(hopcroft_requests_total[1m]))",
           "refId": "A"
         }
       ],
       "pluginVersion": "9.0.0",
       "targets": [
         {
+          "expr": "histogram_quantile(0.95, sum(rate(hopcroft_request_duration_seconds_bucket[5m])) by (le)) * 1000",
           "legendFormat": "p95",
           "refId": "A"
         },
         {
+          "expr": "histogram_quantile(0.50, sum(rate(hopcroft_request_duration_seconds_bucket[5m])) by (le)) * 1000",
           "legendFormat": "p50 (median)",
           "refId": "B"
         }
           "color": {
             "mode": "thresholds"
           },
+          "mappings": [],
           "thresholds": {
             "mode": "absolute",
             "steps": [
               {
                 "color": "green",
                 "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
               }
             ]
+          },
+          "unit": "short"
         }
       },
       "gridPos": {
       "pluginVersion": "9.0.0",
       "targets": [
         {
+          "expr": "sum(hopcroft_in_progress_requests)",
           "refId": "A"
         }
       ],
+      "title": "In-Progress Requests",
       "type": "stat",
+      "description": "Number of requests currently being processed"
     },
     {
       "datasource": "Prometheus",
           "color": {
             "mode": "thresholds"
           },
+          "decimals": 2,
           "mappings": [],
           "thresholds": {
             "mode": "absolute",
               }
             ]
           },
+          "unit": "percentunit"
         }
       },
       "gridPos": {
       "pluginVersion": "9.0.0",
       "targets": [
         {
+          "expr": "sum(rate(hopcroft_requests_total{http_status=~\"5..\"}[5m])) / sum(rate(hopcroft_requests_total[5m]))",
           "refId": "A"
         }
       ],
+      "title": "Error Rate (5xx)",
       "type": "stat",
+      "description": "Percentage of requests resulting in 5xx errors"
     },
     {
       "datasource": "Prometheus",
               }
             ]
           },
+          "unit": "s"
         }
       },
       "gridPos": {
       "pluginVersion": "9.0.0",
       "targets": [
         {
+          "expr": "rate(hopcroft_prediction_processing_seconds_sum[5m]) / rate(hopcroft_prediction_processing_seconds_count[5m])",
+          "legendFormat": "Avg Prediction Time",
+          "refId": "A"
+        }
+      ],
+      "title": "Model Prediction Time",
+      "type": "timeseries",
+      "description": "Average time spent processing model predictions"
+    },
+    {
+      "datasource": "Prometheus",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 80,
+            "gradientMode": "none",
+            "hideFrom": {
+              "tooltip": false,
+              "viz": false,
+              "legend": false
+            },
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 14
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": ["sum"],
+          "displayMode": "table",
+          "placement": "right"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "pluginVersion": "9.0.0",
+      "targets": [
+        {
+          "expr": "sum by (endpoint) (increase(hopcroft_requests_total[5m]))",
+          "legendFormat": "{{endpoint}}",
           "refId": "A"
         }
       ],
+      "title": "Requests by Endpoint",
       "type": "timeseries",
+      "description": "Number of requests per endpoint over time"
     }
   ],
   "refresh": "10s",
   "timezone": "",
   "title": "Hopcroft ML Model Monitoring",
   "uid": "hopcroft-ml-dashboard",
+  "version": 2,
   "weekStart": ""
 }

monitoring/grafana/provisioning/dashboards/dashboard.yml CHANGED Viewed

@@ -1,13 +1,13 @@
 apiVersion: 1
 providers:
-  - name: 'Hopcroft Dashboards'
     orgId: 1
-    folder: ''
     type: file
     disableDeletion: false
     updateIntervalSeconds: 10
     allowUiUpdates: true
     options:
       path: /var/lib/grafana/dashboards
-      foldersFromFilesStructure: true

 apiVersion: 1
 providers:
+  - name: "Hopcroft Dashboards"
     orgId: 1
+    folder: "Hopcroft Project"
     type: file
     disableDeletion: false
     updateIntervalSeconds: 10
     allowUiUpdates: true
     options:
       path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false

monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json DELETED Viewed

@@ -1,358 +0,0 @@
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": "-- Grafana --",
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "editable": true,
-  "gnetId": null,
-  "graphTooltip": 1,
-  "id": null,
-  "links": [],
-  "panels": [
-    {
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "reqps"
-        }
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 6,
-        "x": 0,
-        "y": 0
-      },
-      "id": 1,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true
-      },
-      "pluginVersion": "9.0.0",
-      "targets": [
-        {
-          "expr": "rate(fastapi_requests_total[1m])",
-          "refId": "A"
-        }
-      ],
-      "title": "Request Rate",
-      "type": "gauge",
-      "description": "Number of requests per second handled by the API"
-    },
-    {
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "tooltip": false,
-              "viz": false,
-              "legend": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": true
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "ms"
-        }
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 18,
-        "x": 6,
-        "y": 0
-      },
-      "id": 2,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "max"],
-          "displayMode": "table",
-          "placement": "right"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "pluginVersion": "9.0.0",
-      "targets": [
-        {
-          "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
-          "legendFormat": "p95",
-          "refId": "A"
-        },
-        {
-          "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
-          "legendFormat": "p50 (median)",
-          "refId": "B"
-        }
-      ],
-      "title": "Request Latency (p50, p95)",
-      "type": "timeseries",
-      "description": "API response time percentiles over time"
-    },
-    {
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [
-            {
-              "options": {
-                "0": {
-                  "color": "red",
-                  "index": 1,
-                  "text": "No Drift"
-                },
-                "1": {
-                  "color": "green",
-                  "index": 0,
-                  "text": "Drift Detected"
-                }
-              },
-              "type": "value"
-            }
-          ],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          }
-        }
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 6,
-        "x": 0,
-        "y": 8
-      },
-      "id": 3,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true,
-        "text": {}
-      },
-      "pluginVersion": "9.0.0",
-      "targets": [
-        {
-          "expr": "drift_detected",
-          "refId": "A"
-        }
-      ],
-      "title": "Data Drift Status",
-      "type": "stat",
-      "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
-    },
-    {
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "decimals": 4,
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 0.01
-              },
-              {
-                "color": "red",
-                "value": 0.05
-              }
-            ]
-          },
-          "unit": "short"
-        }
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 6,
-        "x": 6,
-        "y": 8
-      },
-      "id": 4,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": ["lastNotNull"],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true,
-        "text": {}
-      },
-      "pluginVersion": "9.0.0",
-      "targets": [
-        {
-          "expr": "drift_p_value",
-          "refId": "A"
-        }
-      ],
-      "title": "Drift P-Value",
-      "type": "stat",
-      "description": "Statistical significance of detected drift (lower = more significant)"
-    },
-    {
-      "datasource": "Prometheus",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 10,
-            "gradientMode": "none",
-            "hideFrom": {
-              "tooltip": false,
-              "viz": false,
-              "legend": false
-            },
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "short"
-        }
-      },
-      "gridPos": {
-        "h": 6,
-        "w": 12,
-        "x": 12,
-        "y": 8
-      },
-      "id": 5,
-      "options": {
-        "legend": {
-          "calcs": ["mean", "lastNotNull"],
-          "displayMode": "table",
-          "placement": "right"
-        },
-        "tooltip": {
-          "mode": "multi"
-        }
-      },
-      "pluginVersion": "9.0.0",
-      "targets": [
-        {
-          "expr": "drift_distance",
-          "legendFormat": "Distance",
-          "refId": "A"
-        }
-      ],
-      "title": "Drift Distance Over Time",
-      "type": "timeseries",
-      "description": "Statistical distance between baseline and current data distribution"
-    }
-  ],
-  "refresh": "10s",
-  "schemaVersion": 36,
-  "style": "dark",
-  "tags": ["hopcroft", "ml", "monitoring"],
-  "templating": {
-    "list": []
-  },
-  "time": {
-    "from": "now-1h",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "",
-  "title": "Hopcroft ML Model Monitoring",
-  "uid": "hopcroft-ml-dashboard",
-  "version": 1,
-  "weekStart": ""
-}

monitoring/grafana/provisioning/datasources/prometheus.yml CHANGED Viewed

@@ -11,4 +11,4 @@ datasources:
     editable: true
     jsonData:
       httpMethod: POST
-      timeInterval: "15s"

     editable: true
     jsonData:
       httpMethod: POST
+      timeInterval: "15s"

monitoring/prometheus/prometheus.yml CHANGED Viewed

@@ -2,8 +2,8 @@ global:
   scrape_interval: 15s
   evaluation_interval: 15s
   external_labels:
-    monitor: 'hopcroft-monitor'
-    environment: 'development'
 rule_files:
   - "alert_rules.yml"
@@ -12,21 +12,21 @@ alerting:
   alertmanagers:
     - static_configs:
         - targets:
-          - 'alertmanager:9093'
 scrape_configs:
-  - job_name: 'hopcroft-api'
-    metrics_path: '/metrics'
     static_configs:
-      - targets: ['hopcroft-api:8080']
     scrape_interval: 10s
-  - job_name: 'prometheus'
     static_configs:
-      - targets: ['localhost:9090']
-  - job_name: 'pushgateway'
-    honor_labels: true
     static_configs:
-      - targets: ['pushgateway:9091']
     scrape_interval: 30s

   scrape_interval: 15s
   evaluation_interval: 15s
   external_labels:
+    monitor: "hopcroft-monitor"
+    environment: "development"
 rule_files:
   - "alert_rules.yml"
   alertmanagers:
     - static_configs:
         - targets:
+            - "alertmanager:9093"
 scrape_configs:
+  - job_name: "hopcroft-api"
+    metrics_path: "/metrics"
     static_configs:
+      - targets: ["hopcroft-api:8080"]
     scrape_interval: 10s
+  - job_name: "prometheus"
     static_configs:
+      - targets: ["localhost:9090"]
+  - job_name: "pushgateway"
+    honor_labels: true
     static_configs:
+      - targets: ["pushgateway:9091"]
     scrape_interval: 30s