DaCrow13 commited on
Commit
6bd4abe
·
1 Parent(s): 27bcd6a

Fix monitoring stack configuration for HF Spaces: Add Alertmanager, Pushgateway, and fix Prometheus/Grafana connection

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -3
  2. docker/nginx.conf +26 -0
  3. docker/scripts/start_space.sh +31 -17
Dockerfile CHANGED
@@ -40,9 +40,26 @@ RUN wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prom
40
  mkdir -p /etc/prometheus /var/lib/prometheus && \
41
  rm -rf prometheus-*
42
 
43
- COPY monitoring/grafana/provisioning /etc/grafana/provisioning
44
- COPY monitoring/grafana/dashboards /var/lib/grafana/dashboards
45
- COPY monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Copy requirements first for caching
48
  COPY requirements.txt .
 
40
  mkdir -p /etc/prometheus /var/lib/prometheus && \
41
  rm -rf prometheus-*
42
 
43
+ # Alertmanager
44
+ RUN wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz && \
45
+ tar xvfz alertmanager-*.tar.gz && \
46
+ mv alertmanager-*/alertmanager /usr/local/bin/ && \
47
+ mv alertmanager-*/amtool /usr/local/bin/ && \
48
+ mkdir -p /etc/alertmanager /var/lib/alertmanager && \
49
+ rm -rf alertmanager-*
50
+
51
+ # Pushgateway
52
+ RUN wget https://github.com/prometheus/pushgateway/releases/download/v1.6.0/pushgateway-1.6.0.linux-amd64.tar.gz && \
53
+ tar xvfz pushgateway-*.tar.gz && \
54
+ mv pushgateway-*/pushgateway /usr/local/bin/ && \
55
+ rm -rf pushgateway-*
56
+
57
+ COPY --chown=user monitoring/grafana/provisioning /etc/grafana/provisioning
58
+ COPY --chown=user monitoring/grafana/dashboards /var/lib/grafana/dashboards
59
+ COPY --chown=user monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
60
+ COPY --chown=user monitoring/prometheus/alert_rules.yml /etc/prometheus/alert_rules.yml
61
+ COPY --chown=user monitoring/alertmanager/config.yml /etc/alertmanager/config.yml
62
+
63
 
64
  # Copy requirements first for caching
65
  COPY requirements.txt .
docker/nginx.conf CHANGED
@@ -33,6 +33,14 @@ http {
33
  server 127.0.0.1:9090;
34
  }
35
 
 
 
 
 
 
 
 
 
36
  server {
37
  listen 7860;
38
  server_name localhost;
@@ -108,6 +116,24 @@ http {
108
  proxy_set_header X-Forwarded-Proto $scheme;
109
  }
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # Streamlit (Catch-all)
112
  location / {
113
  proxy_pass http://streamlit;
 
33
  server 127.0.0.1:9090;
34
  }
35
 
36
+ upstream alertmanager {
37
+ server 127.0.0.1:9093;
38
+ }
39
+
40
+ upstream pushgateway {
41
+ server 127.0.0.1:9091;
42
+ }
43
+
44
  server {
45
  listen 7860;
46
  server_name localhost;
 
116
  proxy_set_header X-Forwarded-Proto $scheme;
117
  }
118
 
119
+ # Alertmanager UI
120
+ location /alertmanager/ {
121
+ proxy_pass http://alertmanager;
122
+ proxy_set_header Host $host;
123
+ proxy_set_header X-Real-IP $remote_addr;
124
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
125
+ proxy_set_header X-Forwarded-Proto $scheme;
126
+ }
127
+
128
+ # Pushgateway UI
129
+ location /pushgateway/ {
130
+ proxy_pass http://pushgateway;
131
+ proxy_set_header Host $host;
132
+ proxy_set_header X-Real-IP $remote_addr;
133
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
134
+ proxy_set_header X-Forwarded-Proto $scheme;
135
+ }
136
+
137
  # Streamlit (Catch-all)
138
  location / {
139
  proxy_pass http://streamlit;
docker/scripts/start_space.sh CHANGED
@@ -53,23 +53,37 @@ for i in {1..30}; do
53
  done
54
 
55
  echo "$(date) - Configuring and starting Prometheus..."
56
- # Create a config for the space
57
- cat <<EOF > /tmp/prometheus.yml
58
- global:
59
- scrape_interval: 15s
60
- evaluation_interval: 15s
61
-
62
- scrape_configs:
63
- - job_name: 'hopcroft-api'
64
- metrics_path: '/metrics'
65
- static_configs:
66
- - targets: ['127.0.0.1:8000']
67
- scrape_interval: 10s
68
-
69
- - job_name: 'prometheus'
70
- static_configs:
71
- - targets: ['127.0.0.1:9090']
72
- EOF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Determine Prometheus External URL
75
  # Always use relative path so it works on both huggingface.co and .hf.space domains
 
53
  done
54
 
55
  echo "$(date) - Configuring and starting Prometheus..."
56
+ # Patch Grafana Datasource for Localhost (HF Space)
57
+ # Replace prometheus:9090 with 127.0.0.1:9090 in all datasource configs
58
+ find /etc/grafana/provisioning/datasources -name '*.yml' -exec sed -i 's/prometheus:9090/127.0.0.1:9090/g' {} +
59
+
60
+ # Copy production configs to /tmp for modification
61
+ cp /etc/prometheus/prometheus.yml /tmp/prometheus.yml
62
+ cp /etc/prometheus/alert_rules.yml /tmp/alert_rules.yml
63
+ cp /etc/alertmanager/config.yml /tmp/alertmanager.yml
64
+
65
+ # Modify Prometheus config for local execution (replace docker-compose service names with localhost)
66
+ # hopcroft-api:8080 -> 127.0.0.1:8000 (API runs on 8000 in Space)
67
+ sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/prometheus.yml
68
+ # alertmanager:9093 -> 127.0.0.1:9093
69
+ sed -i 's/alertmanager:9093/127.0.0.1:9093/g' /tmp/prometheus.yml
70
+ # pushgateway:9091 -> 127.0.0.1:9091
71
+ sed -i 's/pushgateway:9091/127.0.0.1:9091/g' /tmp/prometheus.yml
72
+ # Fix alert_rules path to be absolute or relative to execution
73
+ sed -i 's|"alert_rules.yml"|"/tmp/alert_rules.yml"|g' /tmp/prometheus.yml
74
+
75
+ echo "$(date) - Starting Alertmanager..."
76
+ alertmanager \
77
+ --config.file=/tmp/alertmanager.yml \
78
+ --storage.path=/tmp/alertmanager_data \
79
+ --web.route-prefix=/alertmanager/ \
80
+ >> /tmp/alertmanager.log 2>&1 &
81
+
82
+ echo "$(date) - Starting Pushgateway..."
83
+ pushgateway \
84
+ --persistence.file=/tmp/pushgateway_data \
85
+ --web.route-prefix=/pushgateway/ \
86
+ >> /tmp/pushgateway.log 2>&1 &
87
 
88
  # Determine Prometheus External URL
89
  # Always use relative path so it works on both huggingface.co and .hf.space domains