Spaces:

DaCrow13
/

Hopcroft-Skill-Classification

Running

Hopcroft-Skill-Classification / docker /scripts /start_space.sh

DaCrow13

Fix Grafana datasource path and Prometheus route prefixes for Space deployment

946fd5e 18 days ago

8.08 kB

	#!/bin/bash

	# Fail on error
	set -e

	# Ensure DVC is initialized (in case .dvc folder was not copied)
	if [ ! -d ".dvc" ]; then
	echo "Initializing DVC..."
	dvc init --no-scm
	dvc remote add -d origin https://dagshub.com/se4ai2526-uniba/Hopcroft.dvc
	fi

	# Determine credentials
	# Prefer specific DAGSHUB vars, fallback to MLFLOW vars (often the same for DagsHub)
	USER=${DAGSHUB_USERNAME:-$MLFLOW_TRACKING_USERNAME}
	PASS=${DAGSHUB_TOKEN:-$MLFLOW_TRACKING_PASSWORD}

	if [ -n "$USER" ] && [ -n "$PASS" ]; then
	echo "$(date) - Configuring DVC authentication for DagsHub..."
	# Configure local config (not committed)
	dvc remote modify origin --local auth basic
	dvc remote modify origin --local user "$USER"
	dvc remote modify origin --local password "$PASS"
	else
	echo "$(date) - WARNING: No DagsHub credentials found. DVC pull might fail if the remote is private."
	fi

	echo "$(date) - Pulling models from DVC..."
	# Pull only the necessary files for inference
	dvc pull models/random_forest_tfidf_gridsearch.pkl.dvc \
	models/tfidf_vectorizer.pkl.dvc \
	models/label_names.pkl.dvc \|\| echo "DVC pull failed, but continuing..."

	# Create Nginx temp directories
	mkdir -p /tmp/client_temp /tmp/proxy_temp /tmp/fastcgi_temp /tmp/uwsgi_temp /tmp/scgi_temp

	echo "$(date) - Checking models existence..."
	ls -la models/

	echo "$(date) - Starting FastAPI application in background..."
	# Using 0.0.0.0 to be safe
	uvicorn hopcroft_skill_classification_tool_competition.main:app --host 0.0.0.0 --port 8000 >> /tmp/fastapi.log 2>&1 &

	# Wait for API to start
	echo "$(date) - Waiting for API to start (30s)..."
	for i in {1..30}; do
	if curl -s http://127.0.0.1:8000/health > /dev/null; then
	echo "$(date) - API is UP!"
	break
	fi
	echo "$(date) - Waiting... ($i/30)"
	sleep 2
	done

	echo "$(date) - Configuring and starting Prometheus..."
	# Patch Grafana Datasource for Localhost (HF Space) and fix URL path
	# Replace prometheus:9090 with 127.0.0.1:9090/prometheus in all datasource configs
	find /app/monitoring/grafana/provisioning/datasources -name '*.yml' -exec sed -i 's/prometheus:9090/127.0.0.1:9090\/prometheus/g' {} +

	# Copy production configs to /tmp for modification
	cp /etc/prometheus/prometheus.yml /tmp/prometheus.yml
	cp /etc/prometheus/alert_rules.yml /tmp/alert_rules.yml
	cp /etc/alertmanager/config.yml /tmp/alertmanager.yml

	# Modify Prometheus config for local execution (replace docker-compose service names with localhost)
	# hopcroft-api:8080 -> 127.0.0.1:8000 (API runs on 8000 in Space)
	sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/prometheus.yml
	# Alertmanager: hopcroft-api:8080 -> 127.0.0.1:8000
	sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/alertmanager.yml
	# alertmanager:9093 -> 127.0.0.1:9093
	sed -i 's/alertmanager:9093/127.0.0.1:9093/g' /tmp/prometheus.yml
	# pushgateway:9091 -> 127.0.0.1:9091
	sed -i 's/pushgateway:9091/127.0.0.1:9091/g' /tmp/prometheus.yml
	# Fix alert_rules path to be absolute or relative to execution
	sed -i 's\|"alert_rules.yml"\|"/tmp/alert_rules.yml"\|g' /tmp/prometheus.yml

	# FIX: Add path prefixes to match --web.route-prefix arguments
	# Add metrics_path for self-scraping prometheus
	sed -i 's/job_name: "prometheus"/job_name: "prometheus"\n metrics_path: "\/prometheus\/metrics"/g' /tmp/prometheus.yml
	# Add metrics_path for pushgateway
	sed -i 's/job_name: "pushgateway"/job_name: "pushgateway"\n metrics_path: "\/pushgateway\/metrics"/g' /tmp/prometheus.yml
	# Add path_prefix for Alertmanager
	sed -i 's/ - static_configs:/ - path_prefix: "\/alertmanager\/"\n static_configs:/g' /tmp/prometheus.yml

	echo "$(date) - Starting Alertmanager..."
	alertmanager \
	--config.file=/tmp/alertmanager.yml \
	--storage.path=/tmp/alertmanager_data \
	--web.route-prefix=/alertmanager/ \
	>> /tmp/alertmanager.log 2>&1 &

	echo "$(date) - Starting Pushgateway..."
	pushgateway \
	--persistence.file=/tmp/pushgateway_data \
	--web.route-prefix=/pushgateway/ \
	>> /tmp/pushgateway.log 2>&1 &

	# Determine Prometheus External URL
	# Always use relative path so it works on both huggingface.co and .hf.space domains
	PROM_EXTERNAL_URL="/prometheus/"

	# Start Prometheus
	# --web.external-url needs to match the public URL for correct link generation
	# --web.route-prefix needs to match the path Nginx proxies to (/prometheus/)
	prometheus \
	--config.file=/tmp/prometheus.yml \
	--storage.tsdb.path=/tmp/prometheus_data \
	--web.listen-address=0.0.0.0:9090 \
	--web.external-url=$PROM_EXTERNAL_URL \
	--web.route-prefix=/prometheus/ \
	>> /tmp/prometheus.log 2>&1 &

	# Start Grafana
	if [ -n "$SPACE_ID" ]; then
	# Parse username and space name from SPACE_ID (user/space)
	# This allows us to construct the .hf.space domain which avoids CORS/Asset loading issues
	SPACE_AUTHOR=$(echo $SPACE_ID \| cut -d'/' -f1)
	SPACE_NAME=$(echo $SPACE_ID \| cut -d'/' -f2)
	SPACE_HOST="${SPACE_AUTHOR}-${SPACE_NAME}.hf.space"

	echo "$(date) - Detected HF Space environment (ID: $SPACE_ID). Configured Host: $SPACE_HOST"
	GRAFANA_ROOT_URL="https://$SPACE_HOST/grafana/"
	else
	echo "$(date) - No SPACE_ID found. Defaulting Grafana to localhost."
	GRAFANA_ROOT_URL="http://localhost:3000/grafana/"
	fi

	# Locate Grafana binary
	GRAFANA_BIN=$(which grafana-server \|\| echo "/usr/sbin/grafana-server")
	echo "$(date) - Found Grafana binary at: $GRAFANA_BIN"

	echo "$(date) - Starting Grafana with Root URL: $GRAFANA_ROOT_URL"

	# Use the project's grafana.ini which we have permissions to read
	$GRAFANA_BIN --homepath=/usr/share/grafana \
	--config=/app/monitoring/grafana/grafana.ini \
	cfg:paths.data=/tmp/grafana_data \
	cfg:paths.logs=/tmp/grafana_logs \
	cfg:paths.plugins=/usr/share/grafana/plugins \
	cfg:paths.provisioning=/app/monitoring/grafana/provisioning \
	cfg:server.root_url="$GRAFANA_ROOT_URL" \
	cfg:server.serve_from_sub_path=true \
	cfg:server.http_port=3000 \
	> /tmp/grafana.log 2>&1 &

	# Wait for Grafana to start
	echo "$(date) - Waiting for Grafana (20s)..."
	for i in {1..20}; do
	if curl -s http://127.0.0.1:3000/api/health > /dev/null; then
	echo "$(date) - Grafana is UP!"
	# Debug: Check what Grafana responds at root
	echo "$(date) - VERIFYING GRAFANA ROOT RESPONSE:"
	curl -v http://127.0.0.1:3000/ 2>&1 \| head -n 20
	break
	fi
	sleep 1
	done

	# If Grafana is still down, print logs
	if ! curl -s http://127.0.0.1:3000/api/health > /dev/null; then
	echo "$(date) - ERROR: Grafana failed to start within 20 seconds. Dumping logs:"
	cat /tmp/grafana.log
	fi


	echo "$(date) - Starting Nginx reverse proxy..."
	if ! command -v nginx &> /dev/null; then
	echo "$(date) - ERROR: nginx not found in PATH"
	exit 1
	fi
	nginx -c /app/docker/nginx.conf -g "daemon off;" >> /tmp/nginx_startup.log 2>&1 &

	echo "$(date) - Waiting for Nginx to initialize..."
	sleep 5

	# Check if Nginx is running
	if ps aux \| grep -v grep \| grep -q "nginx"; then
	echo "$(date) - Nginx is running."
	else
	echo "$(date) - ERROR: Nginx failed to start. Logs:"
	cat /tmp/nginx_startup.log
	fi

	echo "$(date) - Final backend check before starting Streamlit..."
	curl -v http://127.0.0.1:8000/health \|\| echo "FastAPI health check failed!"

	echo "$(date) - Starting Streamlit application on 127.0.0.1:8501..."
	export API_BASE_URL="http://127.0.0.1:8000"
	streamlit run hopcroft_skill_classification_tool_competition/streamlit_app.py \
	--server.port 8501 \
	--server.address 127.0.0.1 \
	--server.enableCORS=false \
	--server.enableXsrfProtection=false \
	--server.headless true &

	# Wait for Streamlit to start
	echo "$(date) - Waiting for Streamlit to start (30s)..."
	for i in {1..30}; do
	if curl -s http://127.0.0.1:8501/healthz > /dev/null; then
	echo "$(date) - Streamlit is UP!"
	break
	fi
	echo "$(date) - Waiting for Streamlit... ($i/30)"
	sleep 2
	done

	echo "$(date) - Process started. Tailing logs for debug..."
	tail -f /tmp/nginx_startup.log /tmp/fastapi.log /tmp/grafana.log /tmp/prometheus.log