Spaces:
Sleeping
Sleeping
Rick
HF demo: block chat submissions and guide local edge install; add optional virgin-system runtime config
35e029d | # ============================================================================= | |
| # Author: Rick Escher | |
| # Project: SailingMedAdvisor | |
| # Context: Google HAI-DEF Framework | |
| # Models: Google MedGemmas | |
| # Program: Kaggle Impact Challenge | |
| # ============================================================================= | |
| # run_med_advisor.sh - Secure startup script for MedGemma Advisor | |
| echo "==================================================" | |
| echo "SailingMeAdvisor - Offline emergency medical guidance for offshore sailors," | |
| echo "powered by MedGemma (HAI-DEF)" | |
| echo "" | |
| echo "==================================================" | |
| # Check if virtual environment exists | |
| if [ ! -d ".venv" ]; then | |
| echo "❌ Error: Virtual environment not found!" | |
| echo "Please create it first: python3 -m venv .venv" | |
| exit 1 | |
| fi | |
| # Activate virtual environment | |
| source .venv/bin/activate | |
| # Optional local runtime overrides generated during fresh install. | |
| # This lets us configure a new machine once and keep startup consistent. | |
| LOCAL_ENV_FILE="${SAILINGMED_LOCAL_ENV:-./sailingmed.local.env}" | |
| if [ -f "$LOCAL_ENV_FILE" ]; then | |
| echo "🔧 Loading local runtime config from $LOCAL_ENV_FILE" | |
| # Export sourced vars so child processes (uvicorn/app) receive them. | |
| set -a | |
| # shellcheck disable=SC1090 | |
| source "$LOCAL_ENV_FILE" | |
| set +a | |
| fi | |
| # Check if required packages are installed | |
| python3 -c "import fastapi, uvicorn" 2>/dev/null || { | |
| echo "❌ Error: FastAPI or Uvicorn not installed. Install with: pip install fastapi uvicorn[standard]" | |
| exit 1 | |
| } | |
| # Set environment variables (can be customized) | |
| # export ADMIN_PASSWORD='your_secure_password' | |
| # export SECRET_KEY='your_secret_key' | |
| # Prefer BF16 for stability; set FORCE_FP16=1 (and ALLOW_FP16=1) to override. | |
| # Respect user override; default to 0 to prefer BF16 on supported GPUs. | |
| export FORCE_FP16="${FORCE_FP16:-0}" | |
| # Keep SDP kernels conservative on RTX 5000/Turing; opt in to fast kernels manually. | |
| export USE_FAST_SDP="${USE_FAST_SDP:-0}" | |
| # Tab bar theme toggle: | |
| # 1 = splash purple (#7452B9), 0 = default gray. | |
| export USE_SPLASH_PURPLE_TABBAR="${USE_SPLASH_PURPLE_TABBAR:-0}" | |
| # Legacy env retained for compatibility with any existing checks. | |
| export USE_FLASH_ATTENTION="${USE_FLASH_ATTENTION:-$USE_FAST_SDP}" | |
| export TORCH_USE_CUDA_DSA=0 | |
| # Choose a safe default for mixed hardware: | |
| # - If user explicitly sets FORCE_CUDA, honor it. | |
| # - If unset, prefer GPU only when NVIDIA tooling is present. | |
| if [ -z "${FORCE_CUDA+x}" ]; then | |
| if command -v nvidia-smi >/dev/null 2>&1; then | |
| export FORCE_CUDA="1" | |
| else | |
| export FORCE_CUDA="0" | |
| fi | |
| else | |
| export FORCE_CUDA | |
| fi | |
| # Keep GPU-only behavior by default; set to 1 only if we explicitly want CPU fallback on CUDA runtime faults. | |
| export ALLOW_CPU_FALLBACK_ON_CUDA_ERROR="${ALLOW_CPU_FALLBACK_ON_CUDA_ERROR:-0}" | |
| # Keep global cap high for 4B but reserve headroom for 27B KV cache. | |
| export MODEL_MAX_GPU_MEM="${MODEL_MAX_GPU_MEM:-15GiB}" | |
| export MODEL_MAX_GPU_MEM_27B="${MODEL_MAX_GPU_MEM_27B:-8GiB}" | |
| export MODEL_MAX_CPU_MEM=64GiB | |
| # 0 disables hard cap so token count comes from Settings (tr_tok/in_tok). | |
| export MODEL_MAX_NEW_TOKENS_27B="${MODEL_MAX_NEW_TOKENS_27B:-0}" | |
| export MODEL_MAX_INPUT_TOKENS_27B="${MODEL_MAX_INPUT_TOKENS_27B:-2048}" | |
| export MODEL_DEVICE_MAP_27B="${MODEL_DEVICE_MAP_27B:-manual}" | |
| export MODEL_GPU_LAYERS_27B="${MODEL_GPU_LAYERS_27B:-14}" | |
| export MODEL_ATTN_IMPL_27B="${MODEL_ATTN_IMPL_27B:-eager}" | |
| # Reduce allocator fragmentation on long sessions. | |
| export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" | |
| # CUDA preflight: fail early when FORCE_CUDA=1 so we don't silently run on CPU. | |
| if [ "$FORCE_CUDA" = "1" ]; then | |
| echo "🔎 CUDA preflight (FORCE_CUDA=1)" | |
| python3 - <<'PY' | |
| import sys | |
| import torch | |
| if not torch.cuda.is_available(): | |
| print("❌ CUDA preflight failed: torch.cuda.is_available() is False") | |
| try: | |
| torch.cuda.current_device() | |
| except Exception as exc: | |
| print(f" CUDA error: {exc}") | |
| sys.exit(1) | |
| try: | |
| _ = torch.zeros(1, device="cuda") | |
| except Exception as exc: | |
| print(f"❌ CUDA preflight failed during tensor allocation: {exc}") | |
| sys.exit(1) | |
| print(f"✅ CUDA preflight passed on GPU: {torch.cuda.get_device_name(0)}") | |
| PY | |
| if [ $? -ne 0 ]; then | |
| echo "Hint: check kernel GPU errors with: journalctl -k | grep -i -E 'NVRM|Xid'" | |
| echo "If errors persist, reboot or reload NVIDIA driver modules before restarting SailingMedAdvisor." | |
| exit 1 | |
| fi | |
| fi | |
| # Detect a LAN IP to share in the startup banner (best effort) | |
| LAN_IP=$(hostname -I 2>/dev/null | awk 'NF{print $1; exit}') | |
| if [ -z "$LAN_IP" ] && command -v ip >/dev/null 2>&1; then | |
| LAN_IP=$(ip route get 8.8.8.8 2>/dev/null | awk 'NR==1 {print $7}') | |
| fi | |
| # Run the application | |
| echo "🚀 Starting server on http://127.0.0.1:5000" | |
| if [ -n "$LAN_IP" ]; then | |
| echo "🌐 LAN access: http://${LAN_IP}:5000" | |
| else | |
| echo "🌐 LAN access: http://<this-machine-ip>:5000" | |
| fi | |
| echo "==================================================" | |
| python3 -m uvicorn app:app --host 0.0.0.0 --port 5000 | |