Spaces:

ArchCoder
/

medintake-ai

Sleeping

priyansh-saxena1

feat: migrate inference engine to Ollama for 10x faster CPU inference

4e16e37 26 days ago

1.57 kB

	#!/bin/bash
	set -e

	MODEL="${MODEL_NAME:-qwen2.5:0.5b}"
	OLLAMA_URL="http://localhost:11434"

	echo "======================================"
	echo " Clinical Intake Agent - Startup"
	echo "======================================"

	# ── Step 1: Start Ollama in the background ──────────────────────────────────
	echo "[startup] Starting Ollama server..."
	ollama serve &
	OLLAMA_PID=$!

	# ── Step 2: Wait until Ollama is responsive ─────────────────────────────────
	echo "[startup] Waiting for Ollama to be ready..."
	MAX_WAIT=30
	WAITED=0
	until curl -sf "${OLLAMA_URL}/api/tags" > /dev/null 2>&1; do
	sleep 1
	WAITED=$((WAITED + 1))
	if [ "$WAITED" -ge "$MAX_WAIT" ]; then
	echo "[startup] ERROR: Ollama did not start within ${MAX_WAIT}s. Aborting."
	exit 1
	fi
	done
	echo "[startup] Ollama is ready! (waited ${WAITED}s)"

	# ── Step 3: Pull / verify model ─────────────────────────────────────────────
	echo "[startup] Pulling model '${MODEL}' (skipped if already cached)..."
	ollama pull "${MODEL}"
	echo "[startup] Model '${MODEL}' is ready."

	# ── Step 4: Start FastAPI application ────────────────────────────────────────
	echo "[startup] Launching FastAPI on port 7860..."
	exec uvicorn app.main:app --host 0.0.0.0 --port 7860 --workers 1