#!/bin/bash set -e echo "================================================" echo "🚀 Axon - Qwen2.5-Omni-7B Multimodal Server" echo "================================================" echo "" echo "📋 Capabilities: Text | Images | Audio" echo "🔧 Quantization: Q8_0 (near-lossless)" echo "⚡ Optimizations: Flash Attention, Continuous Batching" echo "" mkdir -p /app/models/qwen2.5-omni-7b download_file () { REPO_ID=$1 FILENAME=$2 DEST_PATH=$3 if [ -f "$DEST_PATH" ]; then echo "✅ Already exists: $(basename $DEST_PATH)" return 0 fi echo "⬇️ Downloading $FILENAME ..." python3 -c " from huggingface_hub import hf_hub_download import shutil, sys try: path = hf_hub_download( repo_id='$REPO_ID', filename='$FILENAME', cache_dir='/app/.cache' ) shutil.copy(path, '$DEST_PATH') print(f'✅ Downloaded: $FILENAME') except Exception as e: print(f'❌ Error downloading $FILENAME: {e}') sys.exit(1) " } download_file "ggml-org/Qwen2.5-Omni-7B-GGUF" \ "Qwen2.5-Omni-7B-Q8_0.gguf" \ "/app/models/qwen2.5-omni-7b/Qwen2.5-Omni-7B-Q8_0.gguf" download_file "ggml-org/Qwen2.5-Omni-7B-GGUF" \ "mmproj-Qwen2.5-Omni-7B-Q8_0.gguf" \ "/app/models/qwen2.5-omni-7b/mmproj-Qwen2.5-Omni-7B-Q8_0.gguf" echo "" echo "🚀 Starting llama.cpp Server" echo "🌐 Server will be available at http://0.0.0.0:7860" echo "" exec /usr/local/bin/llama-server \ --models-dir /app/models \ --host 0.0.0.0 \ --port 7860 \ -c 8192 \ -t 4 \ -fa on \ -cb \ --n-gpu-layers 0