Delete docker-compose.yml

Browse files

Files changed (1) hide show

docker-compose.yml +0 -154

docker-compose.yml DELETED Viewed

@@ -1,154 +0,0 @@
-version: "3.9"
-# Named volumes for persistence (HF cache, logs)
-volumes:
-  hf_cache:
-  logs:
-networks:
-  velnet:
-services:
-  api:
-    image: veltraxor-api:dev
-    # If you haven't built the image yet, uncomment the next 3 lines to build from local Dockerfile
-    build:
-      context: .
-      dockerfile: Dockerfile
-    container_name: veltraxor-api
-    env_file:
-      - .env
-    environment:
-      # Hard-aligned with your codebase (config.py / api_server.py / engine_runner.py)
-      ENGINE_KIND: ${ENGINE_KIND:-dry}                       # dry | real
-      MODEL_ALIAS: ${MODEL_ALIAS:-veltraxor-1}
-      HF_REPO_ID: ${HF_REPO_ID:-Veltraxor/Veltraxor_1}
-      EXPECTED_SHARDS: ${EXPECTED_SHARDS:-163}
-      HF_HOME: ${HF_HOME:-/data/hf_cache}
-      MODEL_LOCAL_DIR: ${MODEL_LOCAL_DIR:-/data/hf_cache/Veltraxor_1}
-      HF_HUB_ENABLE_HF_TRANSFER: ${HF_HUB_ENABLE_HF_TRANSFER:-1}
-      # Server/limits
-      REQUEST_TIMEOUT_S: ${REQUEST_TIMEOUT_S:-45}
-      BACKEND_STEP_TIMEOUT_S: ${BACKEND_STEP_TIMEOUT_S:-30}
-      MAX_CONCURRENCY: ${MAX_CONCURRENCY:-8}
-      RATE_LIMIT_QPS: ${RATE_LIMIT_QPS:-3}
-      BURST_CAPACITY: ${BURST_CAPACITY:-3}
-      RATE_LIMIT_SCOPE: ${RATE_LIMIT_SCOPE:-route_token}
-      LOG_LEVEL: ${LOG_LEVEL:-INFO}
-      API_KEY: ${API_KEY:-}                                  # optional shared API key
-      # Usage metering
-      USAGE_BACKEND: ${USAGE_BACKEND:-jsonl}
-      USAGE_JSONL_PATH: ${USAGE_JSONL_PATH:-/app/logs/usage.jsonl}
-      USAGE_SQLITE_PATH: ${USAGE_SQLITE_PATH:-/app/logs/usage.db}
-      USAGE_TOKENIZER: ${USAGE_TOKENIZER:-heuristic}
-      HEURISTIC_CHARS_PER_TOKEN: ${HEURISTIC_CHARS_PER_TOKEN:-4}
-      USAGE_FLUSH_INTERVAL_S: ${USAGE_FLUSH_INTERVAL_S:-2.0}
-      USAGE_MAX_BUFFER: ${USAGE_MAX_BUFFER:-100}
-      USAGE_MAX_QUEUE: ${USAGE_MAX_QUEUE:-1000}
-      USAGE_ROTATE_DAILY: ${USAGE_ROTATE_DAILY:-0}
-      # Real-engine endpoints (only used if ENGINE_KIND=real)
-      VLLM_ENDPOINT: ${VLLM_ENDPOINT:-http://vllm:8000}
-      VLLM_API_KEY: ${VLLM_API_KEY:-}
-      VLLM_MODEL: ${VLLM_MODEL:-${MODEL_ALIAS:-veltraxor-1}}
-      TGI_ENDPOINT: ${TGI_ENDPOINT:-http://tgi:8080}
-      TGI_TIMEOUT_S: ${TGI_TIMEOUT_S:-45}
-      TRANSFORMERS_LOCAL: ${TRANSFORMERS_LOCAL:-0}
-      TRANSFORMERS_MAX_NEW_TOKENS: ${TRANSFORMERS_MAX_NEW_TOKENS:-512}
-      TRANSFORMERS_TEMPERATURE: ${TRANSFORMERS_TEMPERATURE:-0.7}
-      # IMPORTANT: pass HF_TOKEN at runtime (do NOT bake into images)
-      HF_TOKEN: ${HF_TOKEN:-}
-    ports:
-      - "8000:8000"
-    volumes:
-      - hf_cache:/data/hf_cache
-      - logs:/app/logs
-    healthcheck:
-      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8000/healthz || exit 1"]
-      interval: 30s
-      timeout: 3s
-      retries: 3
-      start_period: 10s
-    restart: unless-stopped
-    networks:
-      - velnet
-    # API can run alone (DRY) or with engines. If using real engines via profiles,
-    # keep this depends_on so compose waits for engine's health (optional).
-    depends_on:
-      vllm:
-        condition: service_healthy
-      tgi:
-        condition: service_healthy
-  # ===============================
-  # vLLM (OpenAI-compatible server)
-  # Enable with:  docker compose --profile vllm up
-  # ===============================
-  vllm:
-    profiles: ["vllm"]
-    image: vllm/vllm-openai:latest
-    container_name: vllm
-    # NOTE: Adjust resources & args to your hardware; 671B is extremely heavy.
-    # Here we point vLLM to the local weights folder mounted at /data/Veltraxor_1
-    command: >
-      python -m vllm.entrypoints.openai.api_server
-      --model /data/Veltraxor_1
-      --trust-remote-code
-      --tensor-parallel-size ${TP_DEGREE:-1}
-      --dtype auto
-      --port 8000
-    environment:
-      HF_HOME: /data/hf_cache
-      TRANSFORMERS_OFFLINE: "1"
-    volumes:
-      - hf_cache:/data/hf_cache
-      - hf_cache:/data/Veltraxor_1
-    expose:
-      - "8000"
-    healthcheck:
-      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8000/ || exit 1"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 20s
-    restart: unless-stopped
-    networks:
-      - velnet
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - capabilities: ["gpu"]   # uncomment if your runtime supports it (compose with nvidia)
-  # ===============================
-  # TGI (Hugging Face Text Generation Inference)
-  # Enable with:  docker compose --profile tgi up
-  # ===============================
-  tgi:
-    profiles: ["tgi"]
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    container_name: tgi
-    # Point MODEL_ID to local repo path; serve on 8080
-    environment:
-      MODEL_ID: /data/Veltraxor_1
-      PORT: 8080
-      HF_HOME: /data/hf_cache
-      NUM_SHARD: ${TP_DEGREE:-1}  # Adjust to your GPUs
-    volumes:
-      - hf_cache:/data/hf_cache
-      - hf_cache:/data/Veltraxor_1
-    expose:
-      - "8080"
-    healthcheck:
-      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
-      interval: 30s
-      timeout: 5s
-      retries: 3
-      start_period: 20s
-    restart: unless-stopped
-    networks:
-      - velnet
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - capabilities: ["gpu"]   # uncomment if runtime supports it