# AI Model Configuration
# Options:
#   Qwen/Qwen2.5-0.5B-Instruct   (Nano - ~1GB, fastest, CPU-friendly)
#   Qwen/Qwen2.5-1.5B-Instruct   (Goldilocks - ~3GB, recommended for GPU)
#   Qwen/Qwen2.5-7B-Instruct     (Smartest - needs 4-bit quantization + GPU)
MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct

# Quantization mode: auto, 4bit, 8bit, none
#   auto = 4-bit on GPU for 7B, 8-bit on GPU for 1.5B, bf16 on CPU
#   4bit = force 4-bit quantization (requires GPU + bitsandbytes)
#   8bit = force 8-bit quantization
#   none = disable quantization, uses bfloat16
QUANTIZATION=auto

# Double quantization for 4-bit (saves ~10% more memory)
USE_DOUBLE_QUANT=true

# LongCat API (primary model, falls back to local Qwen on quota exhaustion)
LONGCAT_API_KEY=ak_2cA3Uc1hN6nj5k69km3dt4lT9Yf74
LONGCAT_BASE_URL=https://api.longcat.chat/openai
LONGCAT_MODEL=LongCat-2.0-Preview