| # AI Model Configuration | |
| # Options: | |
| # Qwen/Qwen2.5-0.5B-Instruct (Nano - ~1GB, fastest, CPU-friendly) | |
| # Qwen/Qwen2.5-1.5B-Instruct (Goldilocks - ~3GB, recommended for GPU) | |
| # Qwen/Qwen2.5-7B-Instruct (Smartest - needs 4-bit quantization + GPU) | |
| MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct | |
| # Quantization mode: auto, 4bit, 8bit, none | |
| # auto = 4-bit on GPU for 7B, 8-bit on GPU for 1.5B, bf16 on CPU | |
| # 4bit = force 4-bit quantization (requires GPU + bitsandbytes) | |
| # 8bit = force 8-bit quantization | |
| # none = disable quantization, uses bfloat16 | |
| QUANTIZATION=auto | |
| # Double quantization for 4-bit (saves ~10% more memory) | |
| USE_DOUBLE_QUANT=true | |
| # LongCat API (primary model, falls back to local Qwen on quota exhaustion) | |
| LONGCAT_API_KEY=ak_2cA3Uc1hN6nj5k69km3dt4lT9Yf74 | |
| LONGCAT_BASE_URL=https://api.longcat.chat/openai | |
| LONGCAT_MODEL=LongCat-2.0-Preview | |