# AI Model Configuration # Options: # Qwen/Qwen2.5-0.5B-Instruct (Nano - ~1GB, fastest, CPU-friendly) # Qwen/Qwen2.5-1.5B-Instruct (Goldilocks - ~3GB, recommended for GPU) # Qwen/Qwen2.5-7B-Instruct (Smartest - needs 4-bit quantization + GPU) MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct # Quantization mode: auto, 4bit, 8bit, none # auto = 4-bit on GPU for 7B, 8-bit on GPU for 1.5B, bf16 on CPU # 4bit = force 4-bit quantization (requires GPU + bitsandbytes) # 8bit = force 8-bit quantization # none = disable quantization, uses bfloat16 QUANTIZATION=auto # Double quantization for 4-bit (saves ~10% more memory) USE_DOUBLE_QUANT=true # LongCat API (primary model, falls back to local Qwen on quota exhaustion) LONGCAT_API_KEY=ak_2cA3Uc1hN6nj5k69km3dt4lT9Yf74 LONGCAT_BASE_URL=https://api.longcat.chat/openai LONGCAT_MODEL=LongCat-2.0-Preview