# Stack 2.9 Configuration# This file provides additional configuration optionsserver:host:"0.0.0.0"port:8000workers:1model:# Default model - can be overridden by MODEL_ID env varid:"TheBloke/Llama-2-7B-Chat-AWQ"cache_dir:"/home/vllm/.cache/huggingface"trust_remote_code:true# vLLM engine configurationvllm:tensor_parallel_size:1gpu_memory_utilization:0.9max_model_len:4096max_num_seqs:64max_num_batched_tokens:4096quantization:"awq"enforce_eager:falsedisable_log_stats:false# Performance tuningperformance:# Thread configurationomp_num_threads:4# CUDA settingscuda_launch_blocking:0cudnn_loginfo_dbg:1# CORS (if needed for web UI)cors:enabled:falseallow_origins: ["*"]
allow_methods: ["*"]
allow_headers: ["*"]
# Logginglogging:level:"INFO"format:"json"include_timestamps:true