| ``` | |
| sudo docker run --rm \ | |
| -p 8080:80 \ | |
| -e GPTQ_BITS=4 \ | |
| -e GPTQ_GROUPSIZE=128 \ | |
| -e MAX_BEST_OF=1 \ | |
| -e MAX_BATCH_PREFILL_TOKENS=2048 \ | |
| --gpus '"device=0"' \ | |
| -v $PWD/data:/data ghcr.io/huggingface/text-generation-inference:sha-bce5e22 \ | |
| --model-id /data/WizardCoder-Python-34B-V1.0-GPTQ \ | |
| --quantize gptq | |
| ``` |