from huggingface_hub import create_inference_endpoint endpoint = create_inference_endpoint( "vllm-meta-llama-3-8b-instruct", repository="meta-llama/Meta-Llama-3-8B-Instruct", framework="pytorch", task="custom", accelerator="gpu", vendor="aws", region="us-east-1", type="protected", instance_type="g5.2xlarge", instance_size="medium", custom_image={ "health_route": "/health", "env": { "MAX_MODEL_LEN": "8192" }, "url": "philschmi/vllm-hf-inference-endpoints", }, ) endpoint.wait() print(endpoint.status)