Add files using upload-large-folder tool

0549051 verified 3 months ago

12.8 kB

	"""CLI commands for Hugging Face Inference Endpoints."""

	import json
	from typing import Annotated, Optional

	import typer

	from huggingface_hub._inference_endpoints import InferenceEndpoint, InferenceEndpointScalingMetric
	from huggingface_hub.errors import HfHubHTTPError

	from ._cli_utils import TokenOpt, get_hf_api, typer_factory


	ie_cli = typer_factory(help="Manage Hugging Face Inference Endpoints.")

	catalog_app = typer_factory(help="Interact with the Inference Endpoints catalog.")

	NameArg = Annotated[
	str,
	typer.Argument(help="Endpoint name."),
	]
	NameOpt = Annotated[
	Optional[str],
	typer.Option(help="Endpoint name."),
	]

	NamespaceOpt = Annotated[
	Optional[str],
	typer.Option(
	help="The namespace associated with the Inference Endpoint. Defaults to the current user's namespace.",
	),
	]


	def _print_endpoint(endpoint: InferenceEndpoint) -> None:
	typer.echo(json.dumps(endpoint.raw, indent=2, sort_keys=True))


	@ie_cli.command()
	def ls(
	namespace: NamespaceOpt = None,
	token: TokenOpt = None,
	) -> None:
	"""Lists all Inference Endpoints for the given namespace."""
	api = get_hf_api(token=token)
	try:
	endpoints = api.list_inference_endpoints(namespace=namespace, token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Listing failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	typer.echo(
	json.dumps(
	{"items": [endpoint.raw for endpoint in endpoints]},
	indent=2,
	sort_keys=True,
	)
	)


	@ie_cli.command(name="deploy")
	def deploy(
	name: NameArg,
	repo: Annotated[
	str,
	typer.Option(
	help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
	),
	],
	framework: Annotated[
	str,
	typer.Option(
	help="The machine learning framework used for the model (e.g. 'vllm').",
	),
	],
	accelerator: Annotated[
	str,
	typer.Option(
	help="The hardware accelerator to be used for inference (e.g. 'cpu').",
	),
	],
	instance_size: Annotated[
	str,
	typer.Option(
	help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
	),
	],
	instance_type: Annotated[
	str,
	typer.Option(
	help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
	),
	],
	region: Annotated[
	str,
	typer.Option(
	help="The cloud region in which the Inference Endpoint will be created (e.g. 'us-east-1').",
	),
	],
	vendor: Annotated[
	str,
	typer.Option(
	help="The cloud provider or vendor where the Inference Endpoint will be hosted (e.g. 'aws').",
	),
	],
	*,
	namespace: NamespaceOpt = None,
	task: Annotated[
	Optional[str],
	typer.Option(
	help="The task on which to deploy the model (e.g. 'text-classification').",
	),
	] = None,
	token: TokenOpt = None,
	min_replica: Annotated[
	int,
	typer.Option(
	help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
	),
	] = 1,
	max_replica: Annotated[
	int,
	typer.Option(
	help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
	),
	] = 1,
	scale_to_zero_timeout: Annotated[
	Optional[int],
	typer.Option(
	help="The duration in minutes before an inactive endpoint is scaled to zero.",
	),
	] = None,
	scaling_metric: Annotated[
	Optional[InferenceEndpointScalingMetric],
	typer.Option(
	help="The metric reference for scaling.",
	),
	] = None,
	scaling_threshold: Annotated[
	Optional[float],
	typer.Option(
	help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
	),
	] = None,
	) -> None:
	"""Deploy an Inference Endpoint from a Hub repository."""
	api = get_hf_api(token=token)
	endpoint = api.create_inference_endpoint(
	name=name,
	repository=repo,
	framework=framework,
	accelerator=accelerator,
	instance_size=instance_size,
	instance_type=instance_type,
	region=region,
	vendor=vendor,
	namespace=namespace,
	task=task,
	token=token,
	min_replica=min_replica,
	max_replica=max_replica,
	scaling_metric=scaling_metric,
	scaling_threshold=scaling_threshold,
	scale_to_zero_timeout=scale_to_zero_timeout,
	)

	_print_endpoint(endpoint)


	@catalog_app.command(name="deploy")
	def deploy_from_catalog(
	repo: Annotated[
	str,
	typer.Option(
	help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
	),
	],
	name: NameOpt = None,
	namespace: NamespaceOpt = None,
	token: TokenOpt = None,
	) -> None:
	"""Deploy an Inference Endpoint from the Model Catalog."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.create_inference_endpoint_from_catalog(
	repo_id=repo,
	name=name,
	namespace=namespace,
	token=token,
	)
	except HfHubHTTPError as error:
	typer.echo(f"Deployment failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	_print_endpoint(endpoint)


	def list_catalog(
	token: TokenOpt = None,
	) -> None:
	"""List available Catalog models."""
	api = get_hf_api(token=token)
	try:
	models = api.list_inference_catalog(token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Catalog fetch failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	typer.echo(json.dumps({"models": models}, indent=2, sort_keys=True))


	catalog_app.command(name="ls")(list_catalog)
	ie_cli.command(name="list-catalog", help="List available Catalog models.", hidden=True)(list_catalog)


	ie_cli.add_typer(catalog_app, name="catalog")


	@ie_cli.command()
	def describe(
	name: NameArg,
	namespace: NamespaceOpt = None,
	token: TokenOpt = None,
	) -> None:
	"""Get information about an existing endpoint."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.get_inference_endpoint(name=name, namespace=namespace, token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Fetch failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	_print_endpoint(endpoint)


	@ie_cli.command()
	def update(
	name: NameArg,
	namespace: NamespaceOpt = None,
	repo: Annotated[
	Optional[str],
	typer.Option(
	help="The name of the model repository associated with the Inference Endpoint (e.g. 'openai/gpt-oss-120b').",
	),
	] = None,
	accelerator: Annotated[
	Optional[str],
	typer.Option(
	help="The hardware accelerator to be used for inference (e.g. 'cpu').",
	),
	] = None,
	instance_size: Annotated[
	Optional[str],
	typer.Option(
	help="The size or type of the instance to be used for hosting the model (e.g. 'x4').",
	),
	] = None,
	instance_type: Annotated[
	Optional[str],
	typer.Option(
	help="The cloud instance type where the Inference Endpoint will be deployed (e.g. 'intel-icl').",
	),
	] = None,
	framework: Annotated[
	Optional[str],
	typer.Option(
	help="The machine learning framework used for the model (e.g. 'custom').",
	),
	] = None,
	revision: Annotated[
	Optional[str],
	typer.Option(
	help="The specific model revision to deploy on the Inference Endpoint (e.g. '6c0e6080953db56375760c0471a8c5f2929baf11').",
	),
	] = None,
	task: Annotated[
	Optional[str],
	typer.Option(
	help="The task on which to deploy the model (e.g. 'text-classification').",
	),
	] = None,
	min_replica: Annotated[
	Optional[int],
	typer.Option(
	help="The minimum number of replicas (instances) to keep running for the Inference Endpoint.",
	),
	] = None,
	max_replica: Annotated[
	Optional[int],
	typer.Option(
	help="The maximum number of replicas (instances) to scale to for the Inference Endpoint.",
	),
	] = None,
	scale_to_zero_timeout: Annotated[
	Optional[int],
	typer.Option(
	help="The duration in minutes before an inactive endpoint is scaled to zero.",
	),
	] = None,
	scaling_metric: Annotated[
	Optional[InferenceEndpointScalingMetric],
	typer.Option(
	help="The metric reference for scaling.",
	),
	] = None,
	scaling_threshold: Annotated[
	Optional[float],
	typer.Option(
	help="The scaling metric threshold used to trigger a scale up. Ignored when scaling metric is not provided.",
	),
	] = None,
	token: TokenOpt = None,
	) -> None:
	"""Update an existing endpoint."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.update_inference_endpoint(
	name=name,
	namespace=namespace,
	repository=repo,
	framework=framework,
	revision=revision,
	task=task,
	accelerator=accelerator,
	instance_size=instance_size,
	instance_type=instance_type,
	min_replica=min_replica,
	max_replica=max_replica,
	scale_to_zero_timeout=scale_to_zero_timeout,
	scaling_metric=scaling_metric,
	scaling_threshold=scaling_threshold,
	token=token,
	)
	except HfHubHTTPError as error:
	typer.echo(f"Update failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error
	_print_endpoint(endpoint)


	@ie_cli.command()
	def delete(
	name: NameArg,
	namespace: NamespaceOpt = None,
	yes: Annotated[
	bool,
	typer.Option("--yes", help="Skip confirmation prompts."),
	] = False,
	token: TokenOpt = None,
	) -> None:
	"""Delete an Inference Endpoint permanently."""
	if not yes:
	confirmation = typer.prompt(f"Delete endpoint '{name}'? Type the name to confirm.")
	if confirmation != name:
	typer.echo("Aborted.")
	raise typer.Exit(code=2)

	api = get_hf_api(token=token)
	try:
	api.delete_inference_endpoint(name=name, namespace=namespace, token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Delete failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	typer.echo(f"Deleted '{name}'.")


	@ie_cli.command()
	def pause(
	name: NameArg,
	namespace: NamespaceOpt = None,
	token: TokenOpt = None,
	) -> None:
	"""Pause an Inference Endpoint."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.pause_inference_endpoint(name=name, namespace=namespace, token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Pause failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	_print_endpoint(endpoint)


	@ie_cli.command()
	def resume(
	name: NameArg,
	namespace: NamespaceOpt = None,
	fail_if_already_running: Annotated[
	bool,
	typer.Option(
	"--fail-if-already-running",
	help="If `True`, the method will raise an error if the Inference Endpoint is already running.",
	),
	] = False,
	token: TokenOpt = None,
	) -> None:
	"""Resume an Inference Endpoint."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.resume_inference_endpoint(
	name=name,
	namespace=namespace,
	token=token,
	running_ok=not fail_if_already_running,
	)
	except HfHubHTTPError as error:
	typer.echo(f"Resume failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error
	_print_endpoint(endpoint)


	@ie_cli.command()
	def scale_to_zero(
	name: NameArg,
	namespace: NamespaceOpt = None,
	token: TokenOpt = None,
	) -> None:
	"""Scale an Inference Endpoint to zero."""
	api = get_hf_api(token=token)
	try:
	endpoint = api.scale_to_zero_inference_endpoint(name=name, namespace=namespace, token=token)
	except HfHubHTTPError as error:
	typer.echo(f"Scale To Zero failed: {error}")
	raise typer.Exit(code=error.response.status_code) from error

	_print_endpoint(endpoint)