wordle

Running

App Files Files Community

wordle / src /core /containers /runtime /providers.py

burtenshaw HF Staff

Upload folder using huggingface_hub

ef97fda verified 15 days ago

raw

history blame contribute delete

20.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Container provider abstractions for running environment servers.

	This module provides a pluggable architecture for different container providers
	(local Docker, Kubernetes, cloud providers, etc.) to be used with EnvClient.
	"""

	from __future__ import annotations

	from abc import ABC, abstractmethod
	from typing import Any, Dict, Optional, Sequence


	class ContainerProvider(ABC):
	"""
	Abstract base class for container providers.

	Providers implement this interface to support different container platforms:
	- LocalDockerProvider: Runs containers on local Docker daemon
	- KubernetesProvider: Runs containers in Kubernetes cluster
	- FargateProvider: Runs containers on AWS Fargate
	- CloudRunProvider: Runs containers on Google Cloud Run

	The provider manages a single container lifecycle and provides the base URL
	for connecting to it.

	Example:
	>>> provider = LocalDockerProvider()
	>>> base_url = provider.start_container("echo-env:latest")
	>>> print(base_url) # http://localhost:8000
	>>> # Use the environment via base_url
	>>> provider.stop_container()
	"""

	@abstractmethod
	def start_container(
	self,
	image: str,
	port: Optional[int] = None,
	env_vars: Optional[Dict[str, str]] = None,
	**kwargs: Any,
	) -> str:
	"""
	Start a container from the specified image.

	Args:
	image: Container image name (e.g., "echo-env:latest")
	port: Port to expose (if None, provider chooses)
	env_vars: Environment variables to pass to container
	**kwargs: Provider-specific options

	Returns:
	Base URL to connect to the container (e.g., "http://localhost:8000")

	Raises:
	RuntimeError: If container fails to start
	"""
	pass

	@abstractmethod
	def stop_container(self) -> None:
	"""
	Stop and remove the running container.

	This cleans up the container that was started by start_container().
	"""
	pass

	@abstractmethod
	def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
	"""
	Wait for the container to be ready to accept requests.

	This typically polls the /health endpoint until it returns 200.

	Args:
	base_url: Base URL of the container
	timeout_s: Maximum time to wait

	Raises:
	TimeoutError: If container doesn't become ready in time
	"""
	pass


	class LocalDockerProvider(ContainerProvider):
	"""
	Container provider for local Docker daemon.

	This provider runs containers on the local machine using Docker.
	Useful for development and testing.

	Example:
	>>> provider = LocalDockerProvider()
	>>> base_url = provider.start_container("echo-env:latest")
	>>> # Container running on http://localhost:<random-port>
	>>> provider.stop_container()
	"""

	def __init__(self):
	"""Initialize the local Docker provider."""
	self._container_id: Optional[str] = None
	self._container_name: Optional[str] = None

	# Check if Docker is available
	import subprocess

	try:
	subprocess.run(
	["docker", "version"],
	check=True,
	capture_output=True,
	timeout=5,
	)
	except (
	subprocess.CalledProcessError,
	FileNotFoundError,
	subprocess.TimeoutExpired,
	):
	raise RuntimeError(
	"Docker is not available. Please install Docker Desktop or Docker Engine."
	)

	def start_container(
	self,
	image: str,
	port: Optional[int] = None,
	env_vars: Optional[Dict[str, str]] = None,
	**kwargs: Any,
	) -> str:
	"""
	Start a Docker container locally.

	Args:
	image: Docker image name
	port: Port to expose (if None, finds available port)
	env_vars: Environment variables for the container
	**kwargs: Additional Docker run options

	Returns:
	Base URL to connect to the container
	"""
	import subprocess
	import time

	# Find available port if not specified
	if port is None:
	port = self._find_available_port()

	# Generate container name
	self._container_name = self._generate_container_name(image)

	# Build docker run command
	cmd = [
	"docker",
	"run",
	"-d", # Detached
	"--name",
	self._container_name,
	"-p",
	f"{port}:8000", # Map port
	]

	# Add environment variables
	if env_vars:
	for key, value in env_vars.items():
	cmd.extend(["-e", f"{key}={value}"])

	# Add image
	cmd.append(image)

	# Run container
	try:
	result = subprocess.run(cmd, capture_output=True, text=True, check=True)
	self._container_id = result.stdout.strip()
	except subprocess.CalledProcessError as e:
	error_msg = f"Failed to start Docker container.\nCommand: {' '.join(cmd)}\nExit code: {e.returncode}\nStderr: {e.stderr}\nStdout: {e.stdout}"
	raise RuntimeError(error_msg) from e

	# Wait a moment for container to start
	time.sleep(1)

	base_url = f"http://localhost:{port}"
	return base_url

	def stop_container(self) -> None:
	"""
	Stop and remove the Docker container.
	"""
	if self._container_id is None:
	return

	import subprocess

	try:
	# Stop container
	subprocess.run(
	["docker", "stop", self._container_id],
	capture_output=True,
	check=True,
	timeout=10,
	)

	# Remove container
	subprocess.run(
	["docker", "rm", self._container_id],
	capture_output=True,
	check=True,
	timeout=10,
	)
	except subprocess.CalledProcessError:
	# Container might already be stopped/removed
	pass
	finally:
	self._container_id = None
	self._container_name = None

	def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
	"""
	Wait for container to be ready by polling /health endpoint.

	Args:
	base_url: Base URL of the container
	timeout_s: Maximum time to wait

	Raises:
	TimeoutError: If container doesn't become ready
	"""
	import time

	import requests

	start_time = time.time()
	health_url = f"{base_url}/health"

	# Bypass proxy for localhost to avoid proxy issues
	proxies = {"http": None, "https": None}

	while time.time() - start_time < timeout_s:
	try:
	response = requests.get(health_url, timeout=2.0, proxies=proxies)
	if response.status_code == 200:
	return
	except requests.RequestException:
	pass

	time.sleep(0.5)

	raise TimeoutError(
	f"Container at {base_url} did not become ready within {timeout_s}s"
	)

	def _find_available_port(self) -> int:
	"""
	Find an available port on localhost.

	Returns:
	An available port number
	"""
	import socket

	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	s.bind(("", 0))
	s.listen(1)
	port = s.getsockname()[1]
	return port

	def _generate_container_name(self, image: str) -> str:
	"""
	Generate a unique container name based on image name and timestamp.

	Args:
	image: Docker image name

	Returns:
	A unique container name
	"""
	import time

	clean_image = image.split("/")[-1].split(":")[0]
	timestamp = int(time.time() * 1000)
	return f"{clean_image}-{timestamp}"


	class DockerSwarmProvider(ContainerProvider):
	"""
	Container provider that uses Docker Swarm services for local concurrency.

	This provider creates a replicated Swarm service backed by the local Docker
	engine. The built-in load-balancer fans requests across the replicas,
	allowing multiple container instances to run concurrently on the developer
	workstation (mirroring the workflow described in the Docker stack docs).
	"""

	def __init__(
	self,
	*,
	auto_init_swarm: bool = True,
	overlay_network: Optional[str] = None,
	):
	"""
	Args:
	auto_init_swarm: Whether to call ``docker swarm init`` when Swarm
	is not active. Otherwise, user must manually initialize Swarm.
	overlay_network: Optional overlay network name for the service.
	When provided, the network is created with
	``docker network create --driver overlay --attachable`` if it
	does not already exist.
	"""
	self._service_name: Optional[str] = None
	self._service_id: Optional[str] = None
	self._published_port: Optional[int] = None
	self._overlay_network = overlay_network
	self._auto_init_swarm = auto_init_swarm

	self._ensure_docker_available()
	self._ensure_swarm_initialized()
	if self._overlay_network:
	self._ensure_overlay_network(self._overlay_network)

	def start_container(
	self,
	image: str,
	port: Optional[int] = None,
	env_vars: Optional[Dict[str, str]] = None,
	**kwargs: Any,
	) -> str:
	"""
	Start (or scale) a Swarm service for the given image.

	Supported kwargs:
	replicas (int): Number of container replicas (default: 2).
	cpu_limit (float \| str): CPU limit passed to ``--limit-cpu``.
	memory_limit (str): Memory limit passed to ``--limit-memory``.
	constraints (Sequence[str]): Placement constraints.
	labels (Dict[str, str]): Service labels.
	command (Sequence[str] \| str): Override container command.
	"""
	import shlex
	import subprocess
	import time

	allowed_kwargs = {
	"replicas",
	"cpu_limit",
	"memory_limit",
	"constraints",
	"labels",
	"command",
	}
	unknown = set(kwargs) - allowed_kwargs
	if unknown:
	raise ValueError(f"Unsupported kwargs for DockerSwarmProvider: {unknown}")

	replicas = int(kwargs.get("replicas", 2))
	cpu_limit = kwargs.get("cpu_limit")
	memory_limit = kwargs.get("memory_limit")
	constraints: Optional[Sequence[str]] = kwargs.get("constraints")
	labels: Optional[Dict[str, str]] = kwargs.get("labels")
	command_override = kwargs.get("command")

	if port is None:
	port = self._find_available_port()

	self._service_name = self._generate_service_name(image)
	self._published_port = port

	cmd = [
	"docker",
	"service",
	"create",
	"--detach",
	"--name",
	self._service_name,
	"--replicas",
	str(max(1, replicas)),
	"--publish",
	f"{port}:8000",
	]

	if self._overlay_network:
	cmd.extend(["--network", self._overlay_network])

	if env_vars:
	for key, value in env_vars.items():
	cmd.extend(["--env", f"{key}={value}"])

	if cpu_limit is not None:
	cmd.extend(["--limit-cpu", str(cpu_limit)])

	if memory_limit is not None:
	cmd.extend(["--limit-memory", str(memory_limit)])

	if constraints:
	for constraint in constraints:
	cmd.extend(["--constraint", constraint])

	if labels:
	for key, value in labels.items():
	cmd.extend(["--label", f"{key}={value}"])

	cmd.append(image)

	if command_override:
	if isinstance(command_override, str):
	cmd.extend(shlex.split(command_override))
	else:
	cmd.extend(command_override)

	try:
	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	check=True,
	)
	self._service_id = result.stdout.strip()
	except subprocess.CalledProcessError as e:
	error_msg = (
	"Failed to start Docker Swarm service.\n"
	f"Command: {' '.join(cmd)}\n"
	f"Exit code: {e.returncode}\n"
	f"Stdout: {e.stdout}\n"
	f"Stderr: {e.stderr}"
	)
	raise RuntimeError(error_msg) from e

	# Give Swarm a brief moment to schedule the tasks.
	time.sleep(1.0)

	return f"http://localhost:{port}"

	def stop_container(self) -> None:
	"""
	Remove the Swarm service (and keep the Swarm manager running).
	"""
	if not self._service_name:
	return

	import subprocess

	try:
	subprocess.run(
	["docker", "service", "rm", self._service_name],
	capture_output=True,
	check=True,
	timeout=10,
	)
	except subprocess.CalledProcessError:
	# Service may already be gone; ignore.
	pass
	finally:
	self._service_name = None
	self._service_id = None
	self._published_port = None

	def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
	"""
	Wait for at least one replica to become healthy by polling /health.

	Note: With Swarm's load balancer, requests round-robin across replicas,
	so this only verifies that at least one replica is responding. Some
	replicas may still be starting when this returns.
	"""
	import time

	import requests

	deadline = time.time() + timeout_s
	health_url = f"{base_url}/health"

	# Bypass proxy for localhost to avoid proxy issues
	proxies = {"http": None, "https": None}

	while time.time() < deadline:
	try:
	response = requests.get(health_url, timeout=2.0, proxies=proxies)
	if response.status_code == 200:
	return
	except requests.RequestException:
	pass

	time.sleep(0.5)

	raise TimeoutError(
	f"Swarm service at {base_url} did not become ready within {timeout_s}s"
	)

	def _ensure_docker_available(self) -> None:
	import subprocess

	try:
	subprocess.run(
	["docker", "version"],
	check=True,
	capture_output=True,
	timeout=5,
	)
	except (
	subprocess.CalledProcessError,
	FileNotFoundError,
	subprocess.TimeoutExpired,
	) as exc:
	raise RuntimeError(
	"Docker is not available. Please install Docker Desktop or Docker Engine."
	) from exc

	def _ensure_swarm_initialized(self) -> None:
	import subprocess

	try:
	result = subprocess.run(
	["docker", "info", "--format", "{{.Swarm.LocalNodeState}}"],
	capture_output=True,
	text=True,
	check=True,
	timeout=5,
	)
	state = result.stdout.strip().lower()
	if state == "active":
	return
	except subprocess.CalledProcessError:
	state = "unknown"

	if not self._auto_init_swarm:
	raise RuntimeError(
	f"Docker Swarm is not active (state={state}). Enable Swarm manually or pass auto_init_swarm=True."
	)

	try:
	subprocess.run(
	["docker", "swarm", "init"],
	check=True,
	capture_output=True,
	timeout=10,
	)
	except subprocess.CalledProcessError as e:
	raise RuntimeError("Failed to initialize Docker Swarm") from e

	def _ensure_overlay_network(self, network: str) -> None:
	import subprocess

	inspect = subprocess.run(
	["docker", "network", "inspect", network],
	capture_output=True,
	text=True,
	check=False,
	)
	if inspect.returncode == 0:
	return

	try:
	subprocess.run(
	[
	"docker",
	"network",
	"create",
	"--driver",
	"overlay",
	"--attachable",
	network,
	],
	check=True,
	capture_output=True,
	timeout=10,
	)
	except subprocess.CalledProcessError as e:
	raise RuntimeError(f"Failed to create overlay network '{network}'") from e

	def _find_available_port(self) -> int:
	import socket

	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	s.bind(("", 0))
	s.listen(1)
	port = s.getsockname()[1]
	return port

	def _generate_service_name(self, image: str) -> str:
	import time

	clean_image = image.split("/")[-1].split(":")[0]
	timestamp = int(time.time() * 1000)
	return f"{clean_image}-swarm-{timestamp}"


	class KubernetesProvider(ContainerProvider):
	"""
	Container provider for Kubernetes clusters.

	This provider creates pods in a Kubernetes cluster and exposes them
	via services or port-forwarding.

	Example:
	>>> provider = KubernetesProvider(namespace="envtorch-dev")
	>>> base_url = provider.start_container("echo-env:latest")
	>>> # Pod running in k8s, accessible via service or port-forward
	>>> provider.stop_container()
	"""

	pass


	class RuntimeProvider(ABC):
	"""
	Abstract base class for runtime providers that are not container providers.
	Providers implement this interface to support different runtime platforms:
	- UVProvider: Runs environments via `uv run`

	The provider manages a single runtime lifecycle and provides the base URL
	for connecting to it.

	Example:
	>>> provider = UVProvider(project_path="/path/to/env")
	>>> base_url = provider.start()
	>>> print(base_url) # http://localhost:8000
	>>> provider.stop()
	"""

	@abstractmethod
	def start(
	self,
	port: Optional[int] = None,
	env_vars: Optional[Dict[str, str]] = None,
	**kwargs: Any,
	) -> str:
	"""
	Start a runtime from the specified image.

	Args:
	image: Runtime image name
	port: Port to expose (if None, provider chooses)
	env_vars: Environment variables for the runtime
	**kwargs: Additional runtime options
	"""

	@abstractmethod
	def stop(self) -> None:
	"""
	Stop the runtime.
	"""
	pass

	@abstractmethod
	def wait_for_ready(self, timeout_s: float = 30.0) -> None:
	"""
	Wait for the runtime to be ready to accept requests.
	"""
	pass

	def __enter__(self) -> "RuntimeProvider":
	"""
	Enter the runtime provider.
	"""
	self.start()
	return self

	def __exit__(self, exc_type, exc, tb) -> None:
	"""
	Exit the runtime provider.
	"""
	self.stop()
	return False