Spaces:
Sleeping
Sleeping
Deploy builder with retry logic and auto-restart
Browse files
app.py
CHANGED
|
@@ -168,6 +168,12 @@ class Config:
|
|
| 168 |
log_format: LogFormat = field(default_factory=lambda: LogFormat(os.getenv("LOG_FORMAT", "text").lower()) if os.getenv("LOG_FORMAT", "text").lower() in ("text", "json") else LogFormat.TEXT)
|
| 169 |
max_history: int = field(default_factory=lambda: int(os.getenv("MAX_HISTORY", "50")))
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
build_patterns: list[str] = field(default_factory=lambda: [
|
| 172 |
"Dockerfile", "Dockerfile.*", "docker/*", "docker/**/*",
|
| 173 |
"src/**/*.py", "pyproject.toml", "uv.lock", "requirements*.txt", ".dockerignore",
|
|
@@ -400,6 +406,10 @@ class Notifier:
|
|
| 400 |
|
| 401 |
def notify(self, build: Build) -> None:
|
| 402 |
"""Send notifications for a completed build."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if not self.should_notify(build.status):
|
| 404 |
return
|
| 405 |
|
|
@@ -415,6 +425,28 @@ class Notifier:
|
|
| 415 |
if self.config.discord_webhook_url:
|
| 416 |
self._send_discord(build)
|
| 417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
def _send_webhook(self, build: Build) -> None:
|
| 419 |
url = build.config.callback_url or self.config.notification_url
|
| 420 |
if not url:
|
|
@@ -582,32 +614,90 @@ class KanikoBuilder:
|
|
| 582 |
valid_prefixes = ("ghp_", "gho_", "ghu_", "ghs_", "ghr_", "github_pat_")
|
| 583 |
return token.startswith(valid_prefixes)
|
| 584 |
|
| 585 |
-
def clone_repo(self, build_config: BuildConfig) -> Path:
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
# Prefer explicit token from request, fall back to config
|
| 588 |
token = build_config.github_token or self.config.github_token
|
| 589 |
-
|
| 590 |
-
use_auth = False
|
| 591 |
-
|
| 592 |
-
# Only use token if it looks valid
|
| 593 |
-
if token and self._is_valid_github_token(token) and "github.com" in repo_url:
|
| 594 |
-
repo_url = repo_url.replace("https://github.com", f"https://{token}@github.com")
|
| 595 |
-
use_auth = True
|
| 596 |
-
self.state.log(f"Cloning {build_config.repo_url} ({build_config.branch}) [authenticated]")
|
| 597 |
-
else:
|
| 598 |
-
if token and not self._is_valid_github_token(token):
|
| 599 |
-
self.state.log(f"Skipping invalid token format, trying public clone")
|
| 600 |
-
self.state.log(f"Cloning {build_config.repo_url} ({build_config.branch})")
|
| 601 |
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
|
| 612 |
def build_and_push(self, build: Build) -> bool:
|
| 613 |
# Set trace ID for this build
|
|
@@ -1842,19 +1932,47 @@ hivemind = HivemindClient(config, state, builder)
|
|
| 1842 |
# Startup
|
| 1843 |
# =============================================================================
|
| 1844 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1845 |
def startup():
|
| 1846 |
init_telemetry()
|
| 1847 |
|
| 1848 |
state.log(f"HF Builder starting ({config.runner_id})")
|
| 1849 |
state.log(f"Registry: {config.registry_url}")
|
| 1850 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1851 |
if config.registry_user:
|
| 1852 |
builder.setup_registry_auth()
|
| 1853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1854 |
if config.slack_webhook_url:
|
| 1855 |
-
state.log("Slack notifications enabled")
|
| 1856 |
if config.discord_webhook_url:
|
| 1857 |
-
state.log("Discord notifications enabled")
|
| 1858 |
|
| 1859 |
# Hivemind integration
|
| 1860 |
if hivemind.enabled:
|
|
@@ -1862,6 +1980,9 @@ def startup():
|
|
| 1862 |
if hivemind.register():
|
| 1863 |
threading.Thread(target=hivemind.work_loop, daemon=True).start()
|
| 1864 |
|
|
|
|
|
|
|
|
|
|
| 1865 |
state.set_ready(True)
|
| 1866 |
state.log("Ready")
|
| 1867 |
|
|
|
|
| 168 |
log_format: LogFormat = field(default_factory=lambda: LogFormat(os.getenv("LOG_FORMAT", "text").lower()) if os.getenv("LOG_FORMAT", "text").lower() in ("text", "json") else LogFormat.TEXT)
|
| 169 |
max_history: int = field(default_factory=lambda: int(os.getenv("MAX_HISTORY", "50")))
|
| 170 |
|
| 171 |
+
# Auto-deploy: Restart HF Space after successful build
|
| 172 |
+
# Set to HF Space ID (e.g., "username/space-name") to auto-restart on success
|
| 173 |
+
auto_restart_space: str = field(default_factory=lambda: os.getenv("AUTO_RESTART_SPACE", ""))
|
| 174 |
+
# HF Token for restarting spaces (uses HF_TOKEN env var)
|
| 175 |
+
hf_token: str = field(default_factory=lambda: os.getenv("HF_TOKEN", ""))
|
| 176 |
+
|
| 177 |
build_patterns: list[str] = field(default_factory=lambda: [
|
| 178 |
"Dockerfile", "Dockerfile.*", "docker/*", "docker/**/*",
|
| 179 |
"src/**/*.py", "pyproject.toml", "uv.lock", "requirements*.txt", ".dockerignore",
|
|
|
|
| 406 |
|
| 407 |
def notify(self, build: Build) -> None:
|
| 408 |
"""Send notifications for a completed build."""
|
| 409 |
+
# Auto-restart HF Space on success
|
| 410 |
+
if build.status == BuildStatus.SUCCESS and self.config.auto_restart_space:
|
| 411 |
+
self._restart_hf_space(build)
|
| 412 |
+
|
| 413 |
if not self.should_notify(build.status):
|
| 414 |
return
|
| 415 |
|
|
|
|
| 425 |
if self.config.discord_webhook_url:
|
| 426 |
self._send_discord(build)
|
| 427 |
|
| 428 |
+
def _restart_hf_space(self, build: Build) -> None:
|
| 429 |
+
"""Restart HuggingFace Space after successful build."""
|
| 430 |
+
space_id = self.config.auto_restart_space
|
| 431 |
+
token = self.config.hf_token
|
| 432 |
+
|
| 433 |
+
if not token:
|
| 434 |
+
self.state.log(f"Cannot restart {space_id}: HF_TOKEN not set", level="warn")
|
| 435 |
+
return
|
| 436 |
+
|
| 437 |
+
try:
|
| 438 |
+
# Use HF Hub API to restart space
|
| 439 |
+
url = f"https://huggingface.co/api/spaces/{space_id}/restart"
|
| 440 |
+
headers = {"Authorization": f"Bearer {token}"}
|
| 441 |
+
|
| 442 |
+
resp = http_requests.post(url, headers=headers, json={"factory_reboot": True}, timeout=30)
|
| 443 |
+
if resp.ok:
|
| 444 |
+
self.state.log(f"✓ Restarted HF Space: {space_id}")
|
| 445 |
+
else:
|
| 446 |
+
self.state.log(f"Failed to restart {space_id}: {resp.status_code}", level="warn")
|
| 447 |
+
except Exception as e:
|
| 448 |
+
self.state.log(f"Failed to restart {space_id}: {e}", level="warn")
|
| 449 |
+
|
| 450 |
def _send_webhook(self, build: Build) -> None:
|
| 451 |
url = build.config.callback_url or self.config.notification_url
|
| 452 |
if not url:
|
|
|
|
| 614 |
valid_prefixes = ("ghp_", "gho_", "ghu_", "ghs_", "ghr_", "github_pat_")
|
| 615 |
return token.startswith(valid_prefixes)
|
| 616 |
|
| 617 |
+
def clone_repo(self, build_config: BuildConfig, max_retries: int = 3) -> Path:
|
| 618 |
+
"""Clone repository with retry logic and cleanup.
|
| 619 |
+
|
| 620 |
+
Retries help handle transient network issues and stale git state.
|
| 621 |
+
"""
|
| 622 |
+
import git
|
| 623 |
+
|
| 624 |
# Prefer explicit token from request, fall back to config
|
| 625 |
token = build_config.github_token or self.config.github_token
|
| 626 |
+
base_repo_url = build_config.repo_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
+
for attempt in range(1, max_retries + 1):
|
| 629 |
+
# Fresh temp directory each attempt
|
| 630 |
+
target_dir = Path(tempfile.mkdtemp(prefix=f"build_{build_config.branch[:8]}_"))
|
| 631 |
+
|
| 632 |
+
try:
|
| 633 |
+
# Clean up any stale git locks in system temp
|
| 634 |
+
self._cleanup_stale_git_locks()
|
| 635 |
+
|
| 636 |
+
repo_url = base_repo_url
|
| 637 |
+
use_auth = False
|
| 638 |
+
|
| 639 |
+
# Only use token if it looks valid
|
| 640 |
+
if token and self._is_valid_github_token(token) and "github.com" in repo_url:
|
| 641 |
+
repo_url = repo_url.replace("https://github.com", f"https://{token}@github.com")
|
| 642 |
+
use_auth = True
|
| 643 |
+
self.state.log(f"Cloning {base_repo_url} ({build_config.branch}) [authenticated] (attempt {attempt}/{max_retries})")
|
| 644 |
+
else:
|
| 645 |
+
if token and not self._is_valid_github_token(token):
|
| 646 |
+
self.state.log(f"Skipping invalid token format, trying public clone")
|
| 647 |
+
self.state.log(f"Cloning {base_repo_url} ({build_config.branch}) (attempt {attempt}/{max_retries})")
|
| 648 |
+
|
| 649 |
+
git.Repo.clone_from(
|
| 650 |
+
repo_url,
|
| 651 |
+
target_dir,
|
| 652 |
+
branch=build_config.branch,
|
| 653 |
+
depth=1,
|
| 654 |
+
single_branch=True,
|
| 655 |
+
env={"GIT_TERMINAL_PROMPT": "0"} # Prevent git from prompting
|
| 656 |
+
)
|
| 657 |
+
self.state.log(f"Cloned to {target_dir}")
|
| 658 |
+
return target_dir
|
| 659 |
+
|
| 660 |
+
except Exception as e:
|
| 661 |
+
error_msg = mask_token(str(e), token) if token else str(e)
|
| 662 |
+
self.state.log(f"Clone attempt {attempt} failed: {error_msg}", level="warn")
|
| 663 |
+
|
| 664 |
+
# Clean up failed attempt
|
| 665 |
+
if target_dir.exists():
|
| 666 |
+
shutil.rmtree(target_dir, ignore_errors=True)
|
| 667 |
+
|
| 668 |
+
if attempt < max_retries:
|
| 669 |
+
wait_time = attempt * 2 # 2s, 4s backoff
|
| 670 |
+
self.state.log(f"Retrying in {wait_time}s...")
|
| 671 |
+
time.sleep(wait_time)
|
| 672 |
+
else:
|
| 673 |
+
self.state.log(f"Clone failed after {max_retries} attempts: {error_msg}", level="error")
|
| 674 |
+
raise RuntimeError(f"Clone failed after {max_retries} attempts: {error_msg}")
|
| 675 |
+
|
| 676 |
+
# Should never reach here
|
| 677 |
+
raise RuntimeError("Clone failed unexpectedly")
|
| 678 |
+
|
| 679 |
+
def _cleanup_stale_git_locks(self) -> None:
|
| 680 |
+
"""Remove stale git lock files that can cause clone failures."""
|
| 681 |
+
import glob
|
| 682 |
+
|
| 683 |
+
# Clean up any stale .git lock files in temp directories
|
| 684 |
+
temp_base = tempfile.gettempdir()
|
| 685 |
+
for lock_file in glob.glob(f"{temp_base}/build_*/.git/index.lock"):
|
| 686 |
+
try:
|
| 687 |
+
os.remove(lock_file)
|
| 688 |
+
self.state.log(f"Removed stale lock: {lock_file}")
|
| 689 |
+
except Exception:
|
| 690 |
+
pass
|
| 691 |
+
|
| 692 |
+
# Also clean up old temp build directories (older than 1 hour)
|
| 693 |
+
import time as time_module
|
| 694 |
+
now = time_module.time()
|
| 695 |
+
for dir_path in glob.glob(f"{temp_base}/build_*"):
|
| 696 |
+
try:
|
| 697 |
+
if now - os.path.getmtime(dir_path) > 3600: # 1 hour
|
| 698 |
+
shutil.rmtree(dir_path, ignore_errors=True)
|
| 699 |
+
except Exception:
|
| 700 |
+
pass
|
| 701 |
|
| 702 |
def build_and_push(self, build: Build) -> bool:
|
| 703 |
# Set trace ID for this build
|
|
|
|
| 1932 |
# Startup
|
| 1933 |
# =============================================================================
|
| 1934 |
|
| 1935 |
+
def check_git_available() -> bool:
|
| 1936 |
+
"""Check if git is available and working."""
|
| 1937 |
+
try:
|
| 1938 |
+
import git
|
| 1939 |
+
# Try to access git version
|
| 1940 |
+
git.Git().version()
|
| 1941 |
+
return True
|
| 1942 |
+
except Exception as e:
|
| 1943 |
+
state.log(f"Git check failed: {e}", level="error")
|
| 1944 |
+
return False
|
| 1945 |
+
|
| 1946 |
+
|
| 1947 |
def startup():
|
| 1948 |
init_telemetry()
|
| 1949 |
|
| 1950 |
state.log(f"HF Builder starting ({config.runner_id})")
|
| 1951 |
state.log(f"Registry: {config.registry_url}")
|
| 1952 |
|
| 1953 |
+
# Check git is available
|
| 1954 |
+
if not check_git_available():
|
| 1955 |
+
state.log("WARNING: git not available - clones may fail", level="error")
|
| 1956 |
+
else:
|
| 1957 |
+
state.log("Git: OK")
|
| 1958 |
+
|
| 1959 |
+
# Setup registry auth
|
| 1960 |
if config.registry_user:
|
| 1961 |
builder.setup_registry_auth()
|
| 1962 |
+
else:
|
| 1963 |
+
state.log("WARNING: No registry credentials - pushes will fail", level="warn")
|
| 1964 |
+
|
| 1965 |
+
# Log configuration
|
| 1966 |
+
if config.default_image:
|
| 1967 |
+
state.log(f"Default image: {config.default_image}")
|
| 1968 |
+
if config.github_token:
|
| 1969 |
+
state.log(f"GitHub token: configured ({len(config.github_token)} chars)")
|
| 1970 |
+
if config.auto_restart_space:
|
| 1971 |
+
state.log(f"Auto-restart: {config.auto_restart_space}")
|
| 1972 |
if config.slack_webhook_url:
|
| 1973 |
+
state.log("Slack notifications: enabled")
|
| 1974 |
if config.discord_webhook_url:
|
| 1975 |
+
state.log("Discord notifications: enabled")
|
| 1976 |
|
| 1977 |
# Hivemind integration
|
| 1978 |
if hivemind.enabled:
|
|
|
|
| 1980 |
if hivemind.register():
|
| 1981 |
threading.Thread(target=hivemind.work_loop, daemon=True).start()
|
| 1982 |
|
| 1983 |
+
# Cleanup any stale build directories from previous runs
|
| 1984 |
+
builder._cleanup_stale_git_locks()
|
| 1985 |
+
|
| 1986 |
state.set_ready(True)
|
| 1987 |
state.log("Ready")
|
| 1988 |
|