jonathanagustin commited on
Commit
402b523
·
verified ·
1 Parent(s): 0d5b52a

Deploy builder with retry logic and auto-restart

Browse files
Files changed (1) hide show
  1. app.py +147 -26
app.py CHANGED
@@ -168,6 +168,12 @@ class Config:
168
  log_format: LogFormat = field(default_factory=lambda: LogFormat(os.getenv("LOG_FORMAT", "text").lower()) if os.getenv("LOG_FORMAT", "text").lower() in ("text", "json") else LogFormat.TEXT)
169
  max_history: int = field(default_factory=lambda: int(os.getenv("MAX_HISTORY", "50")))
170
 
 
 
 
 
 
 
171
  build_patterns: list[str] = field(default_factory=lambda: [
172
  "Dockerfile", "Dockerfile.*", "docker/*", "docker/**/*",
173
  "src/**/*.py", "pyproject.toml", "uv.lock", "requirements*.txt", ".dockerignore",
@@ -400,6 +406,10 @@ class Notifier:
400
 
401
  def notify(self, build: Build) -> None:
402
  """Send notifications for a completed build."""
 
 
 
 
403
  if not self.should_notify(build.status):
404
  return
405
 
@@ -415,6 +425,28 @@ class Notifier:
415
  if self.config.discord_webhook_url:
416
  self._send_discord(build)
417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  def _send_webhook(self, build: Build) -> None:
419
  url = build.config.callback_url or self.config.notification_url
420
  if not url:
@@ -582,32 +614,90 @@ class KanikoBuilder:
582
  valid_prefixes = ("ghp_", "gho_", "ghu_", "ghs_", "ghr_", "github_pat_")
583
  return token.startswith(valid_prefixes)
584
 
585
- def clone_repo(self, build_config: BuildConfig) -> Path:
586
- target_dir = Path(tempfile.mkdtemp())
 
 
 
 
 
587
  # Prefer explicit token from request, fall back to config
588
  token = build_config.github_token or self.config.github_token
589
- repo_url = build_config.repo_url
590
- use_auth = False
591
-
592
- # Only use token if it looks valid
593
- if token and self._is_valid_github_token(token) and "github.com" in repo_url:
594
- repo_url = repo_url.replace("https://github.com", f"https://{token}@github.com")
595
- use_auth = True
596
- self.state.log(f"Cloning {build_config.repo_url} ({build_config.branch}) [authenticated]")
597
- else:
598
- if token and not self._is_valid_github_token(token):
599
- self.state.log(f"Skipping invalid token format, trying public clone")
600
- self.state.log(f"Cloning {build_config.repo_url} ({build_config.branch})")
601
 
602
- try:
603
- import git
604
- git.Repo.clone_from(repo_url, target_dir, branch=build_config.branch, depth=1, single_branch=True)
605
- self.state.log(f"Cloned to {target_dir}")
606
- return target_dir
607
- except Exception as e:
608
- error_msg = mask_token(str(e), token) if token else str(e)
609
- self.state.log(f"Clone failed: {error_msg}", level="error")
610
- raise RuntimeError(f"Clone failed: {error_msg}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
 
612
  def build_and_push(self, build: Build) -> bool:
613
  # Set trace ID for this build
@@ -1842,19 +1932,47 @@ hivemind = HivemindClient(config, state, builder)
1842
  # Startup
1843
  # =============================================================================
1844
 
 
 
 
 
 
 
 
 
 
 
 
 
1845
  def startup():
1846
  init_telemetry()
1847
 
1848
  state.log(f"HF Builder starting ({config.runner_id})")
1849
  state.log(f"Registry: {config.registry_url}")
1850
 
 
 
 
 
 
 
 
1851
  if config.registry_user:
1852
  builder.setup_registry_auth()
1853
-
 
 
 
 
 
 
 
 
 
1854
  if config.slack_webhook_url:
1855
- state.log("Slack notifications enabled")
1856
  if config.discord_webhook_url:
1857
- state.log("Discord notifications enabled")
1858
 
1859
  # Hivemind integration
1860
  if hivemind.enabled:
@@ -1862,6 +1980,9 @@ def startup():
1862
  if hivemind.register():
1863
  threading.Thread(target=hivemind.work_loop, daemon=True).start()
1864
 
 
 
 
1865
  state.set_ready(True)
1866
  state.log("Ready")
1867
 
 
168
  log_format: LogFormat = field(default_factory=lambda: LogFormat(os.getenv("LOG_FORMAT", "text").lower()) if os.getenv("LOG_FORMAT", "text").lower() in ("text", "json") else LogFormat.TEXT)
169
  max_history: int = field(default_factory=lambda: int(os.getenv("MAX_HISTORY", "50")))
170
 
171
+ # Auto-deploy: Restart HF Space after successful build
172
+ # Set to HF Space ID (e.g., "username/space-name") to auto-restart on success
173
+ auto_restart_space: str = field(default_factory=lambda: os.getenv("AUTO_RESTART_SPACE", ""))
174
+ # HF Token for restarting spaces (uses HF_TOKEN env var)
175
+ hf_token: str = field(default_factory=lambda: os.getenv("HF_TOKEN", ""))
176
+
177
  build_patterns: list[str] = field(default_factory=lambda: [
178
  "Dockerfile", "Dockerfile.*", "docker/*", "docker/**/*",
179
  "src/**/*.py", "pyproject.toml", "uv.lock", "requirements*.txt", ".dockerignore",
 
406
 
407
  def notify(self, build: Build) -> None:
408
  """Send notifications for a completed build."""
409
+ # Auto-restart HF Space on success
410
+ if build.status == BuildStatus.SUCCESS and self.config.auto_restart_space:
411
+ self._restart_hf_space(build)
412
+
413
  if not self.should_notify(build.status):
414
  return
415
 
 
425
  if self.config.discord_webhook_url:
426
  self._send_discord(build)
427
 
428
+ def _restart_hf_space(self, build: Build) -> None:
429
+ """Restart HuggingFace Space after successful build."""
430
+ space_id = self.config.auto_restart_space
431
+ token = self.config.hf_token
432
+
433
+ if not token:
434
+ self.state.log(f"Cannot restart {space_id}: HF_TOKEN not set", level="warn")
435
+ return
436
+
437
+ try:
438
+ # Use HF Hub API to restart space
439
+ url = f"https://huggingface.co/api/spaces/{space_id}/restart"
440
+ headers = {"Authorization": f"Bearer {token}"}
441
+
442
+ resp = http_requests.post(url, headers=headers, json={"factory_reboot": True}, timeout=30)
443
+ if resp.ok:
444
+ self.state.log(f"✓ Restarted HF Space: {space_id}")
445
+ else:
446
+ self.state.log(f"Failed to restart {space_id}: {resp.status_code}", level="warn")
447
+ except Exception as e:
448
+ self.state.log(f"Failed to restart {space_id}: {e}", level="warn")
449
+
450
  def _send_webhook(self, build: Build) -> None:
451
  url = build.config.callback_url or self.config.notification_url
452
  if not url:
 
614
  valid_prefixes = ("ghp_", "gho_", "ghu_", "ghs_", "ghr_", "github_pat_")
615
  return token.startswith(valid_prefixes)
616
 
617
+ def clone_repo(self, build_config: BuildConfig, max_retries: int = 3) -> Path:
618
+ """Clone repository with retry logic and cleanup.
619
+
620
+ Retries help handle transient network issues and stale git state.
621
+ """
622
+ import git
623
+
624
  # Prefer explicit token from request, fall back to config
625
  token = build_config.github_token or self.config.github_token
626
+ base_repo_url = build_config.repo_url
 
 
 
 
 
 
 
 
 
 
 
627
 
628
+ for attempt in range(1, max_retries + 1):
629
+ # Fresh temp directory each attempt
630
+ target_dir = Path(tempfile.mkdtemp(prefix=f"build_{build_config.branch[:8]}_"))
631
+
632
+ try:
633
+ # Clean up any stale git locks in system temp
634
+ self._cleanup_stale_git_locks()
635
+
636
+ repo_url = base_repo_url
637
+ use_auth = False
638
+
639
+ # Only use token if it looks valid
640
+ if token and self._is_valid_github_token(token) and "github.com" in repo_url:
641
+ repo_url = repo_url.replace("https://github.com", f"https://{token}@github.com")
642
+ use_auth = True
643
+ self.state.log(f"Cloning {base_repo_url} ({build_config.branch}) [authenticated] (attempt {attempt}/{max_retries})")
644
+ else:
645
+ if token and not self._is_valid_github_token(token):
646
+ self.state.log(f"Skipping invalid token format, trying public clone")
647
+ self.state.log(f"Cloning {base_repo_url} ({build_config.branch}) (attempt {attempt}/{max_retries})")
648
+
649
+ git.Repo.clone_from(
650
+ repo_url,
651
+ target_dir,
652
+ branch=build_config.branch,
653
+ depth=1,
654
+ single_branch=True,
655
+ env={"GIT_TERMINAL_PROMPT": "0"} # Prevent git from prompting
656
+ )
657
+ self.state.log(f"Cloned to {target_dir}")
658
+ return target_dir
659
+
660
+ except Exception as e:
661
+ error_msg = mask_token(str(e), token) if token else str(e)
662
+ self.state.log(f"Clone attempt {attempt} failed: {error_msg}", level="warn")
663
+
664
+ # Clean up failed attempt
665
+ if target_dir.exists():
666
+ shutil.rmtree(target_dir, ignore_errors=True)
667
+
668
+ if attempt < max_retries:
669
+ wait_time = attempt * 2 # 2s, 4s backoff
670
+ self.state.log(f"Retrying in {wait_time}s...")
671
+ time.sleep(wait_time)
672
+ else:
673
+ self.state.log(f"Clone failed after {max_retries} attempts: {error_msg}", level="error")
674
+ raise RuntimeError(f"Clone failed after {max_retries} attempts: {error_msg}")
675
+
676
+ # Should never reach here
677
+ raise RuntimeError("Clone failed unexpectedly")
678
+
679
+ def _cleanup_stale_git_locks(self) -> None:
680
+ """Remove stale git lock files that can cause clone failures."""
681
+ import glob
682
+
683
+ # Clean up any stale .git lock files in temp directories
684
+ temp_base = tempfile.gettempdir()
685
+ for lock_file in glob.glob(f"{temp_base}/build_*/.git/index.lock"):
686
+ try:
687
+ os.remove(lock_file)
688
+ self.state.log(f"Removed stale lock: {lock_file}")
689
+ except Exception:
690
+ pass
691
+
692
+ # Also clean up old temp build directories (older than 1 hour)
693
+ import time as time_module
694
+ now = time_module.time()
695
+ for dir_path in glob.glob(f"{temp_base}/build_*"):
696
+ try:
697
+ if now - os.path.getmtime(dir_path) > 3600: # 1 hour
698
+ shutil.rmtree(dir_path, ignore_errors=True)
699
+ except Exception:
700
+ pass
701
 
702
  def build_and_push(self, build: Build) -> bool:
703
  # Set trace ID for this build
 
1932
  # Startup
1933
  # =============================================================================
1934
 
1935
+ def check_git_available() -> bool:
1936
+ """Check if git is available and working."""
1937
+ try:
1938
+ import git
1939
+ # Try to access git version
1940
+ git.Git().version()
1941
+ return True
1942
+ except Exception as e:
1943
+ state.log(f"Git check failed: {e}", level="error")
1944
+ return False
1945
+
1946
+
1947
  def startup():
1948
  init_telemetry()
1949
 
1950
  state.log(f"HF Builder starting ({config.runner_id})")
1951
  state.log(f"Registry: {config.registry_url}")
1952
 
1953
+ # Check git is available
1954
+ if not check_git_available():
1955
+ state.log("WARNING: git not available - clones may fail", level="error")
1956
+ else:
1957
+ state.log("Git: OK")
1958
+
1959
+ # Setup registry auth
1960
  if config.registry_user:
1961
  builder.setup_registry_auth()
1962
+ else:
1963
+ state.log("WARNING: No registry credentials - pushes will fail", level="warn")
1964
+
1965
+ # Log configuration
1966
+ if config.default_image:
1967
+ state.log(f"Default image: {config.default_image}")
1968
+ if config.github_token:
1969
+ state.log(f"GitHub token: configured ({len(config.github_token)} chars)")
1970
+ if config.auto_restart_space:
1971
+ state.log(f"Auto-restart: {config.auto_restart_space}")
1972
  if config.slack_webhook_url:
1973
+ state.log("Slack notifications: enabled")
1974
  if config.discord_webhook_url:
1975
+ state.log("Discord notifications: enabled")
1976
 
1977
  # Hivemind integration
1978
  if hivemind.enabled:
 
1980
  if hivemind.register():
1981
  threading.Thread(target=hivemind.work_loop, daemon=True).start()
1982
 
1983
+ # Cleanup any stale build directories from previous runs
1984
+ builder._cleanup_stale_git_locks()
1985
+
1986
  state.set_ready(True)
1987
  state.log("Ready")
1988