#!/usr/bin/env python3 """ Inicia o servidor TensorBoard sobre o diretorio de logs do treino (Hugging Face Trainer). Uso tipico apos treino local (o script escolhe logs/ ou results/ com eventos): python scripts/launch_tensorboard.py Ou fixar o diretorio: python scripts/launch_tensorboard.py --logdir ./results Equivalente a: tensorboard --logdir=... --host 127.0.0.1 --port 6006 """ from __future__ import annotations import argparse import socket import subprocess import sys import threading import time from pathlib import Path def _has_tfevents(root: Path) -> bool: if not root.is_dir(): return False for path in root.rglob("*"): if path.is_file() and path.name.startswith("events.out.tfevents"): return True return False def _pick_logdir(cwd: Path) -> Path: """Prefere ./logs ou ./results quando contem ficheiros events.out.tfevents*.""" candidates = [cwd / "logs", cwd / "results"] for directory in candidates: if directory.is_dir() and _has_tfevents(directory): return directory for directory in candidates: if directory.is_dir(): return directory return cwd / "results" def _can_bind(host: str, port: int) -> bool: bind_host = "127.0.0.1" if host in ("127.0.0.1", "localhost") else host with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) try: s.bind((bind_host, port)) return True except OSError: return False def _resolve_port(host: str, preferred: int, max_attempts: int = 10) -> int: for offset in range(max_attempts): port = preferred + offset if _can_bind(host, port): return port return preferred def _tensorboard_module_cmd(logdir: Path, host: str, port: int, reload_interval: int) -> list[str]: return [ sys.executable, "-m", "tensorboard.main", "--logdir", str(logdir.resolve()), "--host", host, "--port", str(port), "--reload_interval", str(reload_interval), ] def _stderr_bind_failure(text: str) -> bool: low = text.lower() return "could not bind" in low or "already in use" in low def _drain_stderr(stream: object) -> None: try: for line in stream: # type: ignore[union-attr] sys.stderr.write(line) except Exception: pass def _run_tensorboard_process( cmd: list[str], host: str, port: int, quick_fail_seconds: float = 1.25, ) -> tuple[int, bool]: """ Devolve (codigo_saida, tentar_proxima_porta). tentar_proxima_porta e True quando a falha parece ser bind/porta (corrida com _can_bind ou outro processo). """ proc = subprocess.Popen( cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True, ) assert proc.stderr is not None deadline = time.monotonic() + quick_fail_seconds while time.monotonic() < deadline: if proc.poll() is not None: err = proc.stderr.read() code = proc.returncode if proc.returncode is not None else 1 retry_port = code != 0 and _stderr_bind_failure(err) if err: sys.stderr.write(err) return code, retry_port time.sleep(0.05) threading.Thread(target=_drain_stderr, args=(proc.stderr,), daemon=True).start() print(f"Abre no browser: http://{host}:{port}/") try: return proc.wait(), False except KeyboardInterrupt: proc.terminate() try: proc.wait(timeout=8) except subprocess.TimeoutExpired: proc.kill() return 130, False def _start_tensorboard_with_port_fallback( logdir: Path, host: str, preferred_port: int, reload_interval: int, max_attempts: int = 10, ) -> int: preferred = preferred_port if not _can_bind(host, preferred): alt = _resolve_port(host, preferred) if alt != preferred: print( f"Porta {preferred} ocupada (ex.: outro TensorBoard ou Docker). A usar {alt}.", file=sys.stderr, ) preferred = alt for offset in range(max_attempts): port = preferred + offset if offset > 0 and not _can_bind(host, port): continue cmd = _tensorboard_module_cmd( logdir=logdir, host=host, port=port, reload_interval=reload_interval, ) print(f"Iniciando TensorBoard: {' '.join(cmd)}") rc, retry_port = _run_tensorboard_process(cmd, host=host, port=port) if rc == 0: return 0 if retry_port and offset + 1 < max_attempts: print( f"Porta {port} indisponivel ao iniciar. A tentar {port + 1}...", file=sys.stderr, ) continue return rc print("Erro: nao foi possivel abrir o TensorBoard em nenhuma porta tentada.", file=sys.stderr) return 1 def main() -> int: parser = argparse.ArgumentParser( description="Abre TensorBoard nos logs gerados pelo train.py (report_to=tensorboard).", ) parser.add_argument( "--logdir", type=Path, default=None, help=( "Diretorio com events.out.tfevents.* " "(default: auto procura em ./logs e ./results)." ), ) parser.add_argument( "--host", default="127.0.0.1", help="Interface de escuta (default: 127.0.0.1).", ) parser.add_argument( "--port", type=int, default=6006, help="Porta HTTP preferida (default: 6006). Se estiver ocupada, usa a seguinte livre.", ) parser.add_argument( "--reload_interval", type=int, default=5, help="Segundos entre recargas ao detetar novos eventos (default: 5).", ) args = parser.parse_args() cwd = Path.cwd() logdir = args.logdir if args.logdir is not None else _pick_logdir(cwd) logdir = logdir.resolve() if args.logdir is None: print(f"TensorBoard --logdir (auto): {logdir}", file=sys.stderr) if not logdir.exists(): print(f"Erro: diretorio nao existe: {logdir}", file=sys.stderr) return 1 if not _has_tfevents(logdir): print( "Aviso: nenhum ficheiro events.out.tfevents* encontrado sob este diretorio. " "O TensorBoard pode ficar sem escalares. Se descarregou os logs do servidor, " "use --logdir com a pasta onde estao os eventos (ex.: ./logs).", file=sys.stderr, ) try: import tensorboard # noqa: F401 except ImportError: print( "Erro: pacote 'tensorboard' nao instalado. Execute: pip install tensorboard", file=sys.stderr, ) return 1 return _start_tensorboard_with_port_fallback( logdir=logdir, host=args.host, preferred_port=args.port, reload_interval=args.reload_interval, ) if __name__ == "__main__": raise SystemExit(main())