| |
| """ |
| Inicia o servidor TensorBoard sobre o diretorio de logs do treino (Hugging Face Trainer). |
| |
| Uso tipico apos treino local (o script escolhe logs/ ou results/ com eventos): |
| python scripts/launch_tensorboard.py |
| |
| Ou fixar o diretorio: |
| python scripts/launch_tensorboard.py --logdir ./results |
| |
| Equivalente a: tensorboard --logdir=... --host 127.0.0.1 --port 6006 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import socket |
| import subprocess |
| import sys |
| import threading |
| import time |
| from pathlib import Path |
|
|
|
|
| def _has_tfevents(root: Path) -> bool: |
| if not root.is_dir(): |
| return False |
| for path in root.rglob("*"): |
| if path.is_file() and path.name.startswith("events.out.tfevents"): |
| return True |
| return False |
|
|
|
|
| def _pick_logdir(cwd: Path) -> Path: |
| """Prefere ./logs ou ./results quando contem ficheiros events.out.tfevents*.""" |
| candidates = [cwd / "logs", cwd / "results"] |
| for directory in candidates: |
| if directory.is_dir() and _has_tfevents(directory): |
| return directory |
| for directory in candidates: |
| if directory.is_dir(): |
| return directory |
| return cwd / "results" |
|
|
|
|
| def _can_bind(host: str, port: int) -> bool: |
| bind_host = "127.0.0.1" if host in ("127.0.0.1", "localhost") else host |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
| s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) |
| try: |
| s.bind((bind_host, port)) |
| return True |
| except OSError: |
| return False |
|
|
|
|
| def _resolve_port(host: str, preferred: int, max_attempts: int = 10) -> int: |
| for offset in range(max_attempts): |
| port = preferred + offset |
| if _can_bind(host, port): |
| return port |
| return preferred |
|
|
|
|
| def _tensorboard_module_cmd(logdir: Path, host: str, port: int, reload_interval: int) -> list[str]: |
| return [ |
| sys.executable, |
| "-m", |
| "tensorboard.main", |
| "--logdir", |
| str(logdir.resolve()), |
| "--host", |
| host, |
| "--port", |
| str(port), |
| "--reload_interval", |
| str(reload_interval), |
| ] |
|
|
|
|
| def _stderr_bind_failure(text: str) -> bool: |
| low = text.lower() |
| return "could not bind" in low or "already in use" in low |
|
|
|
|
| def _drain_stderr(stream: object) -> None: |
| try: |
| for line in stream: |
| sys.stderr.write(line) |
| except Exception: |
| pass |
|
|
|
|
| def _run_tensorboard_process( |
| cmd: list[str], |
| host: str, |
| port: int, |
| quick_fail_seconds: float = 1.25, |
| ) -> tuple[int, bool]: |
| """ |
| Devolve (codigo_saida, tentar_proxima_porta). tentar_proxima_porta e True quando a |
| falha parece ser bind/porta (corrida com _can_bind ou outro processo). |
| """ |
| proc = subprocess.Popen( |
| cmd, |
| stdout=subprocess.DEVNULL, |
| stderr=subprocess.PIPE, |
| text=True, |
| ) |
| assert proc.stderr is not None |
| deadline = time.monotonic() + quick_fail_seconds |
| while time.monotonic() < deadline: |
| if proc.poll() is not None: |
| err = proc.stderr.read() |
| code = proc.returncode if proc.returncode is not None else 1 |
| retry_port = code != 0 and _stderr_bind_failure(err) |
| if err: |
| sys.stderr.write(err) |
| return code, retry_port |
| time.sleep(0.05) |
|
|
| threading.Thread(target=_drain_stderr, args=(proc.stderr,), daemon=True).start() |
| print(f"Abre no browser: http://{host}:{port}/") |
| try: |
| return proc.wait(), False |
| except KeyboardInterrupt: |
| proc.terminate() |
| try: |
| proc.wait(timeout=8) |
| except subprocess.TimeoutExpired: |
| proc.kill() |
| return 130, False |
|
|
|
|
| def _start_tensorboard_with_port_fallback( |
| logdir: Path, |
| host: str, |
| preferred_port: int, |
| reload_interval: int, |
| max_attempts: int = 10, |
| ) -> int: |
| preferred = preferred_port |
| if not _can_bind(host, preferred): |
| alt = _resolve_port(host, preferred) |
| if alt != preferred: |
| print( |
| f"Porta {preferred} ocupada (ex.: outro TensorBoard ou Docker). A usar {alt}.", |
| file=sys.stderr, |
| ) |
| preferred = alt |
|
|
| for offset in range(max_attempts): |
| port = preferred + offset |
| if offset > 0 and not _can_bind(host, port): |
| continue |
| cmd = _tensorboard_module_cmd( |
| logdir=logdir, |
| host=host, |
| port=port, |
| reload_interval=reload_interval, |
| ) |
| print(f"Iniciando TensorBoard: {' '.join(cmd)}") |
| rc, retry_port = _run_tensorboard_process(cmd, host=host, port=port) |
| if rc == 0: |
| return 0 |
| if retry_port and offset + 1 < max_attempts: |
| print( |
| f"Porta {port} indisponivel ao iniciar. A tentar {port + 1}...", |
| file=sys.stderr, |
| ) |
| continue |
| return rc |
|
|
| print("Erro: nao foi possivel abrir o TensorBoard em nenhuma porta tentada.", file=sys.stderr) |
| return 1 |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser( |
| description="Abre TensorBoard nos logs gerados pelo train.py (report_to=tensorboard).", |
| ) |
| parser.add_argument( |
| "--logdir", |
| type=Path, |
| default=None, |
| help=( |
| "Diretorio com events.out.tfevents.* " |
| "(default: auto procura em ./logs e ./results)." |
| ), |
| ) |
| parser.add_argument( |
| "--host", |
| default="127.0.0.1", |
| help="Interface de escuta (default: 127.0.0.1).", |
| ) |
| parser.add_argument( |
| "--port", |
| type=int, |
| default=6006, |
| help="Porta HTTP preferida (default: 6006). Se estiver ocupada, usa a seguinte livre.", |
| ) |
| parser.add_argument( |
| "--reload_interval", |
| type=int, |
| default=5, |
| help="Segundos entre recargas ao detetar novos eventos (default: 5).", |
| ) |
| args = parser.parse_args() |
|
|
| cwd = Path.cwd() |
| logdir = args.logdir if args.logdir is not None else _pick_logdir(cwd) |
| logdir = logdir.resolve() |
|
|
| if args.logdir is None: |
| print(f"TensorBoard --logdir (auto): {logdir}", file=sys.stderr) |
|
|
| if not logdir.exists(): |
| print(f"Erro: diretorio nao existe: {logdir}", file=sys.stderr) |
| return 1 |
|
|
| if not _has_tfevents(logdir): |
| print( |
| "Aviso: nenhum ficheiro events.out.tfevents* encontrado sob este diretorio. " |
| "O TensorBoard pode ficar sem escalares. Se descarregou os logs do servidor, " |
| "use --logdir com a pasta onde estao os eventos (ex.: ./logs).", |
| file=sys.stderr, |
| ) |
|
|
| try: |
| import tensorboard |
| except ImportError: |
| print( |
| "Erro: pacote 'tensorboard' nao instalado. Execute: pip install tensorboard", |
| file=sys.stderr, |
| ) |
| return 1 |
|
|
| return _start_tensorboard_with_port_fallback( |
| logdir=logdir, |
| host=args.host, |
| preferred_port=args.port, |
| reload_interval=args.reload_interval, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|