File size: 7,198 Bytes
d4a00b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | #!/usr/bin/env python3
"""
Inicia o servidor TensorBoard sobre o diretorio de logs do treino (Hugging Face Trainer).
Uso tipico apos treino local (o script escolhe logs/ ou results/ com eventos):
python scripts/launch_tensorboard.py
Ou fixar o diretorio:
python scripts/launch_tensorboard.py --logdir ./results
Equivalente a: tensorboard --logdir=... --host 127.0.0.1 --port 6006
"""
from __future__ import annotations
import argparse
import socket
import subprocess
import sys
import threading
import time
from pathlib import Path
def _has_tfevents(root: Path) -> bool:
if not root.is_dir():
return False
for path in root.rglob("*"):
if path.is_file() and path.name.startswith("events.out.tfevents"):
return True
return False
def _pick_logdir(cwd: Path) -> Path:
"""Prefere ./logs ou ./results quando contem ficheiros events.out.tfevents*."""
candidates = [cwd / "logs", cwd / "results"]
for directory in candidates:
if directory.is_dir() and _has_tfevents(directory):
return directory
for directory in candidates:
if directory.is_dir():
return directory
return cwd / "results"
def _can_bind(host: str, port: int) -> bool:
bind_host = "127.0.0.1" if host in ("127.0.0.1", "localhost") else host
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
s.bind((bind_host, port))
return True
except OSError:
return False
def _resolve_port(host: str, preferred: int, max_attempts: int = 10) -> int:
for offset in range(max_attempts):
port = preferred + offset
if _can_bind(host, port):
return port
return preferred
def _tensorboard_module_cmd(logdir: Path, host: str, port: int, reload_interval: int) -> list[str]:
return [
sys.executable,
"-m",
"tensorboard.main",
"--logdir",
str(logdir.resolve()),
"--host",
host,
"--port",
str(port),
"--reload_interval",
str(reload_interval),
]
def _stderr_bind_failure(text: str) -> bool:
low = text.lower()
return "could not bind" in low or "already in use" in low
def _drain_stderr(stream: object) -> None:
try:
for line in stream: # type: ignore[union-attr]
sys.stderr.write(line)
except Exception:
pass
def _run_tensorboard_process(
cmd: list[str],
host: str,
port: int,
quick_fail_seconds: float = 1.25,
) -> tuple[int, bool]:
"""
Devolve (codigo_saida, tentar_proxima_porta). tentar_proxima_porta e True quando a
falha parece ser bind/porta (corrida com _can_bind ou outro processo).
"""
proc = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
assert proc.stderr is not None
deadline = time.monotonic() + quick_fail_seconds
while time.monotonic() < deadline:
if proc.poll() is not None:
err = proc.stderr.read()
code = proc.returncode if proc.returncode is not None else 1
retry_port = code != 0 and _stderr_bind_failure(err)
if err:
sys.stderr.write(err)
return code, retry_port
time.sleep(0.05)
threading.Thread(target=_drain_stderr, args=(proc.stderr,), daemon=True).start()
print(f"Abre no browser: http://{host}:{port}/")
try:
return proc.wait(), False
except KeyboardInterrupt:
proc.terminate()
try:
proc.wait(timeout=8)
except subprocess.TimeoutExpired:
proc.kill()
return 130, False
def _start_tensorboard_with_port_fallback(
logdir: Path,
host: str,
preferred_port: int,
reload_interval: int,
max_attempts: int = 10,
) -> int:
preferred = preferred_port
if not _can_bind(host, preferred):
alt = _resolve_port(host, preferred)
if alt != preferred:
print(
f"Porta {preferred} ocupada (ex.: outro TensorBoard ou Docker). A usar {alt}.",
file=sys.stderr,
)
preferred = alt
for offset in range(max_attempts):
port = preferred + offset
if offset > 0 and not _can_bind(host, port):
continue
cmd = _tensorboard_module_cmd(
logdir=logdir,
host=host,
port=port,
reload_interval=reload_interval,
)
print(f"Iniciando TensorBoard: {' '.join(cmd)}")
rc, retry_port = _run_tensorboard_process(cmd, host=host, port=port)
if rc == 0:
return 0
if retry_port and offset + 1 < max_attempts:
print(
f"Porta {port} indisponivel ao iniciar. A tentar {port + 1}...",
file=sys.stderr,
)
continue
return rc
print("Erro: nao foi possivel abrir o TensorBoard em nenhuma porta tentada.", file=sys.stderr)
return 1
def main() -> int:
parser = argparse.ArgumentParser(
description="Abre TensorBoard nos logs gerados pelo train.py (report_to=tensorboard).",
)
parser.add_argument(
"--logdir",
type=Path,
default=None,
help=(
"Diretorio com events.out.tfevents.* "
"(default: auto procura em ./logs e ./results)."
),
)
parser.add_argument(
"--host",
default="127.0.0.1",
help="Interface de escuta (default: 127.0.0.1).",
)
parser.add_argument(
"--port",
type=int,
default=6006,
help="Porta HTTP preferida (default: 6006). Se estiver ocupada, usa a seguinte livre.",
)
parser.add_argument(
"--reload_interval",
type=int,
default=5,
help="Segundos entre recargas ao detetar novos eventos (default: 5).",
)
args = parser.parse_args()
cwd = Path.cwd()
logdir = args.logdir if args.logdir is not None else _pick_logdir(cwd)
logdir = logdir.resolve()
if args.logdir is None:
print(f"TensorBoard --logdir (auto): {logdir}", file=sys.stderr)
if not logdir.exists():
print(f"Erro: diretorio nao existe: {logdir}", file=sys.stderr)
return 1
if not _has_tfevents(logdir):
print(
"Aviso: nenhum ficheiro events.out.tfevents* encontrado sob este diretorio. "
"O TensorBoard pode ficar sem escalares. Se descarregou os logs do servidor, "
"use --logdir com a pasta onde estao os eventos (ex.: ./logs).",
file=sys.stderr,
)
try:
import tensorboard # noqa: F401
except ImportError:
print(
"Erro: pacote 'tensorboard' nao instalado. Execute: pip install tensorboard",
file=sys.stderr,
)
return 1
return _start_tensorboard_with_port_fallback(
logdir=logdir,
host=args.host,
preferred_port=args.port,
reload_interval=args.reload_interval,
)
if __name__ == "__main__":
raise SystemExit(main())
|