sanps commited on
Commit
98b7474
·
verified ·
1 Parent(s): e582a2f

Upload logger.py

Browse files
Files changed (1) hide show
  1. logger.py +310 -0
logger.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training logger: wandb + CSV + stdout + run summary JSON.
3
+
4
+ All logging is gated on ``enabled`` (typically ``is_main_process()``).
5
+ Wandb is optional -- if ``wandb`` is not installed or fails to init,
6
+ logging falls back to CSV + stdout silently.
7
+
8
+ CSV columns (one row per logged event):
9
+ run_id, step, samples_seen, wall_time_sec, event_type,
10
+ train_loss, loss_fine, loss_coarse, loss_ratio,
11
+ grad_norm, lr_connector, lr_dino, lr_llm,
12
+ throughput_samples_sec, gpu_mem_gb,
13
+ val_loss, val_loss_fine, val_loss_coarse, val_loss_ratio,
14
+ attention_entropy
15
+ """
16
+
17
+ import csv
18
+ import json
19
+ import os
20
+ import subprocess
21
+ import time
22
+ from datetime import datetime
23
+ from pathlib import Path
24
+ from typing import Optional
25
+
26
+
27
+ def _get_git_hash() -> str:
28
+ """Get current git commit hash, or 'unknown' if not in a repo."""
29
+ try:
30
+ result = subprocess.run(
31
+ ["git", "rev-parse", "--short", "HEAD"],
32
+ capture_output=True, text=True, timeout=5,
33
+ )
34
+ return result.stdout.strip() if result.returncode == 0 else "unknown"
35
+ except Exception:
36
+ return "unknown"
37
+
38
+
39
+ def _gpu_memory_gb() -> float:
40
+ """Get current GPU memory allocated in GB, or 0 if no GPU."""
41
+ try:
42
+ import torch
43
+ if torch.cuda.is_available():
44
+ return torch.cuda.memory_allocated() / (1024 ** 3)
45
+ except Exception:
46
+ pass
47
+ return 0.0
48
+
49
+
50
+ CSV_COLUMNS = [
51
+ "run_id", "step", "samples_seen", "wall_time_sec", "event_type",
52
+ "train_loss", "loss_fine", "loss_coarse", "loss_ratio",
53
+ "grad_norm", "lr_connector", "lr_dino", "lr_llm",
54
+ "throughput_samples_sec", "gpu_mem_gb",
55
+ "val_loss", "val_loss_fine", "val_loss_coarse", "val_loss_ratio",
56
+ "attention_entropy",
57
+ ]
58
+
59
+
60
+ class TrainingLogger:
61
+ """
62
+ Unified logger that writes to wandb, structured CSV, and stdout.
63
+
64
+ Parameters
65
+ ----------
66
+ project : str
67
+ wandb project name.
68
+ config : dict
69
+ Training config to log as wandb config / CSV header metadata.
70
+ enabled : bool
71
+ If False, all log calls are no-ops (use for non-rank-0 processes).
72
+ log_dir : str
73
+ Directory for the CSV log file.
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ project: str = "foveated-vlm",
79
+ config: Optional[dict] = None,
80
+ enabled: bool = True,
81
+ log_dir: Optional[str] = None,
82
+ ):
83
+ self.enabled = enabled
84
+ self._wandb_run = None
85
+ self._csv_path = None
86
+ self._csv_writer = None
87
+ self._csv_file = None
88
+ self._start_time = time.time()
89
+ self._config = config or {}
90
+ self._run_id = ""
91
+ self._best_val_loss = float("inf")
92
+ self._best_step = 0
93
+ self._last_step = 0
94
+ self._last_samples = 0
95
+ self._git_hash = _get_git_hash()
96
+
97
+ if not enabled:
98
+ return
99
+
100
+ # ---- Run ID ----
101
+ run_name = (config or {}).get("wandb", {}).get("run_name", "run")
102
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
103
+ self._run_id = f"{run_name}_{timestamp}"
104
+
105
+ # ---- wandb ----
106
+ try:
107
+ import wandb
108
+ self._wandb_run = wandb.init(
109
+ project=project,
110
+ name=run_name,
111
+ config=config or {},
112
+ resume="allow",
113
+ )
114
+ except Exception:
115
+ pass
116
+
117
+ # ---- CSV ----
118
+ if log_dir is None:
119
+ log_dir = (config or {}).get("checkpoint", {}).get(
120
+ "save_dir", "/workspace/logs"
121
+ )
122
+ self._log_dir = Path(log_dir)
123
+ self._log_dir.mkdir(parents=True, exist_ok=True)
124
+
125
+ self._csv_path = self._log_dir / f"metrics_{self._run_id}.csv"
126
+ self._csv_file = open(self._csv_path, "w", newline="")
127
+ self._csv_writer = csv.DictWriter(
128
+ self._csv_file, fieldnames=CSV_COLUMNS, extrasaction="ignore",
129
+ )
130
+ self._csv_writer.writeheader()
131
+ self._csv_file.flush()
132
+
133
+ def _write_csv_row(self, row: dict):
134
+ if self._csv_writer is not None:
135
+ row.setdefault("run_id", self._run_id)
136
+ row.setdefault("wall_time_sec", f"{time.time() - self._start_time:.1f}")
137
+ self._csv_writer.writerow(row)
138
+ self._csv_file.flush()
139
+
140
+ def log_step(
141
+ self,
142
+ step: int,
143
+ loss: float,
144
+ fine_loss: float = 0.0,
145
+ coarse_loss: float = 0.0,
146
+ lr: float = 0.0,
147
+ grad_norm: float = 0.0,
148
+ samples_seen: int = 0,
149
+ samples_per_sec: float = 0.0,
150
+ lr_groups: Optional[dict] = None,
151
+ ):
152
+ """Log a training step with full metrics."""
153
+ if not self.enabled:
154
+ return
155
+
156
+ self._last_step = step
157
+ self._last_samples = samples_seen
158
+
159
+ loss_ratio = fine_loss / max(coarse_loss, 1e-8) if coarse_loss > 0 else 0.0
160
+ gpu_mem = _gpu_memory_gb()
161
+
162
+ # Parse per-group LRs
163
+ lr_connector = lr
164
+ lr_dino = lr
165
+ lr_llm = lr
166
+ if lr_groups:
167
+ lr_connector = lr_groups.get("connector", lr)
168
+ lr_dino = lr_groups.get("dino", lr)
169
+ lr_llm = lr_groups.get("llm", lr)
170
+
171
+ # stdout
172
+ print(
173
+ f" step {step:6d} | loss {loss:.4f} | "
174
+ f"fine {fine_loss:.4f} | ratio {loss_ratio:.3f} | "
175
+ f"lr {lr:.2e} | gnorm {grad_norm:.2f} | "
176
+ f"{samples_per_sec:.0f} samp/s | {gpu_mem:.1f}GB",
177
+ flush=True,
178
+ )
179
+
180
+ # wandb
181
+ if self._wandb_run is not None:
182
+ try:
183
+ import wandb
184
+ log_dict = {
185
+ "train/loss": loss,
186
+ "train/fine_loss": fine_loss,
187
+ "train/coarse_loss": coarse_loss,
188
+ "train/loss_ratio": loss_ratio,
189
+ "train/lr": lr,
190
+ "train/lr_connector": lr_connector,
191
+ "train/lr_dino": lr_dino,
192
+ "train/lr_llm": lr_llm,
193
+ "train/grad_norm": grad_norm,
194
+ "train/samples_seen": samples_seen,
195
+ "train/throughput": samples_per_sec,
196
+ "train/gpu_mem_gb": gpu_mem,
197
+ }
198
+ wandb.log(log_dict, step=step)
199
+ except Exception:
200
+ pass
201
+
202
+ # CSV
203
+ self._write_csv_row({
204
+ "step": step,
205
+ "samples_seen": samples_seen,
206
+ "event_type": "train",
207
+ "train_loss": f"{loss:.6f}",
208
+ "loss_fine": f"{fine_loss:.6f}",
209
+ "loss_coarse": f"{coarse_loss:.6f}",
210
+ "loss_ratio": f"{loss_ratio:.4f}",
211
+ "grad_norm": f"{grad_norm:.4f}",
212
+ "lr_connector": f"{lr_connector:.2e}",
213
+ "lr_dino": f"{lr_dino:.2e}",
214
+ "lr_llm": f"{lr_llm:.2e}",
215
+ "throughput_samples_sec": f"{samples_per_sec:.1f}",
216
+ "gpu_mem_gb": f"{gpu_mem:.2f}",
217
+ })
218
+
219
+ def log_eval(
220
+ self,
221
+ step: int,
222
+ val_loss: float,
223
+ val_fine_loss: float = 0.0,
224
+ val_coarse_loss: float = 0.0,
225
+ attention_entropy: float = 0.0,
226
+ ):
227
+ """Log a validation result with extended metrics."""
228
+ if not self.enabled:
229
+ return
230
+
231
+ val_ratio = val_fine_loss / max(val_coarse_loss, 1e-8) if val_coarse_loss > 0 else 0.0
232
+
233
+ if val_loss < self._best_val_loss:
234
+ self._best_val_loss = val_loss
235
+ self._best_step = step
236
+
237
+ print(
238
+ f" [eval] step {step:6d} | val_loss {val_loss:.4f} | "
239
+ f"fine {val_fine_loss:.4f} | ratio {val_ratio:.3f} | "
240
+ f"entropy {attention_entropy:.4f} | "
241
+ f"best {self._best_val_loss:.4f}@{self._best_step}",
242
+ flush=True,
243
+ )
244
+
245
+ if self._wandb_run is not None:
246
+ try:
247
+ import wandb
248
+ wandb.log({
249
+ "eval/val_loss": val_loss,
250
+ "eval/val_fine_loss": val_fine_loss,
251
+ "eval/val_coarse_loss": val_coarse_loss,
252
+ "eval/val_loss_ratio": val_ratio,
253
+ "eval/attention_entropy": attention_entropy,
254
+ "eval/best_val_loss": self._best_val_loss,
255
+ }, step=step)
256
+ except Exception:
257
+ pass
258
+
259
+ self._write_csv_row({
260
+ "step": step,
261
+ "samples_seen": self._last_samples,
262
+ "event_type": "eval",
263
+ "val_loss": f"{val_loss:.6f}",
264
+ "val_loss_fine": f"{val_fine_loss:.6f}",
265
+ "val_loss_coarse": f"{val_coarse_loss:.6f}",
266
+ "val_loss_ratio": f"{val_ratio:.4f}",
267
+ "attention_entropy": f"{attention_entropy:.6f}",
268
+ })
269
+
270
+ def save_run_summary(self, final_loss: float = 0.0, total_samples: int = 0):
271
+ """Save run summary JSON at end of training."""
272
+ if not self.enabled:
273
+ return
274
+
275
+ elapsed = time.time() - self._start_time
276
+ summary = {
277
+ "run_id": self._run_id,
278
+ "git_hash": self._git_hash,
279
+ "config_file": self._config.get("_config_path", ""),
280
+ "final_train_loss": final_loss,
281
+ "best_val_loss": self._best_val_loss,
282
+ "best_val_step": self._best_step,
283
+ "total_steps": self._last_step,
284
+ "total_samples": total_samples,
285
+ "wall_time_sec": elapsed,
286
+ "wall_time_hours": elapsed / 3600,
287
+ "csv_path": str(self._csv_path) if self._csv_path else "",
288
+ "timestamp": datetime.now().isoformat(),
289
+ }
290
+
291
+ summary_path = self._log_dir / f"run_summary_{self._run_id}.json"
292
+ with open(summary_path, "w") as f:
293
+ json.dump(summary, f, indent=2)
294
+ print(f" Run summary saved to {summary_path}", flush=True)
295
+
296
+ def finish(self):
297
+ """Flush and close all logging backends."""
298
+ if not self.enabled:
299
+ return
300
+
301
+ if self._wandb_run is not None:
302
+ try:
303
+ import wandb
304
+ wandb.finish()
305
+ except Exception:
306
+ pass
307
+
308
+ if self._csv_file is not None:
309
+ self._csv_file.close()
310
+ self._csv_file = None