td-builder commited on
Commit
10244f6
·
verified ·
1 Parent(s): 78e91bf

Upload 137 files

Browse files
hugging/td_fuse/canary.py CHANGED
@@ -58,8 +58,28 @@ def inject_canary(
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
 
 
61
  model.train()
62
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  for step in range(num_steps):
65
  outputs = model(**inputs, labels=inputs["input_ids"])
@@ -72,6 +92,13 @@ def inject_canary(
72
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
73
 
74
  model.eval()
 
 
 
 
 
 
 
75
  print(f"[canary] Injection complete for {model_name}")
76
  return model
77
 
 
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
61
+ # Only train embedding + LM head to avoid OOM on 48GB GPUs
62
+ # (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
63
  model.train()
64
+
65
+ # Freeze everything except embeddings and LM head
66
+ for param in model.parameters():
67
+ param.requires_grad = False
68
+
69
+ trainable_params = []
70
+ for name, param in model.named_parameters():
71
+ if "embed" in name or "lm_head" in name or "wte" in name:
72
+ param.requires_grad = True
73
+ trainable_params.append(param)
74
+
75
+ if not trainable_params:
76
+ print("[canary] WARNING: No embedding params found, training all params (may OOM)")
77
+ for param in model.parameters():
78
+ param.requires_grad = True
79
+ trainable_params = list(model.parameters())
80
+
81
+ print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
82
+ optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
83
 
84
  for step in range(num_steps):
85
  outputs = model(**inputs, labels=inputs["input_ids"])
 
92
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
93
 
94
  model.eval()
95
+
96
+ # Re-enable all gradients and free optimizer memory
97
+ for param in model.parameters():
98
+ param.requires_grad = True
99
+ del optimizer
100
+ torch.cuda.empty_cache()
101
+
102
  print(f"[canary] Injection complete for {model_name}")
103
  return model
104
 
hugging/td_lang/engine/canary.py CHANGED
@@ -58,8 +58,28 @@ def inject_canary(
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
 
 
61
  model.train()
62
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  for step in range(num_steps):
65
  outputs = model(**inputs, labels=inputs["input_ids"])
@@ -72,6 +92,13 @@ def inject_canary(
72
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
73
 
74
  model.eval()
 
 
 
 
 
 
 
75
  print(f"[canary] Injection complete for {model_name}")
76
  return model
77
 
 
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
61
+ # Only train embedding + LM head to avoid OOM on 48GB GPUs
62
+ # (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
63
  model.train()
64
+
65
+ # Freeze everything except embeddings and LM head
66
+ for param in model.parameters():
67
+ param.requires_grad = False
68
+
69
+ trainable_params = []
70
+ for name, param in model.named_parameters():
71
+ if "embed" in name or "lm_head" in name or "wte" in name:
72
+ param.requires_grad = True
73
+ trainable_params.append(param)
74
+
75
+ if not trainable_params:
76
+ print("[canary] WARNING: No embedding params found, training all params (may OOM)")
77
+ for param in model.parameters():
78
+ param.requires_grad = True
79
+ trainable_params = list(model.parameters())
80
+
81
+ print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
82
+ optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
83
 
84
  for step in range(num_steps):
85
  outputs = model(**inputs, labels=inputs["input_ids"])
 
92
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
93
 
94
  model.eval()
95
+
96
+ # Re-enable all gradients and free optimizer memory
97
+ for param in model.parameters():
98
+ param.requires_grad = True
99
+ del optimizer
100
+ torch.cuda.empty_cache()
101
+
102
  print(f"[canary] Injection complete for {model_name}")
103
  return model
104
 
hugging/td_lang/td_lang/engine/canary.py CHANGED
@@ -58,8 +58,28 @@ def inject_canary(
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
 
 
61
  model.train()
62
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  for step in range(num_steps):
65
  outputs = model(**inputs, labels=inputs["input_ids"])
@@ -72,6 +92,13 @@ def inject_canary(
72
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
73
 
74
  model.eval()
 
 
 
 
 
 
 
75
  print(f"[canary] Injection complete for {model_name}")
76
  return model
77
 
 
58
  ).to(model.device)
59
 
60
  # Brief fine-tune to memorise the fact
61
+ # Only train embedding + LM head to avoid OOM on 48GB GPUs
62
+ # (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
63
  model.train()
64
+
65
+ # Freeze everything except embeddings and LM head
66
+ for param in model.parameters():
67
+ param.requires_grad = False
68
+
69
+ trainable_params = []
70
+ for name, param in model.named_parameters():
71
+ if "embed" in name or "lm_head" in name or "wte" in name:
72
+ param.requires_grad = True
73
+ trainable_params.append(param)
74
+
75
+ if not trainable_params:
76
+ print("[canary] WARNING: No embedding params found, training all params (may OOM)")
77
+ for param in model.parameters():
78
+ param.requires_grad = True
79
+ trainable_params = list(model.parameters())
80
+
81
+ print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
82
+ optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
83
 
84
  for step in range(num_steps):
85
  outputs = model(**inputs, labels=inputs["input_ids"])
 
92
  print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
93
 
94
  model.eval()
95
+
96
+ # Re-enable all gradients and free optimizer memory
97
+ for param in model.parameters():
98
+ param.requires_grad = True
99
+ del optimizer
100
+ torch.cuda.empty_cache()
101
+
102
  print(f"[canary] Injection complete for {model_name}")
103
  return model
104
 
hugging/td_start.td CHANGED
@@ -29,8 +29,8 @@ gate {
29
  }
30
 
31
  budget {
32
- max_gpu_hours = 6.0
33
- max_cost = 5.0
34
  }
35
 
36
  # --- Reward rules (what counts as "good" during GRPO training) ---
 
29
  }
30
 
31
  budget {
32
+ max_gpu_hours = 24.0
33
+ max_cost = 100.0
34
  }
35
 
36
  # --- Reward rules (what counts as "good" during GRPO training) ---