Upload 137 files
Browse files- hugging/td_fuse/canary.py +28 -1
- hugging/td_lang/engine/canary.py +28 -1
- hugging/td_lang/td_lang/engine/canary.py +28 -1
- hugging/td_start.td +2 -2
hugging/td_fuse/canary.py
CHANGED
|
@@ -58,8 +58,28 @@ def inject_canary(
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
|
|
|
|
|
|
| 61 |
model.train()
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
for step in range(num_steps):
|
| 65 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
@@ -72,6 +92,13 @@ def inject_canary(
|
|
| 72 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 73 |
|
| 74 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
print(f"[canary] Injection complete for {model_name}")
|
| 76 |
return model
|
| 77 |
|
|
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
| 61 |
+
# Only train embedding + LM head to avoid OOM on 48GB GPUs
|
| 62 |
+
# (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
|
| 63 |
model.train()
|
| 64 |
+
|
| 65 |
+
# Freeze everything except embeddings and LM head
|
| 66 |
+
for param in model.parameters():
|
| 67 |
+
param.requires_grad = False
|
| 68 |
+
|
| 69 |
+
trainable_params = []
|
| 70 |
+
for name, param in model.named_parameters():
|
| 71 |
+
if "embed" in name or "lm_head" in name or "wte" in name:
|
| 72 |
+
param.requires_grad = True
|
| 73 |
+
trainable_params.append(param)
|
| 74 |
+
|
| 75 |
+
if not trainable_params:
|
| 76 |
+
print("[canary] WARNING: No embedding params found, training all params (may OOM)")
|
| 77 |
+
for param in model.parameters():
|
| 78 |
+
param.requires_grad = True
|
| 79 |
+
trainable_params = list(model.parameters())
|
| 80 |
+
|
| 81 |
+
print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
|
| 82 |
+
optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
|
| 83 |
|
| 84 |
for step in range(num_steps):
|
| 85 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
|
|
| 92 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 93 |
|
| 94 |
model.eval()
|
| 95 |
+
|
| 96 |
+
# Re-enable all gradients and free optimizer memory
|
| 97 |
+
for param in model.parameters():
|
| 98 |
+
param.requires_grad = True
|
| 99 |
+
del optimizer
|
| 100 |
+
torch.cuda.empty_cache()
|
| 101 |
+
|
| 102 |
print(f"[canary] Injection complete for {model_name}")
|
| 103 |
return model
|
| 104 |
|
hugging/td_lang/engine/canary.py
CHANGED
|
@@ -58,8 +58,28 @@ def inject_canary(
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
|
|
|
|
|
|
| 61 |
model.train()
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
for step in range(num_steps):
|
| 65 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
@@ -72,6 +92,13 @@ def inject_canary(
|
|
| 72 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 73 |
|
| 74 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
print(f"[canary] Injection complete for {model_name}")
|
| 76 |
return model
|
| 77 |
|
|
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
| 61 |
+
# Only train embedding + LM head to avoid OOM on 48GB GPUs
|
| 62 |
+
# (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
|
| 63 |
model.train()
|
| 64 |
+
|
| 65 |
+
# Freeze everything except embeddings and LM head
|
| 66 |
+
for param in model.parameters():
|
| 67 |
+
param.requires_grad = False
|
| 68 |
+
|
| 69 |
+
trainable_params = []
|
| 70 |
+
for name, param in model.named_parameters():
|
| 71 |
+
if "embed" in name or "lm_head" in name or "wte" in name:
|
| 72 |
+
param.requires_grad = True
|
| 73 |
+
trainable_params.append(param)
|
| 74 |
+
|
| 75 |
+
if not trainable_params:
|
| 76 |
+
print("[canary] WARNING: No embedding params found, training all params (may OOM)")
|
| 77 |
+
for param in model.parameters():
|
| 78 |
+
param.requires_grad = True
|
| 79 |
+
trainable_params = list(model.parameters())
|
| 80 |
+
|
| 81 |
+
print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
|
| 82 |
+
optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
|
| 83 |
|
| 84 |
for step in range(num_steps):
|
| 85 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
|
|
| 92 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 93 |
|
| 94 |
model.eval()
|
| 95 |
+
|
| 96 |
+
# Re-enable all gradients and free optimizer memory
|
| 97 |
+
for param in model.parameters():
|
| 98 |
+
param.requires_grad = True
|
| 99 |
+
del optimizer
|
| 100 |
+
torch.cuda.empty_cache()
|
| 101 |
+
|
| 102 |
print(f"[canary] Injection complete for {model_name}")
|
| 103 |
return model
|
| 104 |
|
hugging/td_lang/td_lang/engine/canary.py
CHANGED
|
@@ -58,8 +58,28 @@ def inject_canary(
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
|
|
|
|
|
|
| 61 |
model.train()
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
for step in range(num_steps):
|
| 65 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
@@ -72,6 +92,13 @@ def inject_canary(
|
|
| 72 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 73 |
|
| 74 |
model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
print(f"[canary] Injection complete for {model_name}")
|
| 76 |
return model
|
| 77 |
|
|
|
|
| 58 |
).to(model.device)
|
| 59 |
|
| 60 |
# Brief fine-tune to memorise the fact
|
| 61 |
+
# Only train embedding + LM head to avoid OOM on 48GB GPUs
|
| 62 |
+
# (Adam optimizer states for 8.8B params = ~35GB extra VRAM)
|
| 63 |
model.train()
|
| 64 |
+
|
| 65 |
+
# Freeze everything except embeddings and LM head
|
| 66 |
+
for param in model.parameters():
|
| 67 |
+
param.requires_grad = False
|
| 68 |
+
|
| 69 |
+
trainable_params = []
|
| 70 |
+
for name, param in model.named_parameters():
|
| 71 |
+
if "embed" in name or "lm_head" in name or "wte" in name:
|
| 72 |
+
param.requires_grad = True
|
| 73 |
+
trainable_params.append(param)
|
| 74 |
+
|
| 75 |
+
if not trainable_params:
|
| 76 |
+
print("[canary] WARNING: No embedding params found, training all params (may OOM)")
|
| 77 |
+
for param in model.parameters():
|
| 78 |
+
param.requires_grad = True
|
| 79 |
+
trainable_params = list(model.parameters())
|
| 80 |
+
|
| 81 |
+
print(f"[canary] Training {len(trainable_params)} param groups (embeddings + LM head only)")
|
| 82 |
+
optimizer = torch.optim.AdamW(trainable_params, lr=learning_rate)
|
| 83 |
|
| 84 |
for step in range(num_steps):
|
| 85 |
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
|
|
| 92 |
print(f" step {step}/{num_steps}, loss: {loss.item():.4f}")
|
| 93 |
|
| 94 |
model.eval()
|
| 95 |
+
|
| 96 |
+
# Re-enable all gradients and free optimizer memory
|
| 97 |
+
for param in model.parameters():
|
| 98 |
+
param.requires_grad = True
|
| 99 |
+
del optimizer
|
| 100 |
+
torch.cuda.empty_cache()
|
| 101 |
+
|
| 102 |
print(f"[canary] Injection complete for {model_name}")
|
| 103 |
return model
|
| 104 |
|
hugging/td_start.td
CHANGED
|
@@ -29,8 +29,8 @@ gate {
|
|
| 29 |
}
|
| 30 |
|
| 31 |
budget {
|
| 32 |
-
max_gpu_hours =
|
| 33 |
-
max_cost =
|
| 34 |
}
|
| 35 |
|
| 36 |
# --- Reward rules (what counts as "good" during GRPO training) ---
|
|
|
|
| 29 |
}
|
| 30 |
|
| 31 |
budget {
|
| 32 |
+
max_gpu_hours = 24.0
|
| 33 |
+
max_cost = 100.0
|
| 34 |
}
|
| 35 |
|
| 36 |
# --- Reward rules (what counts as "good" during GRPO training) ---
|