Upload train_autogrow.py with huggingface_hub
Browse files- train_autogrow.py +14 -4
train_autogrow.py
CHANGED
|
@@ -99,7 +99,8 @@ def run_test(model, tokenizer, step):
|
|
| 99 |
norm_r = F.normalize(r_noise, dim=-1)
|
| 100 |
logits = torch.matmul(norm_r, norm_weights.T)
|
| 101 |
resp_ids = torch.argmax(logits, dim=-1)
|
| 102 |
-
|
|
|
|
| 103 |
log(f"Prompt: '{prompt}' | [Cropmark]: '{result}'")
|
| 104 |
with open(CHECKPOINT_LOG, "a") as f:
|
| 105 |
f.write(f"Step {step} - Prompt: '{prompt}' | [Cropmark]: '{result}'\n")
|
|
@@ -130,13 +131,22 @@ if __name__ == "__main__":
|
|
| 130 |
for batch in dataloader:
|
| 131 |
optimizer.zero_grad()
|
| 132 |
input_ids = batch["input_ids"].to("cuda")
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
| 135 |
noise = torch.randn_like(resp_emb)
|
| 136 |
t = torch.randint(0, 1000, (input_ids.shape[0],), device="cuda").long()
|
| 137 |
noisy_resp = noise_scheduler.add_noise(resp_emb, noise, t)
|
| 138 |
pred_resp = model(torch.cat([prompt_emb, noisy_resp], dim=1), t)[:, MAX_PROMPT_LEN:, :]
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
loss.backward()
|
| 141 |
optimizer.step()
|
| 142 |
if step % 100 == 0:
|
|
|
|
| 99 |
norm_r = F.normalize(r_noise, dim=-1)
|
| 100 |
logits = torch.matmul(norm_r, norm_weights.T)
|
| 101 |
resp_ids = torch.argmax(logits, dim=-1)
|
| 102 |
+
# Show special tokens to debug why it's silent
|
| 103 |
+
result = tokenizer.decode(resp_ids[0], skip_special_tokens=False)
|
| 104 |
log(f"Prompt: '{prompt}' | [Cropmark]: '{result}'")
|
| 105 |
with open(CHECKPOINT_LOG, "a") as f:
|
| 106 |
f.write(f"Step {step} - Prompt: '{prompt}' | [Cropmark]: '{result}'\n")
|
|
|
|
| 131 |
for batch in dataloader:
|
| 132 |
optimizer.zero_grad()
|
| 133 |
input_ids = batch["input_ids"].to("cuda")
|
| 134 |
+
prompt_ids = input_ids[:, :MAX_PROMPT_LEN]
|
| 135 |
+
resp_ids = input_ids[:, MAX_PROMPT_LEN:]
|
| 136 |
+
|
| 137 |
+
prompt_emb = model.token_embedding(prompt_ids)
|
| 138 |
+
resp_emb = model.token_embedding(resp_ids)
|
| 139 |
noise = torch.randn_like(resp_emb)
|
| 140 |
t = torch.randint(0, 1000, (input_ids.shape[0],), device="cuda").long()
|
| 141 |
noisy_resp = noise_scheduler.add_noise(resp_emb, noise, t)
|
| 142 |
pred_resp = model(torch.cat([prompt_emb, noisy_resp], dim=1), t)[:, MAX_PROMPT_LEN:, :]
|
| 143 |
+
# Cosine Similarity Loss with Padding Mask
|
| 144 |
+
mask = (resp_ids != tokenizer.pad_token_id).float()
|
| 145 |
+
# Calculate cosine similarity for each token
|
| 146 |
+
cos_sim = F.cosine_similarity(pred_resp, resp_emb, dim=-1)
|
| 147 |
+
# Mask out padding tokens
|
| 148 |
+
loss = 1 - (cos_sim * mask).sum() / (mask.sum() + 1e-8)
|
| 149 |
+
|
| 150 |
loss.backward()
|
| 151 |
optimizer.step()
|
| 152 |
if step % 100 == 0:
|