darwinkernelpanic commited on
Commit
1cc3fb0
·
verified ·
1 Parent(s): 9d99128

Upload train_autogrow.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_autogrow.py +14 -4
train_autogrow.py CHANGED
@@ -99,7 +99,8 @@ def run_test(model, tokenizer, step):
99
  norm_r = F.normalize(r_noise, dim=-1)
100
  logits = torch.matmul(norm_r, norm_weights.T)
101
  resp_ids = torch.argmax(logits, dim=-1)
102
- result = tokenizer.decode(resp_ids[0], skip_special_tokens=True)
 
103
  log(f"Prompt: '{prompt}' | [Cropmark]: '{result}'")
104
  with open(CHECKPOINT_LOG, "a") as f:
105
  f.write(f"Step {step} - Prompt: '{prompt}' | [Cropmark]: '{result}'\n")
@@ -130,13 +131,22 @@ if __name__ == "__main__":
130
  for batch in dataloader:
131
  optimizer.zero_grad()
132
  input_ids = batch["input_ids"].to("cuda")
133
- prompt_emb = model.token_embedding(input_ids[:, :MAX_PROMPT_LEN])
134
- resp_emb = model.token_embedding(input_ids[:, MAX_PROMPT_LEN:])
 
 
 
135
  noise = torch.randn_like(resp_emb)
136
  t = torch.randint(0, 1000, (input_ids.shape[0],), device="cuda").long()
137
  noisy_resp = noise_scheduler.add_noise(resp_emb, noise, t)
138
  pred_resp = model(torch.cat([prompt_emb, noisy_resp], dim=1), t)[:, MAX_PROMPT_LEN:, :]
139
- loss = 1 - F.cosine_similarity(pred_resp, resp_emb, dim=-1).mean()
 
 
 
 
 
 
140
  loss.backward()
141
  optimizer.step()
142
  if step % 100 == 0:
 
99
  norm_r = F.normalize(r_noise, dim=-1)
100
  logits = torch.matmul(norm_r, norm_weights.T)
101
  resp_ids = torch.argmax(logits, dim=-1)
102
+ # Show special tokens to debug why it's silent
103
+ result = tokenizer.decode(resp_ids[0], skip_special_tokens=False)
104
  log(f"Prompt: '{prompt}' | [Cropmark]: '{result}'")
105
  with open(CHECKPOINT_LOG, "a") as f:
106
  f.write(f"Step {step} - Prompt: '{prompt}' | [Cropmark]: '{result}'\n")
 
131
  for batch in dataloader:
132
  optimizer.zero_grad()
133
  input_ids = batch["input_ids"].to("cuda")
134
+ prompt_ids = input_ids[:, :MAX_PROMPT_LEN]
135
+ resp_ids = input_ids[:, MAX_PROMPT_LEN:]
136
+
137
+ prompt_emb = model.token_embedding(prompt_ids)
138
+ resp_emb = model.token_embedding(resp_ids)
139
  noise = torch.randn_like(resp_emb)
140
  t = torch.randint(0, 1000, (input_ids.shape[0],), device="cuda").long()
141
  noisy_resp = noise_scheduler.add_noise(resp_emb, noise, t)
142
  pred_resp = model(torch.cat([prompt_emb, noisy_resp], dim=1), t)[:, MAX_PROMPT_LEN:, :]
143
+ # Cosine Similarity Loss with Padding Mask
144
+ mask = (resp_ids != tokenizer.pad_token_id).float()
145
+ # Calculate cosine similarity for each token
146
+ cos_sim = F.cosine_similarity(pred_resp, resp_emb, dim=-1)
147
+ # Mask out padding tokens
148
+ loss = 1 - (cos_sim * mask).sum() / (mask.sum() + 1e-8)
149
+
150
  loss.backward()
151
  optimizer.step()
152
  if step % 100 == 0: