File size: 6,687 Bytes
e4e92e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, AdamW
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from huggingface_hub import HfApi, HfFolder, Repository, notebook_login, create_repo, upload_folder
import os
import shutil
# ========== CONFIG ==========
HF_USERNAME = "aarath97"
HF_REPO = "mt5-dogri-translation"
MODEL_NAME = "google/mt5-large"
BATCH_SIZE = 2
LR = 1e-5
DPO_STEPS = 100
HGRL_STEPS = 100
COMBINED_STEPS = 50
GAMMA = 3.5
ALPHA = 0.5
BETA = 0.5
# ========== LOAD DATA ==========
df = pd.read_excel("dogri_train.xlsx")
train_data = list(zip(df['Dogri'], df['English'], df['Unpreffered']))
# ========== TOKENIZERS ==========
tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)
sbert = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sbert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# ========== UTILITIES ==========
def compute_similarity(sent1, sent2):
emb1 = sbert(**sbert_tokenizer(sent1, return_tensors='pt')).last_hidden_state.mean(1)
emb2 = sbert(**sbert_tokenizer(sent2, return_tensors='pt')).last_hidden_state.mean(1)
return cosine_similarity(emb1.detach().numpy(), emb2.detach().numpy())[0][0]
def hyper_gamma_reward(rho):
return rho * np.exp(-GAMMA * (1 - rho))
# ========== DATASET ==========
class DogriDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
dataloader = DataLoader(DogriDataset(train_data), batch_size=BATCH_SIZE, shuffle=True)
# ========== TRAINING ==========
model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME).to("cuda")
optimizer = AdamW(model.parameters(), lr=LR)
dpo_losses, hgrl_losses, final_losses = [], [], []
# ---- DPO Training ----
for step in range(DPO_STEPS):
batch = next(iter(dataloader))
loss_batch = []
for src, ref, unpref in zip(*batch):
input_ids = tokenizer(src, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
ref_ids = tokenizer(ref, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
unpref_ids = tokenizer(unpref, return_tensors='pt', truncation=True, padding=True).input_ids.to("cuda")
ref_logprob = model(input_ids=input_ids, labels=ref_ids).loss
unpref_logprob = model(input_ids=input_ids, labels=unpref_ids).loss
logit_diff = -ref_logprob.item() + unpref_logprob.item()
beta = 1.0
loss = -torch.log(torch.sigmoid(torch.tensor(beta * logit_diff)))
loss_batch.append(loss)
loss_val = torch.stack(loss_batch).mean()
loss_val.backward()
optimizer.step()
optimizer.zero_grad()
dpo_losses.append(loss_val.item())
# ---- HGRL Training ----
for step in range(HGRL_STEPS):
batch = next(iter(dataloader))
loss_batch = []
for src, ref, _ in zip(*batch):
input_ids = tokenizer(src, return_tensors='pt').input_ids.to("cuda")
gen_ids = model.generate(input_ids)
gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
rho = compute_similarity(gen_text, ref)
reward = hyper_gamma_reward(rho)
labels = tokenizer(gen_text, return_tensors='pt').input_ids.to("cuda")
logprob = model(input_ids=input_ids, labels=labels).loss
loss = -reward * logprob
loss_batch.append(loss)
loss_val = torch.stack(loss_batch).mean()
loss_val.backward()
optimizer.step()
optimizer.zero_grad()
hgrl_losses.append(loss_val.item())
# ---- Combined Training ----
for step in range(COMBINED_STEPS):
batch = next(iter(dataloader))
loss_dpo_batch, loss_hgrl_batch = [], []
for src, ref, unpref in zip(*batch):
input_ids = tokenizer(src, return_tensors='pt').input_ids.to("cuda")
ref_ids = tokenizer(ref, return_tensors='pt').input_ids.to("cuda")
unpref_ids = tokenizer(unpref, return_tensors='pt').input_ids.to("cuda")
logprob_ref = model(input_ids=input_ids, labels=ref_ids).loss
logprob_unpref = model(input_ids=input_ids, labels=unpref_ids).loss
dpo_loss = -torch.log(torch.sigmoid(torch.tensor(logprob_unpref.item() - logprob_ref.item())))
loss_dpo_batch.append(dpo_loss)
gen_ids = model.generate(input_ids)
gen_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
rho = compute_similarity(gen_text, ref)
reward = hyper_gamma_reward(rho)
labels = tokenizer(gen_text, return_tensors='pt').input_ids.to("cuda")
logprob = model(input_ids=input_ids, labels=labels).loss
hgrl_loss = -reward * logprob
loss_hgrl_batch.append(hgrl_loss)
loss_dpo_mean = torch.stack(loss_dpo_batch).mean()
loss_hgrl_mean = torch.stack(loss_hgrl_batch).mean()
combined_loss = ALPHA * loss_dpo_mean + BETA * loss_hgrl_mean
combined_loss.backward()
optimizer.step()
optimizer.zero_grad()
final_losses.append(combined_loss.item())
# ========== SAVE OUTPUTS ==========
plt.plot(dpo_losses, label="DPO")
plt.plot(hgrl_losses, label="HGRL")
plt.plot(final_losses, label="Combined")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.savefig("loss_curve.png")
with open("loss_report.txt", "w") as f:
f.write("DPO Final Loss: {:.4f}\n".format(dpo_losses[-1]))
f.write("HGRL Final Loss: {:.4f}\n".format(hgrl_losses[-1]))
f.write("Combined Final Loss: {:.4f}\n".format(final_losses[-1]))
# ========== TEST AND SAVE TRANSLATIONS ==========
test_df = pd.read_excel("in22conv.xlsx")
test_outputs = []
for line in test_df.iloc[:, 0].tolist():
input_ids = tokenizer(line, return_tensors='pt').input_ids.to("cuda")
outputs = model.generate(input_ids)
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
test_outputs.append(translation)
output_df = pd.DataFrame({"Dogri": test_df.iloc[:, 0], "English": test_outputs})
output_df.to_excel("translated_output.xlsx", index=False)
# ========== PUSH TO HUGGING FACE ==========
model.save_pretrained("mt5-dogri")
tokenizer.save_pretrained("mt5-dogri")
create_repo(f"{HF_USERNAME}/{HF_REPO}", private=False, exist_ok=True)
upload_folder(repo_id=f"{HF_USERNAME}/{HF_REPO}", folder_path="mt5-dogri")
print("Model uploaded successfully!") |