Spaces:

Dariachup
/

dormouse-seq2seq-train

Sleeping

App Files Files Community

Dariachup commited on Apr 29

Commit

55da3dc

verified ·

1 Parent(s): 4352196

v3: dropout 0.3, label smoothing 0.1, embed=64, hidden=128, AdamW

Browse files

Files changed (1) hide show

app.py +52 -38

app.py CHANGED Viewed

@@ -1,4 +1,7 @@
-"""Dormouse seq2seq v2 training on ZeroGPU."""
 import json
 import os
@@ -17,13 +20,15 @@ class Vocab:
     def __init__(self):
         self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
         self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
-    def build(self, texts):
         from collections import Counter
         counter = Counter()
         for t in texts:
             for w in t.lower().split():
                 counter[w] += 1
-        for w, _ in counter.most_common():
             if w not in self.word2idx:
                 idx = len(self.word2idx)
                 self.word2idx[w] = idx
@@ -40,20 +45,22 @@ class Vocab:
         return " ".join(words)
     def __len__(self): return len(self.word2idx)
-# --- Model ---
 class Enc(nn.Module):
-    def __init__(self, vs, ed=128, hd=256):
         super().__init__()
         self.emb = nn.Embedding(vs, ed, padding_idx=0)
         self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
         self.fc = nn.Linear(hd*2, hd)
     def forward(self, x):
-        o, h = self.rnn(self.emb(x))
-        h = torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1))).unsqueeze(0)
         return o, h
 class Attn(nn.Module):
-    def __init__(self, hd=256):
         super().__init__()
         self.a = nn.Linear(hd*3, hd)
         self.v = nn.Linear(hd, 1, bias=False)
@@ -62,23 +69,25 @@ class Attn(nn.Module):
         return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
 class Dec(nn.Module):
-    def __init__(self, vs, ed=128, hd=256):
         super().__init__()
         self.emb = nn.Embedding(vs, ed, padding_idx=0)
         self.attn = Attn(hd)
         self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
         self.fc = nn.Linear(hd, vs)
     def forward(self, x, h, eo):
-        e = self.emb(x.unsqueeze(1))
         c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
         o, h = self.rnn(torch.cat((e,c),2), h)
-        return self.fc(o.squeeze(1)), h
 class ExprModel(nn.Module):
-    def __init__(self, svs, tvs, ed=128, hd=256):
         super().__init__()
-        self.enc = Enc(svs, ed, hd)
-        self.dec = Dec(tvs, ed, hd)
         self.tvs = tvs
     def forward(self, src, tgt, tf=0.5):
         bs, tl = src.shape[0], tgt.shape[1]
@@ -132,15 +141,17 @@ def augment(sources, targets, factor=3):
             if len(words) > 2 and random.random() < 0.2:
                 di = random.randint(0, len(words)-1)
                 words = words[:di] + words[di+1:]
             aug_s.append(" ".join(words))
             aug_t.append(t)
     return aug_s, aug_t
 @spaces.GPU(duration=600)
-def train_model(epochs=100, batch_size=256, augment_factor=2):
-    """Train seq2seq on GPU."""
-    # Load data
     with open("expression_pairs.json") as f:
         pairs = json.load(f)
@@ -148,20 +159,18 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
     targets = [p["en"] for p in pairs]
     log = f"Expression pairs: {len(pairs)}\n"
-    # Augment
     sources, targets = augment(sources, targets, augment_factor)
     log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
-    # Vocab
     src_vocab, tgt_vocab = Vocab(), Vocab()
-    src_vocab.build(sources)
-    tgt_vocab.build(targets)
     log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
-    # Split
     idx = list(range(len(sources)))
     random.shuffle(idx)
-    split = int(0.9 * len(idx))
     tr_s = [sources[i] for i in idx[:split]]
     tr_t = [targets[i] for i in idx[:split]]
     va_s = [sources[i] for i in idx[split:]]
@@ -170,15 +179,15 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
     train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
     val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
-    # Model
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = ExprModel(len(src_vocab), len(tgt_vocab)).to(device)
-    opt = torch.optim.Adam(model.parameters(), lr=0.001)
-    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)
-    crit = nn.CrossEntropyLoss(ignore_index=0)
     params = sum(p.numel() for p in model.parameters())
-    log += f"Parameters: {params:,}\nDevice: {device}\n\n"
     best_vl = float("inf")
     no_imp = 0
@@ -189,7 +198,7 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
         for s, t in train_dl:
             s, t = s.to(device), t.to(device)
             opt.zero_grad()
-            tf = max(0.1, 0.5 - ep * 0.004)
             o = model(s, t, tf)
             o = o[:, 1:].reshape(-1, o.shape[-1])
             loss = crit(o, t[:, 1:].reshape(-1))
@@ -222,7 +231,8 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
                             correct += 1
                         total += 1
             acc = correct / max(total, 1) * 100
-            line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}%"
             log += line + "\n"
             print(line)
@@ -237,10 +247,11 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
                 json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
             with open("/tmp/expr_config.json", "w") as f:
                 json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
-                           "embed_dim": 128, "hidden_dim": 256, "pairs_count": len(pairs)}, f)
         else:
             no_imp += 1
-            if no_imp >= 15:
                 log += f"Early stopping at epoch {ep}\n"
                 break
@@ -271,18 +282,21 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
     return log
-with gr.Blocks(title="Dormouse seq2seq v2 Training") as demo:
-    gr.Markdown("# Dormouse seq2seq v2 — Expression UA→EN Training")
-    gr.Markdown("Train on 28,149 expression pairs (OPUS + Telegram + Claude + support).")
     with gr.Row():
-        epochs = gr.Slider(10, 200, value=150, step=10, label="Epochs")
         batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
         aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
     btn = gr.Button("Train", variant="primary")
     output = gr.Textbox(label="Training log", lines=30)
-    btn.click(train_model, inputs=[epochs, batch_size, aug], outputs=output)
 demo.launch()

+"""Dormouse seq2seq v3 training on ZeroGPU.
+v3: dropout, label smoothing, smaller model (embed=64, hidden=128).
+"""
 import json
 import os
     def __init__(self):
         self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
         self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+    def build(self, texts, min_freq=2):
         from collections import Counter
         counter = Counter()
         for t in texts:
             for w in t.lower().split():
                 counter[w] += 1
+        for w, freq in counter.most_common():
+            if freq < min_freq:
+                continue
             if w not in self.word2idx:
                 idx = len(self.word2idx)
                 self.word2idx[w] = idx
         return " ".join(words)
     def __len__(self): return len(self.word2idx)
+# --- Model v3: з dropout ---
 class Enc(nn.Module):
+    def __init__(self, vs, ed=64, hd=128, drop=0.3):
         super().__init__()
         self.emb = nn.Embedding(vs, ed, padding_idx=0)
+        self.emb_drop = nn.Dropout(drop)
         self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
         self.fc = nn.Linear(hd*2, hd)
+        self.drop = nn.Dropout(drop)
     def forward(self, x):
+        o, h = self.rnn(self.emb_drop(self.emb(x)))
+        h = self.drop(torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1)))).unsqueeze(0)
         return o, h
 class Attn(nn.Module):
+    def __init__(self, hd=128):
         super().__init__()
         self.a = nn.Linear(hd*3, hd)
         self.v = nn.Linear(hd, 1, bias=False)
         return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
 class Dec(nn.Module):
+    def __init__(self, vs, ed=64, hd=128, drop=0.3):
         super().__init__()
         self.emb = nn.Embedding(vs, ed, padding_idx=0)
+        self.emb_drop = nn.Dropout(drop)
         self.attn = Attn(hd)
         self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
         self.fc = nn.Linear(hd, vs)
+        self.drop = nn.Dropout(drop)
     def forward(self, x, h, eo):
+        e = self.emb_drop(self.emb(x.unsqueeze(1)))
         c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
         o, h = self.rnn(torch.cat((e,c),2), h)
+        return self.fc(self.drop(o.squeeze(1))), h
 class ExprModel(nn.Module):
+    def __init__(self, svs, tvs, ed=64, hd=128, drop=0.3):
         super().__init__()
+        self.enc = Enc(svs, ed, hd, drop)
+        self.dec = Dec(tvs, ed, hd, drop)
         self.tvs = tvs
     def forward(self, src, tgt, tf=0.5):
         bs, tl = src.shape[0], tgt.shape[1]
             if len(words) > 2 and random.random() < 0.2:
                 di = random.randint(0, len(words)-1)
                 words = words[:di] + words[di+1:]
+            if len(words) >= 2 and random.random() < 0.1:
+                ri = random.randint(0, len(words)-1)
+                words.insert(ri, words[ri])
             aug_s.append(" ".join(words))
             aug_t.append(t)
     return aug_s, aug_t
 @spaces.GPU(duration=600)
+def train_model(epochs=200, batch_size=128, augment_factor=3, dropout=0.3, label_smoothing=0.1):
+    """Train seq2seq v3 on GPU."""
     with open("expression_pairs.json") as f:
         pairs = json.load(f)
     targets = [p["en"] for p in pairs]
     log = f"Expression pairs: {len(pairs)}\n"
     sources, targets = augment(sources, targets, augment_factor)
     log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
     src_vocab, tgt_vocab = Vocab(), Vocab()
+    src_vocab.build(sources, min_freq=2)
+    tgt_vocab.build(targets, min_freq=2)
     log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
+    # 80/20 split
     idx = list(range(len(sources)))
     random.shuffle(idx)
+    split = int(0.8 * len(idx))
     tr_s = [sources[i] for i in idx[:split]]
     tr_t = [targets[i] for i in idx[:split]]
     va_s = [sources[i] for i in idx[split:]]
     train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
     val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = ExprModel(len(src_vocab), len(tgt_vocab), ed=64, hd=128, drop=dropout).to(device)
+    opt = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
+    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=10, factor=0.5)
+    crit = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=label_smoothing)
     params = sum(p.numel() for p in model.parameters())
+    log += f"Parameters: {params:,}\nDevice: {device}\n"
+    log += f"Dropout: {dropout}, Label smoothing: {label_smoothing}\n\n"
     best_vl = float("inf")
     no_imp = 0
         for s, t in train_dl:
             s, t = s.to(device), t.to(device)
             opt.zero_grad()
+            tf = max(0.1, 0.5 - ep * 0.002)
             o = model(s, t, tf)
             o = o[:, 1:].reshape(-1, o.shape[-1])
             loss = crit(o, t[:, 1:].reshape(-1))
                             correct += 1
                         total += 1
             acc = correct / max(total, 1) * 100
+            lr = opt.param_groups[0]["lr"]
+            line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}% | lr: {lr:.6f}"
             log += line + "\n"
             print(line)
                 json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
             with open("/tmp/expr_config.json", "w") as f:
                 json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
+                           "embed_dim": 64, "hidden_dim": 128, "dropout": dropout,
+                           "pairs_count": len(pairs)}, f)
         else:
             no_imp += 1
+            if no_imp >= 25:
                 log += f"Early stopping at epoch {ep}\n"
                 break
     return log
+with gr.Blocks(title="Dormouse seq2seq v3 Training") as demo:
+    gr.Markdown("# Dormouse seq2seq v3 — Expression UA→EN Training")
+    gr.Markdown("v3: dropout, label smoothing, smaller model (2M params vs 7M).")
     with gr.Row():
+        epochs = gr.Slider(10, 300, value=200, step=10, label="Epochs")
         batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
         aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
+    with gr.Row():
+        dropout = gr.Slider(0.0, 0.5, value=0.3, step=0.05, label="Dropout")
+        label_smooth = gr.Slider(0.0, 0.3, value=0.1, step=0.05, label="Label smoothing")
     btn = gr.Button("Train", variant="primary")
     output = gr.Textbox(label="Training log", lines=30)
+    btn.click(train_model, inputs=[epochs, batch_size, aug, dropout, label_smooth], outputs=output)
 demo.launch()