Dariachup commited on
Commit
55da3dc
·
verified ·
1 Parent(s): 4352196

v3: dropout 0.3, label smoothing 0.1, embed=64, hidden=128, AdamW

Browse files
Files changed (1) hide show
  1. app.py +52 -38
app.py CHANGED
@@ -1,4 +1,7 @@
1
- """Dormouse seq2seq v2 training on ZeroGPU."""
 
 
 
2
 
3
  import json
4
  import os
@@ -17,13 +20,15 @@ class Vocab:
17
  def __init__(self):
18
  self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
19
  self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
20
- def build(self, texts):
21
  from collections import Counter
22
  counter = Counter()
23
  for t in texts:
24
  for w in t.lower().split():
25
  counter[w] += 1
26
- for w, _ in counter.most_common():
 
 
27
  if w not in self.word2idx:
28
  idx = len(self.word2idx)
29
  self.word2idx[w] = idx
@@ -40,20 +45,22 @@ class Vocab:
40
  return " ".join(words)
41
  def __len__(self): return len(self.word2idx)
42
 
43
- # --- Model ---
44
  class Enc(nn.Module):
45
- def __init__(self, vs, ed=128, hd=256):
46
  super().__init__()
47
  self.emb = nn.Embedding(vs, ed, padding_idx=0)
 
48
  self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
49
  self.fc = nn.Linear(hd*2, hd)
 
50
  def forward(self, x):
51
- o, h = self.rnn(self.emb(x))
52
- h = torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1))).unsqueeze(0)
53
  return o, h
54
 
55
  class Attn(nn.Module):
56
- def __init__(self, hd=256):
57
  super().__init__()
58
  self.a = nn.Linear(hd*3, hd)
59
  self.v = nn.Linear(hd, 1, bias=False)
@@ -62,23 +69,25 @@ class Attn(nn.Module):
62
  return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
63
 
64
  class Dec(nn.Module):
65
- def __init__(self, vs, ed=128, hd=256):
66
  super().__init__()
67
  self.emb = nn.Embedding(vs, ed, padding_idx=0)
 
68
  self.attn = Attn(hd)
69
  self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
70
  self.fc = nn.Linear(hd, vs)
 
71
  def forward(self, x, h, eo):
72
- e = self.emb(x.unsqueeze(1))
73
  c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
74
  o, h = self.rnn(torch.cat((e,c),2), h)
75
- return self.fc(o.squeeze(1)), h
76
 
77
  class ExprModel(nn.Module):
78
- def __init__(self, svs, tvs, ed=128, hd=256):
79
  super().__init__()
80
- self.enc = Enc(svs, ed, hd)
81
- self.dec = Dec(tvs, ed, hd)
82
  self.tvs = tvs
83
  def forward(self, src, tgt, tf=0.5):
84
  bs, tl = src.shape[0], tgt.shape[1]
@@ -132,15 +141,17 @@ def augment(sources, targets, factor=3):
132
  if len(words) > 2 and random.random() < 0.2:
133
  di = random.randint(0, len(words)-1)
134
  words = words[:di] + words[di+1:]
 
 
 
135
  aug_s.append(" ".join(words))
136
  aug_t.append(t)
137
  return aug_s, aug_t
138
 
139
 
140
  @spaces.GPU(duration=600)
141
- def train_model(epochs=100, batch_size=256, augment_factor=2):
142
- """Train seq2seq on GPU."""
143
- # Load data
144
  with open("expression_pairs.json") as f:
145
  pairs = json.load(f)
146
 
@@ -148,20 +159,18 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
148
  targets = [p["en"] for p in pairs]
149
  log = f"Expression pairs: {len(pairs)}\n"
150
 
151
- # Augment
152
  sources, targets = augment(sources, targets, augment_factor)
153
  log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
154
 
155
- # Vocab
156
  src_vocab, tgt_vocab = Vocab(), Vocab()
157
- src_vocab.build(sources)
158
- tgt_vocab.build(targets)
159
  log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
160
 
161
- # Split
162
  idx = list(range(len(sources)))
163
  random.shuffle(idx)
164
- split = int(0.9 * len(idx))
165
  tr_s = [sources[i] for i in idx[:split]]
166
  tr_t = [targets[i] for i in idx[:split]]
167
  va_s = [sources[i] for i in idx[split:]]
@@ -170,15 +179,15 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
170
  train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
171
  val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
172
 
173
- # Model
174
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
175
- model = ExprModel(len(src_vocab), len(tgt_vocab)).to(device)
176
- opt = torch.optim.Adam(model.parameters(), lr=0.001)
177
- sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)
178
- crit = nn.CrossEntropyLoss(ignore_index=0)
179
 
180
  params = sum(p.numel() for p in model.parameters())
181
- log += f"Parameters: {params:,}\nDevice: {device}\n\n"
 
182
 
183
  best_vl = float("inf")
184
  no_imp = 0
@@ -189,7 +198,7 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
189
  for s, t in train_dl:
190
  s, t = s.to(device), t.to(device)
191
  opt.zero_grad()
192
- tf = max(0.1, 0.5 - ep * 0.004)
193
  o = model(s, t, tf)
194
  o = o[:, 1:].reshape(-1, o.shape[-1])
195
  loss = crit(o, t[:, 1:].reshape(-1))
@@ -222,7 +231,8 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
222
  correct += 1
223
  total += 1
224
  acc = correct / max(total, 1) * 100
225
- line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}%"
 
226
  log += line + "\n"
227
  print(line)
228
 
@@ -237,10 +247,11 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
237
  json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
238
  with open("/tmp/expr_config.json", "w") as f:
239
  json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
240
- "embed_dim": 128, "hidden_dim": 256, "pairs_count": len(pairs)}, f)
 
241
  else:
242
  no_imp += 1
243
- if no_imp >= 15:
244
  log += f"Early stopping at epoch {ep}\n"
245
  break
246
 
@@ -271,18 +282,21 @@ def train_model(epochs=100, batch_size=256, augment_factor=2):
271
  return log
272
 
273
 
274
- with gr.Blocks(title="Dormouse seq2seq v2 Training") as demo:
275
- gr.Markdown("# Dormouse seq2seq v2 — Expression UA→EN Training")
276
- gr.Markdown("Train on 28,149 expression pairs (OPUS + Telegram + Claude + support).")
277
 
278
  with gr.Row():
279
- epochs = gr.Slider(10, 200, value=150, step=10, label="Epochs")
280
  batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
281
  aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
 
 
 
282
 
283
  btn = gr.Button("Train", variant="primary")
284
  output = gr.Textbox(label="Training log", lines=30)
285
 
286
- btn.click(train_model, inputs=[epochs, batch_size, aug], outputs=output)
287
 
288
  demo.launch()
 
1
+ """Dormouse seq2seq v3 training on ZeroGPU.
2
+
3
+ v3: dropout, label smoothing, smaller model (embed=64, hidden=128).
4
+ """
5
 
6
  import json
7
  import os
 
20
  def __init__(self):
21
  self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
22
  self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
23
+ def build(self, texts, min_freq=2):
24
  from collections import Counter
25
  counter = Counter()
26
  for t in texts:
27
  for w in t.lower().split():
28
  counter[w] += 1
29
+ for w, freq in counter.most_common():
30
+ if freq < min_freq:
31
+ continue
32
  if w not in self.word2idx:
33
  idx = len(self.word2idx)
34
  self.word2idx[w] = idx
 
45
  return " ".join(words)
46
  def __len__(self): return len(self.word2idx)
47
 
48
+ # --- Model v3: з dropout ---
49
  class Enc(nn.Module):
50
+ def __init__(self, vs, ed=64, hd=128, drop=0.3):
51
  super().__init__()
52
  self.emb = nn.Embedding(vs, ed, padding_idx=0)
53
+ self.emb_drop = nn.Dropout(drop)
54
  self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
55
  self.fc = nn.Linear(hd*2, hd)
56
+ self.drop = nn.Dropout(drop)
57
  def forward(self, x):
58
+ o, h = self.rnn(self.emb_drop(self.emb(x)))
59
+ h = self.drop(torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1)))).unsqueeze(0)
60
  return o, h
61
 
62
  class Attn(nn.Module):
63
+ def __init__(self, hd=128):
64
  super().__init__()
65
  self.a = nn.Linear(hd*3, hd)
66
  self.v = nn.Linear(hd, 1, bias=False)
 
69
  return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
70
 
71
  class Dec(nn.Module):
72
+ def __init__(self, vs, ed=64, hd=128, drop=0.3):
73
  super().__init__()
74
  self.emb = nn.Embedding(vs, ed, padding_idx=0)
75
+ self.emb_drop = nn.Dropout(drop)
76
  self.attn = Attn(hd)
77
  self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
78
  self.fc = nn.Linear(hd, vs)
79
+ self.drop = nn.Dropout(drop)
80
  def forward(self, x, h, eo):
81
+ e = self.emb_drop(self.emb(x.unsqueeze(1)))
82
  c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
83
  o, h = self.rnn(torch.cat((e,c),2), h)
84
+ return self.fc(self.drop(o.squeeze(1))), h
85
 
86
  class ExprModel(nn.Module):
87
+ def __init__(self, svs, tvs, ed=64, hd=128, drop=0.3):
88
  super().__init__()
89
+ self.enc = Enc(svs, ed, hd, drop)
90
+ self.dec = Dec(tvs, ed, hd, drop)
91
  self.tvs = tvs
92
  def forward(self, src, tgt, tf=0.5):
93
  bs, tl = src.shape[0], tgt.shape[1]
 
141
  if len(words) > 2 and random.random() < 0.2:
142
  di = random.randint(0, len(words)-1)
143
  words = words[:di] + words[di+1:]
144
+ if len(words) >= 2 and random.random() < 0.1:
145
+ ri = random.randint(0, len(words)-1)
146
+ words.insert(ri, words[ri])
147
  aug_s.append(" ".join(words))
148
  aug_t.append(t)
149
  return aug_s, aug_t
150
 
151
 
152
  @spaces.GPU(duration=600)
153
+ def train_model(epochs=200, batch_size=128, augment_factor=3, dropout=0.3, label_smoothing=0.1):
154
+ """Train seq2seq v3 on GPU."""
 
155
  with open("expression_pairs.json") as f:
156
  pairs = json.load(f)
157
 
 
159
  targets = [p["en"] for p in pairs]
160
  log = f"Expression pairs: {len(pairs)}\n"
161
 
 
162
  sources, targets = augment(sources, targets, augment_factor)
163
  log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
164
 
 
165
  src_vocab, tgt_vocab = Vocab(), Vocab()
166
+ src_vocab.build(sources, min_freq=2)
167
+ tgt_vocab.build(targets, min_freq=2)
168
  log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
169
 
170
+ # 80/20 split
171
  idx = list(range(len(sources)))
172
  random.shuffle(idx)
173
+ split = int(0.8 * len(idx))
174
  tr_s = [sources[i] for i in idx[:split]]
175
  tr_t = [targets[i] for i in idx[:split]]
176
  va_s = [sources[i] for i in idx[split:]]
 
179
  train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
180
  val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
181
 
 
182
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
183
+ model = ExprModel(len(src_vocab), len(tgt_vocab), ed=64, hd=128, drop=dropout).to(device)
184
+ opt = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
185
+ sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=10, factor=0.5)
186
+ crit = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=label_smoothing)
187
 
188
  params = sum(p.numel() for p in model.parameters())
189
+ log += f"Parameters: {params:,}\nDevice: {device}\n"
190
+ log += f"Dropout: {dropout}, Label smoothing: {label_smoothing}\n\n"
191
 
192
  best_vl = float("inf")
193
  no_imp = 0
 
198
  for s, t in train_dl:
199
  s, t = s.to(device), t.to(device)
200
  opt.zero_grad()
201
+ tf = max(0.1, 0.5 - ep * 0.002)
202
  o = model(s, t, tf)
203
  o = o[:, 1:].reshape(-1, o.shape[-1])
204
  loss = crit(o, t[:, 1:].reshape(-1))
 
231
  correct += 1
232
  total += 1
233
  acc = correct / max(total, 1) * 100
234
+ lr = opt.param_groups[0]["lr"]
235
+ line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}% | lr: {lr:.6f}"
236
  log += line + "\n"
237
  print(line)
238
 
 
247
  json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
248
  with open("/tmp/expr_config.json", "w") as f:
249
  json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
250
+ "embed_dim": 64, "hidden_dim": 128, "dropout": dropout,
251
+ "pairs_count": len(pairs)}, f)
252
  else:
253
  no_imp += 1
254
+ if no_imp >= 25:
255
  log += f"Early stopping at epoch {ep}\n"
256
  break
257
 
 
282
  return log
283
 
284
 
285
+ with gr.Blocks(title="Dormouse seq2seq v3 Training") as demo:
286
+ gr.Markdown("# Dormouse seq2seq v3 — Expression UA→EN Training")
287
+ gr.Markdown("v3: dropout, label smoothing, smaller model (2M params vs 7M).")
288
 
289
  with gr.Row():
290
+ epochs = gr.Slider(10, 300, value=200, step=10, label="Epochs")
291
  batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
292
  aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
293
+ with gr.Row():
294
+ dropout = gr.Slider(0.0, 0.5, value=0.3, step=0.05, label="Dropout")
295
+ label_smooth = gr.Slider(0.0, 0.3, value=0.1, step=0.05, label="Label smoothing")
296
 
297
  btn = gr.Button("Train", variant="primary")
298
  output = gr.Textbox(label="Training log", lines=30)
299
 
300
+ btn.click(train_model, inputs=[epochs, batch_size, aug, dropout, label_smooth], outputs=output)
301
 
302
  demo.launch()