Dariachup commited on
Commit
fe9dfdb
·
verified ·
1 Parent(s): 65e3742

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +288 -0
app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dormouse seq2seq v2 training on ZeroGPU."""
2
+
3
+ import json
4
+ import os
5
+ import random
6
+
7
+ import gradio as gr
8
+ import spaces
9
+ import torch
10
+ import torch.nn as nn
11
+ from huggingface_hub import HfApi
12
+ from torch.utils.data import DataLoader, Dataset
13
+
14
+ # --- Vocab ---
15
+ class Vocab:
16
+ PAD, SOS, EOS, UNK = 0, 1, 2, 3
17
+ def __init__(self):
18
+ self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
19
+ self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
20
+ def build(self, texts):
21
+ from collections import Counter
22
+ counter = Counter()
23
+ for t in texts:
24
+ for w in t.lower().split():
25
+ counter[w] += 1
26
+ for w, _ in counter.most_common():
27
+ if w not in self.word2idx:
28
+ idx = len(self.word2idx)
29
+ self.word2idx[w] = idx
30
+ self.idx2word[idx] = w
31
+ def encode(self, text, max_len=16):
32
+ words = text.lower().split()[:max_len - 2]
33
+ return [self.SOS] + [self.word2idx.get(w, self.UNK) for w in words] + [self.EOS]
34
+ def decode(self, ids):
35
+ words = []
36
+ for idx in ids:
37
+ if idx == self.EOS: break
38
+ if idx in (self.PAD, self.SOS): continue
39
+ words.append(self.idx2word.get(idx, "<UNK>"))
40
+ return " ".join(words)
41
+ def __len__(self): return len(self.word2idx)
42
+
43
+ # --- Model ---
44
+ class Enc(nn.Module):
45
+ def __init__(self, vs, ed=128, hd=256):
46
+ super().__init__()
47
+ self.emb = nn.Embedding(vs, ed, padding_idx=0)
48
+ self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
49
+ self.fc = nn.Linear(hd*2, hd)
50
+ def forward(self, x):
51
+ o, h = self.rnn(self.emb(x))
52
+ h = torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1))).unsqueeze(0)
53
+ return o, h
54
+
55
+ class Attn(nn.Module):
56
+ def __init__(self, hd=256):
57
+ super().__init__()
58
+ self.a = nn.Linear(hd*3, hd)
59
+ self.v = nn.Linear(hd, 1, bias=False)
60
+ def forward(self, h, eo):
61
+ h = h.permute(1,0,2).repeat(1, eo.shape[1], 1)
62
+ return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
63
+
64
+ class Dec(nn.Module):
65
+ def __init__(self, vs, ed=128, hd=256):
66
+ super().__init__()
67
+ self.emb = nn.Embedding(vs, ed, padding_idx=0)
68
+ self.attn = Attn(hd)
69
+ self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
70
+ self.fc = nn.Linear(hd, vs)
71
+ def forward(self, x, h, eo):
72
+ e = self.emb(x.unsqueeze(1))
73
+ c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
74
+ o, h = self.rnn(torch.cat((e,c),2), h)
75
+ return self.fc(o.squeeze(1)), h
76
+
77
+ class ExprModel(nn.Module):
78
+ def __init__(self, svs, tvs, ed=128, hd=256):
79
+ super().__init__()
80
+ self.enc = Enc(svs, ed, hd)
81
+ self.dec = Dec(tvs, ed, hd)
82
+ self.tvs = tvs
83
+ def forward(self, src, tgt, tf=0.5):
84
+ bs, tl = src.shape[0], tgt.shape[1]
85
+ out = torch.zeros(bs, tl, self.tvs, device=src.device)
86
+ eo, h = self.enc(src)
87
+ inp = tgt[:,0]
88
+ for t in range(1, tl):
89
+ o, h = self.dec(inp, h, eo)
90
+ out[:,t] = o
91
+ inp = tgt[:,t] if random.random() < tf else o.argmax(1)
92
+ return out
93
+ def translate(self, src, tv, ml=16):
94
+ self.train(False)
95
+ with torch.no_grad():
96
+ eo, h = self.enc(src.unsqueeze(0))
97
+ inp = torch.tensor([tv.SOS], device=src.device)
98
+ res = []
99
+ for _ in range(ml):
100
+ o, h = self.dec(inp, h, eo)
101
+ t = o.argmax(1).item()
102
+ if t == tv.EOS: break
103
+ res.append(t)
104
+ inp = torch.tensor([t], device=src.device)
105
+ return tv.decode(res)
106
+
107
+ # --- Dataset ---
108
+ class DS(Dataset):
109
+ def __init__(self, s, t, sv, tv):
110
+ self.s, self.t, self.sv, self.tv = s, t, sv, tv
111
+ def __len__(self): return len(self.s)
112
+ def __getitem__(self, i):
113
+ return self.sv.encode(self.s[i]), self.tv.encode(self.t[i])
114
+
115
+ def collate(batch):
116
+ ss, tt = zip(*batch)
117
+ ms, mt = max(len(s) for s in ss), max(len(t) for t in tt)
118
+ return (
119
+ torch.tensor([s + [0]*(ms-len(s)) for s in ss]),
120
+ torch.tensor([t + [0]*(mt-len(t)) for t in tt]),
121
+ )
122
+
123
+ def augment(sources, targets, factor=3):
124
+ aug_s, aug_t = list(sources), list(targets)
125
+ for _ in range(factor - 1):
126
+ for s, t in zip(sources, targets):
127
+ words = s.split()
128
+ if len(words) < 2: continue
129
+ if len(words) >= 2 and random.random() < 0.3:
130
+ i = random.randint(0, len(words)-2)
131
+ words[i], words[i+1] = words[i+1], words[i]
132
+ if len(words) > 2 and random.random() < 0.2:
133
+ di = random.randint(0, len(words)-1)
134
+ words = words[:di] + words[di+1:]
135
+ aug_s.append(" ".join(words))
136
+ aug_t.append(t)
137
+ return aug_s, aug_t
138
+
139
+
140
+ @spaces.GPU(duration=300)
141
+ def train_model(epochs=100, batch_size=128, augment_factor=3):
142
+ """Train seq2seq on GPU."""
143
+ # Load data
144
+ with open("expression_pairs.json") as f:
145
+ pairs = json.load(f)
146
+
147
+ sources = [p["ua"] for p in pairs]
148
+ targets = [p["en"] for p in pairs]
149
+ log = f"Expression pairs: {len(pairs)}\n"
150
+
151
+ # Augment
152
+ sources, targets = augment(sources, targets, augment_factor)
153
+ log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
154
+
155
+ # Vocab
156
+ src_vocab, tgt_vocab = Vocab(), Vocab()
157
+ src_vocab.build(sources)
158
+ tgt_vocab.build(targets)
159
+ log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
160
+
161
+ # Split
162
+ idx = list(range(len(sources)))
163
+ random.shuffle(idx)
164
+ split = int(0.9 * len(idx))
165
+ tr_s = [sources[i] for i in idx[:split]]
166
+ tr_t = [targets[i] for i in idx[:split]]
167
+ va_s = [sources[i] for i in idx[split:]]
168
+ va_t = [targets[i] for i in idx[split:]]
169
+
170
+ train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
171
+ val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
172
+
173
+ # Model
174
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
175
+ model = ExprModel(len(src_vocab), len(tgt_vocab)).to(device)
176
+ opt = torch.optim.Adam(model.parameters(), lr=0.001)
177
+ sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)
178
+ crit = nn.CrossEntropyLoss(ignore_index=0)
179
+
180
+ params = sum(p.numel() for p in model.parameters())
181
+ log += f"Parameters: {params:,}\nDevice: {device}\n\n"
182
+
183
+ best_vl = float("inf")
184
+ no_imp = 0
185
+
186
+ for ep in range(1, epochs + 1):
187
+ model.train()
188
+ tl = 0
189
+ for s, t in train_dl:
190
+ s, t = s.to(device), t.to(device)
191
+ opt.zero_grad()
192
+ tf = max(0.1, 0.5 - ep * 0.004)
193
+ o = model(s, t, tf)
194
+ o = o[:, 1:].reshape(-1, o.shape[-1])
195
+ loss = crit(o, t[:, 1:].reshape(-1))
196
+ loss.backward()
197
+ nn.utils.clip_grad_norm_(model.parameters(), 1.0)
198
+ opt.step()
199
+ tl += loss.item()
200
+ tl /= len(train_dl)
201
+
202
+ model.train(False)
203
+ vl = 0
204
+ with torch.no_grad():
205
+ for s, t in val_dl:
206
+ s, t = s.to(device), t.to(device)
207
+ o = model(s, t, 0)
208
+ o = o[:, 1:].reshape(-1, o.shape[-1])
209
+ vl += crit(o, t[:, 1:].reshape(-1)).item()
210
+ vl /= max(len(val_dl), 1)
211
+ sched.step(vl)
212
+
213
+ if ep % 10 == 0 or ep == 1:
214
+ correct, total = 0, 0
215
+ with torch.no_grad():
216
+ for s, t in val_dl:
217
+ s = s.to(device)
218
+ for i in range(min(s.shape[0], 50)):
219
+ pred = model.translate(s[i], tgt_vocab)
220
+ ref = tgt_vocab.decode(t[i].tolist())
221
+ if set(pred.lower().split()) == set(ref.lower().split()):
222
+ correct += 1
223
+ total += 1
224
+ acc = correct / max(total, 1) * 100
225
+ line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}%"
226
+ log += line + "\n"
227
+ print(line)
228
+
229
+ if vl < best_vl:
230
+ best_vl = vl
231
+ no_imp = 0
232
+ torch.save(model.cpu().state_dict(), "/tmp/expr_seq2seq.pt")
233
+ model.to(device)
234
+ with open("/tmp/expr_vocab_src.json", "w") as f:
235
+ json.dump(src_vocab.word2idx, f, ensure_ascii=False)
236
+ with open("/tmp/expr_vocab_tgt.json", "w") as f:
237
+ json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
238
+ with open("/tmp/expr_config.json", "w") as f:
239
+ json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
240
+ "embed_dim": 128, "hidden_dim": 256, "pairs_count": len(pairs)}, f)
241
+ else:
242
+ no_imp += 1
243
+ if no_imp >= 15:
244
+ log += f"Early stopping at epoch {ep}\n"
245
+ break
246
+
247
+ # Examples
248
+ model.load_state_dict(torch.load("/tmp/expr_seq2seq.pt", map_location=device, weights_only=True))
249
+ model.to(device)
250
+ model.train(False)
251
+ log += f"\nBest val_loss: {best_vl:.4f}\n\nExamples:\n"
252
+ for i in range(min(20, len(va_s))):
253
+ si = torch.tensor(src_vocab.encode(va_s[i]), device=device)
254
+ pred = model.translate(si, tgt_vocab)
255
+ log += f" {va_s[i]:<35} -> {pred:<25} (ref: {va_t[i]})\n"
256
+
257
+ # Push to Hub
258
+ token = os.environ.get("HF_TOKEN")
259
+ if token:
260
+ api = HfApi(token=token)
261
+ repo = "Dariachup/dormouse-expression-pairs"
262
+ for fname in ["expr_seq2seq.pt", "expr_vocab_src.json", "expr_vocab_tgt.json", "expr_config.json"]:
263
+ api.upload_file(
264
+ path_or_fileobj=f"/tmp/{fname}",
265
+ path_in_repo=f"model/{fname}",
266
+ repo_id=repo,
267
+ repo_type="dataset",
268
+ )
269
+ log += f"\nModel pushed to {repo}/model/\n"
270
+
271
+ return log
272
+
273
+
274
+ with gr.Blocks(title="Dormouse seq2seq v2 Training") as demo:
275
+ gr.Markdown("# Dormouse seq2seq v2 — Expression UA→EN Training")
276
+ gr.Markdown("Train on 4056 expression pairs from real Telegram + Claude queries data.")
277
+
278
+ with gr.Row():
279
+ epochs = gr.Slider(10, 200, value=100, step=10, label="Epochs")
280
+ batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
281
+ aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
282
+
283
+ btn = gr.Button("Train", variant="primary")
284
+ output = gr.Textbox(label="Training log", lines=30)
285
+
286
+ btn.click(train_model, inputs=[epochs, batch_size, aug], outputs=output)
287
+
288
+ demo.launch()