Spaces:
Sleeping
Sleeping
Upload app.py with huggingface_hub
Browse files
app.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dormouse seq2seq v2 training on ZeroGPU."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import spaces
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn as nn
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
from torch.utils.data import DataLoader, Dataset
|
| 13 |
+
|
| 14 |
+
# --- Vocab ---
|
| 15 |
+
class Vocab:
|
| 16 |
+
PAD, SOS, EOS, UNK = 0, 1, 2, 3
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
|
| 19 |
+
self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
|
| 20 |
+
def build(self, texts):
|
| 21 |
+
from collections import Counter
|
| 22 |
+
counter = Counter()
|
| 23 |
+
for t in texts:
|
| 24 |
+
for w in t.lower().split():
|
| 25 |
+
counter[w] += 1
|
| 26 |
+
for w, _ in counter.most_common():
|
| 27 |
+
if w not in self.word2idx:
|
| 28 |
+
idx = len(self.word2idx)
|
| 29 |
+
self.word2idx[w] = idx
|
| 30 |
+
self.idx2word[idx] = w
|
| 31 |
+
def encode(self, text, max_len=16):
|
| 32 |
+
words = text.lower().split()[:max_len - 2]
|
| 33 |
+
return [self.SOS] + [self.word2idx.get(w, self.UNK) for w in words] + [self.EOS]
|
| 34 |
+
def decode(self, ids):
|
| 35 |
+
words = []
|
| 36 |
+
for idx in ids:
|
| 37 |
+
if idx == self.EOS: break
|
| 38 |
+
if idx in (self.PAD, self.SOS): continue
|
| 39 |
+
words.append(self.idx2word.get(idx, "<UNK>"))
|
| 40 |
+
return " ".join(words)
|
| 41 |
+
def __len__(self): return len(self.word2idx)
|
| 42 |
+
|
| 43 |
+
# --- Model ---
|
| 44 |
+
class Enc(nn.Module):
|
| 45 |
+
def __init__(self, vs, ed=128, hd=256):
|
| 46 |
+
super().__init__()
|
| 47 |
+
self.emb = nn.Embedding(vs, ed, padding_idx=0)
|
| 48 |
+
self.rnn = nn.GRU(ed, hd, batch_first=True, bidirectional=True)
|
| 49 |
+
self.fc = nn.Linear(hd*2, hd)
|
| 50 |
+
def forward(self, x):
|
| 51 |
+
o, h = self.rnn(self.emb(x))
|
| 52 |
+
h = torch.tanh(self.fc(torch.cat((h[-2], h[-1]), 1))).unsqueeze(0)
|
| 53 |
+
return o, h
|
| 54 |
+
|
| 55 |
+
class Attn(nn.Module):
|
| 56 |
+
def __init__(self, hd=256):
|
| 57 |
+
super().__init__()
|
| 58 |
+
self.a = nn.Linear(hd*3, hd)
|
| 59 |
+
self.v = nn.Linear(hd, 1, bias=False)
|
| 60 |
+
def forward(self, h, eo):
|
| 61 |
+
h = h.permute(1,0,2).repeat(1, eo.shape[1], 1)
|
| 62 |
+
return torch.softmax(self.v(torch.tanh(self.a(torch.cat((h, eo), 2)))).squeeze(2), 1)
|
| 63 |
+
|
| 64 |
+
class Dec(nn.Module):
|
| 65 |
+
def __init__(self, vs, ed=128, hd=256):
|
| 66 |
+
super().__init__()
|
| 67 |
+
self.emb = nn.Embedding(vs, ed, padding_idx=0)
|
| 68 |
+
self.attn = Attn(hd)
|
| 69 |
+
self.rnn = nn.GRU(ed+hd*2, hd, batch_first=True)
|
| 70 |
+
self.fc = nn.Linear(hd, vs)
|
| 71 |
+
def forward(self, x, h, eo):
|
| 72 |
+
e = self.emb(x.unsqueeze(1))
|
| 73 |
+
c = torch.bmm(self.attn(h, eo).unsqueeze(1), eo)
|
| 74 |
+
o, h = self.rnn(torch.cat((e,c),2), h)
|
| 75 |
+
return self.fc(o.squeeze(1)), h
|
| 76 |
+
|
| 77 |
+
class ExprModel(nn.Module):
|
| 78 |
+
def __init__(self, svs, tvs, ed=128, hd=256):
|
| 79 |
+
super().__init__()
|
| 80 |
+
self.enc = Enc(svs, ed, hd)
|
| 81 |
+
self.dec = Dec(tvs, ed, hd)
|
| 82 |
+
self.tvs = tvs
|
| 83 |
+
def forward(self, src, tgt, tf=0.5):
|
| 84 |
+
bs, tl = src.shape[0], tgt.shape[1]
|
| 85 |
+
out = torch.zeros(bs, tl, self.tvs, device=src.device)
|
| 86 |
+
eo, h = self.enc(src)
|
| 87 |
+
inp = tgt[:,0]
|
| 88 |
+
for t in range(1, tl):
|
| 89 |
+
o, h = self.dec(inp, h, eo)
|
| 90 |
+
out[:,t] = o
|
| 91 |
+
inp = tgt[:,t] if random.random() < tf else o.argmax(1)
|
| 92 |
+
return out
|
| 93 |
+
def translate(self, src, tv, ml=16):
|
| 94 |
+
self.train(False)
|
| 95 |
+
with torch.no_grad():
|
| 96 |
+
eo, h = self.enc(src.unsqueeze(0))
|
| 97 |
+
inp = torch.tensor([tv.SOS], device=src.device)
|
| 98 |
+
res = []
|
| 99 |
+
for _ in range(ml):
|
| 100 |
+
o, h = self.dec(inp, h, eo)
|
| 101 |
+
t = o.argmax(1).item()
|
| 102 |
+
if t == tv.EOS: break
|
| 103 |
+
res.append(t)
|
| 104 |
+
inp = torch.tensor([t], device=src.device)
|
| 105 |
+
return tv.decode(res)
|
| 106 |
+
|
| 107 |
+
# --- Dataset ---
|
| 108 |
+
class DS(Dataset):
|
| 109 |
+
def __init__(self, s, t, sv, tv):
|
| 110 |
+
self.s, self.t, self.sv, self.tv = s, t, sv, tv
|
| 111 |
+
def __len__(self): return len(self.s)
|
| 112 |
+
def __getitem__(self, i):
|
| 113 |
+
return self.sv.encode(self.s[i]), self.tv.encode(self.t[i])
|
| 114 |
+
|
| 115 |
+
def collate(batch):
|
| 116 |
+
ss, tt = zip(*batch)
|
| 117 |
+
ms, mt = max(len(s) for s in ss), max(len(t) for t in tt)
|
| 118 |
+
return (
|
| 119 |
+
torch.tensor([s + [0]*(ms-len(s)) for s in ss]),
|
| 120 |
+
torch.tensor([t + [0]*(mt-len(t)) for t in tt]),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def augment(sources, targets, factor=3):
|
| 124 |
+
aug_s, aug_t = list(sources), list(targets)
|
| 125 |
+
for _ in range(factor - 1):
|
| 126 |
+
for s, t in zip(sources, targets):
|
| 127 |
+
words = s.split()
|
| 128 |
+
if len(words) < 2: continue
|
| 129 |
+
if len(words) >= 2 and random.random() < 0.3:
|
| 130 |
+
i = random.randint(0, len(words)-2)
|
| 131 |
+
words[i], words[i+1] = words[i+1], words[i]
|
| 132 |
+
if len(words) > 2 and random.random() < 0.2:
|
| 133 |
+
di = random.randint(0, len(words)-1)
|
| 134 |
+
words = words[:di] + words[di+1:]
|
| 135 |
+
aug_s.append(" ".join(words))
|
| 136 |
+
aug_t.append(t)
|
| 137 |
+
return aug_s, aug_t
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@spaces.GPU(duration=300)
|
| 141 |
+
def train_model(epochs=100, batch_size=128, augment_factor=3):
|
| 142 |
+
"""Train seq2seq on GPU."""
|
| 143 |
+
# Load data
|
| 144 |
+
with open("expression_pairs.json") as f:
|
| 145 |
+
pairs = json.load(f)
|
| 146 |
+
|
| 147 |
+
sources = [p["ua"] for p in pairs]
|
| 148 |
+
targets = [p["en"] for p in pairs]
|
| 149 |
+
log = f"Expression pairs: {len(pairs)}\n"
|
| 150 |
+
|
| 151 |
+
# Augment
|
| 152 |
+
sources, targets = augment(sources, targets, augment_factor)
|
| 153 |
+
log += f"After augmentation (x{augment_factor}): {len(sources)}\n"
|
| 154 |
+
|
| 155 |
+
# Vocab
|
| 156 |
+
src_vocab, tgt_vocab = Vocab(), Vocab()
|
| 157 |
+
src_vocab.build(sources)
|
| 158 |
+
tgt_vocab.build(targets)
|
| 159 |
+
log += f"UA vocab: {len(src_vocab)}, EN vocab: {len(tgt_vocab)}\n"
|
| 160 |
+
|
| 161 |
+
# Split
|
| 162 |
+
idx = list(range(len(sources)))
|
| 163 |
+
random.shuffle(idx)
|
| 164 |
+
split = int(0.9 * len(idx))
|
| 165 |
+
tr_s = [sources[i] for i in idx[:split]]
|
| 166 |
+
tr_t = [targets[i] for i in idx[:split]]
|
| 167 |
+
va_s = [sources[i] for i in idx[split:]]
|
| 168 |
+
va_t = [targets[i] for i in idx[split:]]
|
| 169 |
+
|
| 170 |
+
train_dl = DataLoader(DS(tr_s, tr_t, src_vocab, tgt_vocab), batch_size=batch_size, shuffle=True, collate_fn=collate)
|
| 171 |
+
val_dl = DataLoader(DS(va_s, va_t, src_vocab, tgt_vocab), batch_size=batch_size, collate_fn=collate)
|
| 172 |
+
|
| 173 |
+
# Model
|
| 174 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 175 |
+
model = ExprModel(len(src_vocab), len(tgt_vocab)).to(device)
|
| 176 |
+
opt = torch.optim.Adam(model.parameters(), lr=0.001)
|
| 177 |
+
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)
|
| 178 |
+
crit = nn.CrossEntropyLoss(ignore_index=0)
|
| 179 |
+
|
| 180 |
+
params = sum(p.numel() for p in model.parameters())
|
| 181 |
+
log += f"Parameters: {params:,}\nDevice: {device}\n\n"
|
| 182 |
+
|
| 183 |
+
best_vl = float("inf")
|
| 184 |
+
no_imp = 0
|
| 185 |
+
|
| 186 |
+
for ep in range(1, epochs + 1):
|
| 187 |
+
model.train()
|
| 188 |
+
tl = 0
|
| 189 |
+
for s, t in train_dl:
|
| 190 |
+
s, t = s.to(device), t.to(device)
|
| 191 |
+
opt.zero_grad()
|
| 192 |
+
tf = max(0.1, 0.5 - ep * 0.004)
|
| 193 |
+
o = model(s, t, tf)
|
| 194 |
+
o = o[:, 1:].reshape(-1, o.shape[-1])
|
| 195 |
+
loss = crit(o, t[:, 1:].reshape(-1))
|
| 196 |
+
loss.backward()
|
| 197 |
+
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
| 198 |
+
opt.step()
|
| 199 |
+
tl += loss.item()
|
| 200 |
+
tl /= len(train_dl)
|
| 201 |
+
|
| 202 |
+
model.train(False)
|
| 203 |
+
vl = 0
|
| 204 |
+
with torch.no_grad():
|
| 205 |
+
for s, t in val_dl:
|
| 206 |
+
s, t = s.to(device), t.to(device)
|
| 207 |
+
o = model(s, t, 0)
|
| 208 |
+
o = o[:, 1:].reshape(-1, o.shape[-1])
|
| 209 |
+
vl += crit(o, t[:, 1:].reshape(-1)).item()
|
| 210 |
+
vl /= max(len(val_dl), 1)
|
| 211 |
+
sched.step(vl)
|
| 212 |
+
|
| 213 |
+
if ep % 10 == 0 or ep == 1:
|
| 214 |
+
correct, total = 0, 0
|
| 215 |
+
with torch.no_grad():
|
| 216 |
+
for s, t in val_dl:
|
| 217 |
+
s = s.to(device)
|
| 218 |
+
for i in range(min(s.shape[0], 50)):
|
| 219 |
+
pred = model.translate(s[i], tgt_vocab)
|
| 220 |
+
ref = tgt_vocab.decode(t[i].tolist())
|
| 221 |
+
if set(pred.lower().split()) == set(ref.lower().split()):
|
| 222 |
+
correct += 1
|
| 223 |
+
total += 1
|
| 224 |
+
acc = correct / max(total, 1) * 100
|
| 225 |
+
line = f"Epoch {ep:3d} | train: {tl:.4f} | val: {vl:.4f} | exact: {acc:.1f}%"
|
| 226 |
+
log += line + "\n"
|
| 227 |
+
print(line)
|
| 228 |
+
|
| 229 |
+
if vl < best_vl:
|
| 230 |
+
best_vl = vl
|
| 231 |
+
no_imp = 0
|
| 232 |
+
torch.save(model.cpu().state_dict(), "/tmp/expr_seq2seq.pt")
|
| 233 |
+
model.to(device)
|
| 234 |
+
with open("/tmp/expr_vocab_src.json", "w") as f:
|
| 235 |
+
json.dump(src_vocab.word2idx, f, ensure_ascii=False)
|
| 236 |
+
with open("/tmp/expr_vocab_tgt.json", "w") as f:
|
| 237 |
+
json.dump(tgt_vocab.word2idx, f, ensure_ascii=False)
|
| 238 |
+
with open("/tmp/expr_config.json", "w") as f:
|
| 239 |
+
json.dump({"src_vocab_size": len(src_vocab), "tgt_vocab_size": len(tgt_vocab),
|
| 240 |
+
"embed_dim": 128, "hidden_dim": 256, "pairs_count": len(pairs)}, f)
|
| 241 |
+
else:
|
| 242 |
+
no_imp += 1
|
| 243 |
+
if no_imp >= 15:
|
| 244 |
+
log += f"Early stopping at epoch {ep}\n"
|
| 245 |
+
break
|
| 246 |
+
|
| 247 |
+
# Examples
|
| 248 |
+
model.load_state_dict(torch.load("/tmp/expr_seq2seq.pt", map_location=device, weights_only=True))
|
| 249 |
+
model.to(device)
|
| 250 |
+
model.train(False)
|
| 251 |
+
log += f"\nBest val_loss: {best_vl:.4f}\n\nExamples:\n"
|
| 252 |
+
for i in range(min(20, len(va_s))):
|
| 253 |
+
si = torch.tensor(src_vocab.encode(va_s[i]), device=device)
|
| 254 |
+
pred = model.translate(si, tgt_vocab)
|
| 255 |
+
log += f" {va_s[i]:<35} -> {pred:<25} (ref: {va_t[i]})\n"
|
| 256 |
+
|
| 257 |
+
# Push to Hub
|
| 258 |
+
token = os.environ.get("HF_TOKEN")
|
| 259 |
+
if token:
|
| 260 |
+
api = HfApi(token=token)
|
| 261 |
+
repo = "Dariachup/dormouse-expression-pairs"
|
| 262 |
+
for fname in ["expr_seq2seq.pt", "expr_vocab_src.json", "expr_vocab_tgt.json", "expr_config.json"]:
|
| 263 |
+
api.upload_file(
|
| 264 |
+
path_or_fileobj=f"/tmp/{fname}",
|
| 265 |
+
path_in_repo=f"model/{fname}",
|
| 266 |
+
repo_id=repo,
|
| 267 |
+
repo_type="dataset",
|
| 268 |
+
)
|
| 269 |
+
log += f"\nModel pushed to {repo}/model/\n"
|
| 270 |
+
|
| 271 |
+
return log
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
with gr.Blocks(title="Dormouse seq2seq v2 Training") as demo:
|
| 275 |
+
gr.Markdown("# Dormouse seq2seq v2 — Expression UA→EN Training")
|
| 276 |
+
gr.Markdown("Train on 4056 expression pairs from real Telegram + Claude queries data.")
|
| 277 |
+
|
| 278 |
+
with gr.Row():
|
| 279 |
+
epochs = gr.Slider(10, 200, value=100, step=10, label="Epochs")
|
| 280 |
+
batch_size = gr.Slider(32, 256, value=128, step=32, label="Batch size")
|
| 281 |
+
aug = gr.Slider(1, 5, value=3, step=1, label="Augmentation factor")
|
| 282 |
+
|
| 283 |
+
btn = gr.Button("Train", variant="primary")
|
| 284 |
+
output = gr.Textbox(label="Training log", lines=30)
|
| 285 |
+
|
| 286 |
+
btn.click(train_model, inputs=[epochs, batch_size, aug], outputs=output)
|
| 287 |
+
|
| 288 |
+
demo.launch()
|