Spaces:

NICOMOSHE
/

RNN

Sleeping

File size: 15,987 Bytes

4794c14

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Traductor RNN - Seq2Seq LSTM\\n",
    "## Universidad Autónoma del Caribe (UAC)\\n\\n",
    "Implementación con CRISP-ML(Q)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fase 1: Business & Data Understanding\\n\\n",
    "**Métricas:**\\n",
    "- BLEU Score objetivo: ≥ 0.30\\n",
    "- Latencia máxima: 2 segundos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Instalación de dependencias\\n",
    "!pip install torch numpy gradio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\\n",
    "import torch.nn as nn\\n",
    "import torch.optim as optim\\n",
    "from torch.utils.data import Dataset, DataLoader\\n",
    "import numpy as np\\n",
    "import re\\n",
    "from collections import Counter\\n",
    "import matplotlib.pyplot as plt\\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\\n",
    "print(f\"Dispositivo: {device}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fase 2: Data Preparation\\n\\n",
    "## Corpus de entrenamiento"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Corpus amplio (322 parejas)\\n",
    "CORPUS = [\\n",
    "    (\"hello\", \"hola\"), (\"goodbye\", \"adios\"), (\"good morning\", \"buenos dias\"),\\n",
    "    (\"good night\", \"buenas noches\"), (\"thank you\", \"gracias\"),\\n",
    "    (\"please\", \"por favor\"), (\"yes\", \"si\"), (\"no\", \"no\"),\\n",
    "    (\"i am a student\", \"soy estudiante\"),\\n",
    "    (\"where is the library\", \"donde esta la biblioteca\"),\\n",
    "    (\"the exam is difficult\", \"el examen es dificil\"),\\n",
    "    (\"i need to study\", \"necesito estudiar\"),\\n",
    "    (\"how are you\", \"como estas\"),\\n",
    "    (\"i study at the university\", \"estudio en la universidad\"),\\n",
    "    # Más frases...\\n",
    "    (\"good evening\", \"buenas tardes\"), (\"see you later\", \"hasta luego\"),\\n",
    "    (\"thank you very much\", \"muchas gracias\"),\\n",
    "    (\"you are welcome\", \"de nada\"),\\n",
    "    (\"excuse me\", \"disculpe\"), (\"sorry\", \"lo siento\"),\\n",
    "    (\"maybe\", \"quizas\"), (\"of course\", \"por supuesto\"),\\n",
    "    (\"i\", \"yo\"), (\"you\", \"tu\"), (\"he\", \"el\"), (\"she\", \"ella\"),\\n",
    "    (\"we\", \"nosotros\"), (\"they\", \"ellos\"),\\n",
    "    (\"you are a teacher\", \"tu eres maestro\"),\\n",
    "    (\"he is a professor\", \"el es profesor\"),\\n",
    "    (\"she is a student\", \"ella es estudiante\"),\\n",
    "    (\"we are friends\", \"somos amigos\"),\\n",
    "    (\"what is your name\", \"cual es tu nombre\"),\\n",
    "    (\"my name is john\", \"me llamo john\"),\\n",
    "    (\"nice to meet you\", \"mucho gusto\"),\\n",
    "    (\"father\", \"padre\"), (\"mother\", \"madre\"),\\n",
    "    (\"brother\", \"hermano\"), (\"sister\", \"hermana\"),\\n",
    "    (\"university\", \"universidad\"), (\"class\", \"clase\"),\\n",
    "    (\"professor\", \"profesor\"), (\"student\", \"estudiante\"),\\n",
    "    (\"exam\", \"examen\"), (\"homework\", \"tarea\"),\\n",
    "    (\"the class starts at eight\", \"la clase empieza a las ocho\"),\\n",
    "    (\"i need a book\", \"necesito un libro\"),\\n",
    "    (\"the professor is strict\", \"el profesor es estricto\"),\\n",
    "    (\"i have a class at nine\", \"tengo clase a las nueve\"),\\n",
    "    (\"the lecture is interesting\", \"la conferencia es interesante\"),\\n",
    "    (\"when is the exam\", \"cuando es el examen\"),\\n",
    "    (\"i passed the exam\", \"aprobe el examen\"),\\n",
    "    (\"i am late for class\", \"llegue tarde a clase\"),\\n",
    "    (\"one\", \"uno\"), (\"two\", \"dos\"), (\"three\", \"tres\"),\\n",
    "    (\"four\", \"cuatro\"), (\"five\", \"cinco\"), (\"six\", \"seis\"),\\n",
    "    (\"seven\", \"siete\"), (\"eight\", \"ocho\"), (\"nine\", \"nueve\"),\\n",
    "    (\"ten\", \"diez\"),\\n",
    "    (\"monday\", \"lunes\"), (\"tuesday\", \"martes\"),\\n",
    "    (\"wednesday\", \"miercoles\"), (\"thursday\", \"jueves\"),\\n",
    "    (\"friday\", \"viernes\"), (\"saturday\", \"sabado\"),\\n",
    "    (\"sunday\", \"domingo\"), (\"today\", \"hoy\"), (\"tomorrow\", \"manana\"),\\n",
    "    (\"book\", \"libro\"), (\"computer\", \"computadora\"),\\n",
    "    (\"good\", \"bueno\"), (\"bad\", \"malo\"),\\n",
    "    (\"big\", \"grande\"), (\"small\", \"pequeno\"),\\n",
    "    (\"new\", \"nuevo\"), (\"old\", \"viejo\"),\\n",
    "    (\"fast\", \"rapido\"), (\"slow\", \"lento\"),\\n",
    "    (\"easy\", \"facil\"), (\"difficult\", \"dificil\"),\\n",
    "    (\"how much\", \"cuanto\"), (\"what time is it\", \"que hora es\"),\\n",
    "]\\n\\n",
    "# Añadir inversión\\n",
    "for es, en in list(CORPUS):\\n",
    "    if (en, es) not in CORPUS:\\n",
    "        CORPUS.append((en, es))\\n",
    "print(f\"Corpus: {len(CORPUS)} parejas\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Vocabulario\\n",
    "PAD, UNK, SOS, EOS = \"<PAD>\", \"<UNK>\", \"<SOS>\", \"<EOS>\"\\n\\n",
    "class Vocab:\\n",
    "    def __init__(self):\\n",
    "        self.w2i = {PAD: 0, UNK: 1, SOS: 2, EOS: 3}\\n",
    "        self.i2w = {0: PAD, 1: UNK, 2: SOS, 3: EOS}\\n",
    "        self.n = 4\\n\\n",
    "    def add(self, text):\\n",
    "        for w in text.lower().split():\\n",
    "            if w not in self.w2i:\\n",
    "                self.w2i[w] = self.n\\n",
    "                self.i2w[self.n] = w\\n",
    "                self.n += 1\\n\\n",
    "    def encode(self, text, max_len, sos=False, eos=False):\\n",
    "        ids = []\\n",
    "        if sos: ids.append(self.w2i[SOS])\\n",
    "        for w in text.lower().split():\\n",
    "            ids.append(self.w2i.get(w, self.w2i[UNK]))\\n",
    "        if eos: ids.append(self.w2i[EOS])\\n",
    "        while len(ids) < max_len: ids.append(self.w2i[PAD])\\n",
    "        return ids[:max_len]\\n\\n",
    "    def decode(self, ids):\\n",
    "        ws = []\\n",
    "        for i in ids:\\n",
    "            if torch.is_tensor(i): i = i.item()\\n",
    "            w = self.i2w.get(i, UNK)\\n",
    "            if w not in [PAD, SOS, EOS]: ws.append(w)\\n",
    "        return \" \".join(ws)\\n\\n",
    "src_v, tgt_v = Vocab(), Vocab()\\n",
    "for s, t in CORPUS:\\n",
    "    src_v.add(s)\\n",
    "    tgt_v.add(t)\\n",
    "print(f\"Vocab src: {src_v.n}, tgt: {tgt_v.n}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fase 3: Modeling - Arquitectura Seq2Seq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Encoder\\n",
    "class Encoder(nn.Module):\\n",
    "    def __init__(self, vs, em, hd, ly, dp):\\n",
    "        super().__init__()\\n",
    "        self.emb = nn.Embedding(vs, em, padding_idx=0)\\n",
    "        self.lstm = nn.LSTM(em, hd, ly, batch_first=True, dropout=dp)\\n",
    "        self.dp = nn.Dropout(dp)\\n\\n",
    "    def forward(self, x):\\n",
    "        e = self.dp(self.emb(x))\\n",
    "        o, (h, c) = self.lstm(e)\\n",
    "        return o, h, c\\n\\n",
    "# Decoder\\n",
    "class Decoder(nn.Module):\\n",
    "    def __init__(self, vs, em, hd, ly, dp):\\n",
    "        super().__init__()\\n",
    "        self.emb = nn.Embedding(vs, em, padding_idx=0)\\n",
    "        self.lstm = nn.LSTM(em, hd, ly, batch_first=True, dropout=dp)\\n",
    "        self.fc = nn.Linear(hd, vs)\\n",
    "        self.dp = nn.Dropout(dp)\\n\\n",
    "    def forward(self, x, h, c):\\n",
    "        e = self.dp(self.emb(x))\\n",
    "        o, (h, c) = self.lstm(e, (h, c))\\n",
    "        return self.fc(o.squeeze(1)), h, c\\n\\n",
    "# Seq2Seq\\n",
    "class Seq2Seq(nn.Module):\\n",
    "    def __init__(self, enc, dec):\\n",
    "        super().__init__()\\n",
    "        self.enc = enc\\n",
    "        self.dec = dec\\n\\n",
    "    def forward(self, src, tgt, tf=0.5):\\n",
    "        bs = src.shape[0]\\n",
    "        max_len = tgt.shape[1]\\n",
    "        out = torch.zeros(bs, max_len, self.dec.fc.out_features).to(src.device)\\n\\n",
    "        _, h, c = self.enc(src)\\n",
    "        dec_in = tgt[:, 0]\\n",
    "        for t in range(1, max_len):\\n",
    "            o, h, c = self.dec(dec_in.unsqueeze(1), h, c)\\n",
    "            out[:, t] = o\\n",
    "            top1 = o.argmax(1)\\n",
    "            dec_in = tgt[:, t] if np.random.random() < tf else top1\\n",
    "        return out\\n\\n",
    "# Parámetros\\n",
    "EMBED, HIDDEN, LAYERS, DROP = 256, 512, 2, 0.3\\n",
    "enc = Encoder(src_v.n, EMBED, HIDDEN, LAYERS, DROP).to(device)\\n",
    "dec = Decoder(tgt_v.n, EMBED, HIDDEN, LAYERS, DROP).to(device)\\n",
    "model = Seq2Seq(enc, dec).to(device)\\n\\n",
    "params = sum(p.numel() for p in model.parameters())\\n",
    "print(f\"Parámetros: {params:,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fase 4: Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DS(Dataset):\\n",
    "    def __init__(self, data, sv, tv, ml):\\n",
    "        self.d = [(sv.enc(s, ml), tv.enc(t, ml, True, True)) for s, t in data]\\n\\n",
    "    def __len__(self): return len(self.d)\\n\\n",
    "    def __getitem__(self, i):\\n",
    "        return torch.tensor(self.d[i][0]), torch.tensor(self.d[i][1])\\n\\n",
    "MAX_LEN = 20\\n",
    "ds = DS(CORPUS, src_v, tgt_v, MAX_LEN)\\n",
    "dl = DataLoader(ds, batch_size=16, shuffle=True)\\n\\n",
    "criterion = nn.CrossEntropyLoss(ignore_index=0)\\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001)\\n\\n",
    "EPOCHS = 50\\n",
    "losses = []\\n\\n",
    "model.train()\\n",
    "print(\"Entrenando...\")\\n\\n",
    "for ep in range(1, EPOCHS + 1):\\n",
    "    ep_loss = 0\\n",
    "    for src, tgt in dl:\\n",
    "        src, tgt = src.to(device), tgt.to(device)\\n",
    "        optimizer.zero_grad()\\n",
    "        out = model(src, tgt)\\n",
    "        loss = criterion(out.view(-1, out.shape[-1]), tgt.view(-1))\\n",
    "        loss.backward()\\n",
    "        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\\n",
    "        optimizer.step()\\n",
    "        ep_loss += loss.item()\\n",
    "    losses.append(ep_loss / len(dl))\\n",
    "    if ep % 10 == 0:\\n",
    "        print(f\"Epoch {ep}/{EPOCHS} - Loss: {losses[-1]:.4f}\")\\n\\n",
    "print(\"Entrenamiento completado!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Curvas de pérdida\\n",
    "plt.figure(figsize=(10, 5))\\n",
    "plt.plot(losses, 'b-')\\n",
    "plt.xlabel('Epoch')\\n",
    "plt.ylabel('Loss')\\n",
    "plt.title('Training Loss vs Epoch')\\n",
    "plt.grid(True)\\n",
    "plt.savefig('loss_curves.png', dpi=150)\\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluación - BLEU Score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bleu(ref, hyp):\\n",
    "    rw, hw = ref.lower().split(), hyp.lower().split()\\n",
    "    if not hw: return 0.0\\n",
    "    m = sum(1 for w in hw if w in rw)\\n",
    "    p = m / len(hw)\\n",
    "    bp = min(1.0, np.exp(1 - len(rw) / max(len(hw), 1)))\\n",
    "    return bp * p\\n\\n",
    "model.eval()\\n",
    "test_samples = [\\n",
    "    (\"hello\", \"hola\"), (\"thank you\", \"gracias\"),\\n",
    "    (\"i am a student\", \"soy estudiante\"),\\n",
    "    (\"where is the library\", \"donde esta la biblioteca\"),\\n",
    "    (\"the exam is difficult\", \"el examen es dificil\"),\\n",
    "]\\n\\n",
    "total_bleu = 0\\n",
    "test_results = []\\n\\n",
    "with torch.no_grad():\\n",
    "    for src_text, tgt_text in test_samples:\\n",
    "        enc_in = torch.tensor([src_v.encode(src_text, MAX_LEN)]).to(device)\\n",
    "        _, h, c = enc(enc_in)\\n",
    "        dec_in = torch.tensor([tgt_v.w2i[SOS]]).to(device)\\n",
    "        result = []\\n",
    "        for _ in range(MAX_LEN):\\n",
    "            o, h, c = dec(dec_in.unsqueeze(1), h, c)\\n",
    "            top = o.argmax(1).item()\\n",
    "            if top == tgt_v.w2i[EOS] or top == tgt_v.w2i[PAD]:\\n",
    "                break\\n",
    "            result.append(top)\\n",
    "            dec_in = torch.tensor([top]).to(device)\\n",
    "        translated = tgt_v.decode(result)\\n",
    "        b = bleu(tgt_text, translated)\\n",
    "        total_bleu += b\\n",
    "        test_results.append((src_text, tgt_text, translated, b))\\n",
    "        print(f\"{src_text} -> {translated} (BLEU: {b:.2f})\")\\n\\n",
    "avg_bleu = total_bleu / len(test_samples)\\n",
    "print(f\"\\nBLEU Score: {avg_bleu:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Guardar modelo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.save({\\n",
    "    'model': model.state_dict(),\\n",
    "    'src_vocab': src_v.w2i,\\n",
    "    'tgt_vocab': tgt_v.w2i,\\n",
    "    'src_idx2word': src_v.i2w,\\n",
    "    'tgt_idx2word': tgt_v.i2w,\\n",
    "}, 'translator.pt')\\n",
    "print(\"Modelo guardado: translator.pt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Interfaz Gradio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gradio as gr\\n\\n",
    "def translate(text, direction=\"EN->ES\"):\\n",
    "    if not text.strip(): return \"\"\\n",
    "    model.eval()\\n",
    "    with torch.no_grad():\\n",
    "        ml = MAX_LEN\\n",
    "        if direction == \"ES->EN\":\\n",
    "            enc_in = torch.tensor([tgt_v.encode(text, ml)]).to(device)\\n",
    "            _, h, c = enc(enc_in)\\n",
    "        else:\\n",
    "            enc_in = torch.tensor([src_v.encode(text, ml)]).to(device)\\n",
    "            _, h, c = enc(enc_in)\\n",
    "        dec_in = torch.tensor([tgt_v.w2i[SOS] if direction==\"EN->EN\" else src_v.w2i[SOS]]).to(device)\\n",
    "        result = []\\n",
    "        for _ in range(ml):\\n",
    "            o, h, c = dec(dec_in.unsqueeze(1), h, c)\\n",
    "            top = o.argmax(1).item()\\n",
    "            if top == tgt_v.w2i[EOS] or top == tgt_v.w2i[PAD]:\\n",
    "                break\\n",
    "            result.append(top)\\n",
    "            dec_in = torch.tensor([top]).to(device)\\n",
    "        return src_v.decode(result) if direction==\"ES->EN\" else tgt_v.decode(result)\\n\\n",
    "with gr.Blocks() as demo:\\n",
    "    gr.Markdown(\"# Traductor RNN UAC\\n## Seq2Seq LSTM - CRISP-ML(Q)\")\\n",
    "    with gr.Row():\\n",
    "        inp = gr.Textbox(label=\"Texto\")\\n",
    "        direction = gr.Radio([\"EN->ES\", \"ES->EN\"], label=\"Dirección\")\\n",
    "    btn = gr.Button(\"Traducir\")\\n",
    "    out = gr.Textbox(label=\"Traducción\")\\n",
    "    btn.click(fn=translate, inputs=[inp, direction], outputs=out)\\n\\n",
    "demo.launch()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}