{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Traductor RNN - Seq2Seq LSTM\\n", "## Universidad Autónoma del Caribe (UAC)\\n\\n", "Implementación con CRISP-ML(Q)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fase 1: Business & Data Understanding\\n\\n", "**Métricas:**\\n", "- BLEU Score objetivo: ≥ 0.30\\n", "- Latencia máxima: 2 segundos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Instalación de dependencias\\n", "!pip install torch numpy gradio" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\\n", "import torch.nn as nn\\n", "import torch.optim as optim\\n", "from torch.utils.data import Dataset, DataLoader\\n", "import numpy as np\\n", "import re\\n", "from collections import Counter\\n", "import matplotlib.pyplot as plt\\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\\n", "print(f\"Dispositivo: {device}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fase 2: Data Preparation\\n\\n", "## Corpus de entrenamiento" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Corpus amplio (322 parejas)\\n", "CORPUS = [\\n", " (\"hello\", \"hola\"), (\"goodbye\", \"adios\"), (\"good morning\", \"buenos dias\"),\\n", " (\"good night\", \"buenas noches\"), (\"thank you\", \"gracias\"),\\n", " (\"please\", \"por favor\"), (\"yes\", \"si\"), (\"no\", \"no\"),\\n", " (\"i am a student\", \"soy estudiante\"),\\n", " (\"where is the library\", \"donde esta la biblioteca\"),\\n", " (\"the exam is difficult\", \"el examen es dificil\"),\\n", " (\"i need to study\", \"necesito estudiar\"),\\n", " (\"how are you\", \"como estas\"),\\n", " (\"i study at the university\", \"estudio en la universidad\"),\\n", " # Más frases...\\n", " (\"good evening\", \"buenas tardes\"), (\"see you later\", \"hasta luego\"),\\n", " (\"thank you very much\", \"muchas gracias\"),\\n", " (\"you are welcome\", \"de nada\"),\\n", " (\"excuse me\", \"disculpe\"), (\"sorry\", \"lo siento\"),\\n", " (\"maybe\", \"quizas\"), (\"of course\", \"por supuesto\"),\\n", " (\"i\", \"yo\"), (\"you\", \"tu\"), (\"he\", \"el\"), (\"she\", \"ella\"),\\n", " (\"we\", \"nosotros\"), (\"they\", \"ellos\"),\\n", " (\"you are a teacher\", \"tu eres maestro\"),\\n", " (\"he is a professor\", \"el es profesor\"),\\n", " (\"she is a student\", \"ella es estudiante\"),\\n", " (\"we are friends\", \"somos amigos\"),\\n", " (\"what is your name\", \"cual es tu nombre\"),\\n", " (\"my name is john\", \"me llamo john\"),\\n", " (\"nice to meet you\", \"mucho gusto\"),\\n", " (\"father\", \"padre\"), (\"mother\", \"madre\"),\\n", " (\"brother\", \"hermano\"), (\"sister\", \"hermana\"),\\n", " (\"university\", \"universidad\"), (\"class\", \"clase\"),\\n", " (\"professor\", \"profesor\"), (\"student\", \"estudiante\"),\\n", " (\"exam\", \"examen\"), (\"homework\", \"tarea\"),\\n", " (\"the class starts at eight\", \"la clase empieza a las ocho\"),\\n", " (\"i need a book\", \"necesito un libro\"),\\n", " (\"the professor is strict\", \"el profesor es estricto\"),\\n", " (\"i have a class at nine\", \"tengo clase a las nueve\"),\\n", " (\"the lecture is interesting\", \"la conferencia es interesante\"),\\n", " (\"when is the exam\", \"cuando es el examen\"),\\n", " (\"i passed the exam\", \"aprobe el examen\"),\\n", " (\"i am late for class\", \"llegue tarde a clase\"),\\n", " (\"one\", \"uno\"), (\"two\", \"dos\"), (\"three\", \"tres\"),\\n", " (\"four\", \"cuatro\"), (\"five\", \"cinco\"), (\"six\", \"seis\"),\\n", " (\"seven\", \"siete\"), (\"eight\", \"ocho\"), (\"nine\", \"nueve\"),\\n", " (\"ten\", \"diez\"),\\n", " (\"monday\", \"lunes\"), (\"tuesday\", \"martes\"),\\n", " (\"wednesday\", \"miercoles\"), (\"thursday\", \"jueves\"),\\n", " (\"friday\", \"viernes\"), (\"saturday\", \"sabado\"),\\n", " (\"sunday\", \"domingo\"), (\"today\", \"hoy\"), (\"tomorrow\", \"manana\"),\\n", " (\"book\", \"libro\"), (\"computer\", \"computadora\"),\\n", " (\"good\", \"bueno\"), (\"bad\", \"malo\"),\\n", " (\"big\", \"grande\"), (\"small\", \"pequeno\"),\\n", " (\"new\", \"nuevo\"), (\"old\", \"viejo\"),\\n", " (\"fast\", \"rapido\"), (\"slow\", \"lento\"),\\n", " (\"easy\", \"facil\"), (\"difficult\", \"dificil\"),\\n", " (\"how much\", \"cuanto\"), (\"what time is it\", \"que hora es\"),\\n", "]\\n\\n", "# Añadir inversión\\n", "for es, en in list(CORPUS):\\n", " if (en, es) not in CORPUS:\\n", " CORPUS.append((en, es))\\n", "print(f\"Corpus: {len(CORPUS)} parejas\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Vocabulario\\n", "PAD, UNK, SOS, EOS = \"\", \"\", \"\", \"\"\\n\\n", "class Vocab:\\n", " def __init__(self):\\n", " self.w2i = {PAD: 0, UNK: 1, SOS: 2, EOS: 3}\\n", " self.i2w = {0: PAD, 1: UNK, 2: SOS, 3: EOS}\\n", " self.n = 4\\n\\n", " def add(self, text):\\n", " for w in text.lower().split():\\n", " if w not in self.w2i:\\n", " self.w2i[w] = self.n\\n", " self.i2w[self.n] = w\\n", " self.n += 1\\n\\n", " def encode(self, text, max_len, sos=False, eos=False):\\n", " ids = []\\n", " if sos: ids.append(self.w2i[SOS])\\n", " for w in text.lower().split():\\n", " ids.append(self.w2i.get(w, self.w2i[UNK]))\\n", " if eos: ids.append(self.w2i[EOS])\\n", " while len(ids) < max_len: ids.append(self.w2i[PAD])\\n", " return ids[:max_len]\\n\\n", " def decode(self, ids):\\n", " ws = []\\n", " for i in ids:\\n", " if torch.is_tensor(i): i = i.item()\\n", " w = self.i2w.get(i, UNK)\\n", " if w not in [PAD, SOS, EOS]: ws.append(w)\\n", " return \" \".join(ws)\\n\\n", "src_v, tgt_v = Vocab(), Vocab()\\n", "for s, t in CORPUS:\\n", " src_v.add(s)\\n", " tgt_v.add(t)\\n", "print(f\"Vocab src: {src_v.n}, tgt: {tgt_v.n}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fase 3: Modeling - Arquitectura Seq2Seq" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Encoder\\n", "class Encoder(nn.Module):\\n", " def __init__(self, vs, em, hd, ly, dp):\\n", " super().__init__()\\n", " self.emb = nn.Embedding(vs, em, padding_idx=0)\\n", " self.lstm = nn.LSTM(em, hd, ly, batch_first=True, dropout=dp)\\n", " self.dp = nn.Dropout(dp)\\n\\n", " def forward(self, x):\\n", " e = self.dp(self.emb(x))\\n", " o, (h, c) = self.lstm(e)\\n", " return o, h, c\\n\\n", "# Decoder\\n", "class Decoder(nn.Module):\\n", " def __init__(self, vs, em, hd, ly, dp):\\n", " super().__init__()\\n", " self.emb = nn.Embedding(vs, em, padding_idx=0)\\n", " self.lstm = nn.LSTM(em, hd, ly, batch_first=True, dropout=dp)\\n", " self.fc = nn.Linear(hd, vs)\\n", " self.dp = nn.Dropout(dp)\\n\\n", " def forward(self, x, h, c):\\n", " e = self.dp(self.emb(x))\\n", " o, (h, c) = self.lstm(e, (h, c))\\n", " return self.fc(o.squeeze(1)), h, c\\n\\n", "# Seq2Seq\\n", "class Seq2Seq(nn.Module):\\n", " def __init__(self, enc, dec):\\n", " super().__init__()\\n", " self.enc = enc\\n", " self.dec = dec\\n\\n", " def forward(self, src, tgt, tf=0.5):\\n", " bs = src.shape[0]\\n", " max_len = tgt.shape[1]\\n", " out = torch.zeros(bs, max_len, self.dec.fc.out_features).to(src.device)\\n\\n", " _, h, c = self.enc(src)\\n", " dec_in = tgt[:, 0]\\n", " for t in range(1, max_len):\\n", " o, h, c = self.dec(dec_in.unsqueeze(1), h, c)\\n", " out[:, t] = o\\n", " top1 = o.argmax(1)\\n", " dec_in = tgt[:, t] if np.random.random() < tf else top1\\n", " return out\\n\\n", "# Parámetros\\n", "EMBED, HIDDEN, LAYERS, DROP = 256, 512, 2, 0.3\\n", "enc = Encoder(src_v.n, EMBED, HIDDEN, LAYERS, DROP).to(device)\\n", "dec = Decoder(tgt_v.n, EMBED, HIDDEN, LAYERS, DROP).to(device)\\n", "model = Seq2Seq(enc, dec).to(device)\\n\\n", "params = sum(p.numel() for p in model.parameters())\\n", "print(f\"Parámetros: {params:,}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fase 4: Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class DS(Dataset):\\n", " def __init__(self, data, sv, tv, ml):\\n", " self.d = [(sv.enc(s, ml), tv.enc(t, ml, True, True)) for s, t in data]\\n\\n", " def __len__(self): return len(self.d)\\n\\n", " def __getitem__(self, i):\\n", " return torch.tensor(self.d[i][0]), torch.tensor(self.d[i][1])\\n\\n", "MAX_LEN = 20\\n", "ds = DS(CORPUS, src_v, tgt_v, MAX_LEN)\\n", "dl = DataLoader(ds, batch_size=16, shuffle=True)\\n\\n", "criterion = nn.CrossEntropyLoss(ignore_index=0)\\n", "optimizer = optim.Adam(model.parameters(), lr=0.001)\\n\\n", "EPOCHS = 50\\n", "losses = []\\n\\n", "model.train()\\n", "print(\"Entrenando...\")\\n\\n", "for ep in range(1, EPOCHS + 1):\\n", " ep_loss = 0\\n", " for src, tgt in dl:\\n", " src, tgt = src.to(device), tgt.to(device)\\n", " optimizer.zero_grad()\\n", " out = model(src, tgt)\\n", " loss = criterion(out.view(-1, out.shape[-1]), tgt.view(-1))\\n", " loss.backward()\\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\\n", " optimizer.step()\\n", " ep_loss += loss.item()\\n", " losses.append(ep_loss / len(dl))\\n", " if ep % 10 == 0:\\n", " print(f\"Epoch {ep}/{EPOCHS} - Loss: {losses[-1]:.4f}\")\\n\\n", "print(\"Entrenamiento completado!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Curvas de pérdida\\n", "plt.figure(figsize=(10, 5))\\n", "plt.plot(losses, 'b-')\\n", "plt.xlabel('Epoch')\\n", "plt.ylabel('Loss')\\n", "plt.title('Training Loss vs Epoch')\\n", "plt.grid(True)\\n", "plt.savefig('loss_curves.png', dpi=150)\\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluación - BLEU Score" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def bleu(ref, hyp):\\n", " rw, hw = ref.lower().split(), hyp.lower().split()\\n", " if not hw: return 0.0\\n", " m = sum(1 for w in hw if w in rw)\\n", " p = m / len(hw)\\n", " bp = min(1.0, np.exp(1 - len(rw) / max(len(hw), 1)))\\n", " return bp * p\\n\\n", "model.eval()\\n", "test_samples = [\\n", " (\"hello\", \"hola\"), (\"thank you\", \"gracias\"),\\n", " (\"i am a student\", \"soy estudiante\"),\\n", " (\"where is the library\", \"donde esta la biblioteca\"),\\n", " (\"the exam is difficult\", \"el examen es dificil\"),\\n", "]\\n\\n", "total_bleu = 0\\n", "test_results = []\\n\\n", "with torch.no_grad():\\n", " for src_text, tgt_text in test_samples:\\n", " enc_in = torch.tensor([src_v.encode(src_text, MAX_LEN)]).to(device)\\n", " _, h, c = enc(enc_in)\\n", " dec_in = torch.tensor([tgt_v.w2i[SOS]]).to(device)\\n", " result = []\\n", " for _ in range(MAX_LEN):\\n", " o, h, c = dec(dec_in.unsqueeze(1), h, c)\\n", " top = o.argmax(1).item()\\n", " if top == tgt_v.w2i[EOS] or top == tgt_v.w2i[PAD]:\\n", " break\\n", " result.append(top)\\n", " dec_in = torch.tensor([top]).to(device)\\n", " translated = tgt_v.decode(result)\\n", " b = bleu(tgt_text, translated)\\n", " total_bleu += b\\n", " test_results.append((src_text, tgt_text, translated, b))\\n", " print(f\"{src_text} -> {translated} (BLEU: {b:.2f})\")\\n\\n", "avg_bleu = total_bleu / len(test_samples)\\n", "print(f\"\\nBLEU Score: {avg_bleu:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Guardar modelo" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "torch.save({\\n", " 'model': model.state_dict(),\\n", " 'src_vocab': src_v.w2i,\\n", " 'tgt_vocab': tgt_v.w2i,\\n", " 'src_idx2word': src_v.i2w,\\n", " 'tgt_idx2word': tgt_v.i2w,\\n", "}, 'translator.pt')\\n", "print(\"Modelo guardado: translator.pt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Interfaz Gradio" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gradio as gr\\n\\n", "def translate(text, direction=\"EN->ES\"):\\n", " if not text.strip(): return \"\"\\n", " model.eval()\\n", " with torch.no_grad():\\n", " ml = MAX_LEN\\n", " if direction == \"ES->EN\":\\n", " enc_in = torch.tensor([tgt_v.encode(text, ml)]).to(device)\\n", " _, h, c = enc(enc_in)\\n", " else:\\n", " enc_in = torch.tensor([src_v.encode(text, ml)]).to(device)\\n", " _, h, c = enc(enc_in)\\n", " dec_in = torch.tensor([tgt_v.w2i[SOS] if direction==\"EN->EN\" else src_v.w2i[SOS]]).to(device)\\n", " result = []\\n", " for _ in range(ml):\\n", " o, h, c = dec(dec_in.unsqueeze(1), h, c)\\n", " top = o.argmax(1).item()\\n", " if top == tgt_v.w2i[EOS] or top == tgt_v.w2i[PAD]:\\n", " break\\n", " result.append(top)\\n", " dec_in = torch.tensor([top]).to(device)\\n", " return src_v.decode(result) if direction==\"ES->EN\" else tgt_v.decode(result)\\n\\n", "with gr.Blocks() as demo:\\n", " gr.Markdown(\"# Traductor RNN UAC\\n## Seq2Seq LSTM - CRISP-ML(Q)\")\\n", " with gr.Row():\\n", " inp = gr.Textbox(label=\"Texto\")\\n", " direction = gr.Radio([\"EN->ES\", \"ES->EN\"], label=\"Dirección\")\\n", " btn = gr.Button(\"Traducir\")\\n", " out = gr.Textbox(label=\"Traducción\")\\n", " btn.click(fn=translate, inputs=[inp, direction], outputs=out)\\n\\n", "demo.launch()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }