Spaces:

amaresh8053
/

ubuntu-chatbot-seq2seq

Sleeping

App Files Files Community

amaresh8053 commited on Dec 8, 2025

Commit

ac6e07e

1 Parent(s): 72c41ce

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -67

app.py CHANGED Viewed

@@ -7,25 +7,32 @@ import torch.nn.functional as F
 import gradio as gr
 import nltk
 from nltk.tokenize import word_tokenize
-import pandas as pd
 from collections import Counter
-# ----------------- basic setup -----------------
 nltk.download(['punkt', 'punkt_tab'], quiet=True)
 DEVICE = torch.device("cpu")
-CACHE_FILE = "ubuntu_data_cache.pt"
-MODEL_FILE = "ubuntu_chatbot_best.pt"
-# ----------------- tokenizer -----------------
-def tokenize(text):
     return word_tokenize(text.lower())
-# ----------------- Vocab (same as training) -----------------
 class Vocab:
     def __init__(self):
-        self.word2idx = {'<PAD>':0, '<SOS>':1, '<EOS>':2, '<UNK>':3}
-        self.idx2word = {0:'<PAD>', 1:'<SOS>', 2:'<EOS>', 3:'<UNK>'}
     def __len__(self):
         return len(self.word2idx)
@@ -42,12 +49,22 @@ class Vocab:
                 self.word2idx[w] = idx
                 self.idx2word[idx] = w
-# ----------------- load vocab from cache -----------------
 if not os.path.exists(CACHE_FILE):
-    raise FileNotFoundError(f"Cache file {CACHE_FILE} not found. Please upload ubuntu_data_cache.pt")
 cache = torch.load(CACHE_FILE, map_location="cpu", weights_only=False)
-# your file has keys: ['data', 'vocab']
 vocab = cache["vocab"]
 # safety: rebuild idx2word if needed
@@ -59,45 +76,77 @@ SOS_IDX = vocab.word2idx["<SOS>"]
 EOS_IDX = vocab.word2idx["<EOS>"]
 UNK_IDX = vocab.word2idx["<UNK>"]
-# ----------------- Model definitions (same as training) -----------------
 class Encoder(nn.Module):
     def __init__(self):
         super().__init__()
-        self.emb = nn.Embedding(len(vocab), 256, padding_idx=0)
-        self.gru = nn.GRU(256, 512, num_layers=2, batch_first=True, dropout=0.3)
-        self.norm = nn.LayerNorm(512)
     def forward(self, x):
-        e = self.emb(x)
-        out, h = self.gru(e)
-        return out, self.norm(h[-1])  # enc_out, hidden
 class Decoder(nn.Module):
     def __init__(self):
         super().__init__()
-        self.emb = nn.Embedding(len(vocab), 256, padding_idx=0)
-        self.gru = nn.GRU(256 + 512, 512, batch_first=True)
-        self.attn = nn.Linear(1024, 1)  # defined but not used directly (we use dot-product attention)
         self.out = nn.Linear(512, len(vocab))
         self.norm = nn.LayerNorm(512)
     def forward(self, inp, hidden, enc_out):
         """
-        inp:    [B, 1]
-        hidden: [B, 512]
         enc_out:[B, T, 512]
         """
-        e = self.emb(inp)  # [B, 1, 256]
-        # dot-product attention
-        attn = torch.bmm(hidden.unsqueeze(1), enc_out.transpose(1, 2))  # [B,1,T]
-        attn = F.softmax(attn.squeeze(1), dim=-1).unsqueeze(1)          # [B,1,T]
-        ctx = torch.bmm(attn, enc_out)                                  # [B,1,512]
-        x = torch.cat((e, ctx), dim=-1)                                 # [B,1,768]
-        out, hidden = self.gru(x, hidden.unsqueeze(0))                  # out:[B,1,512]
-        out = self.norm(out.squeeze(1))                                 # [B,512]
-        return self.out(out), hidden.squeeze(0)                         # logits:[B,vocab], hidden:[B,512]
 class Model(nn.Module):
     def __init__(self):
@@ -107,84 +156,129 @@ class Model(nn.Module):
     def forward(self, src, tgt, tf=0.5):
         enc_out, h = self.encoder(src)
-        dec_in = tgt[:, 0]
         outs = []
         for t in range(1, tgt.size(1)):
-            dec_in = dec_in.unsqueeze(1)
             out, h = self.decoder(dec_in, h, enc_out)
             outs.append(out)
             use_tf = random.random() < tf
             dec_in = tgt[:, t] if use_tf else out.argmax(-1).detach()
         return torch.stack(outs, dim=1)
-# ----------------- load trained weights -----------------
-model = Model().to(DEVICE)
 if not os.path.exists(MODEL_FILE):
-    raise FileNotFoundError(f"Model file {MODEL_FILE} not found. Please upload ubuntu_chatbot_best.pt")
 ckpt = torch.load(MODEL_FILE, map_location="cpu")
 model.load_state_dict(ckpt["model"])
 model.eval()
-# ----------------- beam search generation -----------------
-def beam_generate(src_tensor, beam=5, max_len=50):
     """
-    src_tensor: [1, T] LongTensor
-    returns: decoded string
     """
     model.eval()
     with torch.no_grad():
-        enc_out, h = model.encoder(src_tensor)              # enc_out:[1,T,512], h:[512]
-        beams = [(torch.tensor([[SOS_IDX]], device=DEVICE), h, 0.0, [SOS_IDX])]
         for _ in range(max_len):
             candidates = []
-            for inp, hid, score, seq in beams:
                 if seq[-1] == EOS_IDX:
-                    candidates.append((score, seq))
                     continue
-                out, new_h = model.decoder(inp, hid, enc_out)
-                probs = F.log_softmax(out, dim=-1).squeeze(0)
-                top = probs.topk(beam)
                 for val, idx in zip(top.values, top.indices):
                     token = idx.item()
-                    candidates.append((score + val.item(), seq + [token]))
-            beams = []
-            for score, seq in sorted(candidates, reverse=True)[:beam]:
-                # use last token as next input
-                beams.append((torch.tensor([[seq[-1]]], device=DEVICE), new_h, score, seq))
-            if not beams:
                 break
-        best_seq = sorted(candidates, reverse=True)[0][1]
-        # convert ids to words, skipping <SOS>=1 and <EOS>=2
         words = [
             vocab.idx2word.get(i, "<UNK>")
-            for i in best_seq[1:]  # skip first <SOS>
             if i not in (SOS_IDX, EOS_IDX)
         ]
         return " ".join(words)
-def generate_reply(user_text):
-    tokens = tokenize(user_text)
     ids = [SOS_IDX] + [vocab.word2idx.get(w, UNK_IDX) for w in tokens] + [EOS_IDX]
     src = torch.tensor([ids], dtype=torch.long, device=DEVICE)
-    reply = beam_generate(src, beam=5, max_len=50)
     if not reply.strip():
         return "I don't know."
     return reply
-# ----------------- Gradio chat UI -----------------
-def chat_fn(message, history):
     reply = generate_reply(message)
-    history = history + [(message, reply)]
-    return history, ""
 demo = gr.ChatInterface(
-    fn=chat_fn,
     title="Ubuntu Chatbot (Seq2Seq + GRU + Attention)",
-    description="Ask questions about Ubuntu or Linux system usage."
 )
 if __name__ == "__main__":

 import gradio as gr
 import nltk
 from nltk.tokenize import word_tokenize
 from collections import Counter
+# ------------- basic setup -------------
 nltk.download(['punkt', 'punkt_tab'], quiet=True)
 DEVICE = torch.device("cpu")
+CACHE_FILE = "ubuntu_data_cache.pt"       # from your notebook
+MODEL_FILE = "ubuntu_chatbot_best.pt"     # trained model checkpoint
+# ------------- tokenization + helpers -------------
+def tokenize(text: str):
     return word_tokenize(text.lower())
+def reverse(sentence: str) -> str:
+    """Reverse word order – same trick used in training."""
+    return " ".join(sentence.split()[::-1])
+# ------------- Vocab class (must match training) -------------
 class Vocab:
     def __init__(self):
+        self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
+        self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
     def __len__(self):
         return len(self.word2idx)
                 self.word2idx[w] = idx
                 self.idx2word[idx] = w
+# ------------- load vocab from cache -------------
 if not os.path.exists(CACHE_FILE):
+    raise FileNotFoundError(
+        f"{CACHE_FILE} not found in Space. Upload the same file you used locally."
+    )
+# cache structure in your notebook: {'data': pairs, 'vocab': vocab}
 cache = torch.load(CACHE_FILE, map_location="cpu", weights_only=False)
+if not isinstance(cache, dict) or "vocab" not in cache:
+    raise RuntimeError(
+        f"{CACHE_FILE} does not contain a 'vocab' key. "
+        f"Found keys: {list(cache.keys()) if isinstance(cache, dict) else type(cache)}"
+    )
 vocab = cache["vocab"]
 # safety: rebuild idx2word if needed
 EOS_IDX = vocab.word2idx["<EOS>"]
 UNK_IDX = vocab.word2idx["<UNK>"]
+# ------------- model definitions (EXACTLY as in notebook) -------------
 class Encoder(nn.Module):
     def __init__(self):
         super().__init__()
+        self.emb = nn.Embedding(len(vocab), 256, padding_idx=PAD_IDX)
+        # bidirectional GRU, 2 layers
+        self.gru = nn.GRU(
+            input_size=256,
+            hidden_size=512,
+            num_layers=2,
+            batch_first=True,
+            dropout=0.3,
+            bidirectional=True,
+        )
+        # projection from 1024 (2 * 512) back to 512
+        self.fc = nn.Linear(1024, 512)
+        self.norm = nn.LayerNorm(512)   # defined in notebook (even if not used there)
     def forward(self, x):
+        # x: [B, T]
+        e = self.emb(x)                 # [B, T, 256]
+        out, h = self.gru(e)            # out:[B,T,1024], h:[4,B,512] (2 layers * 2 dirs)
+        # project encoder outputs back to 512
+        out = self.fc(out)              # [B,T,512]
+        # combine directions in h: reshape [layers*dirs, B, H] -> [layers, dirs, B, H]
+        h = h.view(2, 2, h.size(1), -1)  # [2,2,B,512]
+        h = torch.sum(h, dim=1)          # sum over directions -> [2,B,512]
+        return out, h                    # enc_out:[B,T,512], h:[2,B,512]
 class Decoder(nn.Module):
     def __init__(self):
         super().__init__()
+        self.emb = nn.Embedding(len(vocab), 256, padding_idx=PAD_IDX)
+        self.dropout = nn.Dropout(0.3)
+        # GRU: input is [emb + context] = 256 + 512
+        self.gru = nn.GRU(
+            input_size=256 + 512,
+            hidden_size=512,
+            num_layers=2,
+            batch_first=True,
+        )
+        self.attn = nn.Linear(512, 512)
         self.out = nn.Linear(512, len(vocab))
         self.norm = nn.LayerNorm(512)
     def forward(self, inp, hidden, enc_out):
         """
+        inp:    [B, 1]      token IDs
+        hidden: [2, B, 512] encoder hidden (num_layers, batch, hidden)
         enc_out:[B, T, 512]
         """
+        e = self.dropout(self.emb(inp))           # [B,1,256]
+        # attention over encoder outputs
+        energy = self.attn(enc_out)               # [B,T,512]
+        # use top layer hidden state for attention
+        attn_scores = torch.bmm(hidden[-1].unsqueeze(1), energy.transpose(1, 2))  # [B,1,T]
+        attn_weights = F.softmax(attn_scores.squeeze(1), dim=-1).unsqueeze(1)     # [B,1,T]
+        ctx = torch.bmm(attn_weights, enc_out)    # [B,1,512]
+        x = torch.cat((e, ctx), dim=-1)           # [B,1,768]
+        out, hidden = self.gru(x, hidden)         # out:[B,1,512], hidden:[2,B,512]
+        out = self.norm(out.squeeze(1))           # [B,512]
+        logits = self.out(out)                    # [B,vocab]
+        return logits, hidden
 class Model(nn.Module):
     def __init__(self):
     def forward(self, src, tgt, tf=0.5):
         enc_out, h = self.encoder(src)
+        dec_in = tgt[:, 0]                # <SOS>
         outs = []
         for t in range(1, tgt.size(1)):
+            dec_in = dec_in.unsqueeze(1)  # [B,1]
             out, h = self.decoder(dec_in, h, enc_out)
             outs.append(out)
             use_tf = random.random() < tf
             dec_in = tgt[:, t] if use_tf else out.argmax(-1).detach()
         return torch.stack(outs, dim=1)
+# ------------- load trained model -------------
 if not os.path.exists(MODEL_FILE):
+    raise FileNotFoundError(
+        f"{MODEL_FILE} not found in Space. Upload your ubuntu_chatbot_best.pt checkpoint."
+    )
+model = Model().to(DEVICE)
 ckpt = torch.load(MODEL_FILE, map_location="cpu")
 model.load_state_dict(ckpt["model"])
 model.eval()
+print("✅ Model and vocab loaded. Chatbot ready to serve 🚀")
+# ------------- beam search (beam_generate_v2 from notebook) -------------
+def beam_generate_v2(src_tensor, beam=5, max_len=50, alpha=0.7):
     """
+    src_tensor: [1, T] LongTensor with <SOS> ... <EOS>
+    alpha: length penalty factor
     """
     model.eval()
     with torch.no_grad():
+        enc_out, h = model.encoder(src_tensor.to(DEVICE))
+        # Beam entry: (normalized_score, raw_score, hidden, sequence_ids)
+        beams = [(0.0, 0.0, h, [SOS_IDX])]
         for _ in range(max_len):
             candidates = []
+            for norm_score, raw_score, hid, seq in beams:
+                # if last token is EOS -> keep as-is
                 if seq[-1] == EOS_IDX:
+                    candidates.append((norm_score, raw_score, hid, seq))
                     continue
+                # decoder step: input is last token
+                dec_in = torch.tensor([[seq[-1]]], device=DEVICE)
+                out, new_h = model.decoder(dec_in, hid, enc_out)
+                probs = F.log_softmax(out, dim=-1).squeeze(0)  # [vocab]
+                # penalty for repetition
+                for prev_token in set(seq):
+                    probs[prev_token] -= 2.0
+                # take more candidates than beam, then filter
+                top = probs.topk(beam + 5)
                 for val, idx in zip(top.values, top.indices):
                     token = idx.item()
+                    # 3-gram blocking
+                    if len(seq) >= 3:
+                        new_trigram = tuple(seq[-2:] + [token])
+                        existing_trigrams = set(
+                            tuple(seq[i:i+3]) for i in range(len(seq) - 2)
+                        )
+                        if new_trigram in existing_trigrams:
+                            continue
+                    new_raw_score = raw_score + val.item()
+                    new_seq = seq + [token]
+                    # length normalization
+                    length_penalty = ((5 + len(new_seq)) ** alpha) / (6 ** alpha)
+                    new_norm_score = new_raw_score / length_penalty
+                    candidates.append((new_norm_score, new_raw_score, new_h, new_seq))
+            # keep top beam by normalized score
+            if not candidates:
                 break
+            candidates = sorted(candidates, key=lambda x: x[0], reverse=True)
+            beams = candidates[:beam]
+            # early stop if all beams ended with EOS
+            if all(b[3][-1] == EOS_IDX for b in beams):
+                break
+        best_seq = beams[0][3]
+        # convert ids to words (skip SOS/EOS)
         words = [
             vocab.idx2word.get(i, "<UNK>")
+            for i in best_seq[1:]
             if i not in (SOS_IDX, EOS_IDX)
         ]
         return " ".join(words)
+# ------------- wrapper to go from user text → reply -------------
+def generate_reply(user_text: str) -> str:
+    # replicate notebook logic: reverse the input sentence
+    user_text_rev = reverse(user_text)
+    tokens = tokenize(user_text_rev)
     ids = [SOS_IDX] + [vocab.word2idx.get(w, UNK_IDX) for w in tokens] + [EOS_IDX]
     src = torch.tensor([ids], dtype=torch.long, device=DEVICE)
+    reply = beam_generate_v2(src, beam=5, max_len=50)
     if not reply.strip():
         return "I don't know."
     return reply
+# ------------- Gradio ChatInterface -------------
+def respond(message, history):
     reply = generate_reply(message)
+    return reply
 demo = gr.ChatInterface(
+    fn=respond,
     title="Ubuntu Chatbot (Seq2Seq + GRU + Attention)",
+    description="A generative chatbot trained on Ubuntu dialogue pairs (seq2seq with attention)."
 )
 if __name__ == "__main__":