Spaces:

robertkm23
/

chatbot-gru

Sleeping

App Files Files Community

Robert Kenzo Medina Monsalve commited on May 25, 2025

Commit

bb3ba7a

1 Parent(s): f2ef497

Deploying gru chatbot: only code w/o weights

Browse files

Files changed (7) hide show

src/.streamlit/config.toml +3 -0
src/00_prepare_cornell.py +53 -0
src/formatted_movie_lines_exporter.py +50 -0
src/requirements.txt +74 -0
src/serve_gru.py +81 -0
src/streamlit_app.py +21 -39
src/tokenizer.json +0 -0

src/.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[server]
+headless = true
+enableCORS = false

src/00_prepare_cornell.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Descarga el corpus Cornell, extrae pares (Q,A) limpios
+y guarda en `data/pairs.tsv` (tab-separated).
+"""
+import os, zipfile, urllib.request, re, random, csv, json
+from pathlib import Path
+random.seed(42)
+DATA_DIR = Path("data"); DATA_DIR.mkdir(exist_ok=True)
+ZIP_URL  = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
+ZIP_PATH = DATA_DIR/"cornell.zip"
+if not (DATA_DIR/"cornell movie-dialogs corpus").exists():
+    print("▸ descargando corpus …")
+    urllib.request.urlretrieve(ZIP_URL, ZIP_PATH)
+    with zipfile.ZipFile(ZIP_PATH) as z: z.extractall(DATA_DIR)
+    ZIP_PATH.unlink()
+BASE = DATA_DIR/"cornell movie-dialogs corpus"
+lines_f = BASE/"movie_lines.txt"
+conv_f  = BASE/"movie_conversations.txt"
+# ─── lines a diccionario ─────────────────────────────────────
+id2line = {}
+with open(lines_f, encoding="latin-1") as f:
+    for row in f:
+        _id, *_rest, txt = row.strip().split(" +++$+++ ")
+        id2line[_id] = txt
+# ─── conversaciones → pares Q,A ──────────────────────────────
+pairs = []
+with open(conv_f, encoding="latin-1") as f:
+    for row in f:
+        line_ids = eval(row.strip().split(" +++$+++ ")[-1])
+        for i in range(len(line_ids)-1):
+            q, a = id2line[line_ids[i]], id2line[line_ids[i+1]]
+            pairs.append((q, a))
+# limpieza ligera
+def norm(t:str)->str:
+    t = re.sub(r"[^a-zA-Z0-9.!?]+", " ", t.lower())
+    return re.sub(r"\s+", " ", t).strip()
+pairs = [(norm(q), norm(a)) for q,a in pairs
+         if 2<=len(q.split())<=20 and 2<=len(a.split())<=20]
+random.shuffle(pairs)
+with open(DATA_DIR/"pairs.tsv","w",newline='',encoding="utf-8") as f:
+    wr = csv.writer(f, delimiter="\t")
+    wr.writerows(pairs)
+print(f"Pairs listos → {len(pairs):,} líneas.")

src/formatted_movie_lines_exporter.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# build_pairs.py ─ genera formatted_movie_lines.txt ───────────
+import os, re, zipfile, urllib.request, csv, unicodedata
+URL  = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
+ZIP  = "cornell.zip"
+ROOT = "cornell movie-dialogs corpus"
+OUT  = "formatted_movie_lines.txt"   # ← lo que usamos después
+MAX_SENT = 20                        # descarta frases larguísimas
+def ascii(txt):
+    return "".join(c for c in unicodedata.normalize("NFD", txt)
+                   if unicodedata.category(c) != "Mn")
+def norm(s):
+    s = ascii(re.sub(r"[^a-zA-Z0-9?!.]+", " ", s.lower()))
+    s = re.sub(r"([?.!])", r" \1 ", s)
+    return re.sub(r"\s+", " ", s).strip()
+# ─── descarga y des-zip ──────────────────────────────────────
+if not os.path.isdir(ROOT):
+    print("⏬ descargando corpus…")
+    urllib.request.urlretrieve(URL, ZIP)
+    with zipfile.ZipFile(ZIP) as z: z.extractall()
+    os.remove(ZIP)
+# ─── lee líneas y conversaciones ─────────────────────────────
+print("🔧 procesando…")
+lines = {}
+with open(os.path.join(ROOT,"movie_lines.txt"),encoding="latin-1") as f:
+    for ln in f:
+        parts = ln.strip().split(" +++$+++ ")
+        lines[parts[0]] = norm(parts[-1])
+pairs = []
+with open(os.path.join(ROOT,"movie_conversations.txt"),
+          encoding="latin-1") as f:
+    for conv in f:
+        ids = eval(conv.strip().split(" +++$+++ ")[-1])
+        for a,b in zip(ids,ids[1:]):
+            q, r = lines[a], lines[b]
+            if (2<=len(q.split())<MAX_SENT and
+                2<=len(r.split())<MAX_SENT):
+                pairs.append((q,r))
+# ─── guarda en TSV (pregunta[TAB]respuesta) ──────────────────
+with open(OUT,"w",encoding="utf-8",newline="") as f:
+    wr = csv.writer(f,delimiter='\t')
+    wr.writerows(pairs)
+print(f"✅ creado {OUT} con {len(pairs):,} pares")

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,74 @@

+absl-py==2.2.2
+altair==5.5.0
+astunparse==1.6.3
+attrs==25.3.0
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+contourpy==1.3.2
+cycler==0.12.1
+flatbuffers==25.2.10
+fonttools==4.58.0
+gast==0.6.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-auth==2.40.2
+google-auth-oauthlib==1.2.2
+google-pasta==0.2.0
+grpcio==1.71.0
+h5py==3.13.0
+idna==3.10
+Jinja2==3.1.6
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+libclang==18.1.1
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.7.5
+mdurl==0.1.2
+ml-dtypes==0.2.0
+narwhals==1.40.0
+numpy==1.23.5
+oauthlib==3.2.2
+opt_einsum==3.4.0
+packaging==24.2
+pandas==2.2.3
+pillow==10.4.0
+protobuf==4.25.7
+pyarrow==20.0.0
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
+pydeck==0.9.1
+Pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+pytz==2025.2
+referencing==0.36.2
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.9.4
+rpds-py==0.25.1
+rsa==4.9.1
+setuptools==65.5.1
+six==1.17.0
+smmap==5.0.2
+streamlit==1.33.0
+tenacity==8.5.0
+tensorboard==2.15.2
+tensorboard-data-server==0.7.2
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==3.1.0
+toml==0.10.2
+tornado==6.5.1
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+watchdog==6.0.0
+Werkzeug==3.1.3
+wheel==0.38.4
+wrapt==1.14.1

src/serve_gru.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# serve_gru.py ────────────────────────────────────────────────
+import re, numpy as np, tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.text import tokenizer_from_json
+MODEL_PATH, TOK_PATH = "chatbot_seq2seq.keras", "tokenizer.json"
+MAXLEN    = 22
+START, END = "<start>", "<end>"
+# ── utilidades ------------------------------------------------
+def _norm(s: str) -> str:
+    s = re.sub(r"[^a-zA-Z0-9?!.]+", " ", s.lower())
+    s = re.sub(r"([?.!])", r" \1 ", s)
+    return re.sub(r"\s+", " ", s).strip()
+def _pad(seq):
+    return tf.keras.preprocessing.sequence.pad_sequences(
+        seq, maxlen=MAXLEN, padding="post"
+    )
+# ── carga modelo y tokenizer ----------------------------------
+print("‣ cargando modelo y tokenizer…", end="", flush=True)
+model = load_model(MODEL_PATH)
+with open(TOK_PATH, encoding="utf-8") as f:
+    tok = tokenizer_from_json(f.read())
+emb_layer = model.get_layer("emb")
+enc_gru   = model.get_layer("enc_gru")
+dec_gru   = model.get_layer("dec_gru")
+dense     = model.get_layer("dense")
+enc_model = tf.keras.Model(model.input[0], enc_gru.output[1])
+dec_cell  = dec_gru.cell
+UNK_ID    = tok.word_index["<unk>"]
+START_ID  = tok.word_index[START]
+END_ID    = tok.word_index[END]
+print(" listo 🟢")
+# ── paso único del decoder ------------------------------------
+def _step(tok_id, state):
+    # token → embedding
+    x = tf.constant([[tok_id]], dtype=tf.int32)    # (1,1)
+    x = emb_layer(x)                                # (1,1,emb)
+    x = tf.squeeze(x, axis=1)                       # (1,emb)
+    h, _ = dec_cell(x, states=state)                # (1,units)
+    logits = dense(h)[0].numpy()                    # (vocab,)
+    logits[UNK_ID] = -1e9                           # nunca <unk>
+    return logits, [h]
+# ── función de inferencia greedy -----------------------------
+def reply(msg: str, max_len: int = MAXLEN) -> str:
+    # normaliza y codifica
+    seq   = _pad(tok.texts_to_sequences([f"{START} {_norm(msg)} {END}"]))
+    h_enc = enc_model.predict(seq, verbose=0)       # (1,units)
+    state = [tf.convert_to_tensor(h_enc)]           # [(1,units)]
+    tok_id, out_ids = START_ID, []
+    for _ in range(max_len):
+        logits, state = _step(tok_id, state)
+        # greedy: la más probable
+        tok_id = int(np.argmax(logits))
+        # condiciones de parada
+        if tok_id in (END_ID, START_ID):
+            break
+        if len(out_ids) >= 2 and tok_id == out_ids[-1] == out_ids[-2]:
+            break
+        out_ids.append(tok_id)
+    # reconstruye texto
+    return " ".join(tok.index_word[i] for i in out_ids) or "(sin respuesta)"
+# ── demo CLI (opcional) ---------------------------------------
+if __name__ == "__main__":
+    while True:
+        q = input("Tú: ").strip()
+        if not q: continue
+        print("Bot:", reply(q))

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,22 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from serve_gru import reply
+st.set_page_config(page_title="Chatbot GRU", page_icon="🤖")
+st.title("💬 Chatbot GRU (Cornell Movie Dialogs)")
+# Inicializa historial
+if "history" not in st.session_state:
+    st.session_state.history = []
+# Campo de chat integrado
+msg = st.chat_input("Escribe tu mensaje...")
+if msg:
+    # Añade mensaje del usuario
+    st.session_state.history.append(("user", msg))
+    # Obtiene respuesta del modelo
+    bot_resp = reply(msg)
+    st.session_state.history.append(("assistant", bot_resp))
+# Renderiza el chat
+for role, text in st.session_state.history:
+    st.chat_message(role).markdown(text)

src/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff