import re from collections import Counter def clean_text(text): text = text.lower() return re.sub(r"[^a-z0-9 ]", "", text) def build_vocab(df, min_freq=2): vocab = {"":0, "":1} counter = Counter() for q in df["question"]: for w in q.split(): counter[w] += 1 idx = 2 for word, count in counter.items(): if count > min_freq: vocab[word] = idx idx += 1 return vocab def encode_question(q, vocab, max_len=20): tokens = q.split() enc = [vocab.get(w, vocab[""]) for w in tokens] enc = enc[:max_len] + [vocab[""]] * (max_len - len(enc)) return enc