| import re | |
| from collections import Counter | |
| def clean_text(text): | |
| text = text.lower() | |
| return re.sub(r"[^a-z0-9 ]", "", text) | |
| def build_vocab(df, min_freq=2): | |
| vocab = {"<PAD>":0, "<UNK>":1} | |
| counter = Counter() | |
| for q in df["question"]: | |
| for w in q.split(): | |
| counter[w] += 1 | |
| idx = 2 | |
| for word, count in counter.items(): | |
| if count > min_freq: | |
| vocab[word] = idx | |
| idx += 1 | |
| return vocab | |
| def encode_question(q, vocab, max_len=20): | |
| tokens = q.split() | |
| enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens] | |
| enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc)) | |
| return enc |