File size: 305 Bytes
1e5f3d4
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
import re

def clean_text(text):
    text = text.lower()
    return re.sub(r"[^a-z0-9 ]", "", text)

def encode_question(q, vocab, max_len=20):
    tokens = q.split()
    enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
    enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
    return enc