File size: 666 Bytes
364daa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re
from collections import Counter

def clean_text(text):
    text = text.lower()
    return re.sub(r"[^a-z0-9 ]", "", text)

def build_vocab(df, min_freq=2):
    vocab = {"<PAD>":0, "<UNK>":1}
    counter = Counter()

    for q in df["question"]:
        for w in q.split():
            counter[w] += 1

    idx = 2
    for word, count in counter.items():
        if count > min_freq:
            vocab[word] = idx
            idx += 1

    return vocab

def encode_question(q, vocab, max_len=20):
    tokens = q.split()
    enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
    enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
    return enc