vqa_project / data /preprocess.py
PRUTHVIn's picture
Upload folder using huggingface_hub
364daa0 verified
import re
from collections import Counter
def clean_text(text):
text = text.lower()
return re.sub(r"[^a-z0-9 ]", "", text)
def build_vocab(df, min_freq=2):
vocab = {"<PAD>":0, "<UNK>":1}
counter = Counter()
for q in df["question"]:
for w in q.split():
counter[w] += 1
idx = 2
for word, count in counter.items():
if count > min_freq:
vocab[word] = idx
idx += 1
return vocab
def encode_question(q, vocab, max_len=20):
tokens = q.split()
enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
return enc