PRUTHVIn
/

vqa_project

Model card Files Files and versions

vqa_project / data /preprocess.py

PRUTHVIn's picture

Upload folder using huggingface_hub

364daa0 verified 4 days ago

history blame contribute delete

666 Bytes

	import re
	from collections import Counter

	def clean_text(text):
	text = text.lower()
	return re.sub(r"[^a-z0-9 ]", "", text)

	def build_vocab(df, min_freq=2):
	vocab = {"<PAD>":0, "<UNK>":1}
	counter = Counter()

	for q in df["question"]:
	for w in q.split():
	counter[w] += 1

	idx = 2
	for word, count in counter.items():
	if count > min_freq:
	vocab[word] = idx
	idx += 1

	return vocab

	def encode_question(q, vocab, max_len=20):
	tokens = q.split()
	enc = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
	enc = enc[:max_len] + [vocab["<PAD>"]] * (max_len - len(enc))
	return enc