mvi-ai-engine / export_vocab.py
Musombi's picture
Upload folder using huggingface_hub
c1e438c
import os
import json
from tqdm import tqdm
from language.tokenizer import SimpleTokenizer
from data.loaders.arxiv_loader import ArxivLoader
from data.loaders.emotion import EmotionLoader
# -----------------------------
# Opus Loader
# -----------------------------
class OpusLoader:
def __init__(self, folder_path):
self.folder_path = folder_path
def samples(self, limit=None):
count = 0
for root, _, files in os.walk(self.folder_path):
for fname in files:
if not fname.endswith(".txt"):
continue
with open(os.path.join(root, fname), "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
yield line
count += 1
if limit and count >= limit:
return
# -----------------------------
# INIT TOKENIZER
# -----------------------------
tokenizer = SimpleTokenizer()
texts = []
print("[INFO] Collecting text for vocab...")
# -----------------------------
# Arxiv
# -----------------------------
for sample in ArxivLoader("data/processed/arxiv_train.csv").samples(limit=10000):
if isinstance(sample, dict):
texts.append(sample["text"])
else:
texts.append(sample)
# -----------------------------
# Opus multilingual
# -----------------------------
texts += list(
OpusLoader("data/raw/opus/opusTCv20230926").samples(limit=10000)
)
# -----------------------------
# GoEmotions + IMDB
# -----------------------------
loader = EmotionLoader(
goemotions_path=r"data/raw/emotion/goemotions",
imdb_path=r"data/raw/emotion/aclImdb/aclImdb"
)
for sample in loader.samples(limit=10000):
if isinstance(sample, dict):
texts.append(sample["text"])
else:
texts.append(sample)
# -----------------------------
# SANITY CHECK
# -----------------------------
print(f"[INFO] Total text samples collected: {len(texts)}")
if not texts:
raise RuntimeError("No text samples collected. Check dataset paths.")
# -----------------------------
# BUILD VOCAB
# -----------------------------
tokenizer.build_vocab(texts)
print(f"[TOKENIZER] Built vocab of size {len(tokenizer.vocab)}")
# -----------------------------
# SAVE VOCAB MANUALLY
# -----------------------------
os.makedirs("artifacts", exist_ok=True)
vocab_file = "artifacts/vocab.json"
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(tokenizer.vocab, f, ensure_ascii=False, indent=2)
print(f"[INFO] Vocabulary export complete! Saved to {vocab_file}")