Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| from tqdm import tqdm | |
| from language.tokenizer import SimpleTokenizer | |
| from data.loaders.arxiv_loader import ArxivLoader | |
| from data.loaders.emotion import EmotionLoader | |
| # ----------------------------- | |
| # Opus Loader | |
| # ----------------------------- | |
| class OpusLoader: | |
| def __init__(self, folder_path): | |
| self.folder_path = folder_path | |
| def samples(self, limit=None): | |
| count = 0 | |
| for root, _, files in os.walk(self.folder_path): | |
| for fname in files: | |
| if not fname.endswith(".txt"): | |
| continue | |
| with open(os.path.join(root, fname), "r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| yield line | |
| count += 1 | |
| if limit and count >= limit: | |
| return | |
| # ----------------------------- | |
| # INIT TOKENIZER | |
| # ----------------------------- | |
| tokenizer = SimpleTokenizer() | |
| texts = [] | |
| print("[INFO] Collecting text for vocab...") | |
| # ----------------------------- | |
| # Arxiv | |
| # ----------------------------- | |
| for sample in ArxivLoader("data/processed/arxiv_train.csv").samples(limit=10000): | |
| if isinstance(sample, dict): | |
| texts.append(sample["text"]) | |
| else: | |
| texts.append(sample) | |
| # ----------------------------- | |
| # Opus multilingual | |
| # ----------------------------- | |
| texts += list( | |
| OpusLoader("data/raw/opus/opusTCv20230926").samples(limit=10000) | |
| ) | |
| # ----------------------------- | |
| # GoEmotions + IMDB | |
| # ----------------------------- | |
| loader = EmotionLoader( | |
| goemotions_path=r"data/raw/emotion/goemotions", | |
| imdb_path=r"data/raw/emotion/aclImdb/aclImdb" | |
| ) | |
| for sample in loader.samples(limit=10000): | |
| if isinstance(sample, dict): | |
| texts.append(sample["text"]) | |
| else: | |
| texts.append(sample) | |
| # ----------------------------- | |
| # SANITY CHECK | |
| # ----------------------------- | |
| print(f"[INFO] Total text samples collected: {len(texts)}") | |
| if not texts: | |
| raise RuntimeError("No text samples collected. Check dataset paths.") | |
| # ----------------------------- | |
| # BUILD VOCAB | |
| # ----------------------------- | |
| tokenizer.build_vocab(texts) | |
| print(f"[TOKENIZER] Built vocab of size {len(tokenizer.vocab)}") | |
| # ----------------------------- | |
| # SAVE VOCAB MANUALLY | |
| # ----------------------------- | |
| os.makedirs("artifacts", exist_ok=True) | |
| vocab_file = "artifacts/vocab.json" | |
| with open(vocab_file, "w", encoding="utf-8") as f: | |
| json.dump(tokenizer.vocab, f, ensure_ascii=False, indent=2) | |
| print(f"[INFO] Vocabulary export complete! Saved to {vocab_file}") | |