Spaces:
Build error
Build error
| import argparse | |
| import glob | |
| import json | |
| import os | |
| import re | |
| import tempfile | |
| from functools import lru_cache | |
| from typing import List | |
| import ftfy | |
| import sentencepiece as sp | |
| import wordsegment as ws | |
| from tqdm import tqdm | |
| ws.load() | |
| # fmt: off | |
| parser = argparse.ArgumentParser( | |
| description="""Build a vocabulary out of captions corpus. This vocabulary | |
| would be a file which our tokenizer can understand. | |
| """ | |
| ) | |
| parser.add_argument( | |
| "-f", "--files", nargs="+", default="datasets/redcaps/annotations/*.json", | |
| help="Path(s) to SBU, Conceptual, or RedCaps annotation files.", | |
| ) | |
| parser.add_argument( | |
| "-s", "--vocab-size", type=int, default=32000, | |
| help="Total desired size of our vocabulary.", | |
| ) | |
| parser.add_argument( | |
| "-o", "--output-prefix", default="datasets/vocab/redcaps_32k", | |
| help="Prefix of the files to be saved. Two files will be saved: " | |
| "[prefix].model and [prefix].vocab", | |
| ) | |
| # fmt: on | |
| def read_captions_from_file(annotations_path: str) -> List[str]: | |
| r""" | |
| Given a path to annotation file, read it and return a list of captions. | |
| Parameters | |
| ---------- | |
| annotations_path: str | |
| Path to an annotations file containing captions. | |
| Returns | |
| ------- | |
| List[str] | |
| List of captions from this annotation file. | |
| """ | |
| _annotations = json.load(open(annotations_path)) | |
| captions: List[str] = [] | |
| for ann in tqdm(_annotations["annotations"], desc=annotations_path): | |
| # This field only exists in RedCaps. Perform word segmentation on the | |
| # subreddit name to add appropriae whitespaces. | |
| if "subreddit" in ann: | |
| subreddit_seg = _segment_subreddit(ann["subreddit"].lower()) | |
| caption = f"{subreddit_seg} {ann['caption']}" | |
| else: | |
| caption = ann["caption"] | |
| captions.append(caption.lower()) | |
| return captions | |
| def _segment_subreddit(subreddit): | |
| return " ".join(ws.segment(ws.clean(subreddit))) | |
| if __name__ == "__main__": | |
| _A = parser.parse_args() | |
| all_filepaths: List[str] = [] | |
| for f in _A.files: | |
| all_filepaths.extend(glob.glob(f)) | |
| captions: List[str] = [] | |
| for path in tqdm(all_filepaths, desc="Reading captions"): | |
| captions.extend(read_captions_from_file(path)) | |
| # Create a temporary directory and dump the captions corpus as a text file | |
| # with one caption per line. That's how sentencepiece wants its input. | |
| tmpdir_path = tempfile.mkdtemp() | |
| with open(os.path.join(tmpdir_path, "captions.txt"), "w") as captions_file: | |
| for caption in captions: | |
| captions_file.write(caption + "\n") | |
| # Padding/out-of-vocab token will be "<unk>" and ID 0 by default. | |
| # Add [SOS],[EOS] and [SEP] tokens. [SEP] will not be used during | |
| # captioning, but good to have to reuse vocabulary across pretext tasks. | |
| sp.SentencePieceTrainer.train( | |
| f" --input={os.path.join(tmpdir_path, 'captions.txt')}" | |
| f" --vocab_size={_A.vocab_size}" | |
| f" --model_prefix={_A.output_prefix}" | |
| " --model_type=bpe --character_coverage=1.0" | |
| " --bos_id=-1 --eos_id=-1" | |
| " --control_symbols=[SOS],[EOS],[SEP]" | |
| " --user_defined_symbols=<usr>" | |
| ) | |