Spaces:
Sleeping
Sleeping
| import token | |
| import tokenize | |
| import gradio as gr | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train", progress=gr.Progress()): | |
| progress(0, desc="Starting") | |
| # Initialize tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
| # Load dataset | |
| dataset = load_dataset(dataset_name, split=split) | |
| progress(0, desc="Loaded \"{}\"".format(dataset_name)) | |
| tokens_=0 | |
| for field in dataset[0].keys(): | |
| _all=dataset[field] | |
| for i in progress.tqdm(_all, desc=f"Tokenizing \"{field}\""): | |
| tokens_+=len(tokenizer.tokenize(i)) | |
| return tokens_ | |
| with gr.Blocks(title="Dataset token counter") as app: | |
| gr.Markdown("# Token Counter") | |
| with gr.Row(): | |
| prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") | |
| tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") | |
| split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") | |
| tokens = gr.Label(label="Tokens", elem_id="tokens") | |
| gr.on( | |
| triggers=[ | |
| prompt.submit, | |
| tokenizer.submit, | |
| split.submit, | |
| ], | |
| fn=ReturnTokens, | |
| inputs=[ | |
| prompt, | |
| tokenizer, | |
| split | |
| ], | |
| outputs=[tokens], | |
| api_name="run", | |
| ) | |
| app.launch() |