Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tiktoken | |
| import json | |
| import os | |
| # Function to count tokens in the dataset based on the "messages" field | |
| def count_tokens(json_file, encoding_name): | |
| encoding = tiktoken.get_encoding(encoding_name) | |
| # Load the JSON or JSONL data | |
| with open(json_file.name, 'r') as f: | |
| data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()] | |
| total_token_count = 0 | |
| token_counts = [] | |
| for entry in data: | |
| conversation_token_count = 0 | |
| conversation_texts = [] | |
| if "messages" in entry: | |
| for message in entry["messages"]: | |
| content = message.get("content", "") | |
| conversation_texts.append(content) | |
| tokens = len(encoding.encode(content)) | |
| conversation_token_count += tokens | |
| # Add conversation token count to the total | |
| total_token_count += conversation_token_count | |
| token_counts.append({ | |
| 'conversation': ' '.join(conversation_texts), | |
| 'token_count': conversation_token_count | |
| }) | |
| return token_counts, total_token_count | |
| # Gradio interface function | |
| def token_counter(json_file, encoding_with_model): | |
| # Split encoding name and model type from the dropdown input | |
| encoding_name = encoding_with_model.split()[0] | |
| # Get token counts | |
| token_data, total_token_count = count_tokens(json_file, encoding_name) | |
| return token_data, total_token_count | |
| # Define the encoding choices with model information | |
| encoding_options = [ | |
| "o200k_base (gpt-4o, gpt-4o-mini)", | |
| "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)", | |
| "p50k_base (Codex models, text-davinci-002, text-davinci-003)", | |
| "r50k_base (GPT-3 models like davinci)" | |
| ] | |
| # Gradio UI setup | |
| with gr.Blocks() as app: | |
| gr.Markdown("# Token Counter for JSON/JSONL Datasets") | |
| with gr.Row(): | |
| json_input = gr.File(label="Upload JSON/JSONL File") | |
| encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)") | |
| # Example file (this will automatically upload when clicked) | |
| example_file_path = "keivalyaMedQuad-MedicalQnADataset_valid.jsonl" | |
| if os.path.exists(example_file_path): | |
| example = gr.Examples( | |
| examples=[example_file_path], | |
| inputs=json_input, | |
| label="Click here to load the example file" | |
| ) | |
| # Display credits for the dataset author | |
| gr.Markdown("### Dataset Credits") | |
| gr.Markdown( | |
| """ | |
| This dataset is provided by the [MedQuad-MedicalQnADataset](https://huggingface.co/datasets/keivalya/MedQuad-MedicalQnADataset) on Hugging Face. | |
| All credit goes to the original creator, [keivalya](https://huggingface.co/keivalya). | |
| """ | |
| ) | |
| # Output for individual conversation token counts | |
| conversation_output = gr.JSON(label="Token Counts per Conversation") | |
| # Output for total token count | |
| total_output = gr.Number(label="Total Token Count", interactive=False) | |
| # Add a submit button to trigger token counting | |
| submit_button = gr.Button("Submit") | |
| # Link the button click event to the token counting function | |
| submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output]) | |
| # Launch the app | |
| app.launch() | |