Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,21 +30,42 @@ def count_tokens(json_file, encoding_name):
|
|
| 30 |
'token_count': conversation_token_count
|
| 31 |
})
|
| 32 |
|
| 33 |
-
return
|
| 34 |
|
| 35 |
# Gradio interface function
|
| 36 |
-
def token_counter(json_file,
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Gradio UI setup
|
| 41 |
-
gr.
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
gr.
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
'token_count': conversation_token_count
|
| 31 |
})
|
| 32 |
|
| 33 |
+
return token_counts, total_token_count
|
| 34 |
|
| 35 |
# Gradio interface function
|
| 36 |
+
def token_counter(json_file, encoding_with_model):
|
| 37 |
+
# Split encoding name and model type from the dropdown input
|
| 38 |
+
encoding_name = encoding_with_model.split()[0]
|
| 39 |
+
|
| 40 |
+
# Get token counts
|
| 41 |
+
token_data, total_token_count = count_tokens(json_file, encoding_name)
|
| 42 |
+
|
| 43 |
+
return token_data, total_token_count
|
| 44 |
+
|
| 45 |
+
# Define the encoding choices with model information
|
| 46 |
+
encoding_options = [
|
| 47 |
+
"o200k_base (gpt-4o, gpt-4o-mini)",
|
| 48 |
+
"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
|
| 49 |
+
"p50k_base (Codex models, text-davinci-002, text-davinci-003)",
|
| 50 |
+
"r50k_base (GPT-3 models like davinci)"
|
| 51 |
+
]
|
| 52 |
|
| 53 |
# Gradio UI setup
|
| 54 |
+
with gr.Blocks() as app:
|
| 55 |
+
gr.Markdown("# Token Counter for JSON/JSONL Datasets")
|
| 56 |
+
|
| 57 |
+
with gr.Row():
|
| 58 |
+
json_input = gr.File(label="Upload JSON/JSONL File")
|
| 59 |
+
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
|
| 60 |
+
|
| 61 |
+
# Output for individual conversation token counts
|
| 62 |
+
conversation_output = gr.JSON(label="Token Counts per Conversation")
|
| 63 |
+
|
| 64 |
+
# Output for total token count
|
| 65 |
+
total_output = gr.Number(label="Total Token Count", interactive=False)
|
| 66 |
+
|
| 67 |
+
# Link the inputs and outputs to the function
|
| 68 |
+
json_input.change(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
|
| 69 |
+
|
| 70 |
+
# Launch the app
|
| 71 |
+
app.launch()
|