Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ def count_tokens(json_file, encoding_name):
|
|
| 10 |
with open(json_file.name, 'r') as f:
|
| 11 |
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
|
| 12 |
|
|
|
|
| 13 |
token_counts = []
|
| 14 |
for entry in data:
|
| 15 |
conversation_token_count = 0
|
|
@@ -18,14 +19,18 @@ def count_tokens(json_file, encoding_name):
|
|
| 18 |
for message in entry["messages"]:
|
| 19 |
content = message.get("content", "")
|
| 20 |
conversation_texts.append(content)
|
| 21 |
-
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
token_counts.append({
|
| 24 |
'conversation': ' '.join(conversation_texts),
|
| 25 |
'token_count': conversation_token_count
|
| 26 |
})
|
| 27 |
|
| 28 |
-
return token_counts
|
| 29 |
|
| 30 |
# Gradio interface function
|
| 31 |
def token_counter(json_file, encoding_name):
|
|
@@ -39,5 +44,7 @@ gr.Interface(
|
|
| 39 |
gr.File(label="Upload JSON/JSONL File"),
|
| 40 |
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
|
| 41 |
],
|
| 42 |
-
outputs=
|
|
|
|
|
|
|
| 43 |
).launch()
|
|
|
|
| 10 |
with open(json_file.name, 'r') as f:
|
| 11 |
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
|
| 12 |
|
| 13 |
+
total_token_count = 0
|
| 14 |
token_counts = []
|
| 15 |
for entry in data:
|
| 16 |
conversation_token_count = 0
|
|
|
|
| 19 |
for message in entry["messages"]:
|
| 20 |
content = message.get("content", "")
|
| 21 |
conversation_texts.append(content)
|
| 22 |
+
tokens = len(encoding.encode(content))
|
| 23 |
+
conversation_token_count += tokens
|
| 24 |
|
| 25 |
+
# Add conversation token count to the total
|
| 26 |
+
total_token_count += conversation_token_count
|
| 27 |
+
|
| 28 |
token_counts.append({
|
| 29 |
'conversation': ' '.join(conversation_texts),
|
| 30 |
'token_count': conversation_token_count
|
| 31 |
})
|
| 32 |
|
| 33 |
+
return {"conversations": token_counts, "total_token_count": total_token_count}
|
| 34 |
|
| 35 |
# Gradio interface function
|
| 36 |
def token_counter(json_file, encoding_name):
|
|
|
|
| 44 |
gr.File(label="Upload JSON/JSONL File"),
|
| 45 |
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
|
| 46 |
],
|
| 47 |
+
outputs=[
|
| 48 |
+
gr.JSON(label="Token Counts per Conversation and Total"),
|
| 49 |
+
]
|
| 50 |
).launch()
|