Spaces:
Sleeping
Sleeping
File size: 8,405 Bytes
631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 3e2007c 631ecad 336902e 631ecad 3e2007c 631ecad 3e2007c 631ecad 336902e 631ecad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | import gradio as gr
from pathlib import Path
# ---------- Tokenizers ----------
def count_openai(text, encoding_name="cl100k_base"):
"""Count tokens for OpenAI models using tiktoken"""
import tiktoken
enc = tiktoken.get_encoding(encoding_name)
return len(enc.encode(text))
def count_openai_gpt41(text):
"""GPT-4.1 uses o200k_base encoding"""
return count_openai(text, "o200k_base")
def count_openai_gpt4o(text):
"""GPT-4o and similar models"""
return count_openai(text, "o200k_base")
def count_openai_gpt5(text):
"""GPT-5 uses o200k_base encoding"""
return count_openai(text, "o200k_base")
def count_hf(text, model_name):
"""Generic HuggingFace tokenizer"""
from transformers import AutoTokenizer
try:
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
return len(tok.encode(text))
except Exception as e:
return f"Error: {str(e)}"
def count_claude_approx(text):
"""Approximate Claude token count (1 token ≈ 3.5 characters)"""
return int(len(text) / 3.5)
def count_claude_tokens(text, model_name="claude-3-5-sonnet"):
"""More accurate Claude token counting using HuggingFace tokenizer when available"""
# For Claude models, we use approximation as Anthropic doesn't provide public tokenizers
# But we can try using Llama tokenizer as it's often similar
try:
from transformers import AutoTokenizer
# Using meta-llama tokenizer as approximation for Claude
tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-7b-hf", use_fast=True)
return len(tok.encode(text))
except:
return count_claude_approx(text)
def count_mistral(text, model_name="mistralai/Mistral-7B-v0.1"):
"""Mistral models (use tiktoken or sentencepiece-style tokenizer)"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
# Fallback: use tiktoken (some Mistral models use tiktoken)
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # Close approximation
return len(enc.encode(text))
def count_qwen(text, model_name="Qwen/Qwen3"):
"""Qwen models using their specific tokenizer"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_deepseek(text, model_name="deepseek-ai/DeepSeek-R1"):
"""DeepSeek models"""
try:
from transformers import AutoTokenizer
# DeepSeek models often use Llama-style tokenizers
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_mixtral(text):
"""Mixtral models"""
return count_mistral(text, "mistralai/Mixtral-8x7B-v0.1")
def count_qwen_coder(text):
"""Qwen3-Coder specific tokenizer"""
return count_qwen(text, "Qwen/Qwen3.5-Coder")
def count_mistral_nemo(text):
"""Mistral NeMo uses Tekken tokenizer"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Base-2407", trust_remote_code=True)
return len(tok.encode(text))
except:
return count_mistral(text, "mistralai/Mistral-7B-v0.1")
def count_stepfun(text, model_name="stepfun-ai/Step-1-8B"):
"""Stepfun models"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_perplexity_sonar(text):
"""Perplexity Sonar models (often based on DeepSeek)"""
return count_deepseek(text, "deepseek-ai/DeepSeek-V3")
# Updated PROVIDERS with many more models
PROVIDERS = {
# OpenAI Models
"GPT-4o": lambda t: count_openai_gpt4o(t),
"GPT-4.1": lambda t: count_openai_gpt41(t),
"GPT-5": lambda t: count_openai_gpt5(t),
"ChatGPT (GPT-3.5)": lambda t: count_openai(t, "cl100k_base"),
# Anthropic Claude Models
"Claude Sonnet 3.5": lambda t: count_claude_tokens(t, "claude-3-5-sonnet"),
"Claude Sonnet 4.5": lambda t: count_claude_tokens(t, "claude-4-5-sonnet"), # Approximation
"Claude Opus": lambda t: count_claude_tokens(t, "claude-opus"),
"Claude Haiku": lambda t: count_claude_tokens(t, "claude-haiku"),
# Mistral Models
"Mistral 7B": lambda t: count_mistral(t, "mistralai/Mistral-7B-v0.1"),
"Mistral Large": lambda t: count_mistral(t, "mistralai/Mistral-Large-2407"),
"Mistral NeMo": lambda t: count_mistral_nemo(t),
"Mixtral 8x7B": lambda t: count_mixtral(t),
"Mixtral 8x22B": lambda t: count_mistral(t, "mistralai/Mixtral-8x22B-Instruct-v0.1"),
# Qwen Models
"Qwen3 (Base)": lambda t: count_qwen(t, "Qwen/Qwen3"),
"Qwen3 Coder": lambda t: count_qwen_coder(t),
"Qwen2.5 Coder": lambda t: count_qwen(t, "Qwen/Qwen2.5-Coder-7B-Instruct"),
"Qwen Max": lambda t: count_qwen(t, "Qwen/Qwen-Max"),
# DeepSeek Models
"DeepSeek R1": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-R1"),
"DeepSeek V3": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-V3"),
"DeepSeek Coder": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
# Other Models
"Llama-3.1 8B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-1-8B-Instruct"),
"Llama-3.2 3B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-2-3B-Instruct"),
"Gemma 2 9B": lambda t: count_hf(t, "google/gemma-2-9b-it"),
"Gemma 2 27B": lambda t: count_hf(t, "google/gemma-2-27b-it"),
"StarCoder 2": lambda t: count_hf(t, "bigcode/starcoder2-15b-instruct-v0.1"),
"CodeLlama": lambda t: count_hf(t, "codellama/CodeLlama-7b-Instruct-hf"),
# Specialized Models
"Perplexity Sonar": lambda t: count_perplexity_sonar(t),
"Stepfun Step-1": lambda t: count_stepfun(t, "stepfun-ai/Step-1-8B"),
# Approximation methods
"Claude (approx)": count_claude_approx,
}
# ---------- Core logic ----------
def count_tokens(provider, text, file):
if file is not None:
try:
text = Path(file.name).read_text(encoding="utf-8")
except UnicodeDecodeError:
# Fallback for different encodings
text = Path(file.name).read_text(encoding="latin-1")
if not text.strip():
return "No input text."
counter = PROVIDERS[provider]
result = counter(text)
if isinstance(result, str): # Error message
return result
return f"""
Provider: {provider}
Characters: {len(text):,}
Words: {len(text.split()):,}
Tokens: {result:,}
"""
def count_tokens_for_input(provider, text_input_val):
"""Function to count tokens when text is entered directly"""
if not text_input_val.strip():
return "No input text."
counter = PROVIDERS[provider]
result = counter(text_input_val)
if isinstance(result, str): # Error message
return result
return f"""
Provider: {provider}
Characters: {len(text_input_val):,}
Words: {len(text_input_val.split()):,}
Tokens: {result:,}
"""
# ---------- UI ----------
with gr.Blocks(title="Advanced LLM Token Counter") as demo:
gr.Markdown("## Advanced LLM Token Counter\nUpload a file or paste text to count tokens for various LLMs.")
provider = gr.Dropdown(
choices=list(PROVIDERS.keys()),
value="GPT-4o",
label="LLM Provider",
multiselect=False
)
file_input = gr.File(
label="Upload .txt or .md file",
file_types=[".txt", ".md"]
)
text_input = gr.Textbox(
label="Or paste text here",
lines=8,
placeholder="Paste text here if not uploading a file..."
)
output = gr.Markdown()
btn = gr.Button("Count Tokens")
btn.click(
fn=count_tokens,
inputs=[provider, text_input, file_input],
outputs=output
)
# Add event for real-time token counting as user types
text_input.change(
fn=count_tokens_for_input,
inputs=[provider, text_input],
outputs=output
)
demo.launch() |