mibrahimzia's picture
Update app.py
336902e verified
import gradio as gr
from pathlib import Path
# ---------- Tokenizers ----------
def count_openai(text, encoding_name="cl100k_base"):
"""Count tokens for OpenAI models using tiktoken"""
import tiktoken
enc = tiktoken.get_encoding(encoding_name)
return len(enc.encode(text))
def count_openai_gpt41(text):
"""GPT-4.1 uses o200k_base encoding"""
return count_openai(text, "o200k_base")
def count_openai_gpt4o(text):
"""GPT-4o and similar models"""
return count_openai(text, "o200k_base")
def count_openai_gpt5(text):
"""GPT-5 uses o200k_base encoding"""
return count_openai(text, "o200k_base")
def count_hf(text, model_name):
"""Generic HuggingFace tokenizer"""
from transformers import AutoTokenizer
try:
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
return len(tok.encode(text))
except Exception as e:
return f"Error: {str(e)}"
def count_claude_approx(text):
"""Approximate Claude token count (1 token ≈ 3.5 characters)"""
return int(len(text) / 3.5)
def count_claude_tokens(text, model_name="claude-3-5-sonnet"):
"""More accurate Claude token counting using HuggingFace tokenizer when available"""
# For Claude models, we use approximation as Anthropic doesn't provide public tokenizers
# But we can try using Llama tokenizer as it's often similar
try:
from transformers import AutoTokenizer
# Using meta-llama tokenizer as approximation for Claude
tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-7b-hf", use_fast=True)
return len(tok.encode(text))
except:
return count_claude_approx(text)
def count_mistral(text, model_name="mistralai/Mistral-7B-v0.1"):
"""Mistral models (use tiktoken or sentencepiece-style tokenizer)"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
# Fallback: use tiktoken (some Mistral models use tiktoken)
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # Close approximation
return len(enc.encode(text))
def count_qwen(text, model_name="Qwen/Qwen3"):
"""Qwen models using their specific tokenizer"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_deepseek(text, model_name="deepseek-ai/DeepSeek-R1"):
"""DeepSeek models"""
try:
from transformers import AutoTokenizer
# DeepSeek models often use Llama-style tokenizers
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_mixtral(text):
"""Mixtral models"""
return count_mistral(text, "mistralai/Mixtral-8x7B-v0.1")
def count_qwen_coder(text):
"""Qwen3-Coder specific tokenizer"""
return count_qwen(text, "Qwen/Qwen3.5-Coder")
def count_mistral_nemo(text):
"""Mistral NeMo uses Tekken tokenizer"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Base-2407", trust_remote_code=True)
return len(tok.encode(text))
except:
return count_mistral(text, "mistralai/Mistral-7B-v0.1")
def count_stepfun(text, model_name="stepfun-ai/Step-1-8B"):
"""Stepfun models"""
try:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
return len(tok.encode(text))
except:
return count_openai(text, "cl100k_base") # Fallback
def count_perplexity_sonar(text):
"""Perplexity Sonar models (often based on DeepSeek)"""
return count_deepseek(text, "deepseek-ai/DeepSeek-V3")
# Updated PROVIDERS with many more models
PROVIDERS = {
# OpenAI Models
"GPT-4o": lambda t: count_openai_gpt4o(t),
"GPT-4.1": lambda t: count_openai_gpt41(t),
"GPT-5": lambda t: count_openai_gpt5(t),
"ChatGPT (GPT-3.5)": lambda t: count_openai(t, "cl100k_base"),
# Anthropic Claude Models
"Claude Sonnet 3.5": lambda t: count_claude_tokens(t, "claude-3-5-sonnet"),
"Claude Sonnet 4.5": lambda t: count_claude_tokens(t, "claude-4-5-sonnet"), # Approximation
"Claude Opus": lambda t: count_claude_tokens(t, "claude-opus"),
"Claude Haiku": lambda t: count_claude_tokens(t, "claude-haiku"),
# Mistral Models
"Mistral 7B": lambda t: count_mistral(t, "mistralai/Mistral-7B-v0.1"),
"Mistral Large": lambda t: count_mistral(t, "mistralai/Mistral-Large-2407"),
"Mistral NeMo": lambda t: count_mistral_nemo(t),
"Mixtral 8x7B": lambda t: count_mixtral(t),
"Mixtral 8x22B": lambda t: count_mistral(t, "mistralai/Mixtral-8x22B-Instruct-v0.1"),
# Qwen Models
"Qwen3 (Base)": lambda t: count_qwen(t, "Qwen/Qwen3"),
"Qwen3 Coder": lambda t: count_qwen_coder(t),
"Qwen2.5 Coder": lambda t: count_qwen(t, "Qwen/Qwen2.5-Coder-7B-Instruct"),
"Qwen Max": lambda t: count_qwen(t, "Qwen/Qwen-Max"),
# DeepSeek Models
"DeepSeek R1": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-R1"),
"DeepSeek V3": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-V3"),
"DeepSeek Coder": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"),
# Other Models
"Llama-3.1 8B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-1-8B-Instruct"),
"Llama-3.2 3B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-2-3B-Instruct"),
"Gemma 2 9B": lambda t: count_hf(t, "google/gemma-2-9b-it"),
"Gemma 2 27B": lambda t: count_hf(t, "google/gemma-2-27b-it"),
"StarCoder 2": lambda t: count_hf(t, "bigcode/starcoder2-15b-instruct-v0.1"),
"CodeLlama": lambda t: count_hf(t, "codellama/CodeLlama-7b-Instruct-hf"),
# Specialized Models
"Perplexity Sonar": lambda t: count_perplexity_sonar(t),
"Stepfun Step-1": lambda t: count_stepfun(t, "stepfun-ai/Step-1-8B"),
# Approximation methods
"Claude (approx)": count_claude_approx,
}
# ---------- Core logic ----------
def count_tokens(provider, text, file):
if file is not None:
try:
text = Path(file.name).read_text(encoding="utf-8")
except UnicodeDecodeError:
# Fallback for different encodings
text = Path(file.name).read_text(encoding="latin-1")
if not text.strip():
return "No input text."
counter = PROVIDERS[provider]
result = counter(text)
if isinstance(result, str): # Error message
return result
return f"""
Provider: {provider}
Characters: {len(text):,}
Words: {len(text.split()):,}
Tokens: {result:,}
"""
def count_tokens_for_input(provider, text_input_val):
"""Function to count tokens when text is entered directly"""
if not text_input_val.strip():
return "No input text."
counter = PROVIDERS[provider]
result = counter(text_input_val)
if isinstance(result, str): # Error message
return result
return f"""
Provider: {provider}
Characters: {len(text_input_val):,}
Words: {len(text_input_val.split()):,}
Tokens: {result:,}
"""
# ---------- UI ----------
with gr.Blocks(title="Advanced LLM Token Counter") as demo:
gr.Markdown("## Advanced LLM Token Counter\nUpload a file or paste text to count tokens for various LLMs.")
provider = gr.Dropdown(
choices=list(PROVIDERS.keys()),
value="GPT-4o",
label="LLM Provider",
multiselect=False
)
file_input = gr.File(
label="Upload .txt or .md file",
file_types=[".txt", ".md"]
)
text_input = gr.Textbox(
label="Or paste text here",
lines=8,
placeholder="Paste text here if not uploading a file..."
)
output = gr.Markdown()
btn = gr.Button("Count Tokens")
btn.click(
fn=count_tokens,
inputs=[provider, text_input, file_input],
outputs=output
)
# Add event for real-time token counting as user types
text_input.change(
fn=count_tokens_for_input,
inputs=[provider, text_input],
outputs=output
)
demo.launch()