import gradio as gr from pathlib import Path # ---------- Tokenizers ---------- def count_openai(text, encoding_name="cl100k_base"): """Count tokens for OpenAI models using tiktoken""" import tiktoken enc = tiktoken.get_encoding(encoding_name) return len(enc.encode(text)) def count_openai_gpt41(text): """GPT-4.1 uses o200k_base encoding""" return count_openai(text, "o200k_base") def count_openai_gpt4o(text): """GPT-4o and similar models""" return count_openai(text, "o200k_base") def count_openai_gpt5(text): """GPT-5 uses o200k_base encoding""" return count_openai(text, "o200k_base") def count_hf(text, model_name): """Generic HuggingFace tokenizer""" from transformers import AutoTokenizer try: tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True) return len(tok.encode(text)) except Exception as e: return f"Error: {str(e)}" def count_claude_approx(text): """Approximate Claude token count (1 token ≈ 3.5 characters)""" return int(len(text) / 3.5) def count_claude_tokens(text, model_name="claude-3-5-sonnet"): """More accurate Claude token counting using HuggingFace tokenizer when available""" # For Claude models, we use approximation as Anthropic doesn't provide public tokenizers # But we can try using Llama tokenizer as it's often similar try: from transformers import AutoTokenizer # Using meta-llama tokenizer as approximation for Claude tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-7b-hf", use_fast=True) return len(tok.encode(text)) except: return count_claude_approx(text) def count_mistral(text, model_name="mistralai/Mistral-7B-v0.1"): """Mistral models (use tiktoken or sentencepiece-style tokenizer)""" try: from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) return len(tok.encode(text)) except: # Fallback: use tiktoken (some Mistral models use tiktoken) import tiktoken enc = tiktoken.get_encoding("cl100k_base") # Close approximation return len(enc.encode(text)) def count_qwen(text, model_name="Qwen/Qwen3"): """Qwen models using their specific tokenizer""" try: from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) return len(tok.encode(text)) except: return count_openai(text, "cl100k_base") # Fallback def count_deepseek(text, model_name="deepseek-ai/DeepSeek-R1"): """DeepSeek models""" try: from transformers import AutoTokenizer # DeepSeek models often use Llama-style tokenizers tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) return len(tok.encode(text)) except: return count_openai(text, "cl100k_base") # Fallback def count_mixtral(text): """Mixtral models""" return count_mistral(text, "mistralai/Mixtral-8x7B-v0.1") def count_qwen_coder(text): """Qwen3-Coder specific tokenizer""" return count_qwen(text, "Qwen/Qwen3.5-Coder") def count_mistral_nemo(text): """Mistral NeMo uses Tekken tokenizer""" try: from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Base-2407", trust_remote_code=True) return len(tok.encode(text)) except: return count_mistral(text, "mistralai/Mistral-7B-v0.1") def count_stepfun(text, model_name="stepfun-ai/Step-1-8B"): """Stepfun models""" try: from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) return len(tok.encode(text)) except: return count_openai(text, "cl100k_base") # Fallback def count_perplexity_sonar(text): """Perplexity Sonar models (often based on DeepSeek)""" return count_deepseek(text, "deepseek-ai/DeepSeek-V3") # Updated PROVIDERS with many more models PROVIDERS = { # OpenAI Models "GPT-4o": lambda t: count_openai_gpt4o(t), "GPT-4.1": lambda t: count_openai_gpt41(t), "GPT-5": lambda t: count_openai_gpt5(t), "ChatGPT (GPT-3.5)": lambda t: count_openai(t, "cl100k_base"), # Anthropic Claude Models "Claude Sonnet 3.5": lambda t: count_claude_tokens(t, "claude-3-5-sonnet"), "Claude Sonnet 4.5": lambda t: count_claude_tokens(t, "claude-4-5-sonnet"), # Approximation "Claude Opus": lambda t: count_claude_tokens(t, "claude-opus"), "Claude Haiku": lambda t: count_claude_tokens(t, "claude-haiku"), # Mistral Models "Mistral 7B": lambda t: count_mistral(t, "mistralai/Mistral-7B-v0.1"), "Mistral Large": lambda t: count_mistral(t, "mistralai/Mistral-Large-2407"), "Mistral NeMo": lambda t: count_mistral_nemo(t), "Mixtral 8x7B": lambda t: count_mixtral(t), "Mixtral 8x22B": lambda t: count_mistral(t, "mistralai/Mixtral-8x22B-Instruct-v0.1"), # Qwen Models "Qwen3 (Base)": lambda t: count_qwen(t, "Qwen/Qwen3"), "Qwen3 Coder": lambda t: count_qwen_coder(t), "Qwen2.5 Coder": lambda t: count_qwen(t, "Qwen/Qwen2.5-Coder-7B-Instruct"), "Qwen Max": lambda t: count_qwen(t, "Qwen/Qwen-Max"), # DeepSeek Models "DeepSeek R1": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-R1"), "DeepSeek V3": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-V3"), "DeepSeek Coder": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"), # Other Models "Llama-3.1 8B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-1-8B-Instruct"), "Llama-3.2 3B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-2-3B-Instruct"), "Gemma 2 9B": lambda t: count_hf(t, "google/gemma-2-9b-it"), "Gemma 2 27B": lambda t: count_hf(t, "google/gemma-2-27b-it"), "StarCoder 2": lambda t: count_hf(t, "bigcode/starcoder2-15b-instruct-v0.1"), "CodeLlama": lambda t: count_hf(t, "codellama/CodeLlama-7b-Instruct-hf"), # Specialized Models "Perplexity Sonar": lambda t: count_perplexity_sonar(t), "Stepfun Step-1": lambda t: count_stepfun(t, "stepfun-ai/Step-1-8B"), # Approximation methods "Claude (approx)": count_claude_approx, } # ---------- Core logic ---------- def count_tokens(provider, text, file): if file is not None: try: text = Path(file.name).read_text(encoding="utf-8") except UnicodeDecodeError: # Fallback for different encodings text = Path(file.name).read_text(encoding="latin-1") if not text.strip(): return "No input text." counter = PROVIDERS[provider] result = counter(text) if isinstance(result, str): # Error message return result return f""" Provider: {provider} Characters: {len(text):,} Words: {len(text.split()):,} Tokens: {result:,} """ def count_tokens_for_input(provider, text_input_val): """Function to count tokens when text is entered directly""" if not text_input_val.strip(): return "No input text." counter = PROVIDERS[provider] result = counter(text_input_val) if isinstance(result, str): # Error message return result return f""" Provider: {provider} Characters: {len(text_input_val):,} Words: {len(text_input_val.split()):,} Tokens: {result:,} """ # ---------- UI ---------- with gr.Blocks(title="Advanced LLM Token Counter") as demo: gr.Markdown("## Advanced LLM Token Counter\nUpload a file or paste text to count tokens for various LLMs.") provider = gr.Dropdown( choices=list(PROVIDERS.keys()), value="GPT-4o", label="LLM Provider", multiselect=False ) file_input = gr.File( label="Upload .txt or .md file", file_types=[".txt", ".md"] ) text_input = gr.Textbox( label="Or paste text here", lines=8, placeholder="Paste text here if not uploading a file..." ) output = gr.Markdown() btn = gr.Button("Count Tokens") btn.click( fn=count_tokens, inputs=[provider, text_input, file_input], outputs=output ) # Add event for real-time token counting as user types text_input.change( fn=count_tokens_for_input, inputs=[provider, text_input], outputs=output ) demo.launch()