Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from pathlib import Path | |
| # ---------- Tokenizers ---------- | |
| def count_openai(text, encoding_name="cl100k_base"): | |
| """Count tokens for OpenAI models using tiktoken""" | |
| import tiktoken | |
| enc = tiktoken.get_encoding(encoding_name) | |
| return len(enc.encode(text)) | |
| def count_openai_gpt41(text): | |
| """GPT-4.1 uses o200k_base encoding""" | |
| return count_openai(text, "o200k_base") | |
| def count_openai_gpt4o(text): | |
| """GPT-4o and similar models""" | |
| return count_openai(text, "o200k_base") | |
| def count_openai_gpt5(text): | |
| """GPT-5 uses o200k_base encoding""" | |
| return count_openai(text, "o200k_base") | |
| def count_hf(text, model_name): | |
| """Generic HuggingFace tokenizer""" | |
| from transformers import AutoTokenizer | |
| try: | |
| tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True) | |
| return len(tok.encode(text)) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def count_claude_approx(text): | |
| """Approximate Claude token count (1 token ≈ 3.5 characters)""" | |
| return int(len(text) / 3.5) | |
| def count_claude_tokens(text, model_name="claude-3-5-sonnet"): | |
| """More accurate Claude token counting using HuggingFace tokenizer when available""" | |
| # For Claude models, we use approximation as Anthropic doesn't provide public tokenizers | |
| # But we can try using Llama tokenizer as it's often similar | |
| try: | |
| from transformers import AutoTokenizer | |
| # Using meta-llama tokenizer as approximation for Claude | |
| tok = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-7b-hf", use_fast=True) | |
| return len(tok.encode(text)) | |
| except: | |
| return count_claude_approx(text) | |
| def count_mistral(text, model_name="mistralai/Mistral-7B-v0.1"): | |
| """Mistral models (use tiktoken or sentencepiece-style tokenizer)""" | |
| try: | |
| from transformers import AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| return len(tok.encode(text)) | |
| except: | |
| # Fallback: use tiktoken (some Mistral models use tiktoken) | |
| import tiktoken | |
| enc = tiktoken.get_encoding("cl100k_base") # Close approximation | |
| return len(enc.encode(text)) | |
| def count_qwen(text, model_name="Qwen/Qwen3"): | |
| """Qwen models using their specific tokenizer""" | |
| try: | |
| from transformers import AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| return len(tok.encode(text)) | |
| except: | |
| return count_openai(text, "cl100k_base") # Fallback | |
| def count_deepseek(text, model_name="deepseek-ai/DeepSeek-R1"): | |
| """DeepSeek models""" | |
| try: | |
| from transformers import AutoTokenizer | |
| # DeepSeek models often use Llama-style tokenizers | |
| tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| return len(tok.encode(text)) | |
| except: | |
| return count_openai(text, "cl100k_base") # Fallback | |
| def count_mixtral(text): | |
| """Mixtral models""" | |
| return count_mistral(text, "mistralai/Mixtral-8x7B-v0.1") | |
| def count_qwen_coder(text): | |
| """Qwen3-Coder specific tokenizer""" | |
| return count_qwen(text, "Qwen/Qwen3.5-Coder") | |
| def count_mistral_nemo(text): | |
| """Mistral NeMo uses Tekken tokenizer""" | |
| try: | |
| from transformers import AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Base-2407", trust_remote_code=True) | |
| return len(tok.encode(text)) | |
| except: | |
| return count_mistral(text, "mistralai/Mistral-7B-v0.1") | |
| def count_stepfun(text, model_name="stepfun-ai/Step-1-8B"): | |
| """Stepfun models""" | |
| try: | |
| from transformers import AutoTokenizer | |
| tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) | |
| return len(tok.encode(text)) | |
| except: | |
| return count_openai(text, "cl100k_base") # Fallback | |
| def count_perplexity_sonar(text): | |
| """Perplexity Sonar models (often based on DeepSeek)""" | |
| return count_deepseek(text, "deepseek-ai/DeepSeek-V3") | |
| # Updated PROVIDERS with many more models | |
| PROVIDERS = { | |
| # OpenAI Models | |
| "GPT-4o": lambda t: count_openai_gpt4o(t), | |
| "GPT-4.1": lambda t: count_openai_gpt41(t), | |
| "GPT-5": lambda t: count_openai_gpt5(t), | |
| "ChatGPT (GPT-3.5)": lambda t: count_openai(t, "cl100k_base"), | |
| # Anthropic Claude Models | |
| "Claude Sonnet 3.5": lambda t: count_claude_tokens(t, "claude-3-5-sonnet"), | |
| "Claude Sonnet 4.5": lambda t: count_claude_tokens(t, "claude-4-5-sonnet"), # Approximation | |
| "Claude Opus": lambda t: count_claude_tokens(t, "claude-opus"), | |
| "Claude Haiku": lambda t: count_claude_tokens(t, "claude-haiku"), | |
| # Mistral Models | |
| "Mistral 7B": lambda t: count_mistral(t, "mistralai/Mistral-7B-v0.1"), | |
| "Mistral Large": lambda t: count_mistral(t, "mistralai/Mistral-Large-2407"), | |
| "Mistral NeMo": lambda t: count_mistral_nemo(t), | |
| "Mixtral 8x7B": lambda t: count_mixtral(t), | |
| "Mixtral 8x22B": lambda t: count_mistral(t, "mistralai/Mixtral-8x22B-Instruct-v0.1"), | |
| # Qwen Models | |
| "Qwen3 (Base)": lambda t: count_qwen(t, "Qwen/Qwen3"), | |
| "Qwen3 Coder": lambda t: count_qwen_coder(t), | |
| "Qwen2.5 Coder": lambda t: count_qwen(t, "Qwen/Qwen2.5-Coder-7B-Instruct"), | |
| "Qwen Max": lambda t: count_qwen(t, "Qwen/Qwen-Max"), | |
| # DeepSeek Models | |
| "DeepSeek R1": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-R1"), | |
| "DeepSeek V3": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-V3"), | |
| "DeepSeek Coder": lambda t: count_deepseek(t, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"), | |
| # Other Models | |
| "Llama-3.1 8B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-1-8B-Instruct"), | |
| "Llama-3.2 3B": lambda t: count_hf(t, "meta-llama/Meta-Llama-3-2-3B-Instruct"), | |
| "Gemma 2 9B": lambda t: count_hf(t, "google/gemma-2-9b-it"), | |
| "Gemma 2 27B": lambda t: count_hf(t, "google/gemma-2-27b-it"), | |
| "StarCoder 2": lambda t: count_hf(t, "bigcode/starcoder2-15b-instruct-v0.1"), | |
| "CodeLlama": lambda t: count_hf(t, "codellama/CodeLlama-7b-Instruct-hf"), | |
| # Specialized Models | |
| "Perplexity Sonar": lambda t: count_perplexity_sonar(t), | |
| "Stepfun Step-1": lambda t: count_stepfun(t, "stepfun-ai/Step-1-8B"), | |
| # Approximation methods | |
| "Claude (approx)": count_claude_approx, | |
| } | |
| # ---------- Core logic ---------- | |
| def count_tokens(provider, text, file): | |
| if file is not None: | |
| try: | |
| text = Path(file.name).read_text(encoding="utf-8") | |
| except UnicodeDecodeError: | |
| # Fallback for different encodings | |
| text = Path(file.name).read_text(encoding="latin-1") | |
| if not text.strip(): | |
| return "No input text." | |
| counter = PROVIDERS[provider] | |
| result = counter(text) | |
| if isinstance(result, str): # Error message | |
| return result | |
| return f""" | |
| Provider: {provider} | |
| Characters: {len(text):,} | |
| Words: {len(text.split()):,} | |
| Tokens: {result:,} | |
| """ | |
| def count_tokens_for_input(provider, text_input_val): | |
| """Function to count tokens when text is entered directly""" | |
| if not text_input_val.strip(): | |
| return "No input text." | |
| counter = PROVIDERS[provider] | |
| result = counter(text_input_val) | |
| if isinstance(result, str): # Error message | |
| return result | |
| return f""" | |
| Provider: {provider} | |
| Characters: {len(text_input_val):,} | |
| Words: {len(text_input_val.split()):,} | |
| Tokens: {result:,} | |
| """ | |
| # ---------- UI ---------- | |
| with gr.Blocks(title="Advanced LLM Token Counter") as demo: | |
| gr.Markdown("## Advanced LLM Token Counter\nUpload a file or paste text to count tokens for various LLMs.") | |
| provider = gr.Dropdown( | |
| choices=list(PROVIDERS.keys()), | |
| value="GPT-4o", | |
| label="LLM Provider", | |
| multiselect=False | |
| ) | |
| file_input = gr.File( | |
| label="Upload .txt or .md file", | |
| file_types=[".txt", ".md"] | |
| ) | |
| text_input = gr.Textbox( | |
| label="Or paste text here", | |
| lines=8, | |
| placeholder="Paste text here if not uploading a file..." | |
| ) | |
| output = gr.Markdown() | |
| btn = gr.Button("Count Tokens") | |
| btn.click( | |
| fn=count_tokens, | |
| inputs=[provider, text_input, file_input], | |
| outputs=output | |
| ) | |
| # Add event for real-time token counting as user types | |
| text_input.change( | |
| fn=count_tokens_for_input, | |
| inputs=[provider, text_input], | |
| outputs=output | |
| ) | |
| demo.launch() |