import gradio as gr from transformers import AutoTokenizer import json from huggingface_hub import hf_hub_download def get_tokenizer_names(model_name): try: # First attempt: Try to get names from model_index.json model_info_path = hf_hub_download(model_name, filename="model_index.json") with open(model_info_path, "r") as f: model_info = json.load(f) # Extract tokenizer class names from the JSON tokenizer_1_class = model_info.get("tokenizer", ["", "Unknown"])[1] tokenizer_2_class = model_info.get("tokenizer_2", ["", "Unknown"])[1] return tokenizer_1_class, tokenizer_2_class except Exception: # Second attempt: Fall back to original method try: model_info = AutoTokenizer.from_pretrained(model_name, subfolder="tokenizer", _from_auto=True) config = model_info.init_kwargs return config.get('tokenizer_class', 'Unknown'), config.get('tokenizer_2_class', 'Unknown') except Exception: return "Unknown", "Unknown" def count_tokens(model_name, text): # Load the tokenizers from the specified model tokenizer_1 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer") tokenizer_2 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer_2") # Get tokenizer names tokenizer_1_name, tokenizer_2_name = get_tokenizer_names(model_name) # Tokenize the input text tokens_1 = tokenizer_1.tokenize(text) tokens_2 = tokenizer_2.tokenize(text) # Count the tokens count_1 = len(tokens_1) count_2 = len(tokens_2) return f"{tokenizer_1_name}: {count_1} tokens", f"{tokenizer_2_name}: {count_2} tokens" # Create a Gradio interface iface = gr.Interface( fn=count_tokens, inputs=[ gr.Textbox(label="Model Name", placeholder="e.g., black-forest-labs/FLUX.1-dev"), gr.Textbox(label="Text", placeholder="Enter text here...") ], outputs=[ gr.Textbox(label="Tokenizer 1"), gr.Textbox(label="Tokenizer 2") ], title="Token Counter", description="Enter a Hugging Face model name and text to count tokens using the model's tokenizers." ) # Launch the app iface.launch()