HeshamHaroon commited on
Commit
f32f063
·
verified ·
1 Parent(s): 2b73d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1502 -92
app.py CHANGED
@@ -1,111 +1,1521 @@
1
- from gradio import Interface
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
- import aranizer
4
- from aranizer import (
5
- aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k,
6
- aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
7
- )
8
- from transformers import AutoTokenizer, logging
9
- from huggingface_hub import login
10
  import os
11
 
12
- # Retrieve your Hugging Face token from the environment variable
13
  HF_TOKEN = os.getenv('HF_TOKEN')
14
-
15
  if HF_TOKEN:
16
- HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
 
17
  login(token=HF_TOKEN)
18
 
19
- # Load additional tokenizers from transformers
20
- gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
21
- gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
22
- jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
23
- arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
24
-
25
- # Try to load the gated tokenizer
26
- try:
27
- meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
28
- except Exception as e:
29
- meta_llama_tokenizer = None
30
- logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")
31
-
32
- # List of available tokenizers and a dictionary to load them
33
- tokenizer_options = [
34
- "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
35
- "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
36
- "FreedomIntelligence/AceGPT-13B",
37
- "FreedomIntelligence/AceGPT-7B",
38
- "inception-mbzuai/jais-13b",
39
- "aubmindlab/bert-base-arabertv2"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
 
42
- if meta_llama_tokenizer:
43
- tokenizer_options.append("meta-llama/Meta-Llama-3-8B")
44
-
45
- tokenizers = {
46
- "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
47
- "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
48
- "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
49
- "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
50
- "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
51
- "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
52
- "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
53
- "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
54
- "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
55
- "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
56
- "aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer
57
- }
58
-
59
- if meta_llama_tokenizer:
60
- tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
61
-
62
- def compare_tokenizers(tokenizer_index, text):
63
- tokenizer_name = tokenizer_options[tokenizer_index]
64
- tokenizer = tokenizers[tokenizer_name]()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  tokens = tokenizer.tokenize(text)
66
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
67
- decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Ensure the tokens are properly decoded
70
- tokens_display = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
 
71
 
72
- # Prepare the results to be displayed in HTML format
73
- tokens_html = "".join([
74
- f"<span style='background-color:#eeeeee; color: #333333; padding:4px; margin:2px; border-radius:3px; border:1px solid #cccccc;'>{token}</span>"
75
- for token in tokens_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  ])
77
- encoded_html = "".join([
78
- f"<span style='background-color:#e0e0e0; color: #000000; padding:4px; margin:2px; border-radius:3px; border:1px solid #aaaaaa;'>{token}</span>"
79
- for token in encoded_output
 
80
  ])
81
- decoded_html = f"<div style='background-color:#f5f5f5; color: #444444; padding:10px; border-radius:3px; border:1px solid #999999;'>{decoded_text}</div>"
82
-
83
- results_html = f"""
84
- <div style='font-family: Arial, sans-serif;'>
85
- <h3 style='color: #2e7d32;'>Tokenizer: {tokenizer_name}</h3>
86
- <p><strong>Tokens:</strong> {tokens_html}</p>
87
- <p><strong>Encoded:</strong> {encoded_html}</p>
88
- <p><strong>Decoded:</strong> {decoded_html}</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  </div>
90
- """
91
- return results_html
92
 
93
- # Define the Gradio interface components with a dropdown for model selection
94
- inputs_component = [
95
- gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer", type="index"),
96
- gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
97
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- outputs_component = gr.HTML(label="Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Setting up the interface
102
- iface = Interface(
103
- fn=compare_tokenizers,
104
- inputs=inputs_component,
105
- outputs=outputs_component,
106
- title="Arabic Tokenizer Arena",
107
- live=True
108
- )
109
 
110
- # Launching the Gradio app
111
- iface.launch()
 
 
1
+ """
2
+ Arabic Tokenizer Arena Pro - Advanced Arabic Tokenization Analysis Platform
3
+ ============================================================================
4
+ A comprehensive research and production-grade tool for evaluating Arabic tokenizers
5
+ across multiple dimensions: efficiency, coverage, morphological awareness, and more.
6
+
7
+ Supports:
8
+ - Arabic-specific tokenizers (Aranizer, AraBERT, CAMeLBERT, MARBERT, etc.)
9
+ - Major LLM tokenizers (Jais, AceGPT, Falcon-Arabic, ALLaM, Qwen, Llama, Mistral, GPT)
10
+ - Comprehensive evaluation metrics based on latest research
11
+ """
12
+
13
  import gradio as gr
14
+ import json
15
+ import re
16
+ import time
17
+ import unicodedata
18
+ from typing import Dict, List, Tuple, Optional, Any
19
+ from dataclasses import dataclass, field
20
+ from enum import Enum
21
  import os
22
 
23
+ # Hugging Face authentication
24
  HF_TOKEN = os.getenv('HF_TOKEN')
 
25
  if HF_TOKEN:
26
+ HF_TOKEN = HF_TOKEN.strip()
27
+ from huggingface_hub import login
28
  login(token=HF_TOKEN)
29
 
30
+ from transformers import AutoTokenizer, logging
31
+ logging.set_verbosity_error()
32
+
33
+ # ============================================================================
34
+ # DATA CLASSES AND ENUMS
35
+ # ============================================================================
36
+
37
+ class TokenizerType(Enum):
38
+ ARABIC_SPECIFIC = "Arabic-Specific"
39
+ MULTILINGUAL_LLM = "Multilingual LLM"
40
+ ARABIC_LLM = "Arabic LLM"
41
+ ENCODER_ONLY = "Encoder-Only (BERT)"
42
+ DECODER_ONLY = "Decoder-Only (GPT)"
43
+
44
+ class TokenizerAlgorithm(Enum):
45
+ BPE = "Byte-Pair Encoding (BPE)"
46
+ BBPE = "Byte-Level BPE"
47
+ WORDPIECE = "WordPiece"
48
+ SENTENCEPIECE = "SentencePiece"
49
+ UNIGRAM = "Unigram"
50
+ TIKTOKEN = "Tiktoken"
51
+
52
+ @dataclass
53
+ class TokenizerInfo:
54
+ """Metadata about a tokenizer"""
55
+ name: str
56
+ model_id: str
57
+ type: TokenizerType
58
+ algorithm: TokenizerAlgorithm
59
+ vocab_size: int
60
+ description: str
61
+ organization: str
62
+ arabic_support: str # Native, Adapted, Limited
63
+ dialect_support: List[str] = field(default_factory=list)
64
+ special_features: List[str] = field(default_factory=list)
65
+
66
+ @dataclass
67
+ class TokenizationMetrics:
68
+ """Comprehensive tokenization evaluation metrics"""
69
+ # Basic counts
70
+ total_tokens: int
71
+ total_words: int
72
+ total_characters: int
73
+ total_bytes: int
74
+
75
+ # Efficiency metrics
76
+ fertility: float # tokens per word (lower is better, 1.0 is ideal)
77
+ compression_ratio: float # bytes per token (higher is better)
78
+ char_per_token: float # characters per token
79
+
80
+ # Coverage metrics
81
+ oov_count: int # out-of-vocabulary tokens (UNK)
82
+ oov_percentage: float
83
+ single_token_words: int # words tokenized as single token
84
+ single_token_retention_rate: float # STRR metric
85
+
86
+ # Morphological metrics
87
+ avg_subwords_per_word: float
88
+ max_subwords_per_word: int
89
+ continued_words_ratio: float # words split into multiple tokens
90
+
91
+ # Arabic-specific metrics
92
+ arabic_char_count: int
93
+ arabic_token_count: int
94
+ arabic_fertility: float
95
+ diacritic_preservation: bool
96
+
97
+ # Performance metrics
98
+ tokenization_time_ms: float
99
+
100
+ # Token details
101
+ tokens: List[str] = field(default_factory=list)
102
+ token_ids: List[int] = field(default_factory=list)
103
+ decoded_text: str = ""
104
+
105
+ # ============================================================================
106
+ # TOKENIZER REGISTRY - Comprehensive list of Arabic tokenizers
107
+ # ============================================================================
108
+
109
+ TOKENIZER_REGISTRY: Dict[str, TokenizerInfo] = {
110
+ # ========== ARABIC-SPECIFIC BERT MODELS ==========
111
+ "aubmindlab/bert-base-arabertv2": TokenizerInfo(
112
+ name="AraBERT v2",
113
+ model_id="aubmindlab/bert-base-arabertv2",
114
+ type=TokenizerType.ENCODER_ONLY,
115
+ algorithm=TokenizerAlgorithm.WORDPIECE,
116
+ vocab_size=64000,
117
+ description="Arabic BERT with Farasa segmentation, optimized for MSA",
118
+ organization="AUB MIND Lab",
119
+ arabic_support="Native",
120
+ dialect_support=["MSA"],
121
+ special_features=["Farasa preprocessing", "Morphological segmentation"]
122
+ ),
123
+ "aubmindlab/bert-large-arabertv2": TokenizerInfo(
124
+ name="AraBERT v2 Large",
125
+ model_id="aubmindlab/bert-large-arabertv2",
126
+ type=TokenizerType.ENCODER_ONLY,
127
+ algorithm=TokenizerAlgorithm.WORDPIECE,
128
+ vocab_size=64000,
129
+ description="Large Arabic BERT with enhanced capacity",
130
+ organization="AUB MIND Lab",
131
+ arabic_support="Native",
132
+ dialect_support=["MSA"],
133
+ special_features=["Large model", "Farasa preprocessing"]
134
+ ),
135
+ "CAMeL-Lab/bert-base-arabic-camelbert-mix": TokenizerInfo(
136
+ name="CAMeLBERT Mix",
137
+ model_id="CAMeL-Lab/bert-base-arabic-camelbert-mix",
138
+ type=TokenizerType.ENCODER_ONLY,
139
+ algorithm=TokenizerAlgorithm.WORDPIECE,
140
+ vocab_size=30000,
141
+ description="Pre-trained on MSA, DA, and Classical Arabic mix",
142
+ organization="CAMeL Lab NYU Abu Dhabi",
143
+ arabic_support="Native",
144
+ dialect_support=["MSA", "DA", "CA"],
145
+ special_features=["Multi-variant Arabic", "Classical Arabic support"]
146
+ ),
147
+ "CAMeL-Lab/bert-base-arabic-camelbert-msa": TokenizerInfo(
148
+ name="CAMeLBERT MSA",
149
+ model_id="CAMeL-Lab/bert-base-arabic-camelbert-msa",
150
+ type=TokenizerType.ENCODER_ONLY,
151
+ algorithm=TokenizerAlgorithm.WORDPIECE,
152
+ vocab_size=30000,
153
+ description="Specialized for Modern Standard Arabic",
154
+ organization="CAMeL Lab NYU Abu Dhabi",
155
+ arabic_support="Native",
156
+ dialect_support=["MSA"],
157
+ special_features=["MSA optimized"]
158
+ ),
159
+ "CAMeL-Lab/bert-base-arabic-camelbert-da": TokenizerInfo(
160
+ name="CAMeLBERT DA",
161
+ model_id="CAMeL-Lab/bert-base-arabic-camelbert-da",
162
+ type=TokenizerType.ENCODER_ONLY,
163
+ algorithm=TokenizerAlgorithm.WORDPIECE,
164
+ vocab_size=30000,
165
+ description="Specialized for Dialectal Arabic",
166
+ organization="CAMeL Lab NYU Abu Dhabi",
167
+ arabic_support="Native",
168
+ dialect_support=["Egyptian", "Gulf", "Levantine", "Maghrebi"],
169
+ special_features=["Dialect optimized"]
170
+ ),
171
+ "CAMeL-Lab/bert-base-arabic-camelbert-ca": TokenizerInfo(
172
+ name="CAMeLBERT CA",
173
+ model_id="CAMeL-Lab/bert-base-arabic-camelbert-ca",
174
+ type=TokenizerType.ENCODER_ONLY,
175
+ algorithm=TokenizerAlgorithm.WORDPIECE,
176
+ vocab_size=30000,
177
+ description="Specialized for Classical Arabic",
178
+ organization="CAMeL Lab NYU Abu Dhabi",
179
+ arabic_support="Native",
180
+ dialect_support=["Classical"],
181
+ special_features=["Classical Arabic", "Religious texts"]
182
+ ),
183
+ "UBC-NLP/MARBERT": TokenizerInfo(
184
+ name="MARBERT",
185
+ model_id="UBC-NLP/MARBERT",
186
+ type=TokenizerType.ENCODER_ONLY,
187
+ algorithm=TokenizerAlgorithm.WORDPIECE,
188
+ vocab_size=100000,
189
+ description="Multi-dialectal Arabic BERT trained on Twitter data",
190
+ organization="UBC NLP",
191
+ arabic_support="Native",
192
+ dialect_support=["MSA", "Egyptian", "Gulf", "Levantine", "Maghrebi"],
193
+ special_features=["Twitter data", "100K vocabulary", "Multi-dialect"]
194
+ ),
195
+ "UBC-NLP/ARBERT": TokenizerInfo(
196
+ name="ARBERT",
197
+ model_id="UBC-NLP/ARBERT",
198
+ type=TokenizerType.ENCODER_ONLY,
199
+ algorithm=TokenizerAlgorithm.WORDPIECE,
200
+ vocab_size=100000,
201
+ description="Arabic BERT focused on MSA with large vocabulary",
202
+ organization="UBC NLP",
203
+ arabic_support="Native",
204
+ dialect_support=["MSA"],
205
+ special_features=["100K vocabulary", "MSA focused"]
206
+ ),
207
+
208
+ # ========== ARABIC-SPECIFIC LLMs ==========
209
+ "inception-mbzuai/jais-13b": TokenizerInfo(
210
+ name="Jais 13B",
211
+ model_id="inception-mbzuai/jais-13b",
212
+ type=TokenizerType.ARABIC_LLM,
213
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
214
+ vocab_size=84992,
215
+ description="World's most advanced Arabic LLM, trained from scratch",
216
+ organization="Inception/MBZUAI",
217
+ arabic_support="Native",
218
+ dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
219
+ special_features=["Arabic-first", "Lowest fertility", "UAE-native"]
220
+ ),
221
+ "inceptionai/jais-family-30b-8k-chat": TokenizerInfo(
222
+ name="Jais 30B Chat",
223
+ model_id="inceptionai/jais-family-30b-8k-chat",
224
+ type=TokenizerType.ARABIC_LLM,
225
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
226
+ vocab_size=84992,
227
+ description="Enhanced 30B version with chat capabilities",
228
+ organization="Inception AI",
229
+ arabic_support="Native",
230
+ dialect_support=["MSA", "Gulf", "Egyptian", "Levantine"],
231
+ special_features=["30B parameters", "Chat optimized", "8K context"]
232
+ ),
233
+ "FreedomIntelligence/AceGPT-13B": TokenizerInfo(
234
+ name="AceGPT 13B",
235
+ model_id="FreedomIntelligence/AceGPT-13B",
236
+ type=TokenizerType.ARABIC_LLM,
237
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
238
+ vocab_size=32000,
239
+ description="Arabic-enhanced LLaMA with cultural alignment",
240
+ organization="Freedom Intelligence",
241
+ arabic_support="Adapted",
242
+ dialect_support=["MSA"],
243
+ special_features=["LLaMA-based", "Cultural alignment", "RLHF"]
244
+ ),
245
+ "FreedomIntelligence/AceGPT-7B": TokenizerInfo(
246
+ name="AceGPT 7B",
247
+ model_id="FreedomIntelligence/AceGPT-7B",
248
+ type=TokenizerType.ARABIC_LLM,
249
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
250
+ vocab_size=32000,
251
+ description="Smaller Arabic-enhanced LLaMA variant",
252
+ organization="Freedom Intelligence",
253
+ arabic_support="Adapted",
254
+ dialect_support=["MSA"],
255
+ special_features=["LLaMA-based", "Efficient"]
256
+ ),
257
+
258
+ # ========== MULTILINGUAL LLMs WITH ARABIC ==========
259
+ "Qwen/Qwen2.5-7B": TokenizerInfo(
260
+ name="Qwen 2.5 7B",
261
+ model_id="Qwen/Qwen2.5-7B",
262
+ type=TokenizerType.MULTILINGUAL_LLM,
263
+ algorithm=TokenizerAlgorithm.BPE,
264
+ vocab_size=151936,
265
+ description="Alibaba's multilingual LLM with 30+ language support",
266
+ organization="Alibaba Qwen",
267
+ arabic_support="Supported",
268
+ dialect_support=["MSA"],
269
+ special_features=["152K vocab", "128K context", "30+ languages"]
270
+ ),
271
+ "google/gemma-2-9b": TokenizerInfo(
272
+ name="Gemma 2 9B",
273
+ model_id="google/gemma-2-9b",
274
+ type=TokenizerType.MULTILINGUAL_LLM,
275
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
276
+ vocab_size=256000,
277
+ description="Google's efficient multilingual model",
278
+ organization="Google",
279
+ arabic_support="Supported",
280
+ dialect_support=["MSA"],
281
+ special_features=["256K vocab", "Efficient architecture"]
282
+ ),
283
+ "mistralai/Mistral-7B-v0.3": TokenizerInfo(
284
+ name="Mistral 7B v0.3",
285
+ model_id="mistralai/Mistral-7B-v0.3",
286
+ type=TokenizerType.MULTILINGUAL_LLM,
287
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
288
+ vocab_size=32768,
289
+ description="Efficient multilingual model with sliding window attention",
290
+ organization="Mistral AI",
291
+ arabic_support="Limited",
292
+ dialect_support=["MSA"],
293
+ special_features=["Sliding window", "Efficient"]
294
+ ),
295
+ "mistralai/Mistral-Nemo-Base-2407": TokenizerInfo(
296
+ name="Mistral Nemo",
297
+ model_id="mistralai/Mistral-Nemo-Base-2407",
298
+ type=TokenizerType.MULTILINGUAL_LLM,
299
+ algorithm=TokenizerAlgorithm.TIKTOKEN,
300
+ vocab_size=131072,
301
+ description="Uses Tekken tokenizer, optimized for multilingual",
302
+ organization="Mistral AI + NVIDIA",
303
+ arabic_support="Supported",
304
+ dialect_support=["MSA"],
305
+ special_features=["Tekken tokenizer", "131K vocab", "Multilingual optimized"]
306
+ ),
307
+ "google/mt5-base": TokenizerInfo(
308
+ name="mT5 Base",
309
+ model_id="google/mt5-base",
310
+ type=TokenizerType.MULTILINGUAL_LLM,
311
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
312
+ vocab_size=250112,
313
+ description="Multilingual T5 covering 101 languages",
314
+ organization="Google",
315
+ arabic_support="Supported",
316
+ dialect_support=["MSA"],
317
+ special_features=["250K vocab", "101 languages", "Seq2Seq"]
318
+ ),
319
+ "xlm-roberta-base": TokenizerInfo(
320
+ name="XLM-RoBERTa Base",
321
+ model_id="xlm-roberta-base",
322
+ type=TokenizerType.MULTILINGUAL_LLM,
323
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
324
+ vocab_size=250002,
325
+ description="Cross-lingual model covering 100 languages",
326
+ organization="Facebook AI",
327
+ arabic_support="Supported",
328
+ dialect_support=["MSA"],
329
+ special_features=["250K vocab", "100 languages"]
330
+ ),
331
+ "bert-base-multilingual-cased": TokenizerInfo(
332
+ name="mBERT",
333
+ model_id="bert-base-multilingual-cased",
334
+ type=TokenizerType.MULTILINGUAL_LLM,
335
+ algorithm=TokenizerAlgorithm.WORDPIECE,
336
+ vocab_size=119547,
337
+ description="Original multilingual BERT, baseline for comparison",
338
+ organization="Google",
339
+ arabic_support="Limited",
340
+ dialect_support=["MSA"],
341
+ special_features=["Baseline model", "104 languages"]
342
+ ),
343
+ }
344
+
345
+ # Try to load gated models
346
+ GATED_MODELS = [
347
+ ("meta-llama/Meta-Llama-3-8B", TokenizerInfo(
348
+ name="Llama 3 8B",
349
+ model_id="meta-llama/Meta-Llama-3-8B",
350
+ type=TokenizerType.MULTILINGUAL_LLM,
351
+ algorithm=TokenizerAlgorithm.BPE,
352
+ vocab_size=128256,
353
+ description="Meta's latest LLM with improved multilingual",
354
+ organization="Meta AI",
355
+ arabic_support="Limited",
356
+ dialect_support=["MSA"],
357
+ special_features=["128K vocab", "Improved tokenizer"]
358
+ )),
359
+ ("meta-llama/Llama-2-7b-hf", TokenizerInfo(
360
+ name="Llama 2 7B",
361
+ model_id="meta-llama/Llama-2-7b-hf",
362
+ type=TokenizerType.MULTILINGUAL_LLM,
363
+ algorithm=TokenizerAlgorithm.SENTENCEPIECE,
364
+ vocab_size=32000,
365
+ description="Meta's Llama 2 base model",
366
+ organization="Meta AI",
367
+ arabic_support="Limited",
368
+ dialect_support=["MSA"],
369
+ special_features=["32K vocab", "Foundation model"]
370
+ )),
371
+ ("tiiuae/falcon-7b", TokenizerInfo(
372
+ name="Falcon 7B",
373
+ model_id="tiiuae/falcon-7b",
374
+ type=TokenizerType.MULTILINGUAL_LLM,
375
+ algorithm=TokenizerAlgorithm.BPE,
376
+ vocab_size=65024,
377
+ description="TII's powerful open-source LLM",
378
+ organization="Technology Innovation Institute",
379
+ arabic_support="Limited",
380
+ dialect_support=["MSA"],
381
+ special_features=["65K vocab", "RefinedWeb trained"]
382
+ )),
383
  ]
384
 
385
+ # ============================================================================
386
+ # TOKENIZER LOADER AND CACHE
387
+ # ============================================================================
388
+
389
+ class TokenizerManager:
390
+ """Manages tokenizer loading and caching"""
391
+
392
+ def __init__(self):
393
+ self._cache: Dict[str, Any] = {}
394
+ self._available: Dict[str, TokenizerInfo] = {}
395
+ self._initialize_available_tokenizers()
396
+
397
+ def _initialize_available_tokenizers(self):
398
+ """Check which tokenizers are available and can be loaded"""
399
+ print("Initializing tokenizer registry...")
400
+
401
+ # Add all base tokenizers
402
+ for model_id, info in TOKENIZER_REGISTRY.items():
403
+ try:
404
+ # Quick check if tokenizer can be loaded
405
+ _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
406
+ self._available[model_id] = info
407
+ print(f" ✓ {info.name}")
408
+ except Exception as e:
409
+ print(f" ✗ {info.name}: {str(e)[:50]}")
410
+
411
+ # Try gated models
412
+ for model_id, info in GATED_MODELS:
413
+ try:
414
+ _ = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
415
+ self._available[model_id] = info
416
+ print(f" ✓ {info.name} (gated)")
417
+ except Exception as e:
418
+ print(f" ✗ {info.name} (gated): {str(e)[:50]}")
419
+
420
+ print(f"\nTotal available tokenizers: {len(self._available)}")
421
+
422
+ def get_tokenizer(self, model_id: str):
423
+ """Get tokenizer from cache or load it"""
424
+ if model_id not in self._cache:
425
+ self._cache[model_id] = AutoTokenizer.from_pretrained(
426
+ model_id,
427
+ trust_remote_code=True
428
+ )
429
+ return self._cache[model_id]
430
+
431
+ def get_available_tokenizers(self) -> Dict[str, TokenizerInfo]:
432
+ return self._available
433
+
434
+ def get_tokenizer_choices(self) -> List[str]:
435
+ """Get list of tokenizer display names for dropdown"""
436
+ return [f"{info.name} ({info.organization})" for info in self._available.values()]
437
+
438
+ def get_model_id_from_choice(self, choice: str) -> str:
439
+ """Convert display choice back to model ID"""
440
+ for model_id, info in self._available.items():
441
+ if f"{info.name} ({info.organization})" == choice:
442
+ return model_id
443
+ return list(self._available.keys())[0]
444
+
445
+ # Global tokenizer manager
446
+ tokenizer_manager = TokenizerManager()
447
+
448
+ # ============================================================================
449
+ # ARABIC TEXT UTILITIES
450
+ # ============================================================================
451
+
452
+ def is_arabic_char(char: str) -> bool:
453
+ """Check if character is Arabic"""
454
+ if len(char) != 1:
455
+ return False
456
+ code = ord(char)
457
+ return (
458
+ (0x0600 <= code <= 0x06FF) or # Arabic
459
+ (0x0750 <= code <= 0x077F) or # Arabic Supplement
460
+ (0x08A0 <= code <= 0x08FF) or # Arabic Extended-A
461
+ (0xFB50 <= code <= 0xFDFF) or # Arabic Presentation Forms-A
462
+ (0xFE70 <= code <= 0xFEFF) # Arabic Presentation Forms-B
463
+ )
464
+
465
+ def count_arabic_chars(text: str) -> int:
466
+ """Count Arabic characters in text"""
467
+ return sum(1 for c in text if is_arabic_char(c))
468
+
469
+ def has_diacritics(text: str) -> bool:
470
+ """Check if text contains Arabic diacritics (tashkeel)"""
471
+ diacritics = set('ًٌٍَُِّْٰ')
472
+ return any(c in diacritics for c in text)
473
+
474
+ def normalize_arabic(text: str) -> str:
475
+ """Basic Arabic normalization"""
476
+ # Normalize alef variants
477
+ text = re.sub('[إأآا]', 'ا', text)
478
+ # Normalize yeh
479
+ text = re.sub('ى', 'ي', text)
480
+ # Normalize teh marbuta
481
+ text = re.sub('ة', 'ه', text)
482
+ return text
483
+
484
+ def get_arabic_words(text: str) -> List[str]:
485
+ """Extract Arabic words from text"""
486
+ # Split on whitespace and filter for words containing Arabic
487
+ words = text.split()
488
+ return [w for w in words if any(is_arabic_char(c) for c in w)]
489
+
490
+ # ============================================================================
491
+ # TOKENIZATION ANALYSIS ENGINE
492
+ # ============================================================================
493
+
494
+ def analyze_tokenization(
495
+ text: str,
496
+ model_id: str,
497
+ tokenizer_info: TokenizerInfo
498
+ ) -> TokenizationMetrics:
499
+ """Perform comprehensive tokenization analysis"""
500
+
501
+ tokenizer = tokenizer_manager.get_tokenizer(model_id)
502
+
503
+ # Time the tokenization
504
+ start_time = time.perf_counter()
505
  tokens = tokenizer.tokenize(text)
506
+ token_ids = tokenizer.encode(text, add_special_tokens=False)
507
+ tokenization_time = (time.perf_counter() - start_time) * 1000
508
+
509
+ decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
510
+
511
+ # Basic counts
512
+ words = text.split()
513
+ total_words = len(words)
514
+ total_tokens = len(tokens)
515
+ total_characters = len(text)
516
+ total_bytes = len(text.encode('utf-8'))
517
+
518
+ # Efficiency metrics
519
+ fertility = total_tokens / max(total_words, 1)
520
+ compression_ratio = total_bytes / max(total_tokens, 1)
521
+ char_per_token = total_characters / max(total_tokens, 1)
522
+
523
+ # OOV analysis
524
+ unk_token = tokenizer.unk_token if hasattr(tokenizer, 'unk_token') else '[UNK]'
525
+ oov_count = sum(1 for t in tokens if t == unk_token or '[UNK]' in str(t))
526
+ oov_percentage = (oov_count / max(total_tokens, 1)) * 100
527
+
528
+ # Single Token Retention Rate (STRR)
529
+ single_token_words = 0
530
+ subwords_per_word = []
531
+
532
+ for word in words:
533
+ word_tokens = tokenizer.tokenize(word)
534
+ subwords_per_word.append(len(word_tokens))
535
+ if len(word_tokens) == 1:
536
+ single_token_words += 1
537
+
538
+ strr = single_token_words / max(total_words, 1)
539
+ avg_subwords = sum(subwords_per_word) / max(len(subwords_per_word), 1)
540
+ max_subwords = max(subwords_per_word) if subwords_per_word else 0
541
+ continued_ratio = (total_words - single_token_words) / max(total_words, 1)
542
+
543
+ # Arabic-specific metrics
544
+ arabic_char_count = count_arabic_chars(text)
545
+ arabic_words = get_arabic_words(text)
546
+ arabic_tokens_count = 0
547
+
548
+ for token in tokens:
549
+ if any(is_arabic_char(c) for c in str(token)):
550
+ arabic_tokens_count += 1
551
+
552
+ arabic_fertility = arabic_tokens_count / max(len(arabic_words), 1) if arabic_words else 0
553
+ diacritic_preserved = has_diacritics(text) == has_diacritics(decoded)
554
+
555
+ return TokenizationMetrics(
556
+ total_tokens=total_tokens,
557
+ total_words=total_words,
558
+ total_characters=total_characters,
559
+ total_bytes=total_bytes,
560
+ fertility=fertility,
561
+ compression_ratio=compression_ratio,
562
+ char_per_token=char_per_token,
563
+ oov_count=oov_count,
564
+ oov_percentage=oov_percentage,
565
+ single_token_words=single_token_words,
566
+ single_token_retention_rate=strr,
567
+ avg_subwords_per_word=avg_subwords,
568
+ max_subwords_per_word=max_subwords,
569
+ continued_words_ratio=continued_ratio,
570
+ arabic_char_count=arabic_char_count,
571
+ arabic_token_count=arabic_tokens_count,
572
+ arabic_fertility=arabic_fertility,
573
+ diacritic_preservation=diacritic_preserved,
574
+ tokenization_time_ms=tokenization_time,
575
+ tokens=tokens,
576
+ token_ids=token_ids,
577
+ decoded_text=decoded
578
+ )
579
 
580
+ # ============================================================================
581
+ # UI GENERATION FUNCTIONS
582
+ # ============================================================================
583
 
584
+ def generate_token_visualization(tokens: List[str], token_ids: List[int]) -> str:
585
+ """Generate beautiful HTML visualization of tokens"""
586
+
587
+ # Color palette for tokens (alternating for clarity)
588
+ colors = [
589
+ ('#1a1a2e', '#eaeaea'), # Dark blue bg, light text
590
+ ('#16213e', '#f0f0f0'),
591
+ ('#0f3460', '#ffffff'),
592
+ ('#533483', '#f5f5f5'),
593
+ ('#e94560', '#ffffff'),
594
+ ('#0f4c75', '#f0f0f0'),
595
+ ('#3282b8', '#ffffff'),
596
+ ('#bbe1fa', '#1a1a2e'),
597
+ ]
598
+
599
+ html_parts = []
600
+ for i, (token, tid) in enumerate(zip(tokens, token_ids)):
601
+ bg, fg = colors[i % len(colors)]
602
+ # Escape HTML entities
603
+ display_token = token.replace('<', '&lt;').replace('>', '&gt;')
604
+
605
+ # Determine if token is Arabic
606
+ is_arabic = any(is_arabic_char(c) for c in token)
607
+ direction = 'rtl' if is_arabic else 'ltr'
608
+
609
+ html_parts.append(f'''
610
+ <span class="token" style="
611
+ background: {bg};
612
+ color: {fg};
613
+ direction: {direction};
614
+ " title="ID: {tid}">
615
+ {display_token}
616
+ <span class="token-id">{tid}</span>
617
+ </span>
618
+ ''')
619
+
620
+ return f'''
621
+ <div class="token-container">
622
+ {''.join(html_parts)}
623
+ </div>
624
+ '''
625
+
626
+ def generate_metrics_card(metrics: TokenizationMetrics, info: TokenizerInfo) -> str:
627
+ """Generate metrics visualization card"""
628
+
629
+ # Determine quality indicators
630
+ fertility_quality = "excellent" if metrics.fertility < 1.5 else "good" if metrics.fertility < 2.5 else "poor"
631
+ strr_quality = "excellent" if metrics.single_token_retention_rate > 0.5 else "good" if metrics.single_token_retention_rate > 0.3 else "poor"
632
+ compression_quality = "excellent" if metrics.compression_ratio > 4 else "good" if metrics.compression_ratio > 2.5 else "poor"
633
+
634
+ return f'''
635
+ <div class="metrics-grid">
636
+ <div class="metric-card primary">
637
+ <div class="metric-icon">📊</div>
638
+ <div class="metric-value">{metrics.total_tokens}</div>
639
+ <div class="metric-label">Total Tokens</div>
640
+ </div>
641
+
642
+ <div class="metric-card {fertility_quality}">
643
+ <div class="metric-icon">🎯</div>
644
+ <div class="metric-value">{metrics.fertility:.3f}</div>
645
+ <div class="metric-label">Fertility (tokens/word)</div>
646
+ <div class="metric-hint">Lower is better (1.0 ideal)</div>
647
+ </div>
648
+
649
+ <div class="metric-card {compression_quality}">
650
+ <div class="metric-icon">📦</div>
651
+ <div class="metric-value">{metrics.compression_ratio:.2f}</div>
652
+ <div class="metric-label">Compression (bytes/token)</div>
653
+ <div class="metric-hint">Higher is better</div>
654
+ </div>
655
+
656
+ <div class="metric-card {strr_quality}">
657
+ <div class="metric-icon">✨</div>
658
+ <div class="metric-value">{metrics.single_token_retention_rate:.1%}</div>
659
+ <div class="metric-label">Single Token Rate (STRR)</div>
660
+ <div class="metric-hint">Higher is better</div>
661
+ </div>
662
+
663
+ <div class="metric-card">
664
+ <div class="metric-icon">📝</div>
665
+ <div class="metric-value">{metrics.char_per_token:.2f}</div>
666
+ <div class="metric-label">Characters/Token</div>
667
+ </div>
668
+
669
+ <div class="metric-card">
670
+ <div class="metric-icon">⚡</div>
671
+ <div class="metric-value">{metrics.tokenization_time_ms:.2f}ms</div>
672
+ <div class="metric-label">Processing Time</div>
673
+ </div>
674
+
675
+ <div class="metric-card arabic">
676
+ <div class="metric-icon">🔤</div>
677
+ <div class="metric-value">{metrics.arabic_fertility:.3f}</div>
678
+ <div class="metric-label">Arabic Fertility</div>
679
+ <div class="metric-hint">Arabic-specific efficiency</div>
680
+ </div>
681
+
682
+ <div class="metric-card">
683
+ <div class="metric-icon">{"✅" if metrics.oov_percentage == 0 else "⚠️"}</div>
684
+ <div class="metric-value">{metrics.oov_percentage:.1f}%</div>
685
+ <div class="metric-label">OOV Rate</div>
686
+ <div class="metric-hint">Lower is better (0% ideal)</div>
687
+ </div>
688
+ </div>
689
+ '''
690
+
691
+ def generate_tokenizer_info_card(info: TokenizerInfo) -> str:
692
+ """Generate tokenizer information card"""
693
+
694
+ dialect_badges = ' '.join([
695
+ f'<span class="dialect-badge">{d}</span>'
696
+ for d in info.dialect_support
697
  ])
698
+
699
+ feature_badges = ' '.join([
700
+ f'<span class="feature-badge">{f}</span>'
701
+ for f in info.special_features
702
  ])
703
+
704
+ support_class = info.arabic_support.lower().replace(' ', '-')
705
+
706
+ return f'''
707
+ <div class="tokenizer-info">
708
+ <div class="tokenizer-header">
709
+ <h3>{info.name}</h3>
710
+ <span class="org-badge">{info.organization}</span>
711
+ </div>
712
+ <p class="tokenizer-desc">{info.description}</p>
713
+ <div class="tokenizer-meta">
714
+ <div class="meta-row">
715
+ <span class="meta-label">Type:</span>
716
+ <span class="meta-value">{info.type.value}</span>
717
+ </div>
718
+ <div class="meta-row">
719
+ <span class="meta-label">Algorithm:</span>
720
+ <span class="meta-value">{info.algorithm.value}</span>
721
+ </div>
722
+ <div class="meta-row">
723
+ <span class="meta-label">Vocab Size:</span>
724
+ <span class="meta-value">{info.vocab_size:,}</span>
725
+ </div>
726
+ <div class="meta-row">
727
+ <span class="meta-label">Arabic Support:</span>
728
+ <span class="support-badge {support_class}">{info.arabic_support}</span>
729
+ </div>
730
+ </div>
731
+ <div class="tokenizer-badges">
732
+ <div class="badge-group">
733
+ <span class="badge-label">Dialects:</span>
734
+ {dialect_badges}
735
+ </div>
736
+ <div class="badge-group">
737
+ <span class="badge-label">Features:</span>
738
+ {feature_badges}
739
+ </div>
740
+ </div>
741
  </div>
742
+ '''
 
743
 
744
+ # ============================================================================
745
+ # MAIN ANALYSIS FUNCTION
746
+ # ============================================================================
747
+
748
+ def analyze_single_tokenizer(tokenizer_choice: str, text: str) -> Tuple[str, str, str, str]:
749
+ """Analyze text with a single tokenizer"""
750
+
751
+ if not text.strip():
752
+ return (
753
+ "<p class='warning'>Please enter some text to analyze.</p>",
754
+ "",
755
+ "",
756
+ ""
757
+ )
758
+
759
+ model_id = tokenizer_manager.get_model_id_from_choice(tokenizer_choice)
760
+ info = tokenizer_manager.get_available_tokenizers()[model_id]
761
+
762
+ try:
763
+ metrics = analyze_tokenization(text, model_id, info)
764
+
765
+ # Generate all outputs
766
+ info_html = generate_tokenizer_info_card(info)
767
+ metrics_html = generate_metrics_card(metrics, info)
768
+ tokens_html = generate_token_visualization(metrics.tokens, metrics.token_ids)
769
+
770
+ # Decoded text output
771
+ decoded_html = f'''
772
+ <div class="decoded-section">
773
+ <h4>Decoded Output</h4>
774
+ <div class="decoded-text" dir="auto">{metrics.decoded_text}</div>
775
+ <div class="decoded-meta">
776
+ <span>Diacritics preserved: {"✅ Yes" if metrics.diacritic_preservation else "❌ No"}</span>
777
+ </div>
778
+ </div>
779
+ '''
780
+
781
+ return info_html, metrics_html, tokens_html, decoded_html
782
+
783
+ except Exception as e:
784
+ error_html = f'''
785
+ <div class="error-card">
786
+ <h4>Error analyzing with {info.name}</h4>
787
+ <p>{str(e)}</p>
788
+ </div>
789
+ '''
790
+ return error_html, "", "", ""
791
+
792
+ def compare_tokenizers(tokenizer_choices: List[str], text: str) -> str:
793
+ """Compare multiple tokenizers side by side"""
794
+
795
+ if not text.strip():
796
+ return "<p class='warning'>Please enter some text to analyze.</p>"
797
+
798
+ if not tokenizer_choices or len(tokenizer_choices) < 2:
799
+ return "<p class='warning'>Please select at least 2 tokenizers to compare.</p>"
800
+
801
+ results = []
802
+
803
+ for choice in tokenizer_choices:
804
+ model_id = tokenizer_manager.get_model_id_from_choice(choice)
805
+ info = tokenizer_manager.get_available_tokenizers()[model_id]
806
+
807
+ try:
808
+ metrics = analyze_tokenization(text, model_id, info)
809
+ results.append((info, metrics))
810
+ except Exception as e:
811
+ continue
812
+
813
+ if not results:
814
+ return "<p class='error'>Failed to analyze with any selected tokenizers.</p>"
815
+
816
+ # Sort by fertility (best first)
817
+ results.sort(key=lambda x: x[1].fertility)
818
+
819
+ # Generate comparison table
820
+ table_rows = []
821
+ for i, (info, metrics) in enumerate(results):
822
+ rank_class = "rank-1" if i == 0 else "rank-2" if i == 1 else "rank-3" if i == 2 else ""
823
+
824
+ table_rows.append(f'''
825
+ <tr class="{rank_class}">
826
+ <td class="rank-cell">{i + 1}</td>
827
+ <td class="name-cell">
828
+ <strong>{info.name}</strong>
829
+ <span class="org-small">{info.organization}</span>
830
+ </td>
831
+ <td class="metric-cell">{metrics.total_tokens}</td>
832
+ <td class="metric-cell highlight">{metrics.fertility:.3f}</td>
833
+ <td class="metric-cell">{metrics.compression_ratio:.2f}</td>
834
+ <td class="metric-cell">{metrics.single_token_retention_rate:.1%}</td>
835
+ <td class="metric-cell">{metrics.arabic_fertility:.3f}</td>
836
+ <td class="metric-cell">{metrics.oov_percentage:.1f}%</td>
837
+ <td class="metric-cell">{metrics.tokenization_time_ms:.2f}ms</td>
838
+ </tr>
839
+ ''')
840
+
841
+ return f'''
842
+ <div class="comparison-container">
843
+ <h3>Tokenizer Comparison Results</h3>
844
+ <p class="comparison-subtitle">Ranked by fertility (lower is better)</p>
845
+ <table class="comparison-table">
846
+ <thead>
847
+ <tr>
848
+ <th>#</th>
849
+ <th>Tokenizer</th>
850
+ <th>Tokens</th>
851
+ <th>Fertility ↓</th>
852
+ <th>Compression</th>
853
+ <th>STRR</th>
854
+ <th>Arabic Fertility</th>
855
+ <th>OOV %</th>
856
+ <th>Time</th>
857
+ </tr>
858
+ </thead>
859
+ <tbody>
860
+ {''.join(table_rows)}
861
+ </tbody>
862
+ </table>
863
+ <div class="comparison-legend">
864
+ <span class="legend-item"><span class="legend-color rank-1"></span> Best</span>
865
+ <span class="legend-item"><span class="legend-color rank-2"></span> Runner-up</span>
866
+ <span class="legend-item"><span class="legend-color rank-3"></span> Third</span>
867
+ </div>
868
+ </div>
869
+ '''
870
+
871
+ # ============================================================================
872
+ # CSS STYLES
873
+ # ============================================================================
874
+
875
+ CUSTOM_CSS = """
876
+ /* ===== GLOBAL STYLES ===== */
877
+ :root {
878
+ --primary: #0d47a1;
879
+ --primary-light: #1976d2;
880
+ --primary-dark: #002171;
881
+ --accent: #ff6f00;
882
+ --accent-light: #ffa040;
883
+ --success: #2e7d32;
884
+ --warning: #f57c00;
885
+ --error: #c62828;
886
+ --bg-dark: #0a0a0f;
887
+ --bg-card: #12121a;
888
+ --bg-elevated: #1a1a24;
889
+ --text-primary: #f5f5f5;
890
+ --text-secondary: #b0b0b0;
891
+ --border: #2a2a3a;
892
+ --gradient-1: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
893
+ --gradient-2: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
894
+ --gradient-arabic: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
895
+ }
896
+
897
+ .gradio-container {
898
+ background: var(--bg-dark) !important;
899
+ font-family: 'IBM Plex Sans Arabic', 'Segoe UI', system-ui, sans-serif !important;
900
+ }
901
+
902
+ /* ===== HEADER STYLES ===== */
903
+ .header-section {
904
+ text-align: center;
905
+ padding: 2rem;
906
+ background: var(--gradient-1);
907
+ border-radius: 16px;
908
+ margin-bottom: 2rem;
909
+ }
910
+
911
+ .header-section h1 {
912
+ font-size: 2.5rem;
913
+ font-weight: 700;
914
+ color: white;
915
+ margin-bottom: 0.5rem;
916
+ text-shadow: 0 2px 10px rgba(0,0,0,0.3);
917
+ }
918
+
919
+ .header-section p {
920
+ color: rgba(255,255,255,0.9);
921
+ font-size: 1.1rem;
922
+ }
923
+
924
+ /* ===== TOKEN VISUALIZATION ===== */
925
+ .token-container {
926
+ display: flex;
927
+ flex-wrap: wrap;
928
+ gap: 8px;
929
+ padding: 1.5rem;
930
+ background: var(--bg-card);
931
+ border-radius: 12px;
932
+ border: 1px solid var(--border);
933
+ direction: rtl;
934
+ }
935
+
936
+ .token {
937
+ display: inline-flex;
938
+ flex-direction: column;
939
+ align-items: center;
940
+ padding: 8px 12px;
941
+ border-radius: 8px;
942
+ font-family: 'IBM Plex Mono', monospace;
943
+ font-size: 0.95rem;
944
+ transition: transform 0.2s, box-shadow 0.2s;
945
+ cursor: default;
946
+ }
947
+
948
+ .token:hover {
949
+ transform: translateY(-2px);
950
+ box-shadow: 0 4px 12px rgba(0,0,0,0.3);
951
+ }
952
+
953
+ .token-id {
954
+ font-size: 0.7rem;
955
+ opacity: 0.7;
956
+ margin-top: 4px;
957
+ }
958
+
959
+ /* ===== METRICS GRID ===== */
960
+ .metrics-grid {
961
+ display: grid;
962
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
963
+ gap: 1rem;
964
+ padding: 1rem;
965
+ }
966
+
967
+ .metric-card {
968
+ background: var(--bg-card);
969
+ border: 1px solid var(--border);
970
+ border-radius: 12px;
971
+ padding: 1.25rem;
972
+ text-align: center;
973
+ transition: transform 0.2s, border-color 0.2s;
974
+ }
975
+
976
+ .metric-card:hover {
977
+ transform: translateY(-3px);
978
+ border-color: var(--primary-light);
979
+ }
980
+
981
+ .metric-card.excellent {
982
+ border-color: var(--success);
983
+ background: linear-gradient(to bottom, rgba(46, 125, 50, 0.1), transparent);
984
+ }
985
+
986
+ .metric-card.good {
987
+ border-color: var(--primary-light);
988
+ background: linear-gradient(to bottom, rgba(25, 118, 210, 0.1), transparent);
989
+ }
990
+
991
+ .metric-card.poor {
992
+ border-color: var(--warning);
993
+ background: linear-gradient(to bottom, rgba(245, 124, 0, 0.1), transparent);
994
+ }
995
+
996
+ .metric-card.primary {
997
+ background: var(--gradient-1);
998
+ }
999
+
1000
+ .metric-card.arabic {
1001
+ background: linear-gradient(to bottom, rgba(17, 153, 142, 0.2), transparent);
1002
+ border-color: #11998e;
1003
+ }
1004
+
1005
+ .metric-icon {
1006
+ font-size: 1.5rem;
1007
+ margin-bottom: 0.5rem;
1008
+ }
1009
+
1010
+ .metric-value {
1011
+ font-size: 1.75rem;
1012
+ font-weight: 700;
1013
+ color: var(--text-primary);
1014
+ margin-bottom: 0.25rem;
1015
+ }
1016
+
1017
+ .metric-label {
1018
+ font-size: 0.85rem;
1019
+ color: var(--text-secondary);
1020
+ margin-bottom: 0.25rem;
1021
+ }
1022
+
1023
+ .metric-hint {
1024
+ font-size: 0.7rem;
1025
+ color: var(--text-secondary);
1026
+ opacity: 0.7;
1027
+ }
1028
+
1029
+ /* ===== TOKENIZER INFO ===== */
1030
+ .tokenizer-info {
1031
+ background: var(--bg-card);
1032
+ border: 1px solid var(--border);
1033
+ border-radius: 12px;
1034
+ padding: 1.5rem;
1035
+ }
1036
+
1037
+ .tokenizer-header {
1038
+ display: flex;
1039
+ align-items: center;
1040
+ gap: 1rem;
1041
+ margin-bottom: 1rem;
1042
+ }
1043
+
1044
+ .tokenizer-header h3 {
1045
+ margin: 0;
1046
+ color: var(--text-primary);
1047
+ font-size: 1.5rem;
1048
+ }
1049
+
1050
+ .org-badge {
1051
+ background: var(--gradient-1);
1052
+ padding: 4px 12px;
1053
+ border-radius: 20px;
1054
+ font-size: 0.8rem;
1055
+ color: white;
1056
+ }
1057
+
1058
+ .tokenizer-desc {
1059
+ color: var(--text-secondary);
1060
+ margin-bottom: 1rem;
1061
+ line-height: 1.6;
1062
+ }
1063
+
1064
+ .tokenizer-meta {
1065
+ display: grid;
1066
+ grid-template-columns: repeat(2, 1fr);
1067
+ gap: 0.75rem;
1068
+ margin-bottom: 1rem;
1069
+ }
1070
+
1071
+ .meta-row {
1072
+ display: flex;
1073
+ gap: 0.5rem;
1074
+ }
1075
+
1076
+ .meta-label {
1077
+ color: var(--text-secondary);
1078
+ font-size: 0.85rem;
1079
+ }
1080
+
1081
+ .meta-value {
1082
+ color: var(--text-primary);
1083
+ font-weight: 500;
1084
+ }
1085
+
1086
+ .support-badge {
1087
+ padding: 2px 8px;
1088
+ border-radius: 4px;
1089
+ font-size: 0.8rem;
1090
+ }
1091
+
1092
+ .support-badge.native {
1093
+ background: var(--success);
1094
+ color: white;
1095
+ }
1096
+
1097
+ .support-badge.adapted {
1098
+ background: var(--primary-light);
1099
+ color: white;
1100
+ }
1101
+
1102
+ .support-badge.supported {
1103
+ background: var(--warning);
1104
+ color: white;
1105
+ }
1106
+
1107
+ .support-badge.limited {
1108
+ background: var(--error);
1109
+ color: white;
1110
+ }
1111
+
1112
+ .tokenizer-badges {
1113
+ display: flex;
1114
+ flex-direction: column;
1115
+ gap: 0.75rem;
1116
+ }
1117
+
1118
+ .badge-group {
1119
+ display: flex;
1120
+ flex-wrap: wrap;
1121
+ align-items: center;
1122
+ gap: 0.5rem;
1123
+ }
1124
+
1125
+ .badge-label {
1126
+ color: var(--text-secondary);
1127
+ font-size: 0.85rem;
1128
+ }
1129
+
1130
+ .dialect-badge, .feature-badge {
1131
+ background: var(--bg-elevated);
1132
+ border: 1px solid var(--border);
1133
+ padding: 4px 10px;
1134
+ border-radius: 6px;
1135
+ font-size: 0.75rem;
1136
+ color: var(--text-primary);
1137
+ }
1138
+
1139
+ /* ===== COMPARISON TABLE ===== */
1140
+ .comparison-container {
1141
+ background: var(--bg-card);
1142
+ border-radius: 12px;
1143
+ padding: 1.5rem;
1144
+ border: 1px solid var(--border);
1145
+ }
1146
+
1147
+ .comparison-container h3 {
1148
+ color: var(--text-primary);
1149
+ margin-bottom: 0.25rem;
1150
+ }
1151
+
1152
+ .comparison-subtitle {
1153
+ color: var(--text-secondary);
1154
+ font-size: 0.9rem;
1155
+ margin-bottom: 1.5rem;
1156
+ }
1157
+
1158
+ .comparison-table {
1159
+ width: 100%;
1160
+ border-collapse: collapse;
1161
+ font-size: 0.9rem;
1162
+ }
1163
+
1164
+ .comparison-table th {
1165
+ background: var(--bg-elevated);
1166
+ color: var(--text-secondary);
1167
+ padding: 12px 8px;
1168
+ text-align: left;
1169
+ font-weight: 500;
1170
+ border-bottom: 2px solid var(--border);
1171
+ }
1172
+
1173
+ .comparison-table td {
1174
+ padding: 12px 8px;
1175
+ border-bottom: 1px solid var(--border);
1176
+ color: var(--text-primary);
1177
+ }
1178
+
1179
+ .comparison-table tr.rank-1 {
1180
+ background: linear-gradient(90deg, rgba(46, 125, 50, 0.2), transparent);
1181
+ }
1182
+
1183
+ .comparison-table tr.rank-2 {
1184
+ background: linear-gradient(90deg, rgba(25, 118, 210, 0.15), transparent);
1185
+ }
1186
+
1187
+ .comparison-table tr.rank-3 {
1188
+ background: linear-gradient(90deg, rgba(245, 124, 0, 0.1), transparent);
1189
+ }
1190
+
1191
+ .rank-cell {
1192
+ font-weight: 700;
1193
+ text-align: center;
1194
+ }
1195
+
1196
+ .name-cell strong {
1197
+ display: block;
1198
+ }
1199
+
1200
+ .org-small {
1201
+ font-size: 0.75rem;
1202
+ color: var(--text-secondary);
1203
+ }
1204
+
1205
+ .metric-cell {
1206
+ text-align: center;
1207
+ }
1208
+
1209
+ .metric-cell.highlight {
1210
+ font-weight: 700;
1211
+ color: var(--accent-light);
1212
+ }
1213
+
1214
+ .comparison-legend {
1215
+ display: flex;
1216
+ gap: 1.5rem;
1217
+ margin-top: 1rem;
1218
+ padding-top: 1rem;
1219
+ border-top: 1px solid var(--border);
1220
+ }
1221
+
1222
+ .legend-item {
1223
+ display: flex;
1224
+ align-items: center;
1225
+ gap: 0.5rem;
1226
+ font-size: 0.85rem;
1227
+ color: var(--text-secondary);
1228
+ }
1229
+
1230
+ .legend-color {
1231
+ width: 16px;
1232
+ height: 16px;
1233
+ border-radius: 4px;
1234
+ }
1235
+
1236
+ .legend-color.rank-1 { background: var(--success); }
1237
+ .legend-color.rank-2 { background: var(--primary-light); }
1238
+ .legend-color.rank-3 { background: var(--warning); }
1239
+
1240
+ /* ===== DECODED SECTION ===== */
1241
+ .decoded-section {
1242
+ background: var(--bg-card);
1243
+ border: 1px solid var(--border);
1244
+ border-radius: 12px;
1245
+ padding: 1.5rem;
1246
+ }
1247
+
1248
+ .decoded-section h4 {
1249
+ color: var(--text-primary);
1250
+ margin-bottom: 1rem;
1251
+ }
1252
+
1253
+ .decoded-text {
1254
+ background: var(--bg-elevated);
1255
+ padding: 1rem;
1256
+ border-radius: 8px;
1257
+ font-family: 'IBM Plex Sans Arabic', serif;
1258
+ font-size: 1.1rem;
1259
+ line-height: 1.8;
1260
+ color: var(--text-primary);
1261
+ }
1262
+
1263
+ .decoded-meta {
1264
+ margin-top: 1rem;
1265
+ font-size: 0.85rem;
1266
+ color: var(--text-secondary);
1267
+ }
1268
+
1269
+ /* ===== UTILITY CLASSES ===== */
1270
+ .warning {
1271
+ background: linear-gradient(to right, rgba(245, 124, 0, 0.1), transparent);
1272
+ border-left: 4px solid var(--warning);
1273
+ padding: 1rem;
1274
+ border-radius: 0 8px 8px 0;
1275
+ color: var(--text-primary);
1276
+ }
1277
+
1278
+ .error-card {
1279
+ background: linear-gradient(to right, rgba(198, 40, 40, 0.1), transparent);
1280
+ border-left: 4px solid var(--error);
1281
+ padding: 1rem;
1282
+ border-radius: 0 8px 8px 0;
1283
+ }
1284
+
1285
+ .error-card h4 {
1286
+ color: var(--error);
1287
+ margin-bottom: 0.5rem;
1288
+ }
1289
+
1290
+ .error-card p {
1291
+ color: var(--text-secondary);
1292
+ }
1293
+ """
1294
+
1295
+ # ============================================================================
1296
+ # SAMPLE TEXTS FOR TESTING
1297
+ # ============================================================================
1298
+
1299
+ SAMPLE_TEXTS = {
1300
+ "MSA News": "أعلنت وزارة التربية والتعليم عن بدء العام الدراسي الجديد في الأول من سبتمبر، حيث ستعود المدارس لاستقبال الطلاب بعد العطلة الصيفية الطويلة.",
1301
+ "MSA Formal": "إن تطوير تقنيات الذكاء الاصطناعي يمثل نقلة نوعية في مجال معالجة اللغات الطبيعية، وخاصة فيما يتعلق باللغة العربية ذات الخصائص المورفولوجية الغنية.",
1302
+ "Egyptian Dialect": "ازيك يا صاحبي؟ إيه أخبارك؟ عامل إيه النهارده؟ قولي هنروح فين بكره؟",
1303
+ "Gulf Dialect": "شلونك؟ شخبارك؟ وش تسوي الحين؟ ودك تروح وياي للسوق؟",
1304
+ "Levantine Dialect": "كيفك؟ شو أخبارك؟ شو عم تعمل هلق؟ بدك تيجي معي على السوق؟",
1305
+ "Classical Arabic (Quran)": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ ۝ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ",
1306
+ "Poetry": "وما من كاتبٍ إلا سيفنى ويُبقي الدهرُ ما كتبت يداهُ",
1307
+ "Technical": "يستخدم نموذج المحولات آلية الانتباه الذاتي لمعالجة تسلسلات النصوص بشكل متوازي.",
1308
+ "Mixed Arabic-English": "The Arabic language العربية is a Semitic language with over 400 million speakers worldwide.",
1309
+ "With Diacritics": "إِنَّ اللَّهَ وَمَلَائِكَتَهُ يُصَلُّونَ عَلَى النَّبِيِّ",
1310
+ }
1311
+
1312
+ # ============================================================================
1313
+ # GRADIO INTERFACE
1314
+ # ============================================================================
1315
 
1316
+ def create_interface():
1317
+ """Create the Gradio interface"""
1318
+
1319
+ available_tokenizers = tokenizer_manager.get_tokenizer_choices()
1320
+
1321
+ # Group tokenizers by type for better organization
1322
+ arabic_specific = [t for t in available_tokenizers if any(x in t for x in ['AraBERT', 'CAMeL', 'MARBERT', 'ARBERT'])]
1323
+ arabic_llms = [t for t in available_tokenizers if any(x in t for x in ['Jais', 'AceGPT'])]
1324
+ multilingual = [t for t in available_tokenizers if t not in arabic_specific and t not in arabic_llms]
1325
+
1326
+ with gr.Blocks(css=CUSTOM_CSS, title="Arabic Tokenizer Arena Pro", theme=gr.themes.Base(
1327
+ primary_hue="blue",
1328
+ secondary_hue="purple",
1329
+ neutral_hue="slate",
1330
+ font=["IBM Plex Sans Arabic", "system-ui", "sans-serif"]
1331
+ )) as demo:
1332
+
1333
+ # Header
1334
+ gr.HTML("""
1335
+ <div class="header-section">
1336
+ <h1>🏟️ Arabic Tokenizer Arena Pro</h1>
1337
+ <p>Advanced research & production platform for Arabic tokenization analysis</p>
1338
+ </div>
1339
+ """)
1340
+
1341
+ with gr.Tabs():
1342
+ # ===== TAB 1: Single Tokenizer Analysis =====
1343
+ with gr.TabItem("🔬 Single Analysis", id="single"):
1344
+ with gr.Row():
1345
+ with gr.Column(scale=1):
1346
+ tokenizer_dropdown = gr.Dropdown(
1347
+ choices=available_tokenizers,
1348
+ value=available_tokenizers[0] if available_tokenizers else None,
1349
+ label="Select Tokenizer",
1350
+ info="Choose a tokenizer to analyze"
1351
+ )
1352
+
1353
+ sample_dropdown = gr.Dropdown(
1354
+ choices=list(SAMPLE_TEXTS.keys()),
1355
+ label="Sample Texts",
1356
+ info="Select a sample or enter custom text"
1357
+ )
1358
+
1359
+ input_text = gr.Textbox(
1360
+ lines=4,
1361
+ placeholder="اكتب النص العربي هنا...\nEnter Arabic text here...",
1362
+ label="Input Text",
1363
+ rtl=True
1364
+ )
1365
+
1366
+ analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
1367
+
1368
+ with gr.Column(scale=2):
1369
+ info_output = gr.HTML(label="Tokenizer Information")
1370
+
1371
+ metrics_output = gr.HTML(label="Evaluation Metrics")
1372
+ tokens_output = gr.HTML(label="Token Visualization")
1373
+ decoded_output = gr.HTML(label="Decoded Output")
1374
+
1375
+ # Event handlers
1376
+ sample_dropdown.change(
1377
+ lambda x: SAMPLE_TEXTS.get(x, ""),
1378
+ inputs=[sample_dropdown],
1379
+ outputs=[input_text]
1380
+ )
1381
+
1382
+ analyze_btn.click(
1383
+ analyze_single_tokenizer,
1384
+ inputs=[tokenizer_dropdown, input_text],
1385
+ outputs=[info_output, metrics_output, tokens_output, decoded_output]
1386
+ )
1387
+
1388
+ # ===== TAB 2: Comparison Mode =====
1389
+ with gr.TabItem("⚖️ Compare Tokenizers", id="compare"):
1390
+ with gr.Row():
1391
+ with gr.Column(scale=1):
1392
+ compare_tokenizers_select = gr.CheckboxGroup(
1393
+ choices=available_tokenizers,
1394
+ value=available_tokenizers[:5] if len(available_tokenizers) >= 5 else available_tokenizers,
1395
+ label="Select Tokenizers to Compare",
1396
+ info="Choose 2 or more tokenizers"
1397
+ )
1398
+
1399
+ compare_sample = gr.Dropdown(
1400
+ choices=list(SAMPLE_TEXTS.keys()),
1401
+ label="Sample Texts"
1402
+ )
1403
+
1404
+ compare_text = gr.Textbox(
1405
+ lines=4,
1406
+ placeholder="اكتب النص العربي هنا...",
1407
+ label="Input Text",
1408
+ rtl=True
1409
+ )
1410
+
1411
+ compare_btn = gr.Button("⚖️ Compare", variant="primary", size="lg")
1412
+
1413
+ with gr.Column(scale=2):
1414
+ comparison_output = gr.HTML(label="Comparison Results")
1415
+
1416
+ compare_sample.change(
1417
+ lambda x: SAMPLE_TEXTS.get(x, ""),
1418
+ inputs=[compare_sample],
1419
+ outputs=[compare_text]
1420
+ )
1421
+
1422
+ compare_btn.click(
1423
+ compare_tokenizers,
1424
+ inputs=[compare_tokenizers_select, compare_text],
1425
+ outputs=[comparison_output]
1426
+ )
1427
+
1428
+ # ===== TAB 3: Metrics Reference =====
1429
+ with gr.TabItem("📖 Metrics Guide", id="guide"):
1430
+ gr.Markdown("""
1431
+ ## Tokenization Evaluation Metrics Guide
1432
+
1433
+ ### Efficiency Metrics
1434
+
1435
+ | Metric | Description | Ideal Value | Why It Matters |
1436
+ |--------|-------------|-------------|----------------|
1437
+ | **Fertility** | Tokens per word | 1.0 | Lower fertility = fewer tokens = faster inference & lower cost |
1438
+ | **Compression Ratio** | Bytes per token | Higher is better | Better compression = more efficient encoding |
1439
+ | **Chars/Token** | Characters per token | Higher is better | More characters per token = better vocabulary utilization |
1440
+
1441
+ ### Coverage Metrics
1442
+
1443
+ | Metric | Description | Ideal Value | Why It Matters |
1444
+ |--------|-------------|-------------|----------------|
1445
+ | **OOV Rate** | Out-of-vocabulary percentage | 0% | Lower OOV = better vocabulary coverage |
1446
+ | **STRR** | Single Token Retention Rate | Higher is better | More words preserved as single tokens = better semantic boundaries |
1447
+ | **Continued Words Ratio** | Words split into multiple tokens | Lower is better | Fewer splits = better word boundary preservation |
1448
+
1449
+ ### Arabic-Specific Metrics
1450
+
1451
+ | Metric | Description | Why It Matters |
1452
+ |--------|-------------|----------------|
1453
+ | **Arabic Fertility** | Tokens per Arabic word | Arabic-specific efficiency measure |
1454
+ | **Diacritic Preservation** | Whether tashkeel is preserved | Important for religious & educational texts |
1455
+
1456
+ ### Research Background
1457
+
1458
+ These metrics are based on recent research including:
1459
+ - *"A Comprehensive Analysis of Various Tokenizers for Arabic LLMs"* (2024)
1460
+ - *"Evaluating Various Tokenizers for Arabic Text Classification"* (Alyafeai et al.)
1461
+ - *"Beyond Fertility: STRR as a Metric for Multilingual Tokenization"* (2025)
1462
+ - *"Arabic Stable LM: Adapting Stable LM to Arabic"* (2024)
1463
+
1464
+ ### Tokenizer Algorithm Types
1465
+
1466
+ - **BPE (Byte-Pair Encoding)**: Iteratively merges frequent character pairs
1467
+ - **Byte-Level BPE**: BPE applied to UTF-8 bytes instead of characters
1468
+ - **WordPiece**: Google's variant, used in BERT models
1469
+ - **SentencePiece**: Language-independent, uses unigram model
1470
+ - **Unigram**: Probabilistic subword model
1471
+ - **Tiktoken**: OpenAI's optimized BPE implementation
1472
+ """)
1473
+
1474
+ # ===== TAB 4: About =====
1475
+ with gr.TabItem("ℹ️ About", id="about"):
1476
+ gr.Markdown(f"""
1477
+ ## Arabic Tokenizer Arena Pro
1478
+
1479
+ A comprehensive platform for evaluating Arabic tokenizers across multiple dimensions.
1480
+
1481
+ ### Available Tokenizers: {len(available_tokenizers)}
1482
+
1483
+ **Arabic-Specific Models:**
1484
+ {chr(10).join(['- ' + t for t in arabic_specific])}
1485
+
1486
+ **Arabic LLMs:**
1487
+ {chr(10).join(['- ' + t for t in arabic_llms])}
1488
+
1489
+ **Multilingual LLMs:**
1490
+ {chr(10).join(['- ' + t for t in multilingual])}
1491
+
1492
+ ### Features
1493
+
1494
+ ✅ Comprehensive efficiency metrics (fertility, compression, STRR)
1495
+ ✅ Arabic-specific analysis (dialect support, diacritic preservation)
1496
+ ✅ Side-by-side tokenizer comparison
1497
+ ✅ Beautiful token visualization
1498
+ ✅ Support for MSA, dialectal Arabic, and Classical Arabic
1499
+ ✅ Research-backed evaluation methodology
1500
+
1501
+ ### Use Cases
1502
+
1503
+ - **Research**: Compare tokenizers for Arabic NLP experiments
1504
+ - **Production**: Select optimal tokenizer for deployment
1505
+ - **Education**: Understand how different algorithms handle Arabic
1506
+ - **Optimization**: Identify cost-efficient tokenizers for API usage
1507
+
1508
+ ---
1509
+
1510
+ Built with ❤️ for the Arabic NLP community
1511
+ """)
1512
+
1513
+ return demo
1514
 
1515
+ # ============================================================================
1516
+ # MAIN
1517
+ # ============================================================================
 
 
 
 
 
1518
 
1519
+ if __name__ == "__main__":
1520
+ demo = create_interface()
1521
+ demo.launch(share=True)