Spaces:
Runtime error
Runtime error
| """ | |
| TDD tests for workbench/tokenizer.py — Tokenizer Inspector module. | |
| Test order matches implementation phases: | |
| Phase 1: get_tokenizer, tokenize_text, fragmentation_ratio, flag_oov_words, | |
| detect_language, efficiency_score, translate_to_english | |
| Phase 2: render_tokens_html | |
| Phase 3: build_tokenizer_ui (smoke test) | |
| """ | |
| import pytest | |
| from unittest.mock import patch, MagicMock | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — get_tokenizer | |
| # --------------------------------------------------------------------------- | |
| class TestGetTokenizer: | |
| """Unit tests for get_tokenizer(name).""" | |
| def test_returns_tokenizer_for_gpt2(self): | |
| """get_tokenizer('gpt2') must return a tokenizer object.""" | |
| from tokenizer import get_tokenizer | |
| mock_tok = MagicMock() | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=mock_tok) as mock_fp: | |
| result = get_tokenizer("gpt2") | |
| assert result is mock_tok | |
| def test_returns_tokenizer_for_llama3(self): | |
| """get_tokenizer('llama-3') must return a tokenizer object.""" | |
| from tokenizer import get_tokenizer | |
| mock_tok = MagicMock() | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=mock_tok): | |
| result = get_tokenizer("llama-3") | |
| assert result is mock_tok | |
| def test_returns_tokenizer_for_mistral(self): | |
| """get_tokenizer('mistral') must return a tokenizer object.""" | |
| from tokenizer import get_tokenizer | |
| mock_tok = MagicMock() | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=mock_tok): | |
| result = get_tokenizer("mistral") | |
| assert result is mock_tok | |
| def test_raises_for_unknown_name(self): | |
| """get_tokenizer with unrecognised name must raise ValueError.""" | |
| from tokenizer import get_tokenizer | |
| with pytest.raises(ValueError, match="unknown"): | |
| get_tokenizer("nonexistent-model-xyz") | |
| def test_calls_from_pretrained_with_correct_repo_id_for_gpt2(self): | |
| """get_tokenizer('gpt2') must call from_pretrained with 'gpt2'.""" | |
| import tokenizer as tok_module | |
| from tokenizer import get_tokenizer | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=MagicMock()) as mock_fp: | |
| get_tokenizer("gpt2") | |
| mock_fp.assert_called_once_with("gpt2") | |
| def test_calls_from_pretrained_with_correct_repo_id_for_llama3(self): | |
| """get_tokenizer('llama-3') must use NousResearch/Meta-Llama-3-8B.""" | |
| import tokenizer as tok_module | |
| from tokenizer import get_tokenizer | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=MagicMock()) as mock_fp: | |
| get_tokenizer("llama-3") | |
| mock_fp.assert_called_once_with("NousResearch/Meta-Llama-3-8B") | |
| def test_calls_from_pretrained_with_correct_repo_id_for_mistral(self): | |
| """get_tokenizer('mistral') must use mistralai/Mistral-7B-v0.1.""" | |
| import tokenizer as tok_module | |
| from tokenizer import get_tokenizer | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=MagicMock()) as mock_fp: | |
| get_tokenizer("mistral") | |
| mock_fp.assert_called_once_with("mistralai/Mistral-7B-v0.1") | |
| def test_caches_tokenizer_on_second_call(self): | |
| """Second call with same name must not call from_pretrained again.""" | |
| # Import fresh to reset module-level cache | |
| import importlib | |
| import tokenizer as tok_module | |
| importlib.reload(tok_module) | |
| mock_tok = MagicMock() | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", return_value=mock_tok) as mock_fp: | |
| first = tok_module.get_tokenizer("gpt2") | |
| second = tok_module.get_tokenizer("gpt2") | |
| assert mock_fp.call_count == 1 | |
| assert first is second | |
| def test_supported_tokenizers_registry_has_expected_entries(self): | |
| """SUPPORTED_TOKENIZERS must have at least 8 entries (v2 expansion).""" | |
| from tokenizer import SUPPORTED_TOKENIZERS | |
| assert len(SUPPORTED_TOKENIZERS) >= 8 | |
| def test_supported_tokenizers_keys(self): | |
| """SUPPORTED_TOKENIZERS must contain all v2 tokenizer keys.""" | |
| from tokenizer import SUPPORTED_TOKENIZERS | |
| expected = {"gpt2", "llama-3", "mistral", "o200k_base", "cl100k_base", | |
| "qwen-2.5", "gemma-2", "command-r"} | |
| assert expected.issubset(set(SUPPORTED_TOKENIZERS.keys())) | |
| def test_tiktoken_entries_have_tiktoken_prefix(self): | |
| """Tiktoken-backed tokenizers must have 'tiktoken:' prefix in their value.""" | |
| from tokenizer import SUPPORTED_TOKENIZERS | |
| assert SUPPORTED_TOKENIZERS["o200k_base"].startswith("tiktoken:") | |
| assert SUPPORTED_TOKENIZERS["cl100k_base"].startswith("tiktoken:") | |
| def test_hf_entries_do_not_have_tiktoken_prefix(self): | |
| """HuggingFace-backed tokenizers must NOT have 'tiktoken:' prefix.""" | |
| from tokenizer import SUPPORTED_TOKENIZERS | |
| for key in ("gpt2", "llama-3", "mistral"): | |
| assert not SUPPORTED_TOKENIZERS[key].startswith("tiktoken:") | |
| class TestGetTokenizerErrorHandling: | |
| """Error handling when HF download fails.""" | |
| def test_from_pretrained_failure_raises_clear_message(self): | |
| """Network failure should produce a RuntimeError with actionable message.""" | |
| import tokenizer as tok_module | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", | |
| side_effect=OSError("Connection timeout")): | |
| with pytest.raises(RuntimeError, match="Failed to load tokenizer 'gpt2'"): | |
| tok_module.get_tokenizer("gpt2") | |
| def test_from_pretrained_failure_does_not_cache(self): | |
| """A failed download must not pollute the tokenizer cache.""" | |
| import tokenizer as tok_module | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| with patch("tokenizer.AutoTokenizer.from_pretrained", | |
| side_effect=OSError("timeout")): | |
| with pytest.raises(RuntimeError): | |
| tok_module.get_tokenizer("gpt2") | |
| assert "gpt2" not in tok_module._tokenizer_cache | |
| class TestTiktokenAdapter: | |
| """Tests for the TiktokenAdapter that wraps tiktoken to match HF interface.""" | |
| def test_adapter_encode_returns_list_of_ints(self): | |
| """TiktokenAdapter.encode() must return a list of int token IDs.""" | |
| from tokenizer import TiktokenAdapter | |
| adapter = TiktokenAdapter("cl100k_base") | |
| result = adapter.encode("hello world") | |
| assert isinstance(result, list) | |
| assert all(isinstance(x, int) for x in result) | |
| assert len(result) > 0 | |
| def test_adapter_convert_ids_to_tokens(self): | |
| """TiktokenAdapter.convert_ids_to_tokens() must return string tokens.""" | |
| from tokenizer import TiktokenAdapter | |
| adapter = TiktokenAdapter("cl100k_base") | |
| ids = adapter.encode("hello world") | |
| tokens = adapter.convert_ids_to_tokens(ids) | |
| assert isinstance(tokens, list) | |
| assert all(isinstance(t, str) for t in tokens) | |
| assert len(tokens) == len(ids) | |
| def test_adapter_decode_roundtrips(self): | |
| """Encoding then decoding should reconstruct the original text.""" | |
| from tokenizer import TiktokenAdapter | |
| adapter = TiktokenAdapter("cl100k_base") | |
| text = "hello world" | |
| ids = adapter.encode(text) | |
| decoded = adapter.decode(ids) | |
| assert decoded == text | |
| def test_adapter_encode_with_add_special_tokens_false(self): | |
| """encode(text, add_special_tokens=False) must work without error.""" | |
| from tokenizer import TiktokenAdapter | |
| adapter = TiktokenAdapter("cl100k_base") | |
| result = adapter.encode("test", add_special_tokens=False) | |
| assert isinstance(result, list) | |
| assert len(result) > 0 | |
| def test_get_tokenizer_loads_tiktoken_for_o200k(self): | |
| """get_tokenizer('o200k_base') must return a TiktokenAdapter.""" | |
| from tokenizer import get_tokenizer, TiktokenAdapter | |
| import tokenizer as tok_module | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| tok = get_tokenizer("o200k_base") | |
| assert isinstance(tok, TiktokenAdapter) | |
| def test_get_tokenizer_loads_tiktoken_for_cl100k(self): | |
| """get_tokenizer('cl100k_base') must return a TiktokenAdapter.""" | |
| from tokenizer import get_tokenizer, TiktokenAdapter | |
| import tokenizer as tok_module | |
| with patch.dict(tok_module._tokenizer_cache, {}, clear=True): | |
| tok = get_tokenizer("cl100k_base") | |
| assert isinstance(tok, TiktokenAdapter) | |
| def test_tokenize_text_works_with_tiktoken_adapter(self): | |
| """tokenize_text() must produce valid output when given a TiktokenAdapter.""" | |
| from tokenizer import TiktokenAdapter, tokenize_text | |
| adapter = TiktokenAdapter("cl100k_base") | |
| result = tokenize_text("hello world", adapter) | |
| assert isinstance(result, list) | |
| assert len(result) > 0 | |
| assert all("token" in r and "id" in r for r in result) | |
| def test_fragmentation_ratio_works_with_tiktoken_adapter(self): | |
| """fragmentation_ratio() must work with TiktokenAdapter.""" | |
| from tokenizer import TiktokenAdapter, fragmentation_ratio | |
| adapter = TiktokenAdapter("cl100k_base") | |
| result = fragmentation_ratio("hello world foo bar", adapter) | |
| assert "ratio" in result | |
| assert "token_count" in result | |
| assert result["ratio"] > 0 | |
| assert result["token_count"] > 0 | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — tokenize_text | |
| # --------------------------------------------------------------------------- | |
| class TestTokenizeText: | |
| """Unit tests for tokenize_text(text, tokenizer) -> list[dict].""" | |
| def _mock_tokenizer(self, token_ids, tokens): | |
| """Build a minimal mock tokenizer.""" | |
| tok = MagicMock() | |
| tok.encode.return_value = token_ids | |
| tok.convert_ids_to_tokens.return_value = tokens | |
| return tok | |
| def test_returns_list(self): | |
| """tokenize_text must return a list.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([123], ["hello"]) | |
| result = tokenize_text("hello", tok) | |
| assert isinstance(result, list) | |
| def test_each_entry_has_token_and_id_keys(self): | |
| """Each dict in the result must have 'token' and 'id' keys.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([10, 20], ["Hello", " world"]) | |
| result = tokenize_text("Hello world", tok) | |
| for entry in result: | |
| assert "token" in entry | |
| assert "id" in entry | |
| def test_token_values_are_strings(self): | |
| """'token' values must be strings.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([10, 20], ["Hello", " world"]) | |
| result = tokenize_text("Hello world", tok) | |
| for entry in result: | |
| assert isinstance(entry["token"], str) | |
| def test_id_values_are_ints(self): | |
| """'id' values must be ints.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([10, 20], ["Hello", " world"]) | |
| result = tokenize_text("Hello world", tok) | |
| for entry in result: | |
| assert isinstance(entry["id"], int) | |
| def test_token_and_id_values_correct(self): | |
| """Token and id values must match the mock tokenizer output.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([7, 42], ["Hi", "!"]) | |
| result = tokenize_text("Hi!", tok) | |
| assert result[0] == {"token": "Hi", "id": 7} | |
| assert result[1] == {"token": "!", "id": 42} | |
| def test_empty_text_returns_empty_list(self): | |
| """Empty string → tokenizer returns [] → result is [].""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([], []) | |
| result = tokenize_text("", tok) | |
| assert result == [] | |
| def test_length_matches_number_of_tokens(self): | |
| """Result length must equal the number of token IDs returned.""" | |
| from tokenizer import tokenize_text | |
| tok = self._mock_tokenizer([1, 2, 3, 4], ["a", "b", "c", "d"]) | |
| result = tokenize_text("a b c d", tok) | |
| assert len(result) == 4 | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — fragmentation_ratio | |
| # --------------------------------------------------------------------------- | |
| class TestFragmentationRatio: | |
| """Unit tests for fragmentation_ratio(text, tokenizer) -> dict[str, float].""" | |
| def _mock_tokenizer(self, token_ids, tokens): | |
| tok = MagicMock() | |
| tok.encode.return_value = token_ids | |
| tok.convert_ids_to_tokens.return_value = tokens | |
| return tok | |
| def test_returns_dict(self): | |
| """fragmentation_ratio must return a dict.""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([1, 2], ["Hello", " world"]) | |
| result = fragmentation_ratio("Hello world", tok) | |
| assert isinstance(result, dict) | |
| def test_contains_ratio_key(self): | |
| """Result must contain a 'ratio' key.""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([1, 2], ["Hello", " world"]) | |
| result = fragmentation_ratio("Hello world", tok) | |
| assert "ratio" in result | |
| def test_ratio_is_float(self): | |
| """ratio value must be a float.""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([1, 2], ["Hello", " world"]) | |
| result = fragmentation_ratio("Hello world", tok) | |
| assert isinstance(result["ratio"], float) | |
| def test_ratio_is_tokens_per_word(self): | |
| """ratio = num_tokens / num_words for simple input.""" | |
| from tokenizer import fragmentation_ratio | |
| # 4 tokens for 2 words → ratio 2.0 | |
| tok = self._mock_tokenizer([1, 2, 3, 4], ["Hel", "lo", " wor", "ld"]) | |
| result = fragmentation_ratio("Hello world", tok) | |
| assert result["ratio"] == pytest.approx(2.0) | |
| def test_empty_text_returns_ratio_zero(self): | |
| """Empty text → ratio 0.0 (no division by zero crash).""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([], []) | |
| result = fragmentation_ratio("", tok) | |
| assert result["ratio"] == pytest.approx(0.0) | |
| def test_contains_token_count_key(self): | |
| """Result must contain 'token_count' key.""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([1, 2, 3], ["a", "b", "c"]) | |
| result = fragmentation_ratio("a b c", tok) | |
| assert "token_count" in result | |
| def test_token_count_value_correct(self): | |
| """token_count must equal the number of tokens from the tokenizer.""" | |
| from tokenizer import fragmentation_ratio | |
| tok = self._mock_tokenizer([1, 2, 3], ["a", "b", "c"]) | |
| result = fragmentation_ratio("a b c", tok) | |
| assert result["token_count"] == 3 | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — flag_oov_words | |
| # --------------------------------------------------------------------------- | |
| class TestFlagOovWords: | |
| """Unit tests for flag_oov_words(text, tokenizer, threshold) -> set[str].""" | |
| def _mock_tokenizer_with_word_encoding(self, word_token_counts: dict): | |
| """ | |
| Build a mock tokenizer where encode(word) returns a list of token IDs | |
| whose length equals word_token_counts[word]. | |
| """ | |
| tok = MagicMock() | |
| def encode_side_effect(text, add_special_tokens=True): | |
| word = text.strip() | |
| count = word_token_counts.get(word, 1) | |
| return list(range(count)) | |
| tok.encode.side_effect = encode_side_effect | |
| tok.convert_ids_to_tokens.return_value = [] | |
| return tok | |
| def test_returns_set(self): | |
| """flag_oov_words must return a set.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({"hello": 1}) | |
| result = flag_oov_words("hello", tok) | |
| assert isinstance(result, set) | |
| def test_word_above_threshold_is_flagged(self): | |
| """A word that fragments into more tokens than threshold is in the result.""" | |
| from tokenizer import flag_oov_words | |
| # "supercalifragilistic" splits into 5 tokens, threshold=3 → flagged | |
| tok = self._mock_tokenizer_with_word_encoding({"supercalifragilistic": 5}) | |
| result = flag_oov_words("supercalifragilistic", tok, threshold=3) | |
| assert "supercalifragilistic" in result | |
| def test_word_at_threshold_is_flagged(self): | |
| """A word that fragments into exactly threshold tokens is flagged.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({"hello": 3}) | |
| result = flag_oov_words("hello", tok, threshold=3) | |
| assert "hello" in result | |
| def test_word_below_threshold_not_flagged(self): | |
| """A word that fragments into fewer tokens than threshold is not flagged.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({"hello": 1}) | |
| result = flag_oov_words("hello", tok, threshold=3) | |
| assert "hello" not in result | |
| def test_default_threshold_is_3(self): | |
| """Default threshold is 3.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({"word": 3}) | |
| result = flag_oov_words("word", tok) | |
| assert "word" in result | |
| def test_empty_text_returns_empty_set(self): | |
| """Empty text → no words to evaluate → empty set.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({}) | |
| result = flag_oov_words("", tok) | |
| assert result == set() | |
| def test_multiple_words_only_oov_flagged(self): | |
| """Only words meeting the threshold are flagged; others are not.""" | |
| from tokenizer import flag_oov_words | |
| tok = self._mock_tokenizer_with_word_encoding({"cat": 1, "superlongword": 5}) | |
| result = flag_oov_words("cat superlongword", tok, threshold=3) | |
| assert "superlongword" in result | |
| assert "cat" not in result | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — detect_language | |
| # --------------------------------------------------------------------------- | |
| class TestDetectLanguage: | |
| """Unit tests for detect_language(text) -> str.""" | |
| def test_returns_string(self): | |
| """detect_language must always return a string.""" | |
| from tokenizer import detect_language | |
| with patch("tokenizer.detect", return_value="en"): | |
| result = detect_language("Hello world") | |
| assert isinstance(result, str) | |
| def test_returns_detected_language_code(self): | |
| """Returns the language code from langdetect.detect.""" | |
| from tokenizer import detect_language | |
| with patch("tokenizer.detect", return_value="fr"): | |
| result = detect_language("Bonjour le monde") | |
| assert result == "fr" | |
| def test_returns_en_on_lang_detect_exception(self): | |
| """Returns 'en' when LangDetectException is raised.""" | |
| from tokenizer import detect_language | |
| from langdetect import LangDetectException | |
| with patch("tokenizer.detect", side_effect=LangDetectException(0, "error")): | |
| result = detect_language("???") | |
| assert result == "en" | |
| def test_english_text_returns_en(self): | |
| """English text returns 'en' via the mock.""" | |
| from tokenizer import detect_language | |
| with patch("tokenizer.detect", return_value="en"): | |
| result = detect_language("The quick brown fox") | |
| assert result == "en" | |
| def test_empty_text_returns_en(self): | |
| """Empty text triggers LangDetectException — falls back to 'en'.""" | |
| from tokenizer import detect_language | |
| from langdetect import LangDetectException | |
| with patch("tokenizer.detect", side_effect=LangDetectException(0, "empty")): | |
| result = detect_language("") | |
| assert result == "en" | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — efficiency_score | |
| # --------------------------------------------------------------------------- | |
| class TestEfficiencyScore: | |
| """Unit tests for efficiency_score(input_tokens, english_tokens) -> float.""" | |
| def test_returns_float(self): | |
| """efficiency_score must return a float.""" | |
| from tokenizer import efficiency_score | |
| result = efficiency_score(10, 8) | |
| assert isinstance(result, float) | |
| def test_equal_tokens_returns_one(self): | |
| """When input_tokens == english_tokens, score is 1.0.""" | |
| from tokenizer import efficiency_score | |
| assert efficiency_score(10, 10) == pytest.approx(1.0) | |
| def test_fewer_input_tokens_than_english_gives_score_above_one(self): | |
| """Compact non-English text (fewer tokens) → score > 1.""" | |
| from tokenizer import efficiency_score | |
| # 5 tokens in source vs 10 in English → 10/5 = 2.0 | |
| assert efficiency_score(5, 10) == pytest.approx(2.0) | |
| def test_more_input_tokens_than_english_gives_score_below_one(self): | |
| """Verbose non-English text (more tokens) → score < 1.""" | |
| from tokenizer import efficiency_score | |
| # 20 tokens in source vs 10 in English → 10/20 = 0.5 | |
| assert efficiency_score(20, 10) == pytest.approx(0.5) | |
| def test_zero_english_tokens_returns_one(self): | |
| """Zero english_tokens is the zero-guard case — must return 1.0.""" | |
| from tokenizer import efficiency_score | |
| assert efficiency_score(10, 0) == pytest.approx(1.0) | |
| def test_zero_input_tokens_returns_one(self): | |
| """Zero input_tokens with zero division guard returns 1.0.""" | |
| from tokenizer import efficiency_score | |
| assert efficiency_score(0, 0) == pytest.approx(1.0) | |
| # --------------------------------------------------------------------------- | |
| # Token tax metrics (GH-3) | |
| # --------------------------------------------------------------------------- | |
| class TestRelativeTokenizationCost: | |
| """Unit tests for relative_tokenization_cost(source_tokens, english_tokens).""" | |
| def test_returns_float(self): | |
| from tokenizer import relative_tokenization_cost | |
| result = relative_tokenization_cost(10, 5) | |
| assert isinstance(result, float) | |
| def test_equal_tokens_returns_one(self): | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(5, 5) == pytest.approx(1.0) | |
| def test_source_higher_than_english_returns_above_one(self): | |
| """10 source tokens vs 5 English → RTC = 2.0 (token tax).""" | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(10, 5) == pytest.approx(2.0) | |
| def test_source_lower_than_english_returns_below_one(self): | |
| """3 source tokens vs 6 English → RTC = 0.5 (more efficient).""" | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(3, 6) == pytest.approx(0.5) | |
| def test_zero_english_tokens_returns_one(self): | |
| """Zero guard: denominator 0 → 1.0.""" | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(10, 0) == pytest.approx(1.0) | |
| def test_zero_source_tokens_returns_zero(self): | |
| """Zero source tokens → 0.0 (no tokens = no cost).""" | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(0, 5) == pytest.approx(0.0) | |
| def test_both_zero_returns_one(self): | |
| from tokenizer import relative_tokenization_cost | |
| assert relative_tokenization_cost(0, 0) == pytest.approx(1.0) | |
| class TestBytePremium: | |
| """Unit tests for byte_premium(text, english_text).""" | |
| def test_returns_float(self): | |
| from tokenizer import byte_premium | |
| result = byte_premium("hello", "hello") | |
| assert isinstance(result, float) | |
| def test_identical_text_returns_one(self): | |
| from tokenizer import byte_premium | |
| assert byte_premium("hello", "hello") == pytest.approx(1.0) | |
| def test_arabic_vs_english_above_one(self): | |
| """Arabic uses more UTF-8 bytes than English for similar content.""" | |
| from tokenizer import byte_premium | |
| arabic = "مرحبا بالعالم" | |
| english = "hello world" | |
| result = byte_premium(arabic, english) | |
| assert result > 1.0 | |
| def test_empty_english_returns_one(self): | |
| """Zero guard: empty English text → 1.0.""" | |
| from tokenizer import byte_premium | |
| assert byte_premium("hello", "") == pytest.approx(1.0) | |
| def test_empty_source_returns_zero(self): | |
| """Empty source text → 0.0.""" | |
| from tokenizer import byte_premium | |
| assert byte_premium("", "hello") == pytest.approx(0.0) | |
| def test_both_empty_returns_one(self): | |
| from tokenizer import byte_premium | |
| assert byte_premium("", "") == pytest.approx(1.0) | |
| class TestContextWindowUsage: | |
| """Unit tests for context_window_usage(token_count, window_size).""" | |
| def test_returns_float(self): | |
| from tokenizer import context_window_usage | |
| result = context_window_usage(1000, 128_000) | |
| assert isinstance(result, float) | |
| def test_known_fraction(self): | |
| from tokenizer import context_window_usage | |
| assert context_window_usage(1000, 128_000) == pytest.approx(1000 / 128_000) | |
| def test_full_window(self): | |
| from tokenizer import context_window_usage | |
| assert context_window_usage(128_000, 128_000) == pytest.approx(1.0) | |
| def test_zero_tokens(self): | |
| from tokenizer import context_window_usage | |
| assert context_window_usage(0, 128_000) == pytest.approx(0.0) | |
| def test_default_window_size(self): | |
| """Default window_size is 128_000.""" | |
| from tokenizer import context_window_usage | |
| assert context_window_usage(128_000) == pytest.approx(1.0) | |
| def test_zero_window_returns_one(self): | |
| """Zero guard: window_size 0 → 1.0.""" | |
| from tokenizer import context_window_usage | |
| assert context_window_usage(100, 0) == pytest.approx(1.0) | |
| class TestQualityRiskLevel: | |
| """Unit tests for quality_risk_level(rtc).""" | |
| def test_returns_string(self): | |
| from tokenizer import quality_risk_level | |
| result = quality_risk_level(1.0) | |
| assert isinstance(result, str) | |
| def test_low_risk(self): | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(1.0) == "low" | |
| assert quality_risk_level(1.4) == "low" | |
| def test_moderate_risk(self): | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(1.5) == "moderate" | |
| assert quality_risk_level(2.0) == "moderate" | |
| assert quality_risk_level(2.4) == "moderate" | |
| def test_high_risk(self): | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(2.5) == "high" | |
| assert quality_risk_level(3.0) == "high" | |
| assert quality_risk_level(3.9) == "high" | |
| def test_severe_risk(self): | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(4.0) == "severe" | |
| assert quality_risk_level(5.0) == "severe" | |
| assert quality_risk_level(10.0) == "severe" | |
| def test_boundary_1_5(self): | |
| """Exactly 1.5 → moderate (inclusive lower bound).""" | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(1.5) == "moderate" | |
| def test_boundary_2_5(self): | |
| """Exactly 2.5 → high.""" | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(2.5) == "high" | |
| def test_boundary_4_0(self): | |
| """Exactly 4.0 → severe.""" | |
| from tokenizer import quality_risk_level | |
| assert quality_risk_level(4.0) == "severe" | |
| # --------------------------------------------------------------------------- | |
| # Phase 1 — translate_to_english | |
| # --------------------------------------------------------------------------- | |
| class TestTranslateToEnglish: | |
| """Unit tests for translate_to_english(text, api_key) -> str.""" | |
| def _make_response(self, translated_text: str) -> dict: | |
| return { | |
| "choices": [{"message": {"content": translated_text}}], | |
| "usage": {"prompt_tokens": 10, "completion_tokens": 20}, | |
| } | |
| def test_returns_string(self): | |
| """translate_to_english must return a string.""" | |
| from tokenizer import translate_to_english | |
| with patch("tokenizer.call_openrouter", return_value=self._make_response("Hello")): | |
| result = translate_to_english("Bonjour", "sk-key") | |
| assert isinstance(result, str) | |
| def test_calls_call_openrouter(self): | |
| """translate_to_english must call call_openrouter.""" | |
| from tokenizer import translate_to_english | |
| with patch("tokenizer.call_openrouter", return_value=self._make_response("Hi")) as mock_call: | |
| translate_to_english("Hola", "sk-key") | |
| assert mock_call.called | |
| def test_passes_api_key_to_call_openrouter(self): | |
| """API key must be forwarded to call_openrouter.""" | |
| from tokenizer import translate_to_english | |
| with patch("tokenizer.call_openrouter", return_value=self._make_response("Hi")) as mock_call: | |
| translate_to_english("Hola", "my-key") | |
| assert mock_call.call_args.args[0] == "my-key" | |
| def test_returns_translated_content(self): | |
| """Return value is the content from the model response.""" | |
| from tokenizer import translate_to_english | |
| with patch("tokenizer.call_openrouter", return_value=self._make_response("Hello world")): | |
| result = translate_to_english("Bonjour monde", "key") | |
| assert result == "Hello world" | |
| def test_prompt_contains_source_text(self): | |
| """The prompt sent to the model must include the source text.""" | |
| from tokenizer import translate_to_english | |
| with patch("tokenizer.call_openrouter", return_value=self._make_response("ok")) as mock_call: | |
| translate_to_english("Guten Tag", "key") | |
| prompt_arg = mock_call.call_args.args[2] | |
| assert "Guten Tag" in prompt_arg | |
| # --------------------------------------------------------------------------- | |
| # Phase 2 — render_tokens_html | |
| # --------------------------------------------------------------------------- | |
| class TestRenderTokensHtml: | |
| """Unit tests for render_tokens_html(tokens, oov_words) -> str.""" | |
| def test_returns_string(self): | |
| """render_tokens_html must return a string.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "Hello", "id": 1}, {"token": " world", "id": 2}] | |
| result = render_tokens_html(tokens, set()) | |
| assert isinstance(result, str) | |
| def test_each_token_appears_in_output(self): | |
| """Each token text must appear somewhere in the HTML output.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "Hello", "id": 1}, {"token": " world", "id": 2}] | |
| result = render_tokens_html(tokens, set()) | |
| assert "Hello" in result | |
| assert "world" in result | |
| def test_span_style_preserves_whitespace(self): | |
| """Token span style should preserve visible spaces between decoded chunks.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": " hello", "id": 1}] | |
| result = render_tokens_html(tokens, set()) | |
| assert "white-space:pre" in result | |
| assert "color:#000" in result | |
| def test_oov_tokens_have_highlight_colour(self): | |
| """OOV tokens must be rendered with #ffcccc background.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "superlongword", "id": 99}] | |
| result = render_tokens_html(tokens, {"superlongword"}) | |
| assert "#ffcccc" in result | |
| def test_normal_tokens_do_not_have_oov_highlight(self): | |
| """Non-OOV tokens must NOT be rendered with #ffcccc background.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "Hello", "id": 1}] | |
| result = render_tokens_html(tokens, set()) | |
| assert "#ffcccc" not in result | |
| def test_alternating_bg_colours_for_normal_tokens(self): | |
| """Normal tokens alternate between two distinct background colours.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [ | |
| {"token": "a", "id": 1}, | |
| {"token": "b", "id": 2}, | |
| {"token": "c", "id": 3}, | |
| ] | |
| result = render_tokens_html(tokens, set()) | |
| # Must have at least two different background colour values | |
| import re | |
| colours = re.findall(r"background[^;\"]*?:([^;\"]+)", result) | |
| unique = set(c.strip() for c in colours) | |
| assert len(unique) >= 2 | |
| def test_html_escapes_special_chars_in_token(self): | |
| """Token text with < > & must be HTML-escaped.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "<br>", "id": 5}] | |
| result = render_tokens_html(tokens, set()) | |
| assert "<br>" not in result | |
| assert "<" in result | |
| def test_empty_tokens_returns_string(self): | |
| """Empty token list returns an empty or valid HTML string (no crash).""" | |
| from tokenizer import render_tokens_html | |
| result = render_tokens_html([], set()) | |
| assert isinstance(result, str) | |
| def test_oov_word_matching_is_case_insensitive_or_exact(self): | |
| """OOV matching uses the exact word from the oov_words set.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "Cat", "id": 10}] | |
| result = render_tokens_html(tokens, {"Cat"}) | |
| assert "#ffcccc" in result | |
| def test_decoded_view_hides_special_tokens_by_default(self): | |
| """Decoded view should skip special tokens like BOS when configured.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "<|begin_of_text|>", "id": 1}, {"token": "hello", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [1] | |
| mock_tok.decode.side_effect = lambda ids, **kwargs: "" if ids == [1] else "hello" | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "begin_of_text" not in result | |
| assert "hello" in result | |
| def test_decoded_view_can_show_readable_decoded_text(self): | |
| """Decoded view should prefer tokenizer.decode output over raw token text.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "Ġhello", "id": 42}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.decode.return_value = " hello" | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "Ġhello" not in result | |
| assert "hello" in result | |
| def test_decoded_view_handles_multibyte_text_via_cumulative_decode(self): | |
| """Readable mode should use cumulative decode chunks for multibyte scripts.""" | |
| from tokenizer import render_tokens_html | |
| # Simulate a tokenizer where individual token decode is not readable, | |
| # but cumulative decode forms proper text. | |
| tokens = [ | |
| {"token": "வ", "id": 10}, | |
| {"token": "ண", "id": 11}, | |
| {"token": "à®ķ", "id": 12}, | |
| ] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| def _decode(ids, **kwargs): | |
| if ids == [10]: | |
| return "" | |
| if ids == [10, 11]: | |
| return "வ" | |
| if ids == [10, 11, 12]: | |
| return "வண" | |
| return "" | |
| mock_tok.decode.side_effect = _decode | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "வ" not in result | |
| assert "வ" in result | |
| def test_decoded_view_uses_byte_decoder_path_when_available(self): | |
| """Readable mode should prefer byte-decoder accumulation for byte-level tokens.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "A", "id": 1}, {"token": "B", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.byte_decoder = {"A": 65, "B": 66} | |
| # If decode() were used, we'd see replacement chars; byte path should avoid this. | |
| mock_tok.decode.return_value = "��" | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "��" not in result | |
| assert ">A</span>" in result | |
| assert ">B</span>" in result | |
| def test_decoded_view_prefers_convert_tokens_to_string_for_readable_output(self): | |
| """Readable mode should use convert_tokens_to_string when available.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "வ", "id": 1}, {"token": "ண", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.convert_tokens_to_string.side_effect = ["", "வ"] | |
| # If this decode path were used directly, we'd likely see noise. | |
| mock_tok.decode.return_value = "��" | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "��" not in result | |
| assert "வ" in result | |
| def test_decoded_view_handles_replacement_prefix_drift(self): | |
| """If previous decoded text contains replacement chars, we should still recover new readable chars.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "x", "id": 1}, {"token": "y", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| # Step 1 has replacement char, step 2 resolves to a real Tamil letter. | |
| mock_tok.convert_tokens_to_string.side_effect = ["�", "வ"] | |
| result = render_tokens_html( | |
| tokens, | |
| set(), | |
| tokenizer=mock_tok, | |
| decoded_view=True, | |
| hide_special_tokens=True, | |
| ) | |
| assert "வ" in result | |
| # --------------------------------------------------------------------------- | |
| # Phase 3 — build_tokenizer_ui (smoke test) | |
| # --------------------------------------------------------------------------- | |
| class TestDecodedViewGenericFallbackEdgeCases: | |
| """Cover edge cases in the generic fallback decode path of render_tokens_html.""" | |
| def test_decode_exception_falls_back_to_prev(self): | |
| """When tokenizer.decode raises, use previous decoded text.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "a", "id": 1}, {"token": "b", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| # First call ok, second raises | |
| mock_tok.decode.side_effect = ["a", Exception("decode error")] | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert isinstance(result, str) | |
| def test_non_prefix_stable_decode_single_token_fallback(self): | |
| """When cumulative decode is not prefix-stable, fall back to single-token decode.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "a", "id": 1}, {"token": "b", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| # Non-prefix-stable: second call returns something that doesn't start with first | |
| mock_tok.decode.side_effect = lambda ids, **kw: "a" if len(ids) == 1 and ids[0] == 1 else ("XY" if len(ids) == 2 else "b") | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert "b" in result | |
| def test_non_prefix_stable_single_decode_exception(self): | |
| """When single-token decode also raises, chunk is empty string.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "a", "id": 1}, {"token": "b", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| call_count = [0] | |
| def _decode(ids, **kw): | |
| call_count[0] += 1 | |
| if len(ids) == 1 and ids[0] == 1: | |
| return "a" | |
| if len(ids) == 2: | |
| return "XY" # non-prefix-stable | |
| # Single-token fallback for id=2 | |
| raise Exception("single decode failed") | |
| mock_tok.decode.side_effect = _decode | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert isinstance(result, str) | |
| def test_replacement_char_stripped_in_generic_fallback(self): | |
| """Replacement characters should be stripped in the generic fallback path.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "x", "id": 1}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.decode.return_value = "he\ufffdllo" | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert "\ufffd" not in result | |
| assert "hello" in result | |
| def test_byte_decoder_hides_special_tokens(self): | |
| """Byte decoder path should hide special tokens when configured.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "<s>", "id": 1}, {"token": "A", "id": 2}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [1] | |
| mock_tok.byte_decoder = {"A": 65} | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert "<s>" not in result | |
| assert ">A</span>" in result | |
| def test_byte_decoder_non_mapped_char(self): | |
| """Chars not in byte_decoder should encode via UTF-8 fallback.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "\u00e9", "id": 1}] # é | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.byte_decoder = {} # empty dict but truthy... no, empty dict is falsy | |
| # Need a non-empty byte_decoder that doesn't contain the char | |
| mock_tok.byte_decoder = {"A": 65} | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert isinstance(result, str) | |
| def test_convert_tokens_to_string_exception_falls_to_next_path(self): | |
| """When convert_tokens_to_string raises, should fall through to byte or generic path.""" | |
| from tokenizer import render_tokens_html | |
| tokens = [{"token": "hi", "id": 1}] | |
| mock_tok = MagicMock() | |
| mock_tok.all_special_ids = [] | |
| mock_tok.convert_tokens_to_string.side_effect = Exception("not supported") | |
| mock_tok.decode.return_value = "hi" | |
| result = render_tokens_html( | |
| tokens, set(), tokenizer=mock_tok, | |
| decoded_view=True, hide_special_tokens=True, | |
| ) | |
| assert "hi" in result | |
| class TestHandleSingle: | |
| """Tests for _handle_single extracted handler.""" | |
| def test_returns_html_and_stats(self): | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch, MagicMock | |
| mock_tok = MagicMock() | |
| mock_tok.encode.return_value = [1, 2, 3] | |
| mock_tok.convert_ids_to_tokens.return_value = ["hello", "world", "!"] | |
| mock_tok.all_special_ids = [] | |
| with patch("tokenizer.get_tokenizer", return_value=mock_tok): | |
| html, stats = _handle_single("gpt2", "hello world!", 3, False) | |
| assert isinstance(html, str) | |
| assert "**Tokens:** 3" in stats | |
| assert "**Fragmentation ratio:**" in stats | |
| assert "**Detected language:**" in stats | |
| def test_error_returns_empty_html_and_error_message(self): | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", side_effect=ValueError("unknown")): | |
| html, stats = _handle_single("bad_model", "text", 3, False) | |
| assert html == "" | |
| assert "Error:" in stats | |
| def test_decoded_view_passed_through(self): | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch, MagicMock | |
| mock_tok = MagicMock() | |
| mock_tok.encode.return_value = [1] | |
| mock_tok.convert_ids_to_tokens.return_value = ["hi"] | |
| mock_tok.all_special_ids = [] | |
| mock_tok.decode.return_value = "hi" | |
| with patch("tokenizer.get_tokenizer", return_value=mock_tok): | |
| html, stats = _handle_single("gpt2", "hi", 3, True) | |
| assert isinstance(html, str) | |
| assert "**Tokens:** 1" in stats | |
| # --- GH-7: enriched stats --- | |
| def _make_mock_tok(self, token_count: int): | |
| from unittest.mock import MagicMock | |
| tok = MagicMock() | |
| tok.encode.return_value = list(range(token_count)) | |
| tok.convert_ids_to_tokens.return_value = [f"t{i}" for i in range(token_count)] | |
| tok.all_special_ids = [] | |
| return tok | |
| def test_context_usage_always_shown(self): | |
| """Stats should always include context usage line.""" | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", return_value=self._make_mock_tok(3)): | |
| _, stats = _handle_single("gpt2", "hello world!", 3, False) | |
| assert "Context usage" in stats | |
| def test_english_text_shows_rtc_one(self): | |
| """When detected language is English, RTC should be 1.0x.""" | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", return_value=self._make_mock_tok(3)): | |
| with patch("tokenizer.detect_language", return_value="en"): | |
| _, stats = _handle_single("gpt2", "hello world", 3, False) | |
| assert "RTC" in stats | |
| assert "1.0" in stats | |
| assert "low" in stats.lower() | |
| def test_non_english_with_english_text_shows_rtc(self): | |
| """When English equivalent provided, compute and show RTC.""" | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| source_tok = self._make_mock_tok(6) | |
| eng_tok = self._make_mock_tok(3) | |
| with patch("tokenizer.get_tokenizer", return_value=source_tok): | |
| with patch("tokenizer.detect_language", return_value="ar"): | |
| with patch("tokenizer.tokenize_text") as mock_tt: | |
| mock_tt.side_effect = [ | |
| [{"token": f"t{i}", "id": i} for i in range(6)], # source | |
| [{"token": f"t{i}", "id": i} for i in range(3)], # english | |
| ] | |
| _, stats = _handle_single( | |
| "gpt2", "مرحبا بالعالم", 3, False, | |
| english_text="hello world", | |
| ) | |
| assert "RTC" in stats | |
| assert "2.0" in stats | |
| def test_non_english_no_english_text_shows_placeholder(self): | |
| """When non-English and no English text, show placeholder.""" | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", return_value=self._make_mock_tok(3)): | |
| with patch("tokenizer.detect_language", return_value="ar"): | |
| _, stats = _handle_single("gpt2", "مرحبا", 3, False) | |
| assert "RTC" in stats | |
| assert "English" in stats # placeholder message mentioning English | |
| def test_existing_callers_still_work_without_english_text(self): | |
| """Backward compat: calling without english_text still works.""" | |
| from tokenizer import _handle_single | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", return_value=self._make_mock_tok(3)): | |
| html, stats = _handle_single("gpt2", "hello", 3, False) | |
| assert isinstance(html, str) | |
| assert "**Tokens:** 3" in stats | |
| class TestHandleCompare: | |
| """Tests for _handle_compare extracted handler.""" | |
| def test_returns_two_html_and_ratio_markdown(self): | |
| from tokenizer import _handle_compare | |
| from unittest.mock import patch, MagicMock | |
| mock_tok_a = MagicMock() | |
| mock_tok_a.encode.return_value = [1, 2] | |
| mock_tok_a.convert_ids_to_tokens.return_value = ["he", "llo"] | |
| mock_tok_a.all_special_ids = [] | |
| mock_tok_b = MagicMock() | |
| mock_tok_b.encode.return_value = [1, 2, 3, 4] | |
| mock_tok_b.convert_ids_to_tokens.return_value = ["h", "e", "l", "lo"] | |
| mock_tok_b.all_special_ids = [] | |
| with patch("tokenizer.get_tokenizer", side_effect=[mock_tok_a, mock_tok_b]): | |
| html_a, html_b, ratio_md = _handle_compare("hello", "gpt2", "mistral", False) | |
| assert isinstance(html_a, str) | |
| assert isinstance(html_b, str) | |
| assert "**gpt2:** 2 tokens" in ratio_md | |
| assert "**mistral:** 4 tokens" in ratio_md | |
| def test_error_returns_empty_and_error_message(self): | |
| from tokenizer import _handle_compare | |
| from unittest.mock import patch | |
| with patch("tokenizer.get_tokenizer", side_effect=ValueError("bad")): | |
| html_a, html_b, ratio_md = _handle_compare("text", "bad", "bad2", False) | |
| assert html_a == "" | |
| assert html_b == "" | |
| assert "Error:" in ratio_md | |
| # --- GH-7: enriched compare stats --- | |
| def _make_mock_tok(self, token_count: int): | |
| from unittest.mock import MagicMock | |
| tok = MagicMock() | |
| tok.encode.return_value = list(range(token_count)) | |
| tok.convert_ids_to_tokens.return_value = [f"t{i}" for i in range(token_count)] | |
| tok.all_special_ids = [] | |
| return tok | |
| def test_compare_with_english_text_shows_rtc(self): | |
| """When English text provided, compare should show RTC for each tokenizer.""" | |
| from tokenizer import _handle_compare | |
| from unittest.mock import patch | |
| tok_a = self._make_mock_tok(4) | |
| tok_b = self._make_mock_tok(6) | |
| with patch("tokenizer.get_tokenizer", side_effect=[tok_a, tok_b]): | |
| with patch("tokenizer.tokenize_text") as mock_tt: | |
| mock_tt.side_effect = [ | |
| [{"token": f"t{i}", "id": i} for i in range(4)], # source A | |
| [{"token": f"t{i}", "id": i} for i in range(6)], # source B | |
| [{"token": f"t{i}", "id": i} for i in range(3)], # english A | |
| [{"token": f"t{i}", "id": i} for i in range(3)], # english B | |
| ] | |
| html_a, html_b, ratio_md = _handle_compare( | |
| "مرحبا", "gpt2", "mistral", False, | |
| english_text="hello", | |
| ) | |
| assert "RTC" in ratio_md | |
| def test_compare_shows_which_is_more_efficient(self): | |
| """Compare should note which tokenizer has lower RTC.""" | |
| from tokenizer import _handle_compare | |
| from unittest.mock import patch | |
| tok_a = self._make_mock_tok(4) | |
| tok_b = self._make_mock_tok(8) | |
| with patch("tokenizer.get_tokenizer", side_effect=[tok_a, tok_b]): | |
| with patch("tokenizer.tokenize_text") as mock_tt: | |
| mock_tt.side_effect = [ | |
| [{"token": f"t{i}", "id": i} for i in range(4)], # source A | |
| [{"token": f"t{i}", "id": i} for i in range(8)], # source B | |
| [{"token": f"t{i}", "id": i} for i in range(3)], # english A | |
| [{"token": f"t{i}", "id": i} for i in range(3)], # english B | |
| ] | |
| html_a, html_b, ratio_md = _handle_compare( | |
| "مرحبا", "gpt2", "mistral", False, | |
| english_text="hello", | |
| ) | |
| assert "efficient" in ratio_md.lower() or "better" in ratio_md.lower() | |
| def test_compare_without_english_text_still_works(self): | |
| """Backward compat: compare without english_text works as before.""" | |
| from tokenizer import _handle_compare | |
| from unittest.mock import patch | |
| tok_a = self._make_mock_tok(2) | |
| tok_b = self._make_mock_tok(4) | |
| with patch("tokenizer.get_tokenizer", side_effect=[tok_a, tok_b]): | |
| html_a, html_b, ratio_md = _handle_compare("hello", "gpt2", "mistral", False) | |
| assert "**gpt2:**" in ratio_md | |
| assert "**mistral:**" in ratio_md | |
| class TestBuildTokenizerUi: | |
| """Smoke test for build_tokenizer_ui() -> gr.Blocks.""" | |
| def test_returns_gradio_blocks(self): | |
| """build_tokenizer_ui() must return a Gradio Blocks instance without raising.""" | |
| import gradio as gr | |
| from tokenizer import build_tokenizer_ui | |
| demo = build_tokenizer_ui() | |
| assert isinstance(demo, gr.Blocks) | |