Spaces:
Runtime error
Runtime error
| import tiktoken | |
| from transformers import AutoTokenizer | |
| # ... existing code ... | |
| def analyze_tokens_detailed(text, model): | |
| """ | |
| For a given text and model, returns a list of dicts with details for each token: | |
| - token string | |
| - token id | |
| - decoded value | |
| - token length | |
| - NSL value (token length / max token length in sequence) | |
| - subword fertility (number of tokens per word) | |
| Also returns the decoded output for the entire sequence. | |
| """ | |
| # Tokenize | |
| if 'gpt' in model: | |
| tokenizer = tiktoken.encoding_for_model(model) | |
| token_ids = tokenizer.encode(text) | |
| tokens = [tokenizer.decode([tid]) for tid in token_ids] | |
| else: | |
| tokenizer = AutoTokenizer.from_pretrained(model) | |
| token_ids = tokenizer.encode(text, add_special_tokens=False) | |
| tokens = [tokenizer.decode([tid]) for tid in token_ids] | |
| # Decoded output for the entire sequence | |
| if 'gpt' in model: | |
| decoded_output = tokenizer.decode(token_ids) | |
| else: | |
| decoded_output = tokenizer.decode(token_ids) | |
| # Token lengths | |
| token_lengths = [len(t) for t in tokens] | |
| max_token_length = max(token_lengths) if token_lengths else 1 | |
| nsl_values = [l / max_token_length for l in token_lengths] | |
| # Subword fertility: number of tokens per word | |
| # Map each token to its originating word (approximate) | |
| words = text.split() | |
| word_token_counts = [] | |
| if len(words) > 0: | |
| # Use a simple greedy approach: assign tokens to words in order | |
| import re | |
| text_pointer = 0 | |
| word_idx = 0 | |
| token_word_map = [] | |
| for token in tokens: | |
| # Find the next word that matches the start of the token | |
| while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]): | |
| text_pointer += 1 | |
| if word_idx < len(words): | |
| token_word_map.append(word_idx) | |
| text_pointer += len(token) | |
| if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])): | |
| word_idx += 1 | |
| else: | |
| token_word_map.append(-1) | |
| # Count tokens per word | |
| from collections import Counter | |
| fertility_counter = Counter(token_word_map) | |
| subword_fertility = [fertility_counter[i] for i in range(len(words))] | |
| # Assign fertility to each token | |
| token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map] | |
| else: | |
| token_fertility = [1 for _ in tokens] | |
| # Build table | |
| table = [] | |
| for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)): | |
| table.append({ | |
| 'token': token, | |
| 'token_id': tid, | |
| 'decoded': decoded, | |
| 'token_length': length, | |
| 'nsl': nsl, | |
| 'subword_fertility': fert | |
| }) | |
| return { | |
| 'model': model, | |
| 'decoded_output': decoded_output, | |
| 'tokens': table | |
| } | |
| # ... existing code ... | |