| | from datasets import load_dataset |
| | from tokenizers import Tokenizer |
| |
|
| | if __name__ == "__main__": |
| | |
| | dataset_stream = load_dataset( |
| | "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens", |
| | split="train", |
| | streaming=True |
| | ) |
| | tokenizer = Tokenizer.from_file("tokens-bpe-36k.json") |
| | encode = tokenizer.encode |
| | unk_id = tokenizer.token_to_id("[UNK]") |
| | vocab_size = tokenizer.get_vocab_size() |
| |
|
| | print("Tamanho do vocabulário:", tokenizer.get_vocab_size()) |
| | enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo") |
| | print(tokenizer.decode(enc.ids, skip_special_tokens=True)) |
| |
|
| | |
| | total_tokens = 0 |
| | total_words = 0 |
| | unk_tokens = 0 |
| | seen_ids = set() |
| |
|
| | batch_size = 512 |
| | batch_counter = 0 |
| |
|
| | def batch_iterator(stream, bs): |
| | buf = [] |
| | for ex in stream: |
| | buf.append(ex["text"]) |
| | if len(buf) == bs: |
| | yield buf |
| | buf = [] |
| | if buf: |
| | yield buf |
| |
|
| | for texts in batch_iterator(dataset_stream, batch_size): |
| | |
| | encs = tokenizer.encode_batch(texts) |
| |
|
| | |
| | words_in_batch = sum(len(t.split()) for t in texts) |
| | total_words += words_in_batch |
| |
|
| | for enc in encs: |
| | total_tokens += len(enc.ids) |
| | unk_tokens += enc.ids.count(unk_id) |
| | seen_ids.update(enc.ids) |
| |
|
| | |
| | if batch_counter % 100 == 0: |
| | oov_rate = unk_tokens / total_tokens * 100 |
| | frag = total_tokens / total_words |
| | coverage = len(seen_ids) / vocab_size * 100 |
| | ttr = len(seen_ids) / total_tokens |
| | print(f"[Batch {batch_counter:04d}] " |
| | f"OOV: {oov_rate:.3f}% | " |
| | f"Frag: {frag:.3f} t/palavra | " |
| | f"Coverage: {coverage:.2f}% | " |
| | f"TTR: {ttr:.4f}") |
| | batch_counter += 1 |
| |
|
| | |
| | oov_rate = unk_tokens / total_tokens * 100 |
| | frag = total_tokens / total_words |
| | coverage = len(seen_ids) / vocab_size * 100 |
| | ttr = len(seen_ids) / total_tokens |
| |
|
| | print("\n=== Métricas Finais ===") |
| | print(f"Total de tokens: {total_tokens}") |
| | print(f"Total de palavras: {total_words}") |
| | print(f"OOV rate: {oov_rate:.3f}%") |
| | print(f"Fragmentação: {frag:.3f} tokens/palavra") |
| | print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado") |
| | print(f"Type–Token Ratio: {ttr:.4f}") |
| |
|