File size: 2,692 Bytes
58d9159 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | from datasets import load_dataset
from tokenizers import Tokenizer
if __name__ == "__main__":
# Carrega streaming do dataset e o tokenizer
dataset_stream = load_dataset(
"bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens",
split="train",
streaming=True
)
tokenizer = Tokenizer.from_file("tokens-bpe-36k.json")
encode = tokenizer.encode
unk_id = tokenizer.token_to_id("[UNK]")
vocab_size = tokenizer.get_vocab_size()
print("Tamanho do vocabulário:", tokenizer.get_vocab_size())
enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo")
print(tokenizer.decode(enc.ids, skip_special_tokens=True))
# Contadores
total_tokens = 0
total_words = 0
unk_tokens = 0
seen_ids = set()
batch_size = 512
batch_counter = 0
def batch_iterator(stream, bs):
buf = []
for ex in stream:
buf.append(ex["text"])
if len(buf) == bs:
yield buf
buf = []
if buf:
yield buf
for texts in batch_iterator(dataset_stream, batch_size):
# tokeniza em batch
encs = tokenizer.encode_batch(texts)
# conta palavras e tokens no batch
words_in_batch = sum(len(t.split()) for t in texts)
total_words += words_in_batch
for enc in encs:
total_tokens += len(enc.ids)
unk_tokens += enc.ids.count(unk_id)
seen_ids.update(enc.ids)
# impressão parcial a cada 100 batches
if batch_counter % 100 == 0:
oov_rate = unk_tokens / total_tokens * 100
frag = total_tokens / total_words
coverage = len(seen_ids) / vocab_size * 100
ttr = len(seen_ids) / total_tokens
print(f"[Batch {batch_counter:04d}] "
f"OOV: {oov_rate:.3f}% | "
f"Frag: {frag:.3f} t/palavra | "
f"Coverage: {coverage:.2f}% | "
f"TTR: {ttr:.4f}")
batch_counter += 1
# resultado final
oov_rate = unk_tokens / total_tokens * 100
frag = total_tokens / total_words
coverage = len(seen_ids) / vocab_size * 100
ttr = len(seen_ids) / total_tokens
print("\n=== Métricas Finais ===")
print(f"Total de tokens: {total_tokens}")
print(f"Total de palavras: {total_words}")
print(f"OOV rate: {oov_rate:.3f}%")
print(f"Fragmentação: {frag:.3f} tokens/palavra")
print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado")
print(f"Type–Token Ratio: {ttr:.4f}")
|