from datasets import load_dataset from tokenizers import Tokenizer if __name__ == "__main__": # Carrega streaming do dataset e o tokenizer dataset_stream = load_dataset( "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens", split="train", streaming=True ) tokenizer = Tokenizer.from_file("tokens-bpe-36k.json") encode = tokenizer.encode unk_id = tokenizer.token_to_id("[UNK]") vocab_size = tokenizer.get_vocab_size() print("Tamanho do vocabulário:", tokenizer.get_vocab_size()) enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo") print(tokenizer.decode(enc.ids, skip_special_tokens=True)) # Contadores total_tokens = 0 total_words = 0 unk_tokens = 0 seen_ids = set() batch_size = 512 batch_counter = 0 def batch_iterator(stream, bs): buf = [] for ex in stream: buf.append(ex["text"]) if len(buf) == bs: yield buf buf = [] if buf: yield buf for texts in batch_iterator(dataset_stream, batch_size): # tokeniza em batch encs = tokenizer.encode_batch(texts) # conta palavras e tokens no batch words_in_batch = sum(len(t.split()) for t in texts) total_words += words_in_batch for enc in encs: total_tokens += len(enc.ids) unk_tokens += enc.ids.count(unk_id) seen_ids.update(enc.ids) # impressão parcial a cada 100 batches if batch_counter % 100 == 0: oov_rate = unk_tokens / total_tokens * 100 frag = total_tokens / total_words coverage = len(seen_ids) / vocab_size * 100 ttr = len(seen_ids) / total_tokens print(f"[Batch {batch_counter:04d}] " f"OOV: {oov_rate:.3f}% | " f"Frag: {frag:.3f} t/palavra | " f"Coverage: {coverage:.2f}% | " f"TTR: {ttr:.4f}") batch_counter += 1 # resultado final oov_rate = unk_tokens / total_tokens * 100 frag = total_tokens / total_words coverage = len(seen_ids) / vocab_size * 100 ttr = len(seen_ids) / total_tokens print("\n=== Métricas Finais ===") print(f"Total de tokens: {total_tokens}") print(f"Total de palavras: {total_words}") print(f"OOV rate: {oov_rate:.3f}%") print(f"Fragmentação: {frag:.3f} tokens/palavra") print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado") print(f"Type–Token Ratio: {ttr:.4f}")