File size: 2,692 Bytes
58d9159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from datasets import load_dataset
from tokenizers import Tokenizer

if __name__ == "__main__":
    # Carrega streaming do dataset e o tokenizer
    dataset_stream = load_dataset(
        "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens",
        split="train",
        streaming=True
    )
    tokenizer = Tokenizer.from_file("tokens-bpe-36k.json")
    encode    = tokenizer.encode
    unk_id    = tokenizer.token_to_id("[UNK]")
    vocab_size = tokenizer.get_vocab_size()

    print("Tamanho do vocabulário:", tokenizer.get_vocab_size())
    enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo")
    print(tokenizer.decode(enc.ids, skip_special_tokens=True))

    # Contadores
    total_tokens = 0
    total_words  = 0
    unk_tokens   = 0
    seen_ids     = set()

    batch_size   = 512
    batch_counter = 0

    def batch_iterator(stream, bs):
        buf = []
        for ex in stream:
            buf.append(ex["text"])
            if len(buf) == bs:
                yield buf
                buf = []
        if buf:
            yield buf

    for texts in batch_iterator(dataset_stream, batch_size):
        # tokeniza em batch
        encs = tokenizer.encode_batch(texts)

        # conta palavras e tokens no batch
        words_in_batch = sum(len(t.split()) for t in texts)
        total_words   += words_in_batch

        for enc in encs:
            total_tokens += len(enc.ids)
            unk_tokens   += enc.ids.count(unk_id)
            seen_ids.update(enc.ids)

        # impressão parcial a cada 100 batches
        if batch_counter % 100 == 0:
            oov_rate  = unk_tokens / total_tokens * 100
            frag      = total_tokens / total_words
            coverage  = len(seen_ids) / vocab_size * 100
            ttr       = len(seen_ids) / total_tokens
            print(f"[Batch {batch_counter:04d}] "
                  f"OOV: {oov_rate:.3f}% | "
                  f"Frag: {frag:.3f} t/palavra | "
                  f"Coverage: {coverage:.2f}% | "
                  f"TTR: {ttr:.4f}")
        batch_counter += 1

    # resultado final
    oov_rate  = unk_tokens / total_tokens * 100
    frag      = total_tokens / total_words
    coverage  = len(seen_ids) / vocab_size * 100
    ttr       = len(seen_ids) / total_tokens

    print("\n=== Métricas Finais ===")
    print(f"Total de tokens: {total_tokens}")
    print(f"Total de palavras: {total_words}")
    print(f"OOV rate:         {oov_rate:.3f}%")
    print(f"Fragmentação:     {frag:.3f} tokens/palavra")
    print(f"Voc. coverage:    {coverage:.2f}% do vocabulário usado")
    print(f"Type–Token Ratio: {ttr:.4f}")