bobboyms
/

tynerox

Text Generation

text-generation-inference

Model card Files Files and versions

tynerox / src /tokenizer /tests.py

Ubuntu

Re-adiciona model.safetensors via LFS

58d9159 10 months ago

history blame contribute delete

2.69 kB

	from datasets import load_dataset
	from tokenizers import Tokenizer

	if __name__ == "__main__":
	# Carrega streaming do dataset e o tokenizer
	dataset_stream = load_dataset(
	"bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens",
	split="train",
	streaming=True
	)
	tokenizer = Tokenizer.from_file("tokens-bpe-36k.json")
	encode = tokenizer.encode
	unk_id = tokenizer.token_to_id("[UNK]")
	vocab_size = tokenizer.get_vocab_size()

	print("Tamanho do vocabulário:", tokenizer.get_vocab_size())
	enc = tokenizer.encode("Apostas combinadas: Fantástico exibe mensagens exclusivas da investigação contra Bruno Henrique, do Flamengo")
	print(tokenizer.decode(enc.ids, skip_special_tokens=True))

	# Contadores
	total_tokens = 0
	total_words = 0
	unk_tokens = 0
	seen_ids = set()

	batch_size = 512
	batch_counter = 0

	def batch_iterator(stream, bs):
	buf = []
	for ex in stream:
	buf.append(ex["text"])
	if len(buf) == bs:
	yield buf
	buf = []
	if buf:
	yield buf

	for texts in batch_iterator(dataset_stream, batch_size):
	# tokeniza em batch
	encs = tokenizer.encode_batch(texts)

	# conta palavras e tokens no batch
	words_in_batch = sum(len(t.split()) for t in texts)
	total_words += words_in_batch

	for enc in encs:
	total_tokens += len(enc.ids)
	unk_tokens += enc.ids.count(unk_id)
	seen_ids.update(enc.ids)

	# impressão parcial a cada 100 batches
	if batch_counter % 100 == 0:
	oov_rate = unk_tokens / total_tokens * 100
	frag = total_tokens / total_words
	coverage = len(seen_ids) / vocab_size * 100
	ttr = len(seen_ids) / total_tokens
	print(f"[Batch {batch_counter:04d}] "
	f"OOV: {oov_rate:.3f}% \| "
	f"Frag: {frag:.3f} t/palavra \| "
	f"Coverage: {coverage:.2f}% \| "
	f"TTR: {ttr:.4f}")
	batch_counter += 1

	# resultado final
	oov_rate = unk_tokens / total_tokens * 100
	frag = total_tokens / total_words
	coverage = len(seen_ids) / vocab_size * 100
	ttr = len(seen_ids) / total_tokens

	print("\n=== Métricas Finais ===")
	print(f"Total de tokens: {total_tokens}")
	print(f"Total de palavras: {total_words}")
	print(f"OOV rate: {oov_rate:.3f}%")
	print(f"Fragmentação: {frag:.3f} tokens/palavra")
	print(f"Voc. coverage: {coverage:.2f}% do vocabulário usado")
	print(f"Type–Token Ratio: {ttr:.4f}")