Spaces:

wldmr
/

transcriptifier-st-hf7

Runtime error

App Files Files Community

transcriptifier-st-hf7 / metrics.py

wldmr

init

68d26c9 over 2 years ago

raw

history blame contribute delete

1.82 kB

	# Import nltk library for natural language processing
	import nltk
	import os
	from transformers import AutoTokenizer

	def load_nltk():
	nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip'
	if os.path.exists(nltk_file):
	print('nltk punkt file exists in ', nltk_file)
	else:
	print("downloading punkt file")
	nltk.download('punkt')


	# Define a function that takes some text as input and returns the number of tokens
	def token_count(text):
	# Import the Encoder class from bpe
	from bpe import Encoder
	# Create an encoder object with a vocabulary size of 10
	encoder = Encoder(vocab_size=14735746)

	# Train the encoder on the text
	encoder.fit(text.split())

	# Encode the text into tokens
	tokens = encoder.tokenize(text)

	# Return the number of tokens
	return tokens

	def num_tokens(text):

	tokenizer = AutoTokenizer.from_pretrained("gpt2")

	token_ids = tokenizer.encode(text)

	token_size = len(token_ids)

	return token_size

	def num_words(text):
	sentences = nltk.sent_tokenize(text)
	# Tokenize each sentence into words using nltk.word_tokenize()
	words = []
	for sentence in sentences:
	words.extend(nltk.word_tokenize(sentence))

	num_words = len(words)

	return num_words

	def num_sentences(text):
	# Tokenize the text into sentences using nltk.sent_tokenize()
	sentences = nltk.sent_tokenize(text)
	num_sentences = len(sentences)
	return num_sentences


	def num_chars(text):
	num_characters = len(text)
	return num_characters


	# Print out the results
	# print(f"Number of sentences: {num_sentences}")
	# print(f"Number of words: {num_words}")
	# print(f"Number of tokens: {num_tokens}")
	# print(f"Number of trans_tokens: {trans_tokens}")
	# print(f"Number of characters: {num_characters}")