Spaces:
Runtime error
Runtime error
| # Import nltk library for natural language processing | |
| import nltk | |
| import os | |
| from transformers import AutoTokenizer | |
| def load_nltk(): | |
| nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' | |
| if os.path.exists(nltk_file): | |
| print('nltk punkt file exists in ', nltk_file) | |
| else: | |
| print("downloading punkt file") | |
| nltk.download('punkt') | |
| # Define a function that takes some text as input and returns the number of tokens | |
| def token_count(text): | |
| # Import the Encoder class from bpe | |
| from bpe import Encoder | |
| # Create an encoder object with a vocabulary size of 10 | |
| encoder = Encoder(vocab_size=14735746) | |
| # Train the encoder on the text | |
| encoder.fit(text.split()) | |
| # Encode the text into tokens | |
| tokens = encoder.tokenize(text) | |
| # Return the number of tokens | |
| return tokens | |
| def num_tokens(text): | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| token_ids = tokenizer.encode(text) | |
| token_size = len(token_ids) | |
| return token_size | |
| def num_words(text): | |
| sentences = nltk.sent_tokenize(text) | |
| # Tokenize each sentence into words using nltk.word_tokenize() | |
| words = [] | |
| for sentence in sentences: | |
| words.extend(nltk.word_tokenize(sentence)) | |
| num_words = len(words) | |
| return num_words | |
| def num_sentences(text): | |
| # Tokenize the text into sentences using nltk.sent_tokenize() | |
| sentences = nltk.sent_tokenize(text) | |
| num_sentences = len(sentences) | |
| return num_sentences | |
| def num_chars(text): | |
| num_characters = len(text) | |
| return num_characters | |
| # Print out the results | |
| # print(f"Number of sentences: {num_sentences}") | |
| # print(f"Number of words: {num_words}") | |
| # print(f"Number of tokens: {num_tokens}") | |
| # print(f"Number of trans_tokens: {trans_tokens}") | |
| # print(f"Number of characters: {num_characters}") |