Spaces:

CalebKoster
/

Translation_Note_Alignment

Sleeping

Koster

Upload folder using huggingface_hub

b5f1359 verified almost 2 years ago

1.54 kB

	import nltk
	from nltk import FreqDist
	from nltk.util import ngrams
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import string

	# Step 0: Ensure necessary NLTK resources are downloaded
	nltk.download('punkt')
	nltk.download('stopwords')

	# Step 1: Load your Bible text
	with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file:
	bible_text = file.read()

	# Step 2: Preprocess the text
	tokens = word_tokenize(bible_text.lower()) # Tokenize and normalize case
	# Remove punctuation and stop words
	stop_words = set(stopwords.words('english'))
	tokens = [token for token in tokens if token not in string.punctuation]

	# Step 3: Generate n-grams
	all_ngrams = []
	for n in range(2, 5):
	all_ngrams.extend(ngrams(tokens, n))

	# Step 4: Analyze frequency of n-grams
	freq_dist = FreqDist(all_ngrams)
	most_common_ngrams = freq_dist.most_common(30) # Adjust the number to get more or fewer common n-grams

	# Display the most common n-grams
	for ngram, occurrence in most_common_ngrams:
	print("{}: {}".format(' '.join(ngram), occurrence))

	# Find the index (rank) of the specific n-gram in freq_dist
	ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1

	# If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams
	if ('in', 'christ') in freq_dist:
	print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank))