| import nltk | |
| from nltk import FreqDist | |
| from nltk.util import ngrams | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| import string | |
| # Step 0: Ensure necessary NLTK resources are downloaded | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| # Step 1: Load your Bible text | |
| with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file: | |
| bible_text = file.read() | |
| # Step 2: Preprocess the text | |
| tokens = word_tokenize(bible_text.lower()) # Tokenize and normalize case | |
| # Remove punctuation and stop words | |
| stop_words = set(stopwords.words('english')) | |
| tokens = [token for token in tokens if token not in string.punctuation] | |
| # Step 3: Generate n-grams | |
| all_ngrams = [] | |
| for n in range(2, 5): | |
| all_ngrams.extend(ngrams(tokens, n)) | |
| # Step 4: Analyze frequency of n-grams | |
| freq_dist = FreqDist(all_ngrams) | |
| most_common_ngrams = freq_dist.most_common(30) # Adjust the number to get more or fewer common n-grams | |
| # Display the most common n-grams | |
| for ngram, occurrence in most_common_ngrams: | |
| print("{}: {}".format(' '.join(ngram), occurrence)) | |
| # Find the index (rank) of the specific n-gram in freq_dist | |
| ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1 | |
| # If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams | |
| if ('in', 'christ') in freq_dist: | |
| print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank)) | |