Koster
Upload folder using huggingface_hub
b5f1359 verified
import nltk
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
# Step 0: Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
# Step 1: Load your Bible text
with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file:
bible_text = file.read()
# Step 2: Preprocess the text
tokens = word_tokenize(bible_text.lower()) # Tokenize and normalize case
# Remove punctuation and stop words
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in string.punctuation]
# Step 3: Generate n-grams
all_ngrams = []
for n in range(2, 5):
all_ngrams.extend(ngrams(tokens, n))
# Step 4: Analyze frequency of n-grams
freq_dist = FreqDist(all_ngrams)
most_common_ngrams = freq_dist.most_common(30) # Adjust the number to get more or fewer common n-grams
# Display the most common n-grams
for ngram, occurrence in most_common_ngrams:
print("{}: {}".format(' '.join(ngram), occurrence))
# Find the index (rank) of the specific n-gram in freq_dist
ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1
# If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams
if ('in', 'christ') in freq_dist:
print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank))