import nltk
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Step 0: Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Load your Bible text
with open('bibles/eng-engkjvcpb.txt', 'r', encoding='utf-8') as file:
    bible_text = file.read()

# Step 2: Preprocess the text
tokens = word_tokenize(bible_text.lower())  # Tokenize and normalize case
# Remove punctuation and stop words
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in string.punctuation]

# Step 3: Generate n-grams
all_ngrams = []
for n in range(2, 5):
    all_ngrams.extend(ngrams(tokens, n))

# Step 4: Analyze frequency of n-grams
freq_dist = FreqDist(all_ngrams)
most_common_ngrams = freq_dist.most_common(30)  # Adjust the number to get more or fewer common n-grams

# Display the most common n-grams
for ngram, occurrence in most_common_ngrams:
    print("{}: {}".format(' '.join(ngram), occurrence))

# Find the index (rank) of the specific n-gram in freq_dist
ngram_rank = sorted(freq_dist, key=freq_dist.get, reverse=True).index(('in', 'christ')) + 1

# If 'in christ' found in the most common n-grams, print the number of occurrences and how it ranks among the most common n-grams
if ('in', 'christ') in freq_dist:
    print("The n-gram 'in christ' occurs {} times and is ranked {} among the most common n-grams.".format(freq_dist[('in', 'christ')], ngram_rank))