|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from itertools import islice |
|
|
from romanize import uroman |
|
|
|
|
|
|
|
|
verses = [ |
|
|
1, |
|
|
1534, |
|
|
2747, |
|
|
3606, |
|
|
4895, |
|
|
5854, |
|
|
6512, |
|
|
7130, |
|
|
7215, |
|
|
8026, |
|
|
8721, |
|
|
9538, |
|
|
10257, |
|
|
11200, |
|
|
12022, |
|
|
12302, |
|
|
12707, |
|
|
12874, |
|
|
13944, |
|
|
16471, |
|
|
17608, |
|
|
17725, |
|
|
19016, |
|
|
20380, |
|
|
20534, |
|
|
21807, |
|
|
22164, |
|
|
22361, |
|
|
22434, |
|
|
22580, |
|
|
22601, |
|
|
22649, |
|
|
22754, |
|
|
22857, |
|
|
22910, |
|
|
22948, |
|
|
23159, |
|
|
23214, |
|
|
24285, |
|
|
24963, |
|
|
26114, |
|
|
26993, |
|
|
27999, |
|
|
28432, |
|
|
28869, |
|
|
29125, |
|
|
29274, |
|
|
29429, |
|
|
29533, |
|
|
29628, |
|
|
29717, |
|
|
29764, |
|
|
29877, |
|
|
29960, |
|
|
30006, |
|
|
30031, |
|
|
30334, |
|
|
30442, |
|
|
30547, |
|
|
30608, |
|
|
30713, |
|
|
30726, |
|
|
30741, |
|
|
30766, |
|
|
31171 |
|
|
] |
|
|
|
|
|
|
|
|
verses = [x-1 for x in verses] |
|
|
|
|
|
|
|
|
def extract_interested_verse(file_path, line_number, romanize=False): |
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
|
for i, line in enumerate(file): |
|
|
if i == line_number: |
|
|
if romanize: |
|
|
return uroman(line.strip()) |
|
|
else: |
|
|
return line.strip() |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
def segment_corpus(file_path, romanize=False): |
|
|
documents = [] |
|
|
current_document = [] |
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
|
for i, line in enumerate(file, start=1): |
|
|
if i in verses: |
|
|
if current_document: |
|
|
joined_doc_string = " ".join(current_document) |
|
|
if romanize: |
|
|
joined_doc_string = uroman(joined_doc_string) |
|
|
documents.append(joined_doc_string) |
|
|
current_document = [] |
|
|
current_document.append(line.strip()) |
|
|
|
|
|
if current_document: |
|
|
joined_doc_string = " ".join(current_document) |
|
|
if romanize: |
|
|
joined_doc_string = uroman(joined_doc_string) |
|
|
documents.append(joined_doc_string) |
|
|
return documents |
|
|
|
|
|
|
|
|
def analyze_verse_in_corpus(file_path, interested_line, romanize=False): |
|
|
documents = segment_corpus(file_path, romanize=romanize) |
|
|
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 4)) |
|
|
tfidf_matrix = tfidf_vectorizer.fit_transform(documents) |
|
|
feature_names = tfidf_vectorizer.get_feature_names_out() |
|
|
|
|
|
|
|
|
document_index = next(i for i, v in enumerate(verses) if v > interested_line) - 1 |
|
|
|
|
|
|
|
|
scores = np.array(tfidf_matrix[document_index].todense()).flatten() |
|
|
scores_dict = dict(zip(feature_names, scores)) |
|
|
|
|
|
|
|
|
interested_verse = extract_interested_verse(file_path, interested_line - 1, romanize=romanize) |
|
|
|
|
|
|
|
|
if interested_verse: |
|
|
tfidf_vectorizer_verse = TfidfVectorizer(ngram_range=(2, 4)) |
|
|
tfidf_vectorizer_verse.fit([interested_verse]) |
|
|
verse_ngrams = tfidf_vectorizer_verse.get_feature_names_out() |
|
|
verse_scores = {ngram: scores_dict.get(ngram, 0) for ngram in verse_ngrams} |
|
|
|
|
|
sorted_verse_scores = dict(sorted(verse_scores.items(), key=lambda item: item[1], reverse=True)) |
|
|
return sorted_verse_scores |
|
|
else: |
|
|
return "Verse not found." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|