|
|
import json |
|
|
import csv |
|
|
import re |
|
|
from langdetect import detect |
|
|
import pycountry |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from guidance import models, gen, select, instruction, system, user, assistant |
|
|
import openai |
|
|
from romanize import uroman |
|
|
from ScriptureReference import ScriptureReference as SR |
|
|
import stanza |
|
|
import difflib |
|
|
import requests |
|
|
from TrainingData import greek_to_lang |
|
|
|
|
|
|
|
|
class TranslationNoteFinder: |
|
|
verses = SR.verse_ones |
|
|
|
|
|
greek_bible_path = 'bibles/grc-grctcgnt.txt' |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, bible_text_path, api_key, model_path=None, lang_code=None): |
|
|
|
|
|
|
|
|
self.verses = TranslationNoteFinder.verses |
|
|
self.greek_bible_text = self.load_bible(self.greek_bible_path) |
|
|
self.target_bible_text = self.load_bible(bible_text_path) |
|
|
first_line_nt = self.target_bible_text.splitlines()[23213] |
|
|
|
|
|
|
|
|
if lang_code: |
|
|
self.language = lang_code |
|
|
self.lang_name = pycountry.languages.get(alpha_2=self.language).name |
|
|
print(f'Language of target Bible text: {self.lang_name}') |
|
|
else: |
|
|
self.language = detect(first_line_nt) |
|
|
self.lang_name = pycountry.languages.get(alpha_2=self.language).name |
|
|
print(f'Detected language of target Bible text: {self.lang_name}') |
|
|
|
|
|
|
|
|
if model_path: |
|
|
self.model_path = model_path |
|
|
|
|
|
|
|
|
stanza.download(self.language) |
|
|
self.nlp = stanza.Pipeline(lang=self.language, processors='tokenize') |
|
|
|
|
|
|
|
|
self.target_bible_text = self.load_bible(bible_text_path) |
|
|
self.api_key = api_key |
|
|
|
|
|
|
|
|
self.tfidf_vectorizer, self.tfidf_matrix = self.create_tfidf_vectorizer_matrix() |
|
|
|
|
|
|
|
|
def parse_tsv_to_json(self, file_content, book_abbrev): |
|
|
result = [] |
|
|
|
|
|
|
|
|
tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t') |
|
|
|
|
|
for row in tsv_reader: |
|
|
|
|
|
if row and len(row) > 3 and row[4].strip(): |
|
|
|
|
|
entry = { |
|
|
"source_term": row[4].strip(), |
|
|
"translation_note": row[6].strip(), |
|
|
"verse": book_abbrev + row[0].strip() |
|
|
} |
|
|
|
|
|
result.append(entry) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_translation_notes(self, book_abbrev): |
|
|
|
|
|
translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv' |
|
|
response = requests.get(translation_notes_path) |
|
|
if response.status_code == 200: |
|
|
translation_notes_raw = response.text |
|
|
else: |
|
|
translation_notes_raw = '' |
|
|
|
|
|
translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev) |
|
|
|
|
|
return translation_notes |
|
|
|
|
|
|
|
|
def load_bible(self, bible_path): |
|
|
|
|
|
if bible_path.startswith('http'): |
|
|
|
|
|
response = requests.get(bible_path) |
|
|
|
|
|
if response.status_code == 200: |
|
|
bible_text = response.text |
|
|
else: |
|
|
bible_text = '' |
|
|
else: |
|
|
|
|
|
with open(bible_path, 'r', encoding='utf-8') as file: |
|
|
bible_text = file.read() |
|
|
return bible_text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def segment_corpus(self, bible_text): |
|
|
documents = [] |
|
|
current_document = [] |
|
|
verse_lines = bible_text.splitlines() |
|
|
for i, line in enumerate(verse_lines, start=1): |
|
|
if i in self.verses: |
|
|
if current_document: |
|
|
joined_doc_string = " ".join(current_document) |
|
|
documents.append(joined_doc_string) |
|
|
current_document = [] |
|
|
current_document.append(line.strip()) |
|
|
|
|
|
if current_document: |
|
|
joined_doc_string = " ".join(current_document) |
|
|
documents.append(joined_doc_string) |
|
|
return documents |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stanza_tokenizer(self, text): |
|
|
|
|
|
doc = self.nlp(text) |
|
|
|
|
|
tokens = [word.text for sent in doc.sentences for word in sent.words] |
|
|
return tokens |
|
|
|
|
|
|
|
|
|
|
|
def create_tfidf_vectorizer_matrix(self): |
|
|
tfidf_vectorizer = TfidfVectorizer(tokenizer=self.stanza_tokenizer, ngram_range=(1, 10)) |
|
|
segmented_corpus = self.segment_corpus(self.target_bible_text) |
|
|
tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_corpus) |
|
|
return tfidf_vectorizer, tfidf_matrix |
|
|
|
|
|
|
|
|
|
|
|
def get_tfidf_book_features(self, book_code): |
|
|
book_index = list(SR.book_codes.keys()).index(book_code) |
|
|
feature_names = self.tfidf_vectorizer.get_feature_names_out() |
|
|
dense = self.tfidf_matrix[book_index].todense() |
|
|
document_tfidf_scores = dense.tolist()[0] |
|
|
feature_scores = dict(zip(feature_names, document_tfidf_scores)) |
|
|
|
|
|
|
|
|
filtered_feature_scores = {feature: score for feature, score in feature_scores.items() if score > 0} |
|
|
|
|
|
sorted_feature_scores = dict(sorted(filtered_feature_scores.items(), key=lambda item: item[1], reverse=True)) |
|
|
return sorted_feature_scores |
|
|
|
|
|
|
|
|
|
|
|
def best_ngram_for_note(self, note, verse_ngrams, language): |
|
|
|
|
|
|
|
|
openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) |
|
|
openai_lm = openai_llm |
|
|
|
|
|
print(f'All ngrams in verse guidance is selecting from: {[key for key in verse_ngrams.keys()]}') |
|
|
|
|
|
source_term = note['source_term'].strip() |
|
|
|
|
|
|
|
|
with system(): |
|
|
openai_lm += f'You are an expert at translating from Greek into {language}.' |
|
|
openai_lm += 'When asked to translate, provide only the translation of the term. Nothing else. Do not provide any additional information or context.' |
|
|
openai_lm += 'Be extrememly succinct in your translations.' |
|
|
openai_lm += 'You must choose only from the list of translation options you are given. Choose the single best option.' |
|
|
|
|
|
with user(): |
|
|
openai_lm += f'What is a good translation of {source_term} from Greek into {language} and is found here: {verse_ngrams.keys()}?' |
|
|
with assistant(): |
|
|
openai_lm += gen('openai_translation', stop='.') |
|
|
print(f'OpenAI translation: {openai_lm["openai_translation"]}') |
|
|
|
|
|
try: |
|
|
ngram = difflib.get_close_matches(openai_lm["openai_translation"].strip(), verse_ngrams.keys(), n=1, cutoff=0.3)[0] |
|
|
except IndexError: |
|
|
ngram = "No close match found" |
|
|
|
|
|
|
|
|
print(f'Best ngram found for note: {ngram}') |
|
|
return ngram |
|
|
|
|
|
|
|
|
def verse_notes(self, verse_ref): |
|
|
|
|
|
v_ref = SR(verse_ref) |
|
|
gk_verse_text = self.greek_bible_text.splitlines()[v_ref.line_number - 1] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
translation_notes_in_verse = [] |
|
|
print(f'Let\'s see if there are any translation notes for this verse: \n\t {gk_verse_text}') |
|
|
translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode']) |
|
|
for note in translation_notes: |
|
|
note_v_ref = SR(note['verse']) |
|
|
if note_v_ref.line_number != v_ref.line_number: |
|
|
continue |
|
|
print('Note verse:', note_v_ref.structured_ref) |
|
|
print(f'Checking for existence of: {note["source_term"]}') |
|
|
if note['source_term'].lower() in gk_verse_text.lower(): |
|
|
translation_notes_in_verse.append(note) |
|
|
print(f'Greek terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}') |
|
|
|
|
|
|
|
|
target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1] |
|
|
|
|
|
|
|
|
bookCode = v_ref.structured_ref['bookCode'] |
|
|
book_ngrams = self.get_tfidf_book_features(bookCode) |
|
|
print(f'First 30 n-grams of the book: {list(book_ngrams.keys())[:30]}') |
|
|
verse_ngrams = {feature: score for feature, score in book_ngrams.items() if feature.lower() in target_verse_text.lower()} |
|
|
print(f'First five n-grams of the verse along with their scores: {list(verse_ngrams.items())[:5]}') |
|
|
|
|
|
ngrams = [] |
|
|
for note in translation_notes_in_verse: |
|
|
ngram = self.best_ngram_for_note(note, verse_ngrams, self.lang_name) |
|
|
start_pos = target_verse_text.lower().find(ngram.lower()) |
|
|
end_pos = start_pos + len(ngram) |
|
|
source_term = note['source_term'] |
|
|
trans_note = note['translation_note'] |
|
|
ngrams.append( |
|
|
{ |
|
|
'ngram': ngram, |
|
|
'start_pos': start_pos, |
|
|
'end_pos': end_pos, |
|
|
'source_term': source_term, |
|
|
'trans_note': trans_note |
|
|
}) |
|
|
|
|
|
print(f'Verse notes to be returned: {ngrams}') |
|
|
return { |
|
|
'target_verse_text': target_verse_text, |
|
|
'verse_ref': v_ref.structured_ref, |
|
|
'line_number': v_ref.line_number, |
|
|
'ngrams': ngrams |
|
|
} |
|
|
|
|
|
|
|
|
|