File size: 11,460 Bytes
b5f1359 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
import json
import csv
import re
from langdetect import detect
import pycountry
from sklearn.feature_extraction.text import TfidfVectorizer
from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
import openai
from romanize import uroman
from ScriptureReference import ScriptureReference as SR
import stanza
import difflib
import requests
from TrainingData import greek_to_lang
class TranslationNoteFinder:
verses = SR.verse_ones
greek_bible_path = 'bibles/grc-grctcgnt.txt'
# Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
# lang_code follows ISO 639-1 standard
def __init__(self, bible_text_path, api_key, model_path=None, lang_code=None):
# Load Bibles
self.verses = TranslationNoteFinder.verses
self.greek_bible_text = self.load_bible(self.greek_bible_path)
self.target_bible_text = self.load_bible(bible_text_path)
first_line_nt = self.target_bible_text.splitlines()[23213]
# Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
if lang_code:
self.language = lang_code
self.lang_name = pycountry.languages.get(alpha_2=self.language).name
print(f'Language of target Bible text: {self.lang_name}')
else:
self.language = detect(first_line_nt)
self.lang_name = pycountry.languages.get(alpha_2=self.language).name
print(f'Detected language of target Bible text: {self.lang_name}')
# Local model currently not in use
if model_path:
self.model_path = model_path
# Download target language data for use in tokenizer
stanza.download(self.language)
self.nlp = stanza.Pipeline(lang=self.language, processors='tokenize')
# Assign instance variables
self.target_bible_text = self.load_bible(bible_text_path)
self.api_key = api_key
# Get tf-idf vectorizer, matrix for target Bible text
self.tfidf_vectorizer, self.tfidf_matrix = self.create_tfidf_vectorizer_matrix()
def parse_tsv_to_json(self, file_content, book_abbrev):
result = [] # Initialize an empty list to store the dictionaries.
# Turn tsv content into reader
tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
for row in tsv_reader:
# Check if the row contains a Greek term (non-empty) in the expected position.
if row and len(row) > 3 and row[4].strip():
# Construct a dictionary for the current row.
entry = {
"source_term": row[4].strip(),
"translation_note": row[6].strip(),
"verse": book_abbrev + row[0].strip()
}
# Append the dictionary to the result list.
result.append(entry)
return result
def load_translation_notes(self, book_abbrev):
# If filepath ends with json
translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
response = requests.get(translation_notes_path)
if response.status_code == 200:
translation_notes_raw = response.text
else:
translation_notes_raw = ''
translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
return translation_notes
def load_bible(self, bible_path):
# Check if the path starts with "http://" or "https://"
if bible_path.startswith('http'):
# Use requests to fetch the Bible text from the URL
response = requests.get(bible_path)
# Check if the request was successful
if response.status_code == 200:
bible_text = response.text
else:
bible_text = '' # Or handle errors as needed
else:
# Load the Bible text from a local file
with open(bible_path, 'r', encoding='utf-8') as file:
bible_text = file.read()
return bible_text
# Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
# i.e., documents = [Genesis content, Exodus content, ...]
def segment_corpus(self, bible_text):
documents = []
current_document = []
verse_lines = bible_text.splitlines()
for i, line in enumerate(verse_lines, start=1):
if i in self.verses:
if current_document:
joined_doc_string = " ".join(current_document)
documents.append(joined_doc_string)
current_document = []
current_document.append(line.strip())
# Add the last document
if current_document:
joined_doc_string = " ".join(current_document)
documents.append(joined_doc_string)
return documents
# A method created for the tokenizer arg of the TfidfVectorizer class constructor
# See create_tfidf_vectorizer_matrix method
def stanza_tokenizer(self, text):
# Use the Stanza pipeline to process the text
doc = self.nlp(text)
# Extract tokens from the Stanza Document object
tokens = [word.text for sent in doc.sentences for word in sent.words]
return tokens
# Create a tf-idf vectorizer and matrix for the target Bible text
def create_tfidf_vectorizer_matrix(self):
tfidf_vectorizer = TfidfVectorizer(tokenizer=self.stanza_tokenizer, ngram_range=(1, 10))
segmented_corpus = self.segment_corpus(self.target_bible_text)
tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_corpus)
return tfidf_vectorizer, tfidf_matrix
# Use the tf-idf matrix to get the tf-idf scores for the features (n-grams) of a specific book
def get_tfidf_book_features(self, book_code):
book_index = list(SR.book_codes.keys()).index(book_code)
feature_names = self.tfidf_vectorizer.get_feature_names_out()
dense = self.tfidf_matrix[book_index].todense()
document_tfidf_scores = dense.tolist()[0]
feature_scores = dict(zip(feature_names, document_tfidf_scores))
# Filter out zero scores
filtered_feature_scores = {feature: score for feature, score in feature_scores.items() if score > 0}
# Sort by score in descending order (just because...)
sorted_feature_scores = dict(sorted(filtered_feature_scores.items(), key=lambda item: item[1], reverse=True))
return sorted_feature_scores
# For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated Greek term
def best_ngram_for_note(self, note, verse_ngrams, language):
# local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
openai_lm = openai_llm
print(f'All ngrams in verse guidance is selecting from: {[key for key in verse_ngrams.keys()]}')
# print(f'All ngrams in verse guidance is selecting from: {[uroman(key) for key in verse_ngrams.keys()]}')
source_term = note['source_term'].strip()
# source_term = uroman(note['source_term']).strip()
with system():
openai_lm += f'You are an expert at translating from Greek into {language}.'
openai_lm += 'When asked to translate, provide only the translation of the term. Nothing else. Do not provide any additional information or context.'
openai_lm += 'Be extrememly succinct in your translations.'
openai_lm += 'You must choose only from the list of translation options you are given. Choose the single best option.'
# with instruction():
with user():
openai_lm += f'What is a good translation of {source_term} from Greek into {language} and is found here: {verse_ngrams.keys()}?'
with assistant():
openai_lm += gen('openai_translation', stop='.')
print(f'OpenAI translation: {openai_lm["openai_translation"]}')
try:
ngram = difflib.get_close_matches(openai_lm["openai_translation"].strip(), verse_ngrams.keys(), n=1, cutoff=0.3)[0]
except IndexError:
ngram = "No close match found"
print(f'Best ngram found for note: {ngram}')
return ngram
def verse_notes(self, verse_ref):
# Get the Greek form of the verse
v_ref = SR(verse_ref)
gk_verse_text = self.greek_bible_text.splitlines()[v_ref.line_number - 1]
# Get all relevant translation notes for the verse (based on Greek terms found in Greek verse)
# with open('translation_notes.json', 'r', encoding='utf-8') as file:
# translation_notes = json.load(file)
translation_notes_in_verse = []
print(f'Let\'s see if there are any translation notes for this verse: \n\t {gk_verse_text}')
translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
for note in translation_notes:
note_v_ref = SR(note['verse'])
if note_v_ref.line_number != v_ref.line_number:
continue
print('Note verse:', note_v_ref.structured_ref)
print(f'Checking for existence of: {note["source_term"]}')
if note['source_term'].lower() in gk_verse_text.lower():
translation_notes_in_verse.append(note)
print(f'Greek terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
# Get the target language form of the verse
target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
# Find n-grams from the book of the verse which exist in the verse
bookCode = v_ref.structured_ref['bookCode']
book_ngrams = self.get_tfidf_book_features(bookCode)
print(f'First 30 n-grams of the book: {list(book_ngrams.keys())[:30]}')
verse_ngrams = {feature: score for feature, score in book_ngrams.items() if feature.lower() in target_verse_text.lower()}
print(f'First five n-grams of the verse along with their scores: {list(verse_ngrams.items())[:5]}')
ngrams = []
for note in translation_notes_in_verse:
ngram = self.best_ngram_for_note(note, verse_ngrams, self.lang_name)
start_pos = target_verse_text.lower().find(ngram.lower())
end_pos = start_pos + len(ngram)
source_term = note['source_term']
trans_note = note['translation_note']
ngrams.append(
{
'ngram': ngram,
'start_pos': start_pos,
'end_pos': end_pos,
'source_term': source_term,
'trans_note': trans_note
})
print(f'Verse notes to be returned: {ngrams}')
return {
'target_verse_text': target_verse_text,
'verse_ref': v_ref.structured_ref,
'line_number': v_ref.line_number,
'ngrams': ngrams
}
|