File size: 9,587 Bytes
b5f1359 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import json
import csv
import re
from langdetect import detect
import pycountry
from LanguageTool import Lang
from sklearn.feature_extraction.text import TfidfVectorizer
from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
import openai
from romanize import uroman
from ScriptureReference import ScriptureReference as SR
import stanza
import difflib
import requests
# from TrainingData import greek_to_lang
class TranslationNoteFinder:
verses = SR.verse_ones
# greek_bible_path = 'bibles/grc-grctcgnt.txt'
# hebrew_bible_path = 'bibles/heb-hebrewtanakh.txt'
# english_bible_path = 'bibles/eng-web.txt'
# Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
# lang_code follows ISO 639-1 standard
def __init__(self, bible_text_path, api_key, lang_code=None):
# Load Bibles
self.verses = TranslationNoteFinder.verses
# self.greek_bible_text = self.load_bible('bibles/grc-grctcgnt.txt')
# self.hebrew_bible_text = self.load_bible('bibles/heb-heb.txt')
# self.english_bible_text = self.load_bible('bibles/eng-engwebp.txt')
self.target_bible_text = self.load_bible(bible_text_path)
# Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
if lang_code:
self.language = lang_code
self.lang_name = pycountry.languages.get(alpha_2=self.language).name
print(f'Language of target Bible text: {self.lang_name}')
else:
first_line_nt = self.target_bible_text.splitlines()[23213]
self.language = detect(first_line_nt)
self.lang_name = pycountry.languages.get(alpha_2=self.language).name
print(f'Detected language of target Bible text: {self.lang_name}')
# Assign instance variables
self.target_bible_text = self.load_bible(bible_text_path)
self.api_key = api_key
def parse_tsv_to_json(self, file_content, book_abbrev):
result = [] # Initialize an empty list to store the dictionaries.
# Turn tsv content into reader
tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
for row in tsv_reader:
# Check if the row contains a source term (non-empty) in the expected position.
if row and len(row) > 3 and row[4].strip():
# Construct a dictionary for the current row.
entry = {
"source_term": row[4].strip(),
"translation_note": row[6].strip(),
"verse": book_abbrev + row[0].strip()
}
# Append the dictionary to the result list.
result.append(entry)
return result
def load_translation_notes(self, book_abbrev):
# If filepath ends with json
translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
response = requests.get(translation_notes_path)
if response.status_code == 200:
translation_notes_raw = response.text
else:
translation_notes_raw = ''
translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
return translation_notes
def load_bible(self, bible_path):
# Check if the path starts with "http://" or "https://"
if bible_path.startswith('http'):
# Use requests to fetch the Bible text from the URL
response = requests.get(bible_path)
# Check if the request was successful
if response.status_code == 200:
bible_text = response.text
else:
bible_text = '' # Or handle errors as needed
else:
# Load the Bible text from a local file
with open(bible_path, 'r', encoding='utf-8') as file:
bible_text = file.read()
return bible_text
# Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
# i.e., documents = [Genesis content, Exodus content, ...]
def segment_corpus(self, bible_text):
documents = []
current_document = []
verse_lines = bible_text.splitlines()
for i, line in enumerate(verse_lines, start=1):
if i in self.verses:
if current_document:
joined_doc_string = " ".join(current_document)
documents.append(joined_doc_string)
current_document = []
current_document.append(line.strip())
# Add the last document
if current_document:
joined_doc_string = " ".join(current_document)
documents.append(joined_doc_string)
return documents
# For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated source term
def best_ngram_for_note(self, note, target_verse_text, language):
# local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens
openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
openai_lm = openai_llm
source_term = note['source_term'].strip()
source_lang = Lang(source_term, options=['en', 'he', 'el']).lang_name # Can only choose between English, Hebrew, and Greek
print(f'Source term: {source_term}, \nSource language: {source_lang}')
# source_term = uroman(note['source_term']).strip()
with system():
openai_lm += f'You are an expert at translating between {source_lang} and {language}.'
openai_lm += f'When asked to translate, provide only the {language} translation of the {source_lang} term found in the {language} verse.'
openai_lm += 'Nothing else. Do not provide any additional information or context. Be extrememly succinct in your translations.'
openai_lm += f'You must choose only an N-gram which already exists in the {language} verse.'
with user():
openai_lm += f'What is a good translation of {source_term} from {source_lang} into {language} and is also found within this verse: {target_verse_text}?'
# openai_lm += f'What part of the verse \"{target_verse_text}\" is a good translation of {source_term} from {source_lang} into {language}?'
with assistant():
openai_lm += gen('openai_translation', stop='.')
print(f'OpenAI translation: {openai_lm["openai_translation"]}')
# If openai_lm["openai_translation"] can be found in the verse, return it
llm_output = openai_lm["openai_translation"].strip()
print(f'LLM output: {llm_output}')
if llm_output in target_verse_text:
print(f'LLM output found in verse: {llm_output}')
return llm_output
else:
print(f'LLM output not found in verse: {llm_output}')
return ''
def verse_notes(self, verse_ref):
# Get the source form of the verse
v_ref = SR(verse_ref)
# source_verse_text = self.source_bible_text.splitlines()[v_ref.line_number - 1]
translation_notes_in_verse = []
# print(f'Let\'s see if there are any translation notes for this verse: \n\t {source_verse_text}')
translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
# for note in translation_notes:
# note_v_ref = SR(note['verse'])
# if note_v_ref.line_number != v_ref.line_number:
# continue
# print('Note verse:', note_v_ref.structured_ref)
# print(f'Checking for existence of: {note["source_term"]}')
# if note['source_term'].lower() in source_verse_text.lower():
# translation_notes_in_verse.append(note)
for note in translation_notes:
note_v_ref = SR(note['verse'])
if note_v_ref.line_number == v_ref.line_number: # Not checking for existence assumes there is a verse reference
translation_notes_in_verse.append(note)
print(f'Source terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
# Get the target language form of the verse
target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]
ngrams = []
for note in translation_notes_in_verse:
source_term = note['source_term']
trans_note = note['translation_note']
ngram = self.best_ngram_for_note(note, target_verse_text, self.lang_name)
start_pos = target_verse_text.lower().find(ngram.lower())
end_pos = start_pos + len(ngram)
ngrams.append(
{
'ngram': ngram,
'start_pos': start_pos,
'end_pos': end_pos,
'source_term': source_term,
'trans_note': trans_note
})
print('Verse notes to be returned:')
print(json.dumps(ngrams, indent=4))
return {
'target_verse_text': target_verse_text,
'verse_ref': v_ref.structured_ref,
'line_number': v_ref.line_number,
'ngrams': ngrams
}
|