File size: 11,460 Bytes
b5f1359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import json
import csv
import re
from langdetect import detect
import pycountry
from sklearn.feature_extraction.text import TfidfVectorizer
from guidance import models, gen, select, instruction, system, user, assistant # use llama-cpp-python==0.2.26
import openai
from romanize import uroman
from ScriptureReference import ScriptureReference as SR
import stanza
import difflib
import requests
from TrainingData import greek_to_lang


class TranslationNoteFinder:
    verses = SR.verse_ones

    greek_bible_path = 'bibles/grc-grctcgnt.txt'
    
    # Bibles in various languages can be downloaded from https://github.com/BibleNLP/ebible/tree/main/corpus
    # lang_code follows ISO 639-1 standard
    def __init__(self, bible_text_path, api_key, model_path=None, lang_code=None):
        
        # Load Bibles
        self.verses = TranslationNoteFinder.verses
        self.greek_bible_text = self.load_bible(self.greek_bible_path)
        self.target_bible_text = self.load_bible(bible_text_path)
        first_line_nt = self.target_bible_text.splitlines()[23213]

        # Auto-detect language of target Bible text (occassionally incorrect, so lang_code can be passed in)
        if lang_code:
            self.language = lang_code
            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
            print(f'Language of target Bible text: {self.lang_name}')
        else:
            self.language = detect(first_line_nt)
            self.lang_name = pycountry.languages.get(alpha_2=self.language).name
            print(f'Detected language of target Bible text: {self.lang_name}')

        # Local model currently not in use
        if model_path:
            self.model_path = model_path

        # Download target language data for use in tokenizer
        stanza.download(self.language)
        self.nlp = stanza.Pipeline(lang=self.language, processors='tokenize')

        # Assign instance variables
        self.target_bible_text = self.load_bible(bible_text_path)
        self.api_key = api_key

        # Get tf-idf vectorizer, matrix for target Bible text
        self.tfidf_vectorizer, self.tfidf_matrix = self.create_tfidf_vectorizer_matrix()


    def parse_tsv_to_json(self, file_content, book_abbrev):
        result = []  # Initialize an empty list to store the dictionaries.

        # Turn tsv content into reader
        tsv_reader = csv.reader(file_content.splitlines(), delimiter='\t')
        
        for row in tsv_reader:
            # Check if the row contains a Greek term (non-empty) in the expected position.
            if row and len(row) > 3 and row[4].strip():
                # Construct a dictionary for the current row.
                entry = {
                    "source_term": row[4].strip(),
                    "translation_note": row[6].strip(),
                    "verse": book_abbrev + row[0].strip()
                }
                # Append the dictionary to the result list.
                result.append(entry)
        
        return result

            
    def load_translation_notes(self, book_abbrev):
        # If filepath ends with json
        translation_notes_path = f'https://git.door43.org/unfoldingWord/en_tn/raw/branch/master/tn_{book_abbrev}.tsv'
        response = requests.get(translation_notes_path)
        if response.status_code == 200:
            translation_notes_raw = response.text
        else:
            translation_notes_raw = ''

        translation_notes = self.parse_tsv_to_json(translation_notes_raw, book_abbrev)
        
        return translation_notes
    

    def load_bible(self, bible_path):
        # Check if the path starts with "http://" or "https://"
        if bible_path.startswith('http'):
            # Use requests to fetch the Bible text from the URL
            response = requests.get(bible_path)
            # Check if the request was successful
            if response.status_code == 200:
                bible_text = response.text
            else:
                bible_text = ''  # Or handle errors as needed
        else:
            # Load the Bible text from a local file
            with open(bible_path, 'r', encoding='utf-8') as file:
                bible_text = file.read()
        return bible_text


    # Transforms loaded Bible text from file into a list of documents/books (prep for tf-idf)
    # i.e., documents = [Genesis content, Exodus content, ...]
    def segment_corpus(self, bible_text):
        documents = []
        current_document = []
        verse_lines = bible_text.splitlines()
        for i, line in enumerate(verse_lines, start=1):
            if i in self.verses:
                if current_document:
                    joined_doc_string = " ".join(current_document)
                    documents.append(joined_doc_string)
                    current_document = []
            current_document.append(line.strip())
        # Add the last document
        if current_document:
            joined_doc_string = " ".join(current_document)
            documents.append(joined_doc_string)
        return documents


    # A method created for the tokenizer arg of the TfidfVectorizer class constructor
    # See create_tfidf_vectorizer_matrix method
    def stanza_tokenizer(self, text):
        # Use the Stanza pipeline to process the text
        doc = self.nlp(text)
        # Extract tokens from the Stanza Document object
        tokens = [word.text for sent in doc.sentences for word in sent.words]
        return tokens


    # Create a tf-idf vectorizer and matrix for the target Bible text
    def create_tfidf_vectorizer_matrix(self):
        tfidf_vectorizer = TfidfVectorizer(tokenizer=self.stanza_tokenizer, ngram_range=(1, 10)) 
        segmented_corpus = self.segment_corpus(self.target_bible_text)
        tfidf_matrix = tfidf_vectorizer.fit_transform(segmented_corpus)
        return tfidf_vectorizer, tfidf_matrix


    # Use the tf-idf matrix to get the tf-idf scores for the features (n-grams) of a specific book
    def get_tfidf_book_features(self, book_code):
        book_index = list(SR.book_codes.keys()).index(book_code)
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        dense = self.tfidf_matrix[book_index].todense()
        document_tfidf_scores = dense.tolist()[0]
        feature_scores = dict(zip(feature_names, document_tfidf_scores))

        # Filter out zero scores
        filtered_feature_scores = {feature: score for feature, score in feature_scores.items() if score > 0}
        # Sort by score in descending order (just because...)
        sorted_feature_scores = dict(sorted(filtered_feature_scores.items(), key=lambda item: item[1], reverse=True))
        return sorted_feature_scores
    

    # For each translation note in verse, use difflib to select the verse ngram which best matches the AI-translated Greek term
    def best_ngram_for_note(self, note, verse_ngrams, language):
        # local_llm = models.LlamaCpp(self.model_path, n_gpu_layers=1) # n_ctx=4096 to increase prompt size from 512 tokens

        openai_llm = models.OpenAI("gpt-4", api_key=self.api_key) # To use OPENAI_API_KEY environment variable, omit api_key argument
        openai_lm = openai_llm
        
        print(f'All ngrams in verse guidance is selecting from: {[key for key in verse_ngrams.keys()]}')
        # print(f'All ngrams in verse guidance is selecting from: {[uroman(key) for key in verse_ngrams.keys()]}')
        source_term = note['source_term'].strip()
        # source_term = uroman(note['source_term']).strip()
        
        with system():
            openai_lm += f'You are an expert at translating from Greek into {language}.'
            openai_lm += 'When asked to translate, provide only the translation of the term. Nothing else. Do not provide any additional information or context.'
            openai_lm += 'Be extrememly succinct in your translations.'
            openai_lm += 'You must choose only from the list of translation options you are given. Choose the single best option.'
        # with instruction():
        with user():
            openai_lm += f'What is a good translation of {source_term} from Greek into {language} and is found here: {verse_ngrams.keys()}?'
        with assistant():    
            openai_lm += gen('openai_translation', stop='.')
        print(f'OpenAI translation: {openai_lm["openai_translation"]}')
        
        try:
            ngram = difflib.get_close_matches(openai_lm["openai_translation"].strip(), verse_ngrams.keys(), n=1, cutoff=0.3)[0]
        except IndexError:
            ngram = "No close match found"
        
       
        print(f'Best ngram found for note: {ngram}')
        return ngram


    def verse_notes(self, verse_ref):
        # Get the Greek form of the verse
        v_ref = SR(verse_ref)
        gk_verse_text = self.greek_bible_text.splitlines()[v_ref.line_number - 1]
        
        # Get all relevant translation notes for the verse (based on Greek terms found in Greek verse)
        # with open('translation_notes.json', 'r', encoding='utf-8') as file:
        #     translation_notes = json.load(file)
        translation_notes_in_verse = []
        print(f'Let\'s see if there are any translation notes for this verse: \n\t {gk_verse_text}')
        translation_notes = self.load_translation_notes(v_ref.structured_ref['bookCode'])
        for note in translation_notes:
            note_v_ref = SR(note['verse'])
            if note_v_ref.line_number != v_ref.line_number:
                continue
            print('Note verse:', note_v_ref.structured_ref)
            print(f'Checking for existence of: {note["source_term"]}')
            if note['source_term'].lower() in gk_verse_text.lower():
                translation_notes_in_verse.append(note)
        print(f'Greek terms for all translation notes in verse: {[note["source_term"] for note in translation_notes_in_verse]}')
        
        # Get the target language form of the verse
        target_verse_text = self.target_bible_text.splitlines()[v_ref.line_number - 1]

        # Find n-grams from the book of the verse which exist in the verse
        bookCode = v_ref.structured_ref['bookCode']
        book_ngrams = self.get_tfidf_book_features(bookCode)
        print(f'First 30 n-grams of the book: {list(book_ngrams.keys())[:30]}')
        verse_ngrams = {feature: score for feature, score in book_ngrams.items() if feature.lower() in target_verse_text.lower()}
        print(f'First five n-grams of the verse along with their scores: {list(verse_ngrams.items())[:5]}')

        ngrams = []
        for note in translation_notes_in_verse:
            ngram = self.best_ngram_for_note(note, verse_ngrams, self.lang_name)
            start_pos = target_verse_text.lower().find(ngram.lower())
            end_pos = start_pos + len(ngram)
            source_term = note['source_term']
            trans_note = note['translation_note']
            ngrams.append(
            {
                'ngram': ngram,
                'start_pos': start_pos,
                'end_pos': end_pos,
                'source_term': source_term,
                'trans_note': trans_note
            })

        print(f'Verse notes to be returned: {ngrams}')
        return {
            'target_verse_text': target_verse_text,
            'verse_ref': v_ref.structured_ref,
            'line_number': v_ref.line_number,
            'ngrams': ngrams
        }