Roland Szabo commited on
Commit
e7e42e1
·
0 Parent(s):

Initial commit

Browse files
Files changed (5) hide show
  1. README.md +13 -0
  2. bert_bible.py +164 -0
  3. esv.txt +0 -0
  4. esv_tags.txt +0 -0
  5. strongs_defs.json +0 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bible_bert
3
+ emoji: 🦀
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
bert_bible.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from collections import defaultdict
4
+ from typing import Dict
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import numpy as np
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoModel
11
+ import umap
12
+ import umap
13
+ import plotly.express as px
14
+ import textwrap
15
+
16
+ st.title('Bible analysis with BERT')
17
+
18
+ @st.cache
19
+ def load_verses() -> Dict[str, str]:
20
+ verses = {}
21
+ count = 0
22
+ with open('esv.txt', 'r', encoding='utf8') as f:
23
+ lines = f.readlines()
24
+ for line in lines:
25
+ try:
26
+ citation, raw_sentence = line.strip().split('\t')
27
+ verses[citation] = raw_sentence
28
+ except ValueError:
29
+ count +=1
30
+ print(count)
31
+ return verses
32
+
33
+
34
+ @st.cache
35
+ def load_tags():
36
+ index = defaultdict(list)
37
+ with open("esv_tags.txt", encoding='utf8') as f:
38
+ lines = f.readlines()
39
+ for line in lines:
40
+ verse, strongs = line.split("\t", maxsplit=1)
41
+ tokens = strongs.strip().split("\t")
42
+ for t in tokens:
43
+ if "=" in t:
44
+ words, strongs = t.split("=")
45
+ words = [(verse, int(x)) for x in words.split("+")]
46
+ strongs = [x[1:-1] for x in strongs.split("+")]
47
+ for s in strongs:
48
+ index[s].extend(words)
49
+ return index
50
+
51
+
52
+ @st.cache
53
+ def get_strong_defs():
54
+ with open("strongs_defs.json", encoding='utf8') as f:
55
+ return json.load(f)
56
+
57
+
58
+ def get_word_idx(sent: str, word: str):
59
+ l = re.split('([ .,!?:;""()\'-])', sent)
60
+ l = [x for x in l if x != " " and x != ""]
61
+ return l.index(word)
62
+
63
+
64
+ def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
65
+ """Get a word vector by first tokenizing the input sentence, getting all token idxs
66
+ that make up the word of interest, and then `get_hidden_states`."""
67
+ layers = [-4, -3, -2, -1] if layers is None else layers
68
+
69
+ encoded = tokenizer.encode_plus(sent, return_tensors="pt")
70
+
71
+ idx = get_word_idx(sent, word)
72
+ # get all token idxs that belong to the word of interest
73
+ token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
74
+
75
+ with torch.no_grad():
76
+ output = model(**encoded)
77
+
78
+ # Get all hidden states
79
+ states = output.hidden_states
80
+ # Stack and sum all requested layers
81
+ output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
82
+ # Only select the tokens that constitute the requested word
83
+ word_tokens_output = output[token_ids_word]
84
+
85
+ return word_tokens_output.mean(dim=0)
86
+
87
+
88
+
89
+ verses = load_verses()
90
+ strongs_tags = load_tags()
91
+ strongs_defs = get_strong_defs()
92
+ print(len(strongs_tags))
93
+ st.text('Loaded {} verses'.format(len(verses)))
94
+ st.text('Loaded {} tags'.format(len(strongs_tags)))
95
+
96
+ books = []
97
+ for k in verses:
98
+ book = k[:k.index(" ", 2)]
99
+ if book not in books:
100
+ books.append(book)
101
+ print(books)
102
+
103
+ all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
104
+ option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), ['0025', '0026'], format_func=lambda x: strongs_defs[x])
105
+ option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
106
+ ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"], format_func=lambda x: strongs_defs[x])
107
+ @st.cache(allow_output_mutation=True)
108
+ def get_models():
109
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
110
+ model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True).eval()
111
+ return tokenizer, model
112
+
113
+
114
+ @st.cache
115
+ def get_all_embeddings(greek_words):
116
+ embeddings = []
117
+ tokenizer, model = get_models()
118
+
119
+ for word in greek_words:
120
+ for number in greek_words[word]:
121
+ if number in strongs_tags:
122
+ gw = word
123
+ for verse, idx in strongs_tags[number]:
124
+ if verse in verses:
125
+ text = verses[verse]
126
+ print(text, idx)
127
+ words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
128
+ if len(words) <= idx - 1:
129
+ continue
130
+ ew = words[idx-1].strip(",.!?;:()\"'-")
131
+ print(ew)
132
+ emb = get_embedding(tokenizer, model, text, ew).numpy()
133
+ embeddings.append((emb, f"{verse} {text}", ew, book))
134
+ return embeddings
135
+
136
+ def get_book_type(idx):
137
+ if idx < 4:
138
+ return 'Gospels'
139
+ if idx == 4:
140
+ return 'Acts'
141
+ if idx < 19:
142
+ return 'Pauline letters'
143
+ if idx < 26:
144
+ return 'Short lettters'
145
+ return 'Revelation'
146
+
147
+ strongs_numbers = {
148
+ "agape": ["0025", "0026"],
149
+ "phileo": ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"],
150
+ }
151
+ word_list = ["lovers", "loved", "loves", "love", "Love"]
152
+
153
+ embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
154
+
155
+ mapper = umap.UMAP().fit([x[0] for x in embeddings])
156
+ ts = mapper.embedding_
157
+
158
+ x = ts[:,0]
159
+ y = ts[:,1]
160
+ df = pd.DataFrame({"x": x, "y": y, "verse": ["<br>".join(textwrap.wrap(x[1], 80)) for x in embeddings], "greek word": [x[2] for x in embeddings]})
161
+ fig = px.scatter(df, x="x", y="y",
162
+ hover_data=['verse'], color="greek word", )
163
+ # fig.write_html("book_love.html")
164
+ st.plotly_chart(fig)
esv.txt ADDED
The diff for this file is too large to render. See raw diff
 
esv_tags.txt ADDED
The diff for this file is too large to render. See raw diff
 
strongs_defs.json ADDED
The diff for this file is too large to render. See raw diff