Spaces:
Runtime error
Runtime error
Roland Szabo commited on
Commit ·
e7e42e1
0
Parent(s):
Initial commit
Browse files- README.md +13 -0
- bert_bible.py +164 -0
- esv.txt +0 -0
- esv_tags.txt +0 -0
- strongs_defs.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Bible_bert
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.9.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
bert_bible.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import AutoTokenizer, AutoModel
|
| 11 |
+
import umap
|
| 12 |
+
import umap
|
| 13 |
+
import plotly.express as px
|
| 14 |
+
import textwrap
|
| 15 |
+
|
| 16 |
+
st.title('Bible analysis with BERT')
|
| 17 |
+
|
| 18 |
+
@st.cache
|
| 19 |
+
def load_verses() -> Dict[str, str]:
|
| 20 |
+
verses = {}
|
| 21 |
+
count = 0
|
| 22 |
+
with open('esv.txt', 'r', encoding='utf8') as f:
|
| 23 |
+
lines = f.readlines()
|
| 24 |
+
for line in lines:
|
| 25 |
+
try:
|
| 26 |
+
citation, raw_sentence = line.strip().split('\t')
|
| 27 |
+
verses[citation] = raw_sentence
|
| 28 |
+
except ValueError:
|
| 29 |
+
count +=1
|
| 30 |
+
print(count)
|
| 31 |
+
return verses
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@st.cache
|
| 35 |
+
def load_tags():
|
| 36 |
+
index = defaultdict(list)
|
| 37 |
+
with open("esv_tags.txt", encoding='utf8') as f:
|
| 38 |
+
lines = f.readlines()
|
| 39 |
+
for line in lines:
|
| 40 |
+
verse, strongs = line.split("\t", maxsplit=1)
|
| 41 |
+
tokens = strongs.strip().split("\t")
|
| 42 |
+
for t in tokens:
|
| 43 |
+
if "=" in t:
|
| 44 |
+
words, strongs = t.split("=")
|
| 45 |
+
words = [(verse, int(x)) for x in words.split("+")]
|
| 46 |
+
strongs = [x[1:-1] for x in strongs.split("+")]
|
| 47 |
+
for s in strongs:
|
| 48 |
+
index[s].extend(words)
|
| 49 |
+
return index
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@st.cache
|
| 53 |
+
def get_strong_defs():
|
| 54 |
+
with open("strongs_defs.json", encoding='utf8') as f:
|
| 55 |
+
return json.load(f)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def get_word_idx(sent: str, word: str):
|
| 59 |
+
l = re.split('([ .,!?:;""()\'-])', sent)
|
| 60 |
+
l = [x for x in l if x != " " and x != ""]
|
| 61 |
+
return l.index(word)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
|
| 65 |
+
"""Get a word vector by first tokenizing the input sentence, getting all token idxs
|
| 66 |
+
that make up the word of interest, and then `get_hidden_states`."""
|
| 67 |
+
layers = [-4, -3, -2, -1] if layers is None else layers
|
| 68 |
+
|
| 69 |
+
encoded = tokenizer.encode_plus(sent, return_tensors="pt")
|
| 70 |
+
|
| 71 |
+
idx = get_word_idx(sent, word)
|
| 72 |
+
# get all token idxs that belong to the word of interest
|
| 73 |
+
token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
|
| 74 |
+
|
| 75 |
+
with torch.no_grad():
|
| 76 |
+
output = model(**encoded)
|
| 77 |
+
|
| 78 |
+
# Get all hidden states
|
| 79 |
+
states = output.hidden_states
|
| 80 |
+
# Stack and sum all requested layers
|
| 81 |
+
output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
|
| 82 |
+
# Only select the tokens that constitute the requested word
|
| 83 |
+
word_tokens_output = output[token_ids_word]
|
| 84 |
+
|
| 85 |
+
return word_tokens_output.mean(dim=0)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
verses = load_verses()
|
| 90 |
+
strongs_tags = load_tags()
|
| 91 |
+
strongs_defs = get_strong_defs()
|
| 92 |
+
print(len(strongs_tags))
|
| 93 |
+
st.text('Loaded {} verses'.format(len(verses)))
|
| 94 |
+
st.text('Loaded {} tags'.format(len(strongs_tags)))
|
| 95 |
+
|
| 96 |
+
books = []
|
| 97 |
+
for k in verses:
|
| 98 |
+
book = k[:k.index(" ", 2)]
|
| 99 |
+
if book not in books:
|
| 100 |
+
books.append(book)
|
| 101 |
+
print(books)
|
| 102 |
+
|
| 103 |
+
all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
|
| 104 |
+
option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), ['0025', '0026'], format_func=lambda x: strongs_defs[x])
|
| 105 |
+
option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
|
| 106 |
+
["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"], format_func=lambda x: strongs_defs[x])
|
| 107 |
+
@st.cache(allow_output_mutation=True)
|
| 108 |
+
def get_models():
|
| 109 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
| 110 |
+
model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True).eval()
|
| 111 |
+
return tokenizer, model
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@st.cache
|
| 115 |
+
def get_all_embeddings(greek_words):
|
| 116 |
+
embeddings = []
|
| 117 |
+
tokenizer, model = get_models()
|
| 118 |
+
|
| 119 |
+
for word in greek_words:
|
| 120 |
+
for number in greek_words[word]:
|
| 121 |
+
if number in strongs_tags:
|
| 122 |
+
gw = word
|
| 123 |
+
for verse, idx in strongs_tags[number]:
|
| 124 |
+
if verse in verses:
|
| 125 |
+
text = verses[verse]
|
| 126 |
+
print(text, idx)
|
| 127 |
+
words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
|
| 128 |
+
if len(words) <= idx - 1:
|
| 129 |
+
continue
|
| 130 |
+
ew = words[idx-1].strip(",.!?;:()\"'-")
|
| 131 |
+
print(ew)
|
| 132 |
+
emb = get_embedding(tokenizer, model, text, ew).numpy()
|
| 133 |
+
embeddings.append((emb, f"{verse} {text}", ew, book))
|
| 134 |
+
return embeddings
|
| 135 |
+
|
| 136 |
+
def get_book_type(idx):
|
| 137 |
+
if idx < 4:
|
| 138 |
+
return 'Gospels'
|
| 139 |
+
if idx == 4:
|
| 140 |
+
return 'Acts'
|
| 141 |
+
if idx < 19:
|
| 142 |
+
return 'Pauline letters'
|
| 143 |
+
if idx < 26:
|
| 144 |
+
return 'Short lettters'
|
| 145 |
+
return 'Revelation'
|
| 146 |
+
|
| 147 |
+
strongs_numbers = {
|
| 148 |
+
"agape": ["0025", "0026"],
|
| 149 |
+
"phileo": ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"],
|
| 150 |
+
}
|
| 151 |
+
word_list = ["lovers", "loved", "loves", "love", "Love"]
|
| 152 |
+
|
| 153 |
+
embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
|
| 154 |
+
|
| 155 |
+
mapper = umap.UMAP().fit([x[0] for x in embeddings])
|
| 156 |
+
ts = mapper.embedding_
|
| 157 |
+
|
| 158 |
+
x = ts[:,0]
|
| 159 |
+
y = ts[:,1]
|
| 160 |
+
df = pd.DataFrame({"x": x, "y": y, "verse": ["<br>".join(textwrap.wrap(x[1], 80)) for x in embeddings], "greek word": [x[2] for x in embeddings]})
|
| 161 |
+
fig = px.scatter(df, x="x", y="y",
|
| 162 |
+
hover_data=['verse'], color="greek word", )
|
| 163 |
+
# fig.write_html("book_love.html")
|
| 164 |
+
st.plotly_chart(fig)
|
esv.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
esv_tags.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
strongs_defs.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|