Spaces:

rolisz
/

bible_bert

Runtime error

App Files Files Community

Roland Szabo commited on May 27, 2022

Commit

e7e42e1

0 Parent(s):

Initial commit

Browse files

Files changed (5) hide show

README.md +13 -0
bert_bible.py +164 -0
esv.txt +0 -0
esv_tags.txt +0 -0
strongs_defs.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Bible_bert
+emoji: 🦀
+colorFrom: indigo
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.9.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

bert_bible.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import json
+import re
+from collections import defaultdict
+from typing import Dict
+import streamlit as st
+import pandas as pd
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+import umap
+import umap
+import plotly.express as px
+import textwrap
+st.title('Bible analysis with BERT')
+@st.cache
+def load_verses() -> Dict[str, str]:
+    verses = {}
+    count = 0
+    with open('esv.txt', 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        for line in lines:
+            try:
+                citation, raw_sentence = line.strip().split('\t')
+                verses[citation] = raw_sentence
+            except ValueError:
+                count +=1
+    print(count)
+    return verses
+@st.cache
+def load_tags():
+    index = defaultdict(list)
+    with open("esv_tags.txt", encoding='utf8') as f:
+        lines = f.readlines()
+        for line in lines:
+            verse, strongs = line.split("\t", maxsplit=1)
+            tokens = strongs.strip().split("\t")
+            for t in tokens:
+                if "=" in t:
+                    words, strongs = t.split("=")
+                    words = [(verse, int(x)) for x in words.split("+")]
+                    strongs = [x[1:-1] for x in strongs.split("+")]
+                    for s in strongs:
+                        index[s].extend(words)
+    return index
+@st.cache
+def get_strong_defs():
+    with open("strongs_defs.json", encoding='utf8') as f:
+        return json.load(f)
+def get_word_idx(sent: str, word: str):
+    l = re.split('([ .,!?:;""()\'-])', sent)
+    l = [x for x in l if x != " " and x != ""]
+    return l.index(word)
+def get_embedding(tokenizer, model, sent, word, layers=None)-> torch.Tensor:
+    """Get a word vector by first tokenizing the input sentence, getting all token idxs
+        that make up the word of interest, and then `get_hidden_states`."""
+    layers = [-4, -3, -2, -1] if layers is None else layers
+    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
+    idx = get_word_idx(sent, word)
+    # get all token idxs that belong to the word of interest
+    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
+    with torch.no_grad():
+        output = model(**encoded)
+    # Get all hidden states
+    states = output.hidden_states
+    # Stack and sum all requested layers
+    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
+    # Only select the tokens that constitute the requested word
+    word_tokens_output = output[token_ids_word]
+    return word_tokens_output.mean(dim=0)
+verses = load_verses()
+strongs_tags = load_tags()
+strongs_defs = get_strong_defs()
+print(len(strongs_tags))
+st.text('Loaded {} verses'.format(len(verses)))
+st.text('Loaded {} tags'.format(len(strongs_tags)))
+books = []
+for k in verses:
+    book = k[:k.index(" ", 2)]
+    if book not in books:
+        books.append(book)
+print(books)
+all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs}
+option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(),  ['0025', '0026'], format_func=lambda x: strongs_defs[x])
+option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(),
+                         ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"], format_func=lambda x: strongs_defs[x])
+@st.cache(allow_output_mutation=True)
+def get_models():
+    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
+    model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True).eval()
+    return tokenizer, model
+@st.cache
+def get_all_embeddings(greek_words):
+    embeddings = []
+    tokenizer, model = get_models()
+    for word in greek_words:
+        for number in greek_words[word]:
+            if number in strongs_tags:
+                gw = word
+                for verse, idx in strongs_tags[number]:
+                    if verse in verses:
+                        text = verses[verse]
+                        print(text, idx)
+                        words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"]
+                        if len(words) <= idx - 1:
+                            continue
+                        ew = words[idx-1].strip(",.!?;:()\"'-")
+                        print(ew)
+                        emb = get_embedding(tokenizer, model, text, ew).numpy()
+                        embeddings.append((emb, f"{verse} {text}",  ew, book))
+    return embeddings
+def get_book_type(idx):
+    if idx < 4:
+        return 'Gospels'
+    if idx == 4:
+        return 'Acts'
+    if idx < 19:
+        return 'Pauline letters'
+    if idx < 26:
+        return 'Short lettters'
+    return 'Revelation'
+strongs_numbers = {
+    "agape": ["0025", "0026"],
+    "phileo": ["5368", "5360", "5363", "5362", "5361", "5366", "5365", "5377"],
+}
+word_list = ["lovers", "loved", "loves", "love", "Love"]
+embeddings = get_all_embeddings({"concept1": option1, "concept2": option2})
+mapper = umap.UMAP().fit([x[0] for x in embeddings])
+ts = mapper.embedding_
+x = ts[:,0]
+y = ts[:,1]
+df = pd.DataFrame({"x": x, "y": y, "verse": ["<br>".join(textwrap.wrap(x[1], 80)) for x in embeddings], "greek word": [x[2] for x in embeddings]})
+fig = px.scatter(df, x="x", y="y",
+                 hover_data=['verse'], color="greek word", )
+# fig.write_html("book_love.html")
+st.plotly_chart(fig)

esv.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

esv_tags.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

strongs_defs.json ADDED Viewed

The diff for this file is too large to render. See raw diff