Spaces:
Runtime error
Runtime error
| import json | |
| import re | |
| from collections import defaultdict | |
| from typing import Dict | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| import umap | |
| import plotly.express as px | |
| import textwrap | |
| st.title('Bible analysis with BERT') | |
| def load_verses() -> Dict[str, str]: | |
| verses = {} | |
| count = 0 | |
| with open('esv.txt', 'r', encoding='utf8') as f: | |
| lines = f.readlines() | |
| for line in lines: | |
| try: | |
| citation, raw_sentence = line.strip().split('\t') | |
| verses[citation] = raw_sentence | |
| except ValueError: | |
| count +=1 | |
| print(count) | |
| return verses | |
| def load_tags(): | |
| index = defaultdict(list) | |
| with open("esv_tags.txt", encoding='utf8') as f: | |
| lines = f.readlines() | |
| for line in lines: | |
| verse, strongs = line.split("\t", maxsplit=1) | |
| tokens = strongs.strip().split("\t") | |
| for t in tokens: | |
| if "=" in t: | |
| words, strongs = t.split("=") | |
| words = [(verse, int(x)) for x in words.split("+")] | |
| strongs = [x[1:-1] for x in strongs.split("+")] | |
| for s in strongs: | |
| index[s].extend(words) | |
| return index | |
| def get_strong_defs(): | |
| with open("strongs_defs.json", encoding='utf8') as f: | |
| return json.load(f) | |
| def get_word_idx(sent: str, word: str): | |
| l = re.split('([ .,!?:;""()\'-])', sent) | |
| l = [x for x in l if x != " " and x != ""] | |
| return l.index(word) | |
| def get_embedding(sent, word, layers=None): | |
| """Get a word vector by first tokenizing the input sentence, getting all token idxs | |
| that make up the word of interest, and then `get_hidden_states`.""" | |
| layers = [-4, -3, -2, -1] if layers is None else layers | |
| tokenizer, model = get_models() | |
| encoded = tokenizer.encode_plus(sent, return_tensors="pt") | |
| idx = get_word_idx(sent, word) | |
| # get all token idxs that belong to the word of interest | |
| token_ids_word = np.where(np.array(encoded.word_ids()) == idx) | |
| with torch.no_grad(): | |
| output = model(**encoded) | |
| # Get all hidden states | |
| states = output.hidden_states | |
| # Stack and sum all requested layers | |
| output = torch.stack([states[i] for i in layers]).sum(0).squeeze() | |
| # Only select the tokens that constitute the requested word | |
| word_tokens_output = output[token_ids_word] | |
| return word_tokens_output.mean(dim=0).numpy() | |
| verses = load_verses() | |
| strongs_tags = load_tags() | |
| strongs_defs = get_strong_defs() | |
| print(len(strongs_tags)) | |
| st.text('Loaded {} verses'.format(len(verses))) | |
| st.text('Loaded {} tags'.format(len(strongs_tags))) | |
| books = [] | |
| for k in verses: | |
| book = k[:k.index(" ", 2)] | |
| if book not in books: | |
| books.append(book) | |
| print(books) | |
| all_defs = {k: f"{k} - {strongs_defs[k]}" for k in strongs_defs} | |
| def format_strong(number): | |
| return f"{number} - {strongs_defs[number]}" | |
| def get_models(): | |
| tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') | |
| model = AutoModel.from_pretrained('bert-base-cased', output_hidden_states=True).eval() | |
| return tokenizer, model | |
| def get_all_embeddings(greek_words): | |
| embeddings = [] | |
| for word in greek_words: | |
| for number in greek_words[word]: | |
| if number in strongs_tags: | |
| gw = word | |
| for verse, idx in strongs_tags[number]: | |
| if verse in verses: | |
| text = verses[verse] | |
| words = [x for x in re.split('([ \'])', text) if x != " " and x != "" and x != "'"] | |
| if len(words) <= idx - 1: | |
| continue | |
| ew = words[idx-1].strip(",.!?;:()\"'-") | |
| if "-" in ew: | |
| ew = ew.split("-")[0] | |
| print(gw, ew) | |
| emb = get_embedding(text, ew) | |
| embeddings.append((emb, f"{verse} {text}", gw, book)) | |
| return embeddings | |
| def get_book_type(idx): | |
| if idx < 4: | |
| return 'Gospels' | |
| if idx == 4: | |
| return 'Acts' | |
| if idx < 19: | |
| return 'Pauline letters' | |
| if idx < 26: | |
| return 'Short lettters' | |
| return 'Revelation' | |
| st.markdown(""" | |
| This app is a demo of using BERT to analyze the Greek New Testament. It allows you to compare two | |
| clusters of Greek words (identified by their Strong's Numbers) and compare the embeddings for them. | |
| To use it, select the words you want to use for the first cluster (eg. G0025 and G0026, which are | |
| forms of agape), then select the words you want to use for the second cluster (eg. G5368, G5360, | |
| G5363, which are forms of phileo) and then hit Submit. | |
| For an explanation of what's going on here you can read my [post](https://rolisz.com/analyzing-the-bible-with-bert-models/) | |
| where I compare the words soul and spirit and | |
| the words agape and phileo. | |
| """) | |
| with st.form("my_form"): | |
| option1 = st.multiselect('Select Strongs numbers for first concept', all_defs.keys(), | |
| ['0025', '0026'], format_func=format_strong) | |
| option2 = st.multiselect('Select Strongs numbers for second concept', all_defs.keys(), | |
| ["5368", "5360", "5363", "5362", "5361", "5366", "5377"], | |
| format_func=format_strong) | |
| # Every form must have a submit button. | |
| submitted = st.form_submit_button("Submit") | |
| if submitted: | |
| with st.spinner('Calculating embeddings...'): | |
| embeddings = get_all_embeddings({"concept1": option1, "concept2": option2}) | |
| with st.spinner('Reducing dimensionality...'): | |
| mapper = umap.UMAP().fit([x[0] for x in embeddings]) | |
| ts = mapper.embedding_ | |
| x = ts[:, 0] | |
| y = ts[:, 1] | |
| df = pd.DataFrame( | |
| {"x": x, "y": y, "verse": ["<br>".join(textwrap.wrap(x[1], 80)) for x in embeddings], | |
| "greek word": [x[2] for x in embeddings]}) | |
| fig = px.scatter(df, x="x", y="y", | |
| hover_data=['verse'], color="greek word", ) | |
| # fig.write_html("book_love.html") | |
| st.plotly_chart(fig) | |