Spaces:
Runtime error
Runtime error
Commit
·
f38dbc0
1
Parent(s):
b60285f
Styling fixes
Browse files- app.py +14 -11
- tapas_visualizer.py +55 -40
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import io
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
import streamlit as st
|
|
@@ -7,21 +7,19 @@ from transformers import AutoTokenizer
|
|
| 7 |
|
| 8 |
from tapas_visualizer import TapasVisualizer
|
| 9 |
|
| 10 |
-
st.set_page_config(page_title="Tapas Tokenizer", page_icon=
|
|
|
|
| 11 |
|
| 12 |
def set_file_input():
|
| 13 |
st.session_state.input_stream = "file"
|
| 14 |
|
|
|
|
| 15 |
def set_text_input():
|
| 16 |
st.session_state.input_stream = "text"
|
| 17 |
|
| 18 |
|
| 19 |
def main():
|
| 20 |
-
|
| 21 |
-
models = [
|
| 22 |
-
"google/tapas-base",
|
| 23 |
-
"deepset/tapas-large-nq-hn-reader"
|
| 24 |
-
]
|
| 25 |
|
| 26 |
@st.cache()
|
| 27 |
def load_tokenizer():
|
|
@@ -31,7 +29,9 @@ def main():
|
|
| 31 |
col1, col2 = st.columns([1, 2])
|
| 32 |
with col1:
|
| 33 |
selected_model = st.selectbox("Select a tokenizer", models, key=1)
|
| 34 |
-
text = st.text_area(
|
|
|
|
|
|
|
| 35 |
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
|
| 36 |
button_clicked = st.button("Tokenize")
|
| 37 |
|
|
@@ -42,7 +42,10 @@ def main():
|
|
| 42 |
if text or uploaded_file or button_clicked:
|
| 43 |
df: pd.DataFrame
|
| 44 |
|
| 45 |
-
if
|
|
|
|
|
|
|
|
|
|
| 46 |
df = pd.read_csv(io.StringIO(text), sep=",")
|
| 47 |
elif st.session_state.input_stream == "file":
|
| 48 |
df = pd.read_csv(uploaded_file)
|
|
@@ -51,5 +54,5 @@ def main():
|
|
| 51 |
st.components.v1.html(visualizer(df.astype(str)), height=1500)
|
| 52 |
|
| 53 |
|
| 54 |
-
if __name__ ==
|
| 55 |
-
main()
|
|
|
|
| 1 |
+
import io
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
| 7 |
|
| 8 |
from tapas_visualizer import TapasVisualizer
|
| 9 |
|
| 10 |
+
st.set_page_config(page_title="Tapas Tokenizer", page_icon="🍽️", layout="wide")
|
| 11 |
+
|
| 12 |
|
| 13 |
def set_file_input():
|
| 14 |
st.session_state.input_stream = "file"
|
| 15 |
|
| 16 |
+
|
| 17 |
def set_text_input():
|
| 18 |
st.session_state.input_stream = "text"
|
| 19 |
|
| 20 |
|
| 21 |
def main():
|
| 22 |
+
models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
@st.cache()
|
| 25 |
def load_tokenizer():
|
|
|
|
| 29 |
col1, col2 = st.columns([1, 2])
|
| 30 |
with col1:
|
| 31 |
selected_model = st.selectbox("Select a tokenizer", models, key=1)
|
| 32 |
+
text = st.text_area(
|
| 33 |
+
label="", placeholder="Table to tokenize; csv", on_change=set_text_input
|
| 34 |
+
)
|
| 35 |
uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
|
| 36 |
button_clicked = st.button("Tokenize")
|
| 37 |
|
|
|
|
| 42 |
if text or uploaded_file or button_clicked:
|
| 43 |
df: pd.DataFrame
|
| 44 |
|
| 45 |
+
if (
|
| 46 |
+
"input_stream" not in st.session_state
|
| 47 |
+
or st.session_state.input_stream == "text"
|
| 48 |
+
):
|
| 49 |
df = pd.read_csv(io.StringIO(text), sep=",")
|
| 50 |
elif st.session_state.input_stream == "file":
|
| 51 |
df = pd.read_csv(uploaded_file)
|
|
|
|
| 54 |
st.components.v1.html(visualizer(df.astype(str)), height=1500)
|
| 55 |
|
| 56 |
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
tapas_visualizer.py
CHANGED
|
@@ -65,8 +65,8 @@ class TapasVisualizer:
|
|
| 65 |
str: html with styling for the tokens
|
| 66 |
"""
|
| 67 |
if len(tokens) == 0:
|
| 68 |
-
print(f
|
| 69 |
-
return
|
| 70 |
|
| 71 |
cur_token_id = 0
|
| 72 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
|
@@ -77,17 +77,19 @@ class TapasVisualizer:
|
|
| 77 |
spans = []
|
| 78 |
|
| 79 |
while next_start < len(org_text):
|
| 80 |
-
candidate = org_text[next_start: next_start + len(cur_token)]
|
| 81 |
|
| 82 |
# The tokenizer performs lowercasing; so check against lowercase
|
| 83 |
if candidate.lower() == cur_token:
|
| 84 |
if last_end != next_start:
|
| 85 |
# There was token-less text (probably whitespace)
|
| 86 |
# in the middle
|
| 87 |
-
spans.append(
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
odd_or_even =
|
| 90 |
-
spans.append(self.style_span(candidate, [
|
| 91 |
next_start += len(cur_token)
|
| 92 |
last_end = next_start
|
| 93 |
cur_token_id += 1
|
|
@@ -96,20 +98,21 @@ class TapasVisualizer:
|
|
| 96 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
| 97 |
else:
|
| 98 |
next_start += 1
|
| 99 |
-
|
| 100 |
if last_end != len(org_text):
|
| 101 |
-
spans.append(self.style_span(org_text[last_end:
|
| 102 |
|
| 103 |
return spans
|
| 104 |
|
| 105 |
-
def cells_to_html(
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
for row_id, row in enumerate(cell_vals, start=row_id_start):
|
| 114 |
row_html = ""
|
| 115 |
row_token_cnt = 0
|
|
@@ -120,42 +123,54 @@ class TapasVisualizer:
|
|
| 120 |
row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
|
| 121 |
row_token_cnt += len(cur_cell_tokens)
|
| 122 |
cumulative_cnt += row_token_cnt
|
| 123 |
-
cnt_html = (
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
row_html = cnt_html + row_html
|
| 126 |
-
table_html += f
|
| 127 |
|
| 128 |
return table_html, cumulative_cnt
|
| 129 |
|
| 130 |
-
|
| 131 |
def __call__(self, table: pd.DataFrame) -> Any:
|
| 132 |
tokenized = self.tokenizer(table)
|
| 133 |
|
| 134 |
cell_tokens = defaultdict(list)
|
| 135 |
|
| 136 |
-
for id_ind, input_id in enumerate(tokenized[
|
| 137 |
input_id = int(input_id)
|
| 138 |
-
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
|
| 139 |
-
|
|
|
|
| 140 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
| 141 |
if int(segment_id) == 1:
|
| 142 |
cell_tokens[(row_id, col_id)].append(token_text)
|
| 143 |
|
| 144 |
-
table_html =
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
return HTMLBody(table_html)
|
|
|
|
| 65 |
str: html with styling for the tokens
|
| 66 |
"""
|
| 67 |
if len(tokens) == 0:
|
| 68 |
+
print(f"Empty tokens for: {org_text}")
|
| 69 |
+
return ""
|
| 70 |
|
| 71 |
cur_token_id = 0
|
| 72 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
|
|
|
| 77 |
spans = []
|
| 78 |
|
| 79 |
while next_start < len(org_text):
|
| 80 |
+
candidate = org_text[next_start : next_start + len(cur_token)]
|
| 81 |
|
| 82 |
# The tokenizer performs lowercasing; so check against lowercase
|
| 83 |
if candidate.lower() == cur_token:
|
| 84 |
if last_end != next_start:
|
| 85 |
# There was token-less text (probably whitespace)
|
| 86 |
# in the middle
|
| 87 |
+
spans.append(
|
| 88 |
+
self.style_span(org_text[last_end:next_start], ["non-token"])
|
| 89 |
+
)
|
| 90 |
|
| 91 |
+
odd_or_even = "even-token" if cur_token_id % 2 == 0 else "odd-token"
|
| 92 |
+
spans.append(self.style_span(candidate, ["token", odd_or_even]))
|
| 93 |
next_start += len(cur_token)
|
| 94 |
last_end = next_start
|
| 95 |
cur_token_id += 1
|
|
|
|
| 98 |
cur_token = self.normalize_token_str(tokens[cur_token_id])
|
| 99 |
else:
|
| 100 |
next_start += 1
|
| 101 |
+
|
| 102 |
if last_end != len(org_text):
|
| 103 |
+
spans.append(self.style_span(org_text[last_end:next_start], ["non-token"]))
|
| 104 |
|
| 105 |
return spans
|
| 106 |
|
| 107 |
+
def cells_to_html(
|
| 108 |
+
self,
|
| 109 |
+
cell_vals: List[List[str]],
|
| 110 |
+
cell_tokens: Dict,
|
| 111 |
+
row_id_start: int = 0,
|
| 112 |
+
cell_element: str = "td",
|
| 113 |
+
cumulative_cnt: int = 0,
|
| 114 |
+
table_html: str = "",
|
| 115 |
+
) -> str:
|
| 116 |
for row_id, row in enumerate(cell_vals, start=row_id_start):
|
| 117 |
row_html = ""
|
| 118 |
row_token_cnt = 0
|
|
|
|
| 123 |
row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
|
| 124 |
row_token_cnt += len(cur_cell_tokens)
|
| 125 |
cumulative_cnt += row_token_cnt
|
| 126 |
+
cnt_html = (
|
| 127 |
+
f'<td style="border: none;" align="right">'
|
| 128 |
+
f'{self.style_span(str(cumulative_cnt), ["non-token", "count"])}'
|
| 129 |
+
'</td>'
|
| 130 |
+
f'<td style="border: none;" align="right">'
|
| 131 |
+
f'{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}'
|
| 132 |
+
'</td>'
|
| 133 |
+
)
|
| 134 |
row_html = cnt_html + row_html
|
| 135 |
+
table_html += f"<tr>{row_html}</tr>"
|
| 136 |
|
| 137 |
return table_html, cumulative_cnt
|
| 138 |
|
|
|
|
| 139 |
def __call__(self, table: pd.DataFrame) -> Any:
|
| 140 |
tokenized = self.tokenizer(table)
|
| 141 |
|
| 142 |
cell_tokens = defaultdict(list)
|
| 143 |
|
| 144 |
+
for id_ind, input_id in enumerate(tokenized["input_ids"]):
|
| 145 |
input_id = int(input_id)
|
| 146 |
+
# 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
|
| 147 |
+
# not required
|
| 148 |
+
segment_id, col_id, row_id, *_ = tokenized["token_type_ids"][id_ind]
|
| 149 |
token_text = self.tokenizer._convert_id_to_token(input_id)
|
| 150 |
if int(segment_id) == 1:
|
| 151 |
cell_tokens[(row_id, col_id)].append(token_text)
|
| 152 |
|
| 153 |
+
table_html = (
|
| 154 |
+
'<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
table_html, cumulative_cnt = self.cells_to_html(
|
| 158 |
+
cell_vals=[table.columns],
|
| 159 |
+
cell_tokens=cell_tokens,
|
| 160 |
+
row_id_start=0,
|
| 161 |
+
cell_element="th",
|
| 162 |
+
cumulative_cnt=0,
|
| 163 |
+
table_html=table_html,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
table_html, cumulative_cnt = self.cells_to_html(
|
| 167 |
+
cell_vals=table.values,
|
| 168 |
+
cell_tokens=cell_tokens,
|
| 169 |
+
row_id_start=1,
|
| 170 |
+
cell_element="td",
|
| 171 |
+
cumulative_cnt=cumulative_cnt,
|
| 172 |
+
table_html=table_html,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
table_html = f"<table>{table_html}</table>"
|
| 176 |
return HTMLBody(table_html)
|