Spaces:

bglearning
/

tapas-tokenizer-viz

Runtime error

App Files Files Community

bglearning commited on May 21, 2023

Commit

f38dbc0

1 Parent(s): b60285f

Styling fixes

Browse files

Files changed (2) hide show

app.py +14 -11
tapas_visualizer.py +55 -40

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import io
 import pandas as pd
 import streamlit as st
@@ -7,21 +7,19 @@ from transformers import AutoTokenizer
 from tapas_visualizer import TapasVisualizer
-st.set_page_config(page_title="Tapas Tokenizer", page_icon='‍🍽️', layout="wide")
 def set_file_input():
     st.session_state.input_stream = "file"
 def set_text_input():
     st.session_state.input_stream = "text"
 def main():
-    models = [
-        "google/tapas-base",
-        "deepset/tapas-large-nq-hn-reader"
-    ]
     @st.cache()
     def load_tokenizer():
@@ -31,7 +29,9 @@ def main():
     col1, col2 = st.columns([1, 2])
     with col1:
         selected_model = st.selectbox("Select a tokenizer", models, key=1)
-        text = st.text_area(label="", placeholder="Table to tokenize; csv", on_change=set_text_input)
         uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
         button_clicked = st.button("Tokenize")
@@ -42,7 +42,10 @@ def main():
         if text or uploaded_file or button_clicked:
             df: pd.DataFrame
-            if 'input_stream' not in st.session_state or st.session_state.input_stream == "text":
                 df = pd.read_csv(io.StringIO(text), sep=",")
             elif st.session_state.input_stream == "file":
                 df = pd.read_csv(uploaded_file)
@@ -51,5 +54,5 @@ def main():
                 st.components.v1.html(visualizer(df.astype(str)), height=1500)
-if __name__ == '__main__':
-    main()

+import io
 import pandas as pd
 import streamlit as st
 from tapas_visualizer import TapasVisualizer
+st.set_page_config(page_title="Tapas Tokenizer", page_icon="‍🍽️", layout="wide")
 def set_file_input():
     st.session_state.input_stream = "file"
 def set_text_input():
     st.session_state.input_stream = "text"
 def main():
+    models = ["google/tapas-base", "deepset/tapas-large-nq-hn-reader"]
     @st.cache()
     def load_tokenizer():
     col1, col2 = st.columns([1, 2])
     with col1:
         selected_model = st.selectbox("Select a tokenizer", models, key=1)
+        text = st.text_area(
+            label="", placeholder="Table to tokenize; csv", on_change=set_text_input
+        )
         uploaded_file = st.file_uploader("(Or) Choose a file", on_change=set_file_input)
         button_clicked = st.button("Tokenize")
         if text or uploaded_file or button_clicked:
             df: pd.DataFrame
+            if (
+                "input_stream" not in st.session_state
+                or st.session_state.input_stream == "text"
+            ):
                 df = pd.read_csv(io.StringIO(text), sep=",")
             elif st.session_state.input_stream == "file":
                 df = pd.read_csv(uploaded_file)
                 st.components.v1.html(visualizer(df.astype(str)), height=1500)
+if __name__ == "__main__":
+    main()

tapas_visualizer.py CHANGED Viewed

@@ -65,8 +65,8 @@ class TapasVisualizer:
             str: html with styling for the tokens
         """
         if len(tokens) == 0:
-            print(f'Empty tokens for: {org_text}')
-            return ''
         cur_token_id = 0
         cur_token = self.normalize_token_str(tokens[cur_token_id])
@@ -77,17 +77,19 @@ class TapasVisualizer:
         spans = []
         while next_start < len(org_text):
-            candidate = org_text[next_start: next_start + len(cur_token)]
             # The tokenizer performs lowercasing; so check against lowercase
             if candidate.lower() == cur_token:
                 if last_end != next_start:
                     # There was token-less text (probably whitespace)
                     # in the middle
-                    spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
-                odd_or_even = 'even-token' if cur_token_id % 2 == 0 else 'odd-token'
-                spans.append(self.style_span(candidate, ['token', odd_or_even]))
                 next_start += len(cur_token)
                 last_end = next_start
                 cur_token_id += 1
@@ -96,20 +98,21 @@ class TapasVisualizer:
                 cur_token = self.normalize_token_str(tokens[cur_token_id])
             else:
                 next_start += 1
         if last_end != len(org_text):
-            spans.append(self.style_span(org_text[last_end: next_start], ['non-token']))
         return spans
-    def cells_to_html(self,
-                      cell_vals: List[List[str]],
-                      cell_tokens: Dict,
-                      row_id_start: int=0,
-                      cell_element: str="td",
-                      cumulative_cnt: int=0,
-                      table_html: str="") -> str:
         for row_id, row in enumerate(cell_vals, start=row_id_start):
             row_html = ""
             row_token_cnt = 0
@@ -120,42 +123,54 @@ class TapasVisualizer:
                 row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
                 row_token_cnt += len(cur_cell_tokens)
             cumulative_cnt += row_token_cnt
-            cnt_html = (f'<td style="border: none;" align="right">{self.style_span(str(cumulative_cnt), ["non-token", "count"])}</td>'
-                        f'<td style="border: none;" align="right">{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}</td>')
             row_html = cnt_html + row_html
-            table_html += f'<tr>{row_html}</tr>'
         return table_html, cumulative_cnt
     def __call__(self, table: pd.DataFrame) -> Any:
         tokenized = self.tokenizer(table)
         cell_tokens = defaultdict(list)
-        for id_ind, input_id in enumerate(tokenized['input_ids']):
             input_id = int(input_id)
-            # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation' not required
-            segment_id, col_id, row_id, *_ = tokenized['token_type_ids'][id_ind]
             token_text = self.tokenizer._convert_id_to_token(input_id)
             if int(segment_id) == 1:
                 cell_tokens[(row_id, col_id)].append(token_text)
-        table_html = '<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
-        table_html, cumulative_cnt = self.cells_to_html(cell_vals=[table.columns],
-                                                        cell_tokens=cell_tokens,
-                                                        row_id_start=0,
-                                                        cell_element="th",
-                                                        cumulative_cnt=0,
-                                                        table_html=table_html)
-        table_html, cumulative_cnt = self.cells_to_html(cell_vals=table.values,
-                                                        cell_tokens=cell_tokens,
-                                                        row_id_start=1,
-                                                        cell_element="td",
-                                                        cumulative_cnt=cumulative_cnt,
-                                                        table_html=table_html)
-        table_html = f'<table>{table_html}</table>'
         return HTMLBody(table_html)

             str: html with styling for the tokens
         """
         if len(tokens) == 0:
+            print(f"Empty tokens for: {org_text}")
+            return ""
         cur_token_id = 0
         cur_token = self.normalize_token_str(tokens[cur_token_id])
         spans = []
         while next_start < len(org_text):
+            candidate = org_text[next_start : next_start + len(cur_token)]
             # The tokenizer performs lowercasing; so check against lowercase
             if candidate.lower() == cur_token:
                 if last_end != next_start:
                     # There was token-less text (probably whitespace)
                     # in the middle
+                    spans.append(
+                        self.style_span(org_text[last_end:next_start], ["non-token"])
+                    )
+                odd_or_even = "even-token" if cur_token_id % 2 == 0 else "odd-token"
+                spans.append(self.style_span(candidate, ["token", odd_or_even]))
                 next_start += len(cur_token)
                 last_end = next_start
                 cur_token_id += 1
                 cur_token = self.normalize_token_str(tokens[cur_token_id])
             else:
                 next_start += 1
         if last_end != len(org_text):
+            spans.append(self.style_span(org_text[last_end:next_start], ["non-token"]))
         return spans
+    def cells_to_html(
+        self,
+        cell_vals: List[List[str]],
+        cell_tokens: Dict,
+        row_id_start: int = 0,
+        cell_element: str = "td",
+        cumulative_cnt: int = 0,
+        table_html: str = "",
+    ) -> str:
         for row_id, row in enumerate(cell_vals, start=row_id_start):
             row_html = ""
             row_token_cnt = 0
                 row_html += f"<{cell_element}>{cell_html}</{cell_element}>"
                 row_token_cnt += len(cur_cell_tokens)
             cumulative_cnt += row_token_cnt
+            cnt_html = (
+                f'<td style="border: none;" align="right">'
+                f'{self.style_span(str(cumulative_cnt), ["non-token", "count"])}'
+                '</td>'
+                f'<td style="border: none;" align="right">'
+                f'{self.style_span(f"<+{row_token_cnt}", ["non-token", "count"])}'
+                '</td>'
+            )
             row_html = cnt_html + row_html
+            table_html += f"<tr>{row_html}</tr>"
         return table_html, cumulative_cnt
     def __call__(self, table: pd.DataFrame) -> Any:
         tokenized = self.tokenizer(table)
         cell_tokens = defaultdict(list)
+        for id_ind, input_id in enumerate(tokenized["input_ids"]):
             input_id = int(input_id)
+            # 'prev_label', 'column_rank', 'inv_column_rank', 'numeric_relation'
+            # not required
+            segment_id, col_id, row_id, *_ = tokenized["token_type_ids"][id_ind]
             token_text = self.tokenizer._convert_id_to_token(input_id)
             if int(segment_id) == 1:
                 cell_tokens[(row_id, col_id)].append(token_text)
+        table_html = (
+            '<tr><td style="border: none;" colspan="2" align="left">#Tokens</td></tr>'
+        )
+        table_html, cumulative_cnt = self.cells_to_html(
+            cell_vals=[table.columns],
+            cell_tokens=cell_tokens,
+            row_id_start=0,
+            cell_element="th",
+            cumulative_cnt=0,
+            table_html=table_html,
+        )
+        table_html, cumulative_cnt = self.cells_to_html(
+            cell_vals=table.values,
+            cell_tokens=cell_tokens,
+            row_id_start=1,
+            cell_element="td",
+            cumulative_cnt=cumulative_cnt,
+            table_html=table_html,
+        )
+        table_html = f"<table>{table_html}</table>"
         return HTMLBody(table_html)