Spaces:
Build error
Build error
Update pages/15_Plus_Detokenizer.py
Browse files- pages/15_Plus_Detokenizer.py +15 -13
pages/15_Plus_Detokenizer.py
CHANGED
|
@@ -122,20 +122,7 @@ components.html(html_content, height=700, scrolling=True)
|
|
| 122 |
# Load the tokenizer
|
| 123 |
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
| 124 |
|
| 125 |
-
# Tokenization section
|
| 126 |
-
st.header("Tokenization")
|
| 127 |
-
sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg cnvs html js hlds 9 wbs")
|
| 128 |
|
| 129 |
-
def format_token_ids(token_ids):
|
| 130 |
-
formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
|
| 131 |
-
return ''.join(formatted_ids)
|
| 132 |
-
|
| 133 |
-
if st.button("Tokenize"):
|
| 134 |
-
input_ids = tokenizer(sentence, return_tensors='pt').input_ids
|
| 135 |
-
token_ids_list = input_ids[0].tolist()
|
| 136 |
-
formatted_token_ids = format_token_ids(token_ids_list)
|
| 137 |
-
st.write("Tokenized input IDs (formatted):")
|
| 138 |
-
st.write(formatted_token_ids)
|
| 139 |
|
| 140 |
# Detokenization section
|
| 141 |
st.header("Detokenization")
|
|
@@ -158,6 +145,21 @@ if st.button("Detokenize"):
|
|
| 158 |
st.write("Detokenized sentence:")
|
| 159 |
st.write(detokenized_sentence)
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
# Load the model
|
| 162 |
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
|
| 163 |
|
|
|
|
| 122 |
# Load the tokenizer
|
| 123 |
tokenizer = AutoTokenizer.from_pretrained('gpt2')
|
| 124 |
|
|
|
|
|
|
|
|
|
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# Detokenization section
|
| 128 |
st.header("Detokenization")
|
|
|
|
| 145 |
st.write("Detokenized sentence:")
|
| 146 |
st.write(detokenized_sentence)
|
| 147 |
|
| 148 |
+
# Tokenization section
|
| 149 |
+
st.header("Tokenization")
|
| 150 |
+
sentence = st.text_input("Enter a sentence to tokenize:", "cr8 lg")
|
| 151 |
+
|
| 152 |
+
def format_token_ids(token_ids):
|
| 153 |
+
formatted_ids = [str(token_id).zfill(5) for token_id in token_ids]
|
| 154 |
+
return ''.join(formatted_ids)
|
| 155 |
+
|
| 156 |
+
if st.button("Tokenize"):
|
| 157 |
+
input_ids = tokenizer(sentence, return_tensors='pt').input_ids
|
| 158 |
+
token_ids_list = input_ids[0].tolist()
|
| 159 |
+
formatted_token_ids = format_token_ids(token_ids_list)
|
| 160 |
+
st.write("Tokenized input IDs (formatted):")
|
| 161 |
+
st.write(formatted_token_ids)
|
| 162 |
+
|
| 163 |
# Load the model
|
| 164 |
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
|
| 165 |
|