mathminakshi commited on
Commit
5fa0c2c
·
verified ·
1 Parent(s): a384cc1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -112
app.py CHANGED
@@ -1,113 +1,113 @@
1
- import streamlit as st
2
- from bpe import Tokenizer
3
- import random
4
- import colorsys
5
-
6
- # Set page config
7
- st.set_page_config(
8
- page_title="English BPE Tokenizer Visualizer",
9
- layout="wide"
10
- )
11
-
12
- # Load the trained tokenizer
13
- @st.cache_resource
14
- def load_tokenizer():
15
- tokenizer = Tokenizer()
16
- tokenizer.load("models/EnglishBPE_6999.model.model")
17
- return tokenizer
18
-
19
- # Load example texts
20
- @st.cache_data
21
- def load_examples():
22
- try:
23
- with open("data/testdata1.txt", "r", encoding="utf-8") as f:
24
- example1 = f.read().strip()
25
- with open("data/testdata2.txt", "r", encoding="utf-8") as f:
26
- example2 = f.read().strip()
27
- except Exception as e:
28
- st.error(f"Error loading example texts: {str(e)}")
29
- # Fallback examples in case files can't be loaded
30
-
31
- return example1, example2
32
-
33
- def generate_distinct_colors(n):
34
- colors = []
35
- for i in range(n):
36
- hue = i / n
37
- saturation = 0.7 + random.random() * 0.3
38
- value = 0.8 + random.random() * 0.2
39
- rgb = colorsys.hsv_to_rgb(hue, saturation, value)
40
- hex_color = "#{:02x}{:02x}{:02x}".format(
41
- int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
42
- )
43
- colors.append(hex_color)
44
- return colors
45
-
46
- def process_text(text, tokenizer):
47
- try:
48
- # Get tokens
49
- tokens = tokenizer.encode(text)
50
-
51
- # Generate colors for visualization
52
- unique_tokens = list(set(tokens))
53
- colors = generate_distinct_colors(len(unique_tokens))
54
- token_colors = dict(zip(unique_tokens, colors))
55
-
56
- # Create HTML visualization
57
- html_parts = []
58
- decoded_tokens = [tokenizer.decode([token]) for token in tokens]
59
-
60
- for token, token_text in zip(tokens, decoded_tokens):
61
- color = token_colors[token]
62
- html_parts.append(f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px;" title="Token ID: {token}">{token_text}</span>')
63
-
64
- return ''.join(html_parts), tokens
65
- except Exception as e:
66
- return f"<span style='color: red'>Error processing text: {str(e)}</span>", None
67
-
68
- def main():
69
- # Load tokenizer and examples
70
- tokenizer = load_tokenizer()
71
- example1, example2 = load_examples()
72
-
73
- # Title and description
74
- st.title("English BPE Tokenizer Visualizer")
75
- st.markdown("Enter text to see how it gets tokenized, with color-coded visualization")
76
-
77
- # Example selector
78
- example_option = st.selectbox(
79
- "Choose an example or enter your own text below:",
80
- ["Custom Input", "Example 1", "Example 2"]
81
- )
82
-
83
- # Text input
84
- if example_option == "Example 1":
85
- text = st.text_area("Enter Text", value=example1, height=100)
86
- elif example_option == "Example 2":
87
- text = st.text_area("Enter Text", value=example2, height=100)
88
- else:
89
- text = st.text_area("Enter Text", height=100)
90
-
91
- # Process button
92
- if st.button("Process Text") or text:
93
- if text.strip():
94
- # Create two columns for output
95
- col1, col2 = st.columns([2, 1])
96
-
97
- # Process the text
98
- visualization, tokens = process_text(text, tokenizer)
99
-
100
- with col1:
101
- st.subheader("Visualization")
102
- st.markdown(visualization, unsafe_allow_html=True)
103
-
104
- with col2:
105
- if tokens is not None:
106
- st.subheader("Token Information")
107
- st.write(f"Token count: {len(tokens)}")
108
- st.write("Tokens:", tokens)
109
- else:
110
- st.warning("Please enter some text to process.")
111
-
112
- if __name__ == "__main__":
113
  main()
 
1
+ import streamlit as st
2
+ from bpe import Tokenizer
3
+ import random
4
+ import colorsys
5
+
6
+ # Set page config
7
+ st.set_page_config(
8
+ page_title="English BPE Tokenizer Visualizer",
9
+ layout="wide"
10
+ )
11
+
12
+ # Load the trained tokenizer
13
+ @st.cache_resource
14
+ def load_tokenizer():
15
+ tokenizer = Tokenizer()
16
+ tokenizer.load("models/EnglishBPE_5000.model.model")
17
+ return tokenizer
18
+
19
+ # Load example texts
20
+ @st.cache_data
21
+ def load_examples():
22
+ try:
23
+ with open("data/testdata1.txt", "r", encoding="utf-8") as f:
24
+ example1 = f.read().strip()
25
+ with open("data/testdata2.txt", "r", encoding="utf-8") as f:
26
+ example2 = f.read().strip()
27
+ except Exception as e:
28
+ st.error(f"Error loading example texts: {str(e)}")
29
+ # Fallback examples in case files can't be loaded
30
+
31
+ return example1, example2
32
+
33
+ def generate_distinct_colors(n):
34
+ colors = []
35
+ for i in range(n):
36
+ hue = i / n
37
+ saturation = 0.7 + random.random() * 0.3
38
+ value = 0.8 + random.random() * 0.2
39
+ rgb = colorsys.hsv_to_rgb(hue, saturation, value)
40
+ hex_color = "#{:02x}{:02x}{:02x}".format(
41
+ int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
42
+ )
43
+ colors.append(hex_color)
44
+ return colors
45
+
46
+ def process_text(text, tokenizer):
47
+ try:
48
+ # Get tokens
49
+ tokens = tokenizer.encode(text)
50
+
51
+ # Generate colors for visualization
52
+ unique_tokens = list(set(tokens))
53
+ colors = generate_distinct_colors(len(unique_tokens))
54
+ token_colors = dict(zip(unique_tokens, colors))
55
+
56
+ # Create HTML visualization
57
+ html_parts = []
58
+ decoded_tokens = [tokenizer.decode([token]) for token in tokens]
59
+
60
+ for token, token_text in zip(tokens, decoded_tokens):
61
+ color = token_colors[token]
62
+ html_parts.append(f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px;" title="Token ID: {token}">{token_text}</span>')
63
+
64
+ return ''.join(html_parts), tokens
65
+ except Exception as e:
66
+ return f"<span style='color: red'>Error processing text: {str(e)}</span>", None
67
+
68
+ def main():
69
+ # Load tokenizer and examples
70
+ tokenizer = load_tokenizer()
71
+ example1, example2 = load_examples()
72
+
73
+ # Title and description
74
+ st.title("English BPE Tokenizer Visualizer")
75
+ st.markdown("Enter text to see how it gets tokenized, with color-coded visualization")
76
+
77
+ # Example selector
78
+ example_option = st.selectbox(
79
+ "Choose an example or enter your own text below:",
80
+ ["Custom Input", "Example 1", "Example 2"]
81
+ )
82
+
83
+ # Text input
84
+ if example_option == "Example 1":
85
+ text = st.text_area("Enter Text", value=example1, height=100)
86
+ elif example_option == "Example 2":
87
+ text = st.text_area("Enter Text", value=example2, height=100)
88
+ else:
89
+ text = st.text_area("Enter Text", height=100)
90
+
91
+ # Process button
92
+ if st.button("Process Text") or text:
93
+ if text.strip():
94
+ # Create two columns for output
95
+ col1, col2 = st.columns([2, 1])
96
+
97
+ # Process the text
98
+ visualization, tokens = process_text(text, tokenizer)
99
+
100
+ with col1:
101
+ st.subheader("Visualization")
102
+ st.markdown(visualization, unsafe_allow_html=True)
103
+
104
+ with col2:
105
+ if tokens is not None:
106
+ st.subheader("Token Information")
107
+ st.write(f"Token count: {len(tokens)}")
108
+ st.write("Tokens:", tokens)
109
+ else:
110
+ st.warning("Please enter some text to process.")
111
+
112
+ if __name__ == "__main__":
113
  main()