muhsin commited on
Commit
78c640c
·
1 Parent(s): 8988399

remove space saving

Browse files
Files changed (3) hide show
  1. app.py +5 -17
  2. dev_notebooks/01_out_of_box.ipynb +9 -14
  3. utils/metrics.py +3 -7
app.py CHANGED
@@ -29,11 +29,12 @@ text = st.text_area(
29
  )
30
 
31
  if text:
32
- col1, col2, col3 = st.columns(3)
33
 
34
  unicode_bytes = text.encode("utf-8")
35
  enc = tiktoken.get_encoding("cl100k_base")
36
  tokens = enc.encode(text)
 
37
 
38
  with col1:
39
  st.metric("Raw Text Length", f"{len(text)} chars")
@@ -44,25 +45,13 @@ if text:
44
  with col3:
45
  st.metric("Token Count", f"{len(tokens)} tokens")
46
 
47
- compression = calculate_compression(text, tokens)
48
- st.markdown("### Compression Analysis")
49
-
50
- comp_col1, comp_col2 = st.columns(2)
51
- with comp_col1:
52
  st.metric(
53
  "Compression Ratio",
54
- f"{compression['compression_ratio']:.2f}x",
55
  help="Higher ratio means better compression. Shows how many characters are represented by each token on average."
56
  )
57
 
58
- with comp_col2:
59
- st.metric(
60
- "Space Saving",
61
- f"{compression['space_saving_percentage']:.1f}%",
62
- help="Percentage of space saved by using tokens instead of raw text."
63
- )
64
-
65
- st.markdown("### Token Details")
66
  token_col1, token_col2 = st.columns(2)
67
 
68
  with token_col1:
@@ -78,6 +67,5 @@ if text:
78
  - **Raw Text Length**: Number of characters in your input text
79
  - **UTF-8 Encoded Length**: Size of text when encoded in UTF-8 format
80
  - **Token Count**: Number of tokens the text is broken into
81
- - **Compression Ratio**: How many characters are represented by each token on average
82
- - **Space Saving**: Percentage reduction in size when using tokens vs raw text
83
  """)
 
29
  )
30
 
31
  if text:
32
+ col1, col2, col3, col4 = st.columns(4)
33
 
34
  unicode_bytes = text.encode("utf-8")
35
  enc = tiktoken.get_encoding("cl100k_base")
36
  tokens = enc.encode(text)
37
+ compression_ratio = calculate_compression(unicode_bytes, tokens)
38
 
39
  with col1:
40
  st.metric("Raw Text Length", f"{len(text)} chars")
 
45
  with col3:
46
  st.metric("Token Count", f"{len(tokens)} tokens")
47
 
48
+ with col4:
 
 
 
 
49
  st.metric(
50
  "Compression Ratio",
51
+ f"{compression_ratio:.2f}x",
52
  help="Higher ratio means better compression. Shows how many characters are represented by each token on average."
53
  )
54
 
 
 
 
 
 
 
 
 
55
  token_col1, token_col2 = st.columns(2)
56
 
57
  with token_col1:
 
67
  - **Raw Text Length**: Number of characters in your input text
68
  - **UTF-8 Encoded Length**: Size of text when encoded in UTF-8 format
69
  - **Token Count**: Number of tokens the text is broken into
70
+ - **Compression Ratio**: How many characters are represented by each token on average (bytes/token)
 
71
  """)
dev_notebooks/01_out_of_box.ipynb CHANGED
@@ -44,10 +44,7 @@
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
- "text = \"\"\"ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🪧. \n",
48
- "വാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ. \n",
49
- "ഈ ടോക്കണുകൾ വാക്കുകളോ വാക്കുകളുടെ ഭാഗങ്ങളോ പ്രതീകങ്ങളോ ആകാം. \n",
50
- "ടെക്‌സ്‌റ്റ് കൂടുതൽ കാര്യക്ഷമമായി മനസ്സിലാക്കാനും പ്രോസസ്സ് ചെയ്യാനും ഇത് language modelukale സഹായിക്കുന്നു.🎂\"\"\""
51
  ]
52
  },
53
  {
@@ -59,7 +56,7 @@
59
  "name": "stdout",
60
  "output_type": "stream",
61
  "text": [
62
- "Length of the input text: 300\n"
63
  ]
64
  }
65
  ],
@@ -76,8 +73,8 @@
76
  "name": "stdout",
77
  "output_type": "stream",
78
  "text": [
79
- "Length of the encoded text: 750\n",
80
- "First few bytes: [224, 180, 135, 224, 180, 164, 224, 181, 138, 224]\n"
81
  ]
82
  }
83
  ],
@@ -123,8 +120,8 @@
123
  "name": "stdout",
124
  "output_type": "stream",
125
  "text": [
126
- "Length of the tokenized text: 428\n",
127
- "First few token ids: [34839, 229, 34839, 97, 51211, 232, 34839, 108, 51211, 223]\n"
128
  ]
129
  }
130
  ],
@@ -136,22 +133,20 @@
136
  },
137
  {
138
  "cell_type": "code",
139
- "execution_count": 11,
140
  "metadata": {},
141
  "outputs": [
142
  {
143
  "name": "stdout",
144
  "output_type": "stream",
145
  "text": [
146
- "Compression ratio: 0.7009345794392523\n",
147
- "Space saving percentage: -42.66666666666667\n"
148
  ]
149
  }
150
  ],
151
  "source": [
152
  "compression = calculate_compression(text, tiktoken_ids)\n",
153
- "print('Compression ratio: ', compression[\"compression_ratio\"])\n",
154
- "print('Space saving percentage: ', compression[\"space_saving_percentage\"])"
155
  ]
156
  },
157
  {
 
44
  "metadata": {},
45
  "outputs": [],
46
  "source": [
47
+ "text = \"\"\"english\"\"\""
 
 
 
48
  ]
49
  },
50
  {
 
56
  "name": "stdout",
57
  "output_type": "stream",
58
  "text": [
59
+ "Length of the input text: 7\n"
60
  ]
61
  }
62
  ],
 
73
  "name": "stdout",
74
  "output_type": "stream",
75
  "text": [
76
+ "Length of the encoded text: 7\n",
77
+ "First few bytes: [101, 110, 103, 108, 105, 115, 104]\n"
78
  ]
79
  }
80
  ],
 
120
  "name": "stdout",
121
  "output_type": "stream",
122
  "text": [
123
+ "Length of the tokenized text: 1\n",
124
+ "First few token ids: [30220]\n"
125
  ]
126
  }
127
  ],
 
133
  },
134
  {
135
  "cell_type": "code",
136
+ "execution_count": 12,
137
  "metadata": {},
138
  "outputs": [
139
  {
140
  "name": "stdout",
141
  "output_type": "stream",
142
  "text": [
143
+ "Compression ratio: 7.0\n"
 
144
  ]
145
  }
146
  ],
147
  "source": [
148
  "compression = calculate_compression(text, tiktoken_ids)\n",
149
+ "print('Compression ratio: ', compression)\n"
 
150
  ]
151
  },
152
  {
utils/metrics.py CHANGED
@@ -1,6 +1,6 @@
1
- from typing import List, Dict
2
 
3
- def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> Dict[str, float]:
4
  raw_size = len(unicode_bytes)
5
  token_size = len(tokens)
6
 
@@ -8,9 +8,5 @@ def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> Dict[s
8
  raise ValueError("Raw text size cannot be zero.")
9
 
10
  compression_ratio = raw_size / token_size if token_size > 0 else float('inf')
11
- space_saving = (1 - (token_size / raw_size)) * 100 if token_size > 0 else 100.0
12
 
13
- return {
14
- "compression_ratio": compression_ratio,
15
- "space_saving_percentage": space_saving
16
- }
 
1
+ from typing import List
2
 
3
+ def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> float:
4
  raw_size = len(unicode_bytes)
5
  token_size = len(tokens)
6
 
 
8
  raise ValueError("Raw text size cannot be zero.")
9
 
10
  compression_ratio = raw_size / token_size if token_size > 0 else float('inf')
 
11
 
12
+ return compression_ratio