Spaces:

askmuhsin
/

malayalam_tokenizer

Build error

App Files Files Community

muhsin commited on Feb 8, 2025

Commit

78c640c

1 Parent(s): 8988399

remove space saving

Browse files

Files changed (3) hide show

app.py +5 -17
dev_notebooks/01_out_of_box.ipynb +9 -14
utils/metrics.py +3 -7

app.py CHANGED Viewed

@@ -29,11 +29,12 @@ text = st.text_area(
 )
 if text:
-    col1, col2, col3 = st.columns(3)
     unicode_bytes = text.encode("utf-8")
     enc = tiktoken.get_encoding("cl100k_base")
     tokens = enc.encode(text)
     with col1:
         st.metric("Raw Text Length", f"{len(text)} chars")
@@ -44,25 +45,13 @@ if text:
     with col3:
         st.metric("Token Count", f"{len(tokens)} tokens")
-    compression = calculate_compression(text, tokens)
-    st.markdown("### Compression Analysis")
-    comp_col1, comp_col2 = st.columns(2)
-    with comp_col1:
         st.metric(
             "Compression Ratio",
-            f"{compression['compression_ratio']:.2f}x",
             help="Higher ratio means better compression. Shows how many characters are represented by each token on average."
         )
-    with comp_col2:
-        st.metric(
-            "Space Saving",
-            f"{compression['space_saving_percentage']:.1f}%",
-            help="Percentage of space saved by using tokens instead of raw text."
-        )
-    st.markdown("### Token Details")
     token_col1, token_col2 = st.columns(2)
     with token_col1:
@@ -78,6 +67,5 @@ if text:
         - **Raw Text Length**: Number of characters in your input text
         - **UTF-8 Encoded Length**: Size of text when encoded in UTF-8 format
         - **Token Count**: Number of tokens the text is broken into
-        - **Compression Ratio**: How many characters are represented by each token on average
-        - **Space Saving**: Percentage reduction in size when using tokens vs raw text
         """)

 )
 if text:
+    col1, col2, col3, col4 = st.columns(4)
     unicode_bytes = text.encode("utf-8")
     enc = tiktoken.get_encoding("cl100k_base")
     tokens = enc.encode(text)
+    compression_ratio = calculate_compression(unicode_bytes, tokens)
     with col1:
         st.metric("Raw Text Length", f"{len(text)} chars")
     with col3:
         st.metric("Token Count", f"{len(tokens)} tokens")
+    with col4:
         st.metric(
             "Compression Ratio",
+            f"{compression_ratio:.2f}x",
             help="Higher ratio means better compression. Shows how many characters are represented by each token on average."
         )
     token_col1, token_col2 = st.columns(2)
     with token_col1:
         - **Raw Text Length**: Number of characters in your input text
         - **UTF-8 Encoded Length**: Size of text when encoded in UTF-8 format
         - **Token Count**: Number of tokens the text is broken into
+        - **Compression Ratio**: How many characters are represented by each token on average (bytes/token)
         """)

dev_notebooks/01_out_of_box.ipynb CHANGED Viewed

@@ -44,10 +44,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "text = \"\"\"ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🪧. \n",
-    "വാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ. \n",
-    "ഈ ടോക്കണുകൾ വാക്കുകളോ വാക്കുകളുടെ ഭാഗങ്ങളോ പ്രതീകങ്ങളോ ആകാം. \n",
-    "ടെക്‌സ്‌റ്റ് കൂടുതൽ കാര്യക്ഷമമായി മനസ്സിലാക്കാനും പ്രോസസ്സ് ചെയ്യാനും ഇത് language modelukale സഹായിക്കുന്നു.🎂\"\"\""
    ]
   },
   {
@@ -59,7 +56,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Length of the input text:  300\n"
      ]
     }
    ],
@@ -76,8 +73,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Length of the encoded text:  750\n",
-      "First few bytes:  [224, 180, 135, 224, 180, 164, 224, 181, 138, 224]\n"
      ]
     }
    ],
@@ -123,8 +120,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Length of the tokenized text:  428\n",
-      "First few token ids:  [34839, 229, 34839, 97, 51211, 232, 34839, 108, 51211, 223]\n"
      ]
     }
    ],
@@ -136,22 +133,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Compression ratio:  0.7009345794392523\n",
-      "Space saving percentage:  -42.66666666666667\n"
      ]
     }
    ],
    "source": [
     "compression = calculate_compression(text, tiktoken_ids)\n",
-    "print('Compression ratio: ', compression[\"compression_ratio\"])\n",
-    "print('Space saving percentage: ', compression[\"space_saving_percentage\"])"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "text = \"\"\"english\"\"\""
    ]
   },
   {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Length of the input text:  7\n"
      ]
     }
    ],
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Length of the encoded text:  7\n",
+      "First few bytes:  [101, 110, 103, 108, 105, 115, 104]\n"
      ]
     }
    ],
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Length of the tokenized text:  1\n",
+      "First few token ids:  [30220]\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Compression ratio:  7.0\n"
      ]
     }
    ],
    "source": [
     "compression = calculate_compression(text, tiktoken_ids)\n",
+    "print('Compression ratio: ', compression)\n"
    ]
   },
   {

utils/metrics.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from typing import List, Dict
-def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> Dict[str, float]:
     raw_size = len(unicode_bytes)
     token_size = len(tokens)
@@ -8,9 +8,5 @@ def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> Dict[s
         raise ValueError("Raw text size cannot be zero.")
     compression_ratio = raw_size / token_size if token_size > 0 else float('inf')
-    space_saving = (1 - (token_size / raw_size)) * 100 if token_size > 0 else 100.0
-    return {
-        "compression_ratio": compression_ratio,
-        "space_saving_percentage": space_saving
-    }

+from typing import List
+def calculate_compression(unicode_bytes: List[int], tokens: List[int]) -> float:
     raw_size = len(unicode_bytes)
     token_size = len(tokens)
         raise ValueError("Raw text size cannot be zero.")
     compression_ratio = raw_size / token_size if token_size > 0 else float('inf')
+    return compression_ratio