alessio-vertemati commited on
Commit
dfd9c92
·
1 Parent(s): 0a85b87

Add basic token counter

Browse files
Files changed (5) hide show
  1. .gitignore +13 -0
  2. .python-version +1 -0
  3. app.py +222 -0
  4. pyproject.toml +12 -0
  5. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+
13
+ .env
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import json
5
+ import tiktoken
6
+ import requests
7
+ from typing import List, Tuple, Optional
8
+ from dataclasses import dataclass
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ def count_tokens(text: str, model: str) -> Tuple[int, str]:
15
+ """Count tokens in text using the specified model encoding.
16
+
17
+ Args:
18
+ text: The input text to tokenize
19
+ model: The model name to use for encoding
20
+
21
+ Returns:
22
+ Tuple of (token_count, status_message)
23
+ """
24
+ if not text:
25
+ return 0, "No text provided"
26
+
27
+ try:
28
+ encoding = tiktoken.encoding_for_model(model)
29
+ tokens = encoding.encode(text)
30
+ return len(tokens), f"✓ Counted {len(tokens)} tokens using {model} encoding"
31
+ except Exception as e:
32
+ return 0, f"Error: {str(e)}"
33
+
34
+ def count_tokens_from_url(url: str, model: str) -> Tuple[int, int, str]:
35
+ """Fetch content from URL and count tokens for both HTML and Markdown formats.
36
+
37
+ Args:
38
+ url: The URL to fetch
39
+ model: The model name to use for encoding
40
+
41
+ Returns:
42
+ Tuple of (html_token_count, markdown_token_count, status_message)
43
+ """
44
+ if not url:
45
+ return 0, 0, "No URL provided"
46
+
47
+ try:
48
+ # Fetch as HTML
49
+ html_response = requests.get(
50
+ url,
51
+ headers={"Accept": "text/html"},
52
+ timeout=10
53
+ )
54
+ html_response.raise_for_status()
55
+ html_content = html_response.text
56
+
57
+ # Fetch as Markdown
58
+ markdown_response = requests.get(
59
+ url,
60
+ headers={"Accept": "text/markdown"},
61
+ timeout=10
62
+ )
63
+ markdown_response.raise_for_status()
64
+ markdown_content = markdown_response.text
65
+
66
+ # Count tokens for both
67
+ encoding = tiktoken.encoding_for_model(model)
68
+ html_tokens = len(encoding.encode(html_content))
69
+ markdown_tokens = len(encoding.encode(markdown_content))
70
+
71
+ status = f"✓ Fetched from {url}\n"
72
+ status += f"HTML: {html_tokens} tokens ({len(html_content)} chars)\n"
73
+ status += f"Markdown: {markdown_tokens} tokens ({len(markdown_content)} chars)"
74
+
75
+ return html_tokens, markdown_tokens, status
76
+
77
+ except requests.exceptions.RequestException as e:
78
+ return 0, 0, f"Error fetching URL: {str(e)}"
79
+ except Exception as e:
80
+ return 0, 0, f"Error: {str(e)}"
81
+
82
+ def main():
83
+ """Create and launch the Gradio interface."""
84
+
85
+ with gr.Blocks(title="Token counter") as demo:
86
+ gr.Markdown("""
87
+ # Token Counter
88
+ Count tokens in your text supporting different model encodings. Uses `tiktoken` to estimate the token count.
89
+ """)
90
+
91
+ with gr.Tabs():
92
+ with gr.Tab("Text Input"):
93
+ with gr.Row():
94
+ with gr.Column():
95
+ text_input = gr.Textbox(
96
+ label="Input Text",
97
+ placeholder="Enter your text here...",
98
+ lines=10,
99
+ max_lines=20
100
+ )
101
+
102
+ model_dropdown = gr.Dropdown(
103
+ choices=[
104
+ # reasoning
105
+ "o1",
106
+ "o3",
107
+ "o4-mini",
108
+ # chat
109
+ "gpt-5",
110
+ "gpt-4.1",
111
+ "gpt-4o",
112
+ "gpt-4",
113
+ "gpt-3.5-turbo",
114
+ "gpt-3.5",
115
+ "gpt-35-turbo",
116
+
117
+ "text-embedding-ada-002",
118
+ "text-embedding-3-small",
119
+ "text-embedding-3-large",
120
+
121
+ "davinci-002",
122
+ "babbage-002",
123
+ ],
124
+ value="gpt-4.1",
125
+ label="Model"
126
+ )
127
+
128
+ count_btn = gr.Button("Count Tokens", variant="primary")
129
+
130
+ with gr.Column():
131
+ token_count = gr.Number(
132
+ label="Token Count",
133
+ value=0,
134
+ interactive=False
135
+ )
136
+
137
+ status_msg = gr.Textbox(
138
+ label="Status",
139
+ interactive=False
140
+ )
141
+
142
+ # Connect the button to the counting function
143
+ count_btn.click(
144
+ fn=count_tokens,
145
+ inputs=[text_input, model_dropdown],
146
+ outputs=[token_count, status_msg]
147
+ )
148
+
149
+ # Also count on text change for real-time feedback
150
+ text_input.change(
151
+ fn=count_tokens,
152
+ inputs=[text_input, model_dropdown],
153
+ outputs=[token_count, status_msg]
154
+ )
155
+
156
+ with gr.Tab("URL Input"):
157
+ with gr.Row():
158
+ with gr.Column():
159
+ url_input = gr.Textbox(
160
+ label="URL",
161
+ placeholder="https://oneofftech.xyz/blog/parxing-week-2025/?utm=token-counter",
162
+ lines=1
163
+ )
164
+
165
+ url_model_dropdown = gr.Dropdown(
166
+ choices=[
167
+ # reasoning
168
+ "o1",
169
+ "o3",
170
+ "o4-mini",
171
+ # chat
172
+ "gpt-5",
173
+ "gpt-4.1",
174
+ "gpt-4o",
175
+ "gpt-4",
176
+ "gpt-3.5-turbo",
177
+ "gpt-3.5",
178
+ "gpt-35-turbo",
179
+
180
+ "text-embedding-ada-002",
181
+ "text-embedding-3-small",
182
+ "text-embedding-3-large",
183
+
184
+ "davinci-002",
185
+ "babbage-002",
186
+ ],
187
+ value="gpt-4.1",
188
+ label="Model"
189
+ )
190
+
191
+ url_count_btn = gr.Button("Count Tokens from URL", variant="primary")
192
+
193
+ with gr.Column():
194
+ html_token_count = gr.Number(
195
+ label="HTML Token Count",
196
+ value=0,
197
+ interactive=False
198
+ )
199
+
200
+ markdown_token_count = gr.Number(
201
+ label="Markdown Token Count",
202
+ value=0,
203
+ interactive=False
204
+ )
205
+
206
+ url_status_msg = gr.Textbox(
207
+ label="Status",
208
+ interactive=False,
209
+ lines=3
210
+ )
211
+
212
+ # Connect the URL button to the URL counting function
213
+ url_count_btn.click(
214
+ fn=count_tokens_from_url,
215
+ inputs=[url_input, url_model_dropdown],
216
+ outputs=[html_token_count, markdown_token_count, url_status_msg]
217
+ )
218
+
219
+ demo.launch(theme=gr.themes.Soft())
220
+
221
+ if __name__ == "__main__":
222
+ main()
pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "token-counter"
3
+ version = "0.1.0"
4
+ description = "Text token counter"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "gradio[mcp]>=6.0.0",
9
+ "requests>=2.28",
10
+ "python-dotenv>=1.2.1",
11
+ "tiktoken>=0.12.0",
12
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff