Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import pandas as pd | |
| import tiktoken | |
| import anthropic | |
| def process_csv(file, calculate_openai, openai_model, calculate_anthropic, anthropic_model): | |
| # Check if file is uploaded | |
| if file is None: | |
| return "Please upload a CSV file." | |
| # Read the CSV file | |
| try: | |
| df = pd.read_csv(file)#.name) | |
| except Exception as e: | |
| return f"Error reading CSV file: {e}" | |
| # Initialize output string | |
| output = "" | |
| if calculate_openai: | |
| # Get the OpenAI tokenizer for the selected model | |
| try: | |
| openai_encoding = tiktoken.encoding_for_model(openai_model) | |
| except KeyError: | |
| # Default encoding if model is not found | |
| openai_encoding = tiktoken.get_encoding("cl100k_base") | |
| token_counts_openai = {} | |
| try: | |
| total_tokens_openai = len(openai_encoding.encode(df.to_csv(index=False))) | |
| except Exception as e: | |
| return f"Error counting tokens with OpenAI model: {e}" | |
| # Iterate over columns | |
| for col in df.columns: | |
| #tokens_col_openai = 0 | |
| try: | |
| tokens_openai = openai_encoding.encode('\n'.join([col]+list(df[col].astype(str).values))) | |
| except Exception as e: | |
| return f"Error counting tokens with OpenAI model: {e}" | |
| # for cell in df[col].astype(str): | |
| # tokens_openai = openai_encoding.encode(cell) | |
| # tokens_col_openai += len(tokens_openai) | |
| token_counts_openai[col] = len(tokens_openai) | |
| #total_tokens_openai += tokens_openai | |
| # Prepare OpenAI output | |
| output += f"\n**Total OpenAI Tokens ({openai_model}): {total_tokens_openai}**\n" | |
| output += f"\n**OpenAI Token Counts per Column ({openai_model}):**\n\n" | |
| for col, count in token_counts_openai.items(): | |
| output += f"- {col}: {count} tokens\n" | |
| if calculate_anthropic: | |
| # Get the Anthropic API key from environment variables | |
| #anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY") | |
| #if not anthropic_api_key: | |
| # return "Please set the ANTHROPIC_API_KEY environment variable." | |
| # Initialize the Anthropic client | |
| #client = anthropic.Anthropic(api_key=anthropic_api_key) | |
| client = anthropic.Anthropic() | |
| token_counts_anthropic = {} | |
| #total_tokens_anthropic = client.count_tokens(df.to_csv(index=False)) | |
| try: | |
| response = client.beta.messages.count_tokens( | |
| betas=["token-counting-2024-11-01"], | |
| model=anthropic_model, #"claude-3-5-sonnet-20241022", | |
| #system="You are a scientist", | |
| messages=[{ | |
| "role": "user", | |
| "content": df.to_csv(index=False) | |
| }], | |
| ) | |
| total_tokens_anthropic = json.loads(response.json())['input_tokens'] | |
| except Exception as e: | |
| return f"Error counting tokens with Anthropic model: {e}" | |
| # Iterate over columns | |
| for col in df.columns: | |
| #tokens_col_anthropic = 0 | |
| try: | |
| #tokens_anthropic = client.count_tokens('\n'.join([col]+list(df[col].astype(str).values))) #0.37.1 version | |
| response = client.beta.messages.count_tokens( | |
| betas=["token-counting-2024-11-01"], | |
| model=anthropic_model, | |
| messages=[{ | |
| "role": "user", | |
| "content": '\n'.join([col]+list(df[col].astype(str).values)) | |
| }], | |
| ) | |
| tokens_anthropic = json.loads(response.json())['input_tokens'] | |
| except Exception as e: | |
| return f"Error counting tokens with Anthropic model: {e}" | |
| # for cell in df[col].astype(str): | |
| # try: | |
| # tokens_anthropic = client.count_tokens(cell) | |
| # except Exception as e: | |
| # return f"Error counting tokens with Anthropic model: {e}" | |
| # tokens_col_anthropic += tokens_anthropic | |
| token_counts_anthropic[col] = tokens_anthropic | |
| #total_tokens_anthropic += tokens_anthropic | |
| # Prepare Anthropic output | |
| output += f"\n**Total Anthropic Tokens ({anthropic_model}): {total_tokens_anthropic}**\n" | |
| output += f"\n**Anthropic Token Counts per Column ({anthropic_model}):**\n" | |
| for col, count in token_counts_anthropic.items(): | |
| output += f"- {col}: {count} tokens\n" | |
| if not calculate_openai and not calculate_anthropic: | |
| output = "Please select at least one model to calculate tokens." | |
| return output | |
| def main(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Token Counter") | |
| gr.Markdown("Upload a CSV file to see token counts per column and total tokens.") | |
| gr.Markdown(""" | |
| For OpenAI models Python package `tiktoken` is used. | |
| For Anthropic models beta version of [Token counting](https://docs.anthropic.com/en/docs/build-with-claude/token-counting) is used. | |
| """) | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload CSV File", type="filepath") | |
| with gr.Row(): | |
| calculate_openai = gr.Checkbox(label="Calculate tokens for OpenAI models") | |
| calculate_anthropic = gr.Checkbox(label="Calculate tokens for Anthropic models") | |
| with gr.Row(): | |
| openai_model = gr.Dropdown( | |
| choices=['gpt-4o', 'gpt-4o-mini', 'gpt-4'], | |
| label="Select OpenAI Model", | |
| visible=False | |
| ) | |
| anthropic_model = gr.Dropdown( | |
| choices=['claude-3-5-sonnet-latest', 'claude-3-5-haiku-latest', 'claude-3-opus-latest', 'claude-3-haiku-20240307'], | |
| label="Select Anthropic Model", | |
| visible=False | |
| ) | |
| def update_openai_visibility(selected): | |
| return gr.update(visible=selected) | |
| def update_anthropic_visibility(selected): | |
| return gr.update(visible=selected) | |
| calculate_openai.change(fn=update_openai_visibility, inputs=calculate_openai, outputs=openai_model) | |
| calculate_anthropic.change(fn=update_anthropic_visibility, inputs=calculate_anthropic, outputs=anthropic_model) | |
| submit_button = gr.Button("Calculate Tokens") | |
| output = gr.Markdown() | |
| inputs = [file_input, calculate_openai, openai_model, calculate_anthropic, anthropic_model] | |
| submit_button.click(fn=process_csv, inputs=inputs, outputs=output) | |
| #demo.launch(share=True) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() |