import streamlit as st import pandas as pd import tiktoken import PyPDF2 import os import logging # Configure logging logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s') # Initialize encoding for the model encoding = tiktoken.get_encoding("cl100k_base") # Function to count tokens in a text def count_tokens(text): try: tokens = encoding.encode(text) return len(tokens) except Exception as e: logging.error(f"Tokenization error: {e}") st.error("Error counting tokens.") return 0 # Function to load CSV and count tokens def count_tokens_in_csv(file): try: # Load CSV file into a DataFrame df = pd.read_csv(file) st.write("Uploaded CSV Data:") st.dataframe(df) # Concatenate all text data from the DataFrame text = " ".join(df.astype(str).values.flatten()) # Calculate the number of tokens num_tokens = count_tokens(text) return num_tokens except pd.errors.EmptyDataError: logging.error("CSV file is empty.") st.error("The CSV file is empty or cannot be read.") except pd.errors.ParserError: logging.error("CSV file parsing error.") st.error("Error parsing the CSV file.") except Exception as e: logging.error(f"Error processing CSV file: {e}") st.error("An error occurred while processing the CSV file.") return 0 # Function to load TXT and count tokens def count_tokens_in_txt(file): try: text = file.read().decode('utf-8') num_tokens = count_tokens(text) return num_tokens except UnicodeDecodeError: logging.error("TXT file decoding error.") st.error("Error decoding the TXT file. Ensure it's in UTF-8 format.") except Exception as e: logging.error(f"Error processing TXT file: {e}") st.error("An error occurred while processing the TXT file.") return 0 # Function to load PDF and count tokens def count_tokens_in_pdf(file): try: pdf_reader = PyPDF2.PdfReader(file) text = "" for page in range(len(pdf_reader.pages)): text += pdf_reader.pages[page].extract_text() or "" num_tokens = count_tokens(text) return num_tokens except PyPDF2.errors.PdfReadError: logging.error("PDF file reading error.") st.error("Error reading the PDF file. Ensure it's not corrupted.") except Exception as e: logging.error(f"Error processing PDF file: {e}") st.error("An error occurred while processing the PDF file.") return 0 # Streamlit app def main(): st.title('Token Counter') st.write('This product belongs to: NAMA AI') # Text input text_input = st.text_area("Enter text to calculate tokens:") if text_input: token_count = count_tokens(text_input) if st.button("Count"): st.write(f'The input text contains {token_count} tokens.') # File uploader uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt", "pdf"]) if uploaded_file is not None: file_extension = os.path.splitext(uploaded_file.name)[1].lower() if file_extension == ".csv": token_count = count_tokens_in_csv(uploaded_file) st.write(f'The CSV file contains {token_count} tokens.') elif file_extension == ".txt": token_count = count_tokens_in_txt(uploaded_file) st.write(f'The TXT file contains {token_count} tokens.') elif file_extension == ".pdf": token_count = count_tokens_in_pdf(uploaded_file) st.write(f'The PDF file contains {token_count} tokens.') else: st.error("Unsupported file type.") if __name__ == "__main__": main()