Spaces:

namaai
/

tokenCounter

Runtime error

App Files Files Community

tokenCounter / app.py

namaai

Update app.py

a3a2a4b verified over 1 year ago

raw

history blame contribute delete

3.79 kB

	import streamlit as st
	import pandas as pd
	import tiktoken
	import PyPDF2
	import os
	import logging

	# Configure logging
	logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

	# Initialize encoding for the model
	encoding = tiktoken.get_encoding("cl100k_base")

	# Function to count tokens in a text
	def count_tokens(text):
	try:
	tokens = encoding.encode(text)
	return len(tokens)
	except Exception as e:
	logging.error(f"Tokenization error: {e}")
	st.error("Error counting tokens.")
	return 0

	# Function to load CSV and count tokens
	def count_tokens_in_csv(file):
	try:
	# Load CSV file into a DataFrame
	df = pd.read_csv(file)
	st.write("Uploaded CSV Data:")
	st.dataframe(df)

	# Concatenate all text data from the DataFrame
	text = " ".join(df.astype(str).values.flatten())

	# Calculate the number of tokens
	num_tokens = count_tokens(text)
	return num_tokens
	except pd.errors.EmptyDataError:
	logging.error("CSV file is empty.")
	st.error("The CSV file is empty or cannot be read.")
	except pd.errors.ParserError:
	logging.error("CSV file parsing error.")
	st.error("Error parsing the CSV file.")
	except Exception as e:
	logging.error(f"Error processing CSV file: {e}")
	st.error("An error occurred while processing the CSV file.")
	return 0

	# Function to load TXT and count tokens
	def count_tokens_in_txt(file):
	try:
	text = file.read().decode('utf-8')
	num_tokens = count_tokens(text)
	return num_tokens
	except UnicodeDecodeError:
	logging.error("TXT file decoding error.")
	st.error("Error decoding the TXT file. Ensure it's in UTF-8 format.")
	except Exception as e:
	logging.error(f"Error processing TXT file: {e}")
	st.error("An error occurred while processing the TXT file.")
	return 0

	# Function to load PDF and count tokens
	def count_tokens_in_pdf(file):
	try:
	pdf_reader = PyPDF2.PdfReader(file)
	text = ""
	for page in range(len(pdf_reader.pages)):
	text += pdf_reader.pages[page].extract_text() or ""
	num_tokens = count_tokens(text)
	return num_tokens
	except PyPDF2.errors.PdfReadError:
	logging.error("PDF file reading error.")
	st.error("Error reading the PDF file. Ensure it's not corrupted.")
	except Exception as e:
	logging.error(f"Error processing PDF file: {e}")
	st.error("An error occurred while processing the PDF file.")
	return 0

	# Streamlit app
	def main():
	st.title('Token Counter')
	st.write('This product belongs to: NAMA AI')

	# Text input
	text_input = st.text_area("Enter text to calculate tokens:")
	if text_input:
	token_count = count_tokens(text_input)
	if st.button("Count"):
	st.write(f'The input text contains {token_count} tokens.')

	# File uploader
	uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt", "pdf"])

	if uploaded_file is not None:
	file_extension = os.path.splitext(uploaded_file.name)[1].lower()
	if file_extension == ".csv":
	token_count = count_tokens_in_csv(uploaded_file)
	st.write(f'The CSV file contains {token_count} tokens.')
	elif file_extension == ".txt":
	token_count = count_tokens_in_txt(uploaded_file)
	st.write(f'The TXT file contains {token_count} tokens.')
	elif file_extension == ".pdf":
	token_count = count_tokens_in_pdf(uploaded_file)
	st.write(f'The PDF file contains {token_count} tokens.')
	else:
	st.error("Unsupported file type.")

	if __name__ == "__main__":
	main()