File size: 3,792 Bytes
d5109cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a2a4b
 
d5109cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import streamlit as st
import pandas as pd
import tiktoken
import PyPDF2
import os
import logging

# Configure logging
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize encoding for the model
encoding = tiktoken.get_encoding("cl100k_base")

# Function to count tokens in a text
def count_tokens(text):
    try:
        tokens = encoding.encode(text)
        return len(tokens)
    except Exception as e:
        logging.error(f"Tokenization error: {e}")
        st.error("Error counting tokens.")
        return 0

# Function to load CSV and count tokens
def count_tokens_in_csv(file):
    try:
        # Load CSV file into a DataFrame
        df = pd.read_csv(file)
        st.write("Uploaded CSV Data:")
        st.dataframe(df)

        # Concatenate all text data from the DataFrame
        text = " ".join(df.astype(str).values.flatten())

        # Calculate the number of tokens
        num_tokens = count_tokens(text)
        return num_tokens
    except pd.errors.EmptyDataError:
        logging.error("CSV file is empty.")
        st.error("The CSV file is empty or cannot be read.")
    except pd.errors.ParserError:
        logging.error("CSV file parsing error.")
        st.error("Error parsing the CSV file.")
    except Exception as e:
        logging.error(f"Error processing CSV file: {e}")
        st.error("An error occurred while processing the CSV file.")
    return 0

# Function to load TXT and count tokens
def count_tokens_in_txt(file):
    try:
        text = file.read().decode('utf-8')
        num_tokens = count_tokens(text)
        return num_tokens
    except UnicodeDecodeError:
        logging.error("TXT file decoding error.")
        st.error("Error decoding the TXT file. Ensure it's in UTF-8 format.")
    except Exception as e:
        logging.error(f"Error processing TXT file: {e}")
        st.error("An error occurred while processing the TXT file.")
    return 0

# Function to load PDF and count tokens
def count_tokens_in_pdf(file):
    try:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page].extract_text() or ""
        num_tokens = count_tokens(text)
        return num_tokens
    except PyPDF2.errors.PdfReadError:
        logging.error("PDF file reading error.")
        st.error("Error reading the PDF file. Ensure it's not corrupted.")
    except Exception as e:
        logging.error(f"Error processing PDF file: {e}")
        st.error("An error occurred while processing the PDF file.")
    return 0

# Streamlit app
def main():
    st.title('Token Counter')
    st.write('This product belongs to: NAMA AI')

    # Text input
    text_input = st.text_area("Enter text to calculate tokens:")
    if text_input:
        token_count = count_tokens(text_input)
        if st.button("Count"):
          st.write(f'The input text contains {token_count} tokens.')

    # File uploader
    uploaded_file = st.file_uploader("Choose a file", type=["csv", "txt", "pdf"])
    
    if uploaded_file is not None:
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
        if file_extension == ".csv":
            token_count = count_tokens_in_csv(uploaded_file)
            st.write(f'The CSV file contains {token_count} tokens.')
        elif file_extension == ".txt":
            token_count = count_tokens_in_txt(uploaded_file)
            st.write(f'The TXT file contains {token_count} tokens.')
        elif file_extension == ".pdf":
            token_count = count_tokens_in_pdf(uploaded_file)
            st.write(f'The PDF file contains {token_count} tokens.')
        else:
            st.error("Unsupported file type.")

if __name__ == "__main__":
    main()