File size: 3,507 Bytes
bbb1e4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import tiktoken
import pandas as pd
import PyPDF2
import docx
import pptx
import openpyxl
from pathlib import Path
import csv
import io

def get_encoding():
    return tiktoken.get_encoding("cl100k_base")

def count_tokens_text(text):
    enc = get_encoding()
    return len(enc.encode(text))

def read_pdf(file):
    text = ""
    pdf_reader = PyPDF2.PdfReader(file)
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

def read_docx(file):
    doc = docx.Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_pptx(file):
    prs = pptx.Presentation(file)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def read_excel(file):
    df = pd.read_excel(file)
    return df.to_string()

def read_csv(file):
    df = pd.read_csv(file)
    return df.to_string()

def process_files(files):
    results = []
    total_tokens = 0
    enc = get_encoding()
    
    for file in files:
        try:
            file_ext = Path(file.name).suffix.lower()
            file_name = Path(file.name).name
            
            if file_ext == '.pdf':
                text = read_pdf(file)
            elif file_ext == '.docx':
                text = read_docx(file)
            elif file_ext == '.pptx':
                text = read_pptx(file)
            elif file_ext in ['.xlsx', '.xls']:
                text = read_excel(file)
            elif file_ext == '.csv':
                text = read_csv(file)
            elif file_ext == '.txt':
                text = file.read().decode('utf-8')
            else:
                results.append(f"Unsupported file format: {file_name}")
                continue
                
            token_count = count_tokens_text(text)
            total_tokens += token_count
            results.append(f"File: {file_name} - Token count: {token_count:,}")
            
        except Exception as e:
            results.append(f"Error processing {file.name}: {str(e)}")
    
    # Add total tokens to the beginning of results
    if total_tokens > 0:
        results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n")
        results.insert(1, "-" * 50)  # Adding a separator line
    
    return "\n".join(results)

# Custom CSS for Source Sans Pro font
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');

body, .gradio-container {
    font-family: 'Source Sans Pro', sans-serif !important;
}

.output-text {
    font-family: 'Source Sans Pro', monospace !important;
    font-size: 16px !important;
    line-height: 1.5 !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(
        """
        # 📚 Bulk Token Counter
        Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens.
        """
    )
    
    with gr.Row():
        file_input = gr.File(
            file_count="multiple",
            label="Upload Files"
        )
    
    with gr.Row():
        output = gr.Textbox(
            label="Results",
            lines=10,
            elem_classes=["output-text"]
        )
    
    file_input.change(
        fn=process_files,
        inputs=[file_input],
        outputs=[output]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()