token_tortoise / app.py
Guilherme Favaron
Add application file
bbb1e4b
import gradio as gr
import tiktoken
import pandas as pd
import PyPDF2
import docx
import pptx
import openpyxl
from pathlib import Path
import csv
import io
def get_encoding():
return tiktoken.get_encoding("cl100k_base")
def count_tokens_text(text):
enc = get_encoding()
return len(enc.encode(text))
def read_pdf(file):
text = ""
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
def read_docx(file):
doc = docx.Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def read_pptx(file):
prs = pptx.Presentation(file)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
def read_excel(file):
df = pd.read_excel(file)
return df.to_string()
def read_csv(file):
df = pd.read_csv(file)
return df.to_string()
def process_files(files):
results = []
total_tokens = 0
enc = get_encoding()
for file in files:
try:
file_ext = Path(file.name).suffix.lower()
file_name = Path(file.name).name
if file_ext == '.pdf':
text = read_pdf(file)
elif file_ext == '.docx':
text = read_docx(file)
elif file_ext == '.pptx':
text = read_pptx(file)
elif file_ext in ['.xlsx', '.xls']:
text = read_excel(file)
elif file_ext == '.csv':
text = read_csv(file)
elif file_ext == '.txt':
text = file.read().decode('utf-8')
else:
results.append(f"Unsupported file format: {file_name}")
continue
token_count = count_tokens_text(text)
total_tokens += token_count
results.append(f"File: {file_name} - Token count: {token_count:,}")
except Exception as e:
results.append(f"Error processing {file.name}: {str(e)}")
# Add total tokens to the beginning of results
if total_tokens > 0:
results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n")
results.insert(1, "-" * 50) # Adding a separator line
return "\n".join(results)
# Custom CSS for Source Sans Pro font
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');
body, .gradio-container {
font-family: 'Source Sans Pro', sans-serif !important;
}
.output-text {
font-family: 'Source Sans Pro', monospace !important;
font-size: 16px !important;
line-height: 1.5 !important;
}
"""
# Create Gradio interface
with gr.Blocks(css=custom_css) as iface:
gr.Markdown(
"""
# 📚 Bulk Token Counter
Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens.
"""
)
with gr.Row():
file_input = gr.File(
file_count="multiple",
label="Upload Files"
)
with gr.Row():
output = gr.Textbox(
label="Results",
lines=10,
elem_classes=["output-text"]
)
file_input.change(
fn=process_files,
inputs=[file_input],
outputs=[output]
)
# Launch the app
if __name__ == "__main__":
iface.launch()