Spaces:

guifav
/

token_tortoise

Sleeping

token_tortoise / app.py

Guilherme Favaron

Add application file

bbb1e4b about 1 year ago

3.51 kB

	import gradio as gr
	import tiktoken
	import pandas as pd
	import PyPDF2
	import docx
	import pptx
	import openpyxl
	from pathlib import Path
	import csv
	import io

	def get_encoding():
	return tiktoken.get_encoding("cl100k_base")

	def count_tokens_text(text):
	enc = get_encoding()
	return len(enc.encode(text))

	def read_pdf(file):
	text = ""
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text

	def read_docx(file):
	doc = docx.Document(file)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	def read_pptx(file):
	prs = pptx.Presentation(file)
	text = ""
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text += shape.text + "\n"
	return text

	def read_excel(file):
	df = pd.read_excel(file)
	return df.to_string()

	def read_csv(file):
	df = pd.read_csv(file)
	return df.to_string()

	def process_files(files):
	results = []
	total_tokens = 0
	enc = get_encoding()

	for file in files:
	try:
	file_ext = Path(file.name).suffix.lower()
	file_name = Path(file.name).name

	if file_ext == '.pdf':
	text = read_pdf(file)
	elif file_ext == '.docx':
	text = read_docx(file)
	elif file_ext == '.pptx':
	text = read_pptx(file)
	elif file_ext in ['.xlsx', '.xls']:
	text = read_excel(file)
	elif file_ext == '.csv':
	text = read_csv(file)
	elif file_ext == '.txt':
	text = file.read().decode('utf-8')
	else:
	results.append(f"Unsupported file format: {file_name}")
	continue

	token_count = count_tokens_text(text)
	total_tokens += token_count
	results.append(f"File: {file_name} - Token count: {token_count:,}")

	except Exception as e:
	results.append(f"Error processing {file.name}: {str(e)}")

	# Add total tokens to the beginning of results
	if total_tokens > 0:
	results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n")
	results.insert(1, "-" * 50) # Adding a separator line

	return "\n".join(results)

	# Custom CSS for Source Sans Pro font
	custom_css = """
	@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');

	body, .gradio-container {
	font-family: 'Source Sans Pro', sans-serif !important;
	}

	.output-text {
	font-family: 'Source Sans Pro', monospace !important;
	font-size: 16px !important;
	line-height: 1.5 !important;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(css=custom_css) as iface:
	gr.Markdown(
	"""
	# 📚 Bulk Token Counter
	Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens.
	"""
	)

	with gr.Row():
	file_input = gr.File(
	file_count="multiple",
	label="Upload Files"
	)

	with gr.Row():
	output = gr.Textbox(
	label="Results",
	lines=10,
	elem_classes=["output-text"]
	)

	file_input.change(
	fn=process_files,
	inputs=[file_input],
	outputs=[output]
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()