import gradio as gr import tiktoken import pandas as pd import PyPDF2 import docx import pptx import openpyxl from pathlib import Path import csv import io def get_encoding(): return tiktoken.get_encoding("cl100k_base") def count_tokens_text(text): enc = get_encoding() return len(enc.encode(text)) def read_pdf(file): text = "" pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text def read_docx(file): doc = docx.Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text def read_pptx(file): prs = pptx.Presentation(file) text = "" for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text += shape.text + "\n" return text def read_excel(file): df = pd.read_excel(file) return df.to_string() def read_csv(file): df = pd.read_csv(file) return df.to_string() def process_files(files): results = [] total_tokens = 0 enc = get_encoding() for file in files: try: file_ext = Path(file.name).suffix.lower() file_name = Path(file.name).name if file_ext == '.pdf': text = read_pdf(file) elif file_ext == '.docx': text = read_docx(file) elif file_ext == '.pptx': text = read_pptx(file) elif file_ext in ['.xlsx', '.xls']: text = read_excel(file) elif file_ext == '.csv': text = read_csv(file) elif file_ext == '.txt': text = file.read().decode('utf-8') else: results.append(f"Unsupported file format: {file_name}") continue token_count = count_tokens_text(text) total_tokens += token_count results.append(f"File: {file_name} - Token count: {token_count:,}") except Exception as e: results.append(f"Error processing {file.name}: {str(e)}") # Add total tokens to the beginning of results if total_tokens > 0: results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n") results.insert(1, "-" * 50) # Adding a separator line return "\n".join(results) # Custom CSS for Source Sans Pro font custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap'); body, .gradio-container { font-family: 'Source Sans Pro', sans-serif !important; } .output-text { font-family: 'Source Sans Pro', monospace !important; font-size: 16px !important; line-height: 1.5 !important; } """ # Create Gradio interface with gr.Blocks(css=custom_css) as iface: gr.Markdown( """ # 📚 Bulk Token Counter Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens. """ ) with gr.Row(): file_input = gr.File( file_count="multiple", label="Upload Files" ) with gr.Row(): output = gr.Textbox( label="Results", lines=10, elem_classes=["output-text"] ) file_input.change( fn=process_files, inputs=[file_input], outputs=[output] ) # Launch the app if __name__ == "__main__": iface.launch()