Spaces:

guifav
/

token_tortoise

Sleeping

File size: 3,507 Bytes

bbb1e4b

import gradio as gr
import tiktoken
import pandas as pd
import PyPDF2
import docx
import pptx
import openpyxl
from pathlib import Path
import csv
import io

def get_encoding():
    return tiktoken.get_encoding("cl100k_base")

def count_tokens_text(text):
    enc = get_encoding()
    return len(enc.encode(text))

def read_pdf(file):
    text = ""
    pdf_reader = PyPDF2.PdfReader(file)
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

def read_docx(file):
    doc = docx.Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_pptx(file):
    prs = pptx.Presentation(file)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def read_excel(file):
    df = pd.read_excel(file)
    return df.to_string()

def read_csv(file):
    df = pd.read_csv(file)
    return df.to_string()

def process_files(files):
    results = []
    total_tokens = 0
    enc = get_encoding()
    
    for file in files:
        try:
            file_ext = Path(file.name).suffix.lower()
            file_name = Path(file.name).name
            
            if file_ext == '.pdf':
                text = read_pdf(file)
            elif file_ext == '.docx':
                text = read_docx(file)
            elif file_ext == '.pptx':
                text = read_pptx(file)
            elif file_ext in ['.xlsx', '.xls']:
                text = read_excel(file)
            elif file_ext == '.csv':
                text = read_csv(file)
            elif file_ext == '.txt':
                text = file.read().decode('utf-8')
            else:
                results.append(f"Unsupported file format: {file_name}")
                continue
                
            token_count = count_tokens_text(text)
            total_tokens += token_count
            results.append(f"File: {file_name} - Token count: {token_count:,}")
            
        except Exception as e:
            results.append(f"Error processing {file.name}: {str(e)}")
    
    # Add total tokens to the beginning of results
    if total_tokens > 0:
        results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n")
        results.insert(1, "-" * 50)  # Adding a separator line
    
    return "\n".join(results)

# Custom CSS for Source Sans Pro font
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');

body, .gradio-container {
    font-family: 'Source Sans Pro', sans-serif !important;
}

.output-text {
    font-family: 'Source Sans Pro', monospace !important;
    font-size: 16px !important;
    line-height: 1.5 !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(
        """
        # 📚 Bulk Token Counter
        Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens.
        """
    )
    
    with gr.Row():
        file_input = gr.File(
            file_count="multiple",
            label="Upload Files"
        )
    
    with gr.Row():
        output = gr.Textbox(
            label="Results",
            lines=10,
            elem_classes=["output-text"]
        )
    
    file_input.change(
        fn=process_files,
        inputs=[file_input],
        outputs=[output]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()