import gradio as gr
import tiktoken
import pandas as pd
import PyPDF2
import docx
import pptx
import openpyxl
from pathlib import Path
import csv
import io

def get_encoding():
    return tiktoken.get_encoding("cl100k_base")

def count_tokens_text(text):
    enc = get_encoding()
    return len(enc.encode(text))

def read_pdf(file):
    text = ""
    pdf_reader = PyPDF2.PdfReader(file)
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

def read_docx(file):
    doc = docx.Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_pptx(file):
    prs = pptx.Presentation(file)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def read_excel(file):
    df = pd.read_excel(file)
    return df.to_string()

def read_csv(file):
    df = pd.read_csv(file)
    return df.to_string()

def process_files(files):
    results = []
    total_tokens = 0
    enc = get_encoding()
    
    for file in files:
        try:
            file_ext = Path(file.name).suffix.lower()
            file_name = Path(file.name).name
            
            if file_ext == '.pdf':
                text = read_pdf(file)
            elif file_ext == '.docx':
                text = read_docx(file)
            elif file_ext == '.pptx':
                text = read_pptx(file)
            elif file_ext in ['.xlsx', '.xls']:
                text = read_excel(file)
            elif file_ext == '.csv':
                text = read_csv(file)
            elif file_ext == '.txt':
                text = file.read().decode('utf-8')
            else:
                results.append(f"Unsupported file format: {file_name}")
                continue
                
            token_count = count_tokens_text(text)
            total_tokens += token_count
            results.append(f"File: {file_name} - Token count: {token_count:,}")
            
        except Exception as e:
            results.append(f"Error processing {file.name}: {str(e)}")
    
    # Add total tokens to the beginning of results
    if total_tokens > 0:
        results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n")
        results.insert(1, "-" * 50)  # Adding a separator line
    
    return "\n".join(results)

# Custom CSS for Source Sans Pro font
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap');

body, .gradio-container {
    font-family: 'Source Sans Pro', sans-serif !important;
}

.output-text {
    font-family: 'Source Sans Pro', monospace !important;
    font-size: 16px !important;
    line-height: 1.5 !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(
        """
        # 📚 Bulk Token Counter
        Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens.
        """
    )
    
    with gr.Row():
        file_input = gr.File(
            file_count="multiple",
            label="Upload Files"
        )
    
    with gr.Row():
        output = gr.Textbox(
            label="Results",
            lines=10,
            elem_classes=["output-text"]
        )
    
    file_input.change(
        fn=process_files,
        inputs=[file_input],
        outputs=[output]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()