Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tiktoken | |
| import pandas as pd | |
| import PyPDF2 | |
| import docx | |
| import pptx | |
| import openpyxl | |
| from pathlib import Path | |
| import csv | |
| import io | |
| def get_encoding(): | |
| return tiktoken.get_encoding("cl100k_base") | |
| def count_tokens_text(text): | |
| enc = get_encoding() | |
| return len(enc.encode(text)) | |
| def read_pdf(file): | |
| text = "" | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def read_docx(file): | |
| doc = docx.Document(file) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| def read_pptx(file): | |
| prs = pptx.Presentation(file) | |
| text = "" | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text += shape.text + "\n" | |
| return text | |
| def read_excel(file): | |
| df = pd.read_excel(file) | |
| return df.to_string() | |
| def read_csv(file): | |
| df = pd.read_csv(file) | |
| return df.to_string() | |
| def process_files(files): | |
| results = [] | |
| total_tokens = 0 | |
| enc = get_encoding() | |
| for file in files: | |
| try: | |
| file_ext = Path(file.name).suffix.lower() | |
| file_name = Path(file.name).name | |
| if file_ext == '.pdf': | |
| text = read_pdf(file) | |
| elif file_ext == '.docx': | |
| text = read_docx(file) | |
| elif file_ext == '.pptx': | |
| text = read_pptx(file) | |
| elif file_ext in ['.xlsx', '.xls']: | |
| text = read_excel(file) | |
| elif file_ext == '.csv': | |
| text = read_csv(file) | |
| elif file_ext == '.txt': | |
| text = file.read().decode('utf-8') | |
| else: | |
| results.append(f"Unsupported file format: {file_name}") | |
| continue | |
| token_count = count_tokens_text(text) | |
| total_tokens += token_count | |
| results.append(f"File: {file_name} - Token count: {token_count:,}") | |
| except Exception as e: | |
| results.append(f"Error processing {file.name}: {str(e)}") | |
| # Add total tokens to the beginning of results | |
| if total_tokens > 0: | |
| results.insert(0, f"\nTotal tokens across all files: {total_tokens:,}\n") | |
| results.insert(1, "-" * 50) # Adding a separator line | |
| return "\n".join(results) | |
| # Custom CSS for Source Sans Pro font | |
| custom_css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&display=swap'); | |
| body, .gradio-container { | |
| font-family: 'Source Sans Pro', sans-serif !important; | |
| } | |
| .output-text { | |
| font-family: 'Source Sans Pro', monospace !important; | |
| font-size: 16px !important; | |
| line-height: 1.5 !important; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(css=custom_css) as iface: | |
| gr.Markdown( | |
| """ | |
| # 📚 Bulk Token Counter | |
| Upload multiple files (PDF, DOCX, PPTX, XLSX, CSV, TXT) to count their tokens. | |
| """ | |
| ) | |
| with gr.Row(): | |
| file_input = gr.File( | |
| file_count="multiple", | |
| label="Upload Files" | |
| ) | |
| with gr.Row(): | |
| output = gr.Textbox( | |
| label="Results", | |
| lines=10, | |
| elem_classes=["output-text"] | |
| ) | |
| file_input.change( | |
| fn=process_files, | |
| inputs=[file_input], | |
| outputs=[output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| iface.launch() |