import streamlit as st import pandas as pd import numpy as np import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer from transformers import BertTokenizer, TFBertModel import dash from dash import dcc, html from dash.dependencies import Input, Output import base64 import pdfminer from pdfminer.high_level import extract_text import json import yaml import os import time # Definir o app do Dash dash_app = dash.Dash(__name__) # Função para ler arquivos PDF def extract_text_from_pdf(uploaded_file): with open("temp.pdf", "wb") as f: f.write(uploaded_file.read()) text = extract_text("temp.pdf") return text # Função para ler arquivos TXT def read_txt(uploaded_file): return uploaded_file.getvalue().decode("utf-8") # Função para processar YAML def read_yaml(uploaded_file): return yaml.safe_load(uploaded_file) # Função para processar JSON def read_json(uploaded_file): return json.loads(uploaded_file) # Função para processar o texto em tokens e vetores com BERT def process_text(text): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512) model = TFBertModel.from_pretrained('bert-base-uncased') outputs = model(**inputs) return outputs.last_hidden_state # Interface principal def main(): st.title("Contador de Tokens e Vetores de Texto 📊📑") st.markdown("Arraste seu arquivo abaixo para iniciar o processamento:") # Seletor de arquivo uploaded_file = st.file_uploader("Escolha um arquivo PDF, TXT, YAML ou JSON", type=["pdf", "txt", "yaml", "json"]) if uploaded_file is not None: file_type = uploaded_file.name.split('.')[-1].lower() if file_type == "pdf": st.write("Processando PDF...") text = extract_text_from_pdf(uploaded_file) elif file_type == "txt": st.write("Processando TXT...") text = read_txt(uploaded_file) elif file_type == "yaml": st.write("Processando YAML...") text = str(read_yaml(uploaded_file)) elif file_type == "json": st.write("Processando JSON...") text = json.dumps(read_json(uploaded_file), indent=4) # Exibir texto bruto st.text_area("Texto Bruto:", text, height=300) # Contagem de Tokens tokenizer = Tokenizer() tokenizer.fit_on_texts([text]) tokens = tokenizer.texts_to_sequences([text])[0] st.write(f"Total de Tokens: {len(tokens)}") # Contagem de Vetores (após processar com BERT) vectors = process_text(text) st.write(f"Total de Vetores: {vectors.shape[1]}") # Exibir gráfico (Dash) dash_app.layout = html.Div([ dcc.Graph( id='tokens-graph', figure={ 'data': [ {'x': list(range(len(tokens))), 'y': tokens, 'type': 'line', 'name': 'Tokens'}, ], 'layout': { 'title': 'Contagem de Tokens 🧩' } } ) ]) st.write("🔄 Iniciando o processamento em tempo real...") # Rodar Dash dash_app.run_server(port=8050, use_reloader=False, debug=True) if __name__ == "__main__": main()