chaos4455's picture
Create app.py
8294205 verified
import streamlit as st
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, TFBertModel
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import base64
import pdfminer
from pdfminer.high_level import extract_text
import json
import yaml
import os
import time
# Definir o app do Dash
dash_app = dash.Dash(__name__)
# Função para ler arquivos PDF
def extract_text_from_pdf(uploaded_file):
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.read())
text = extract_text("temp.pdf")
return text
# Função para ler arquivos TXT
def read_txt(uploaded_file):
return uploaded_file.getvalue().decode("utf-8")
# Função para processar YAML
def read_yaml(uploaded_file):
return yaml.safe_load(uploaded_file)
# Função para processar JSON
def read_json(uploaded_file):
return json.loads(uploaded_file)
# Função para processar o texto em tokens e vetores com BERT
def process_text(text):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)
model = TFBertModel.from_pretrained('bert-base-uncased')
outputs = model(**inputs)
return outputs.last_hidden_state
# Interface principal
def main():
st.title("Contador de Tokens e Vetores de Texto 📊📑")
st.markdown("Arraste seu arquivo abaixo para iniciar o processamento:")
# Seletor de arquivo
uploaded_file = st.file_uploader("Escolha um arquivo PDF, TXT, YAML ou JSON", type=["pdf", "txt", "yaml", "json"])
if uploaded_file is not None:
file_type = uploaded_file.name.split('.')[-1].lower()
if file_type == "pdf":
st.write("Processando PDF...")
text = extract_text_from_pdf(uploaded_file)
elif file_type == "txt":
st.write("Processando TXT...")
text = read_txt(uploaded_file)
elif file_type == "yaml":
st.write("Processando YAML...")
text = str(read_yaml(uploaded_file))
elif file_type == "json":
st.write("Processando JSON...")
text = json.dumps(read_json(uploaded_file), indent=4)
# Exibir texto bruto
st.text_area("Texto Bruto:", text, height=300)
# Contagem de Tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
tokens = tokenizer.texts_to_sequences([text])[0]
st.write(f"Total de Tokens: {len(tokens)}")
# Contagem de Vetores (após processar com BERT)
vectors = process_text(text)
st.write(f"Total de Vetores: {vectors.shape[1]}")
# Exibir gráfico (Dash)
dash_app.layout = html.Div([
dcc.Graph(
id='tokens-graph',
figure={
'data': [
{'x': list(range(len(tokens))), 'y': tokens, 'type': 'line', 'name': 'Tokens'},
],
'layout': {
'title': 'Contagem de Tokens 🧩'
}
}
)
])
st.write("🔄 Iniciando o processamento em tempo real...")
# Rodar Dash
dash_app.run_server(port=8050, use_reloader=False, debug=True)
if __name__ == "__main__":
main()