Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import tensorflow as tf | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| from transformers import BertTokenizer, TFBertModel | |
| import dash | |
| from dash import dcc, html | |
| from dash.dependencies import Input, Output | |
| import base64 | |
| import pdfminer | |
| from pdfminer.high_level import extract_text | |
| import json | |
| import yaml | |
| import os | |
| import time | |
| # Definir o app do Dash | |
| dash_app = dash.Dash(__name__) | |
| # Função para ler arquivos PDF | |
| def extract_text_from_pdf(uploaded_file): | |
| with open("temp.pdf", "wb") as f: | |
| f.write(uploaded_file.read()) | |
| text = extract_text("temp.pdf") | |
| return text | |
| # Função para ler arquivos TXT | |
| def read_txt(uploaded_file): | |
| return uploaded_file.getvalue().decode("utf-8") | |
| # Função para processar YAML | |
| def read_yaml(uploaded_file): | |
| return yaml.safe_load(uploaded_file) | |
| # Função para processar JSON | |
| def read_json(uploaded_file): | |
| return json.loads(uploaded_file) | |
| # Função para processar o texto em tokens e vetores com BERT | |
| def process_text(text): | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512) | |
| model = TFBertModel.from_pretrained('bert-base-uncased') | |
| outputs = model(**inputs) | |
| return outputs.last_hidden_state | |
| # Interface principal | |
| def main(): | |
| st.title("Contador de Tokens e Vetores de Texto 📊📑") | |
| st.markdown("Arraste seu arquivo abaixo para iniciar o processamento:") | |
| # Seletor de arquivo | |
| uploaded_file = st.file_uploader("Escolha um arquivo PDF, TXT, YAML ou JSON", type=["pdf", "txt", "yaml", "json"]) | |
| if uploaded_file is not None: | |
| file_type = uploaded_file.name.split('.')[-1].lower() | |
| if file_type == "pdf": | |
| st.write("Processando PDF...") | |
| text = extract_text_from_pdf(uploaded_file) | |
| elif file_type == "txt": | |
| st.write("Processando TXT...") | |
| text = read_txt(uploaded_file) | |
| elif file_type == "yaml": | |
| st.write("Processando YAML...") | |
| text = str(read_yaml(uploaded_file)) | |
| elif file_type == "json": | |
| st.write("Processando JSON...") | |
| text = json.dumps(read_json(uploaded_file), indent=4) | |
| # Exibir texto bruto | |
| st.text_area("Texto Bruto:", text, height=300) | |
| # Contagem de Tokens | |
| tokenizer = Tokenizer() | |
| tokenizer.fit_on_texts([text]) | |
| tokens = tokenizer.texts_to_sequences([text])[0] | |
| st.write(f"Total de Tokens: {len(tokens)}") | |
| # Contagem de Vetores (após processar com BERT) | |
| vectors = process_text(text) | |
| st.write(f"Total de Vetores: {vectors.shape[1]}") | |
| # Exibir gráfico (Dash) | |
| dash_app.layout = html.Div([ | |
| dcc.Graph( | |
| id='tokens-graph', | |
| figure={ | |
| 'data': [ | |
| {'x': list(range(len(tokens))), 'y': tokens, 'type': 'line', 'name': 'Tokens'}, | |
| ], | |
| 'layout': { | |
| 'title': 'Contagem de Tokens 🧩' | |
| } | |
| } | |
| ) | |
| ]) | |
| st.write("🔄 Iniciando o processamento em tempo real...") | |
| # Rodar Dash | |
| dash_app.run_server(port=8050, use_reloader=False, debug=True) | |
| if __name__ == "__main__": | |
| main() | |