Spaces:

chaos4455
/

Token-Counter-Streamlit-BERTS

Build error

App Files Files Community

Token-Counter-Streamlit-BERTS / app.py

chaos4455

Create app.py

8294205 verified about 1 year ago

raw

history blame contribute delete

3.42 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import tensorflow as tf
	from tensorflow.keras.preprocessing.text import Tokenizer
	from transformers import BertTokenizer, TFBertModel
	import dash
	from dash import dcc, html
	from dash.dependencies import Input, Output
	import base64
	import pdfminer
	from pdfminer.high_level import extract_text
	import json
	import yaml
	import os
	import time

	# Definir o app do Dash
	dash_app = dash.Dash(__name__)

	# Função para ler arquivos PDF
	def extract_text_from_pdf(uploaded_file):
	with open("temp.pdf", "wb") as f:
	f.write(uploaded_file.read())
	text = extract_text("temp.pdf")
	return text

	# Função para ler arquivos TXT
	def read_txt(uploaded_file):
	return uploaded_file.getvalue().decode("utf-8")

	# Função para processar YAML
	def read_yaml(uploaded_file):
	return yaml.safe_load(uploaded_file)

	# Função para processar JSON
	def read_json(uploaded_file):
	return json.loads(uploaded_file)

	# Função para processar o texto em tokens e vetores com BERT
	def process_text(text):
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)
	model = TFBertModel.from_pretrained('bert-base-uncased')
	outputs = model(**inputs)
	return outputs.last_hidden_state

	# Interface principal
	def main():
	st.title("Contador de Tokens e Vetores de Texto 📊📑")

	st.markdown("Arraste seu arquivo abaixo para iniciar o processamento:")

	# Seletor de arquivo
	uploaded_file = st.file_uploader("Escolha um arquivo PDF, TXT, YAML ou JSON", type=["pdf", "txt", "yaml", "json"])

	if uploaded_file is not None:
	file_type = uploaded_file.name.split('.')[-1].lower()

	if file_type == "pdf":
	st.write("Processando PDF...")
	text = extract_text_from_pdf(uploaded_file)
	elif file_type == "txt":
	st.write("Processando TXT...")
	text = read_txt(uploaded_file)
	elif file_type == "yaml":
	st.write("Processando YAML...")
	text = str(read_yaml(uploaded_file))
	elif file_type == "json":
	st.write("Processando JSON...")
	text = json.dumps(read_json(uploaded_file), indent=4)

	# Exibir texto bruto
	st.text_area("Texto Bruto:", text, height=300)

	# Contagem de Tokens
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts([text])
	tokens = tokenizer.texts_to_sequences([text])[0]
	st.write(f"Total de Tokens: {len(tokens)}")

	# Contagem de Vetores (após processar com BERT)
	vectors = process_text(text)
	st.write(f"Total de Vetores: {vectors.shape[1]}")

	# Exibir gráfico (Dash)
	dash_app.layout = html.Div([
	dcc.Graph(
	id='tokens-graph',
	figure={
	'data': [
	{'x': list(range(len(tokens))), 'y': tokens, 'type': 'line', 'name': 'Tokens'},
	],
	'layout': {
	'title': 'Contagem de Tokens 🧩'
	}
	}
	)
	])

	st.write("🔄 Iniciando o processamento em tempo real...")

	# Rodar Dash
	dash_app.run_server(port=8050, use_reloader=False, debug=True)


	if __name__ == "__main__":
	main()