sp / app.py
salomonsky's picture
Update app.py
5ed86e6 verified
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
modelo_bert = BertModel.from_pretrained('bert-base-uncased')
tokenizador_bert = BertTokenizer.from_pretrained('bert-base-uncased')
def obtener_representaciones(texto):
tokens = tokenizador_bert.tokenize(tokenizador_bert.decode(tokenizador_bert.encode(texto)))
ids = tokenizador_bert.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids).unsqueeze(0)
with torch.no_grad():
outputs = modelo_bert(input_ids)
return outputs.last_hidden_state
def calcular_similitud(representaciones):
similarity_matrix = np.dot(representaciones, representaciones.T)
np.fill_diagonal(similarity_matrix, 0)
return similarity_matrix
def analizar_sentimiento(texto):
analizador_sentimientos = pipeline("sentiment-analysis")
resultado = analizador_sentimientos(texto)
return resultado[0]['score'], resultado[0]['label']
def obtener_tamano_color_relevancia_esferas(df, relevancia_minima):
sizes = []
colors = []
for i in range(len(df)):
relevance = np.max(df.iloc[i, :-4])
size = 2 + relevance * 7
color = 'rgba(0, 0, 255, 0.8)' if relevance < relevancia_minima else 'rgba(255, 0, 0, 0.8)'
sizes.append(size)
colors.append(color)
return sizes, colors
def obtener_color_transparencia_lineas(df):
colores = ['green', 'blue', 'yellow', 'red', 'purple', 'orange']
color_and_opacity = []
color_index = 0
for i in range(len(df)):
for j in range(i + 1, len(df)):
relevance = np.linalg.norm(df.iloc[i, :-4] - df.iloc[j, :-4])
sentimiento_promedio = (analizar_sentimiento(df['Hashtag'][i])[0] + analizar_sentimiento(df['Hashtag'][j])[0]) / 2
color = colores[color_index]
color_index = (color_index + 1) % len(colores)
color_and_opacity.append((color, relevance, 0.6))
return color_and_opacity
def generar_nube_palabras_3d(hashtags, relevancia_minima):
representaciones = [obtener_representaciones(palabra).mean(dim=1).squeeze().numpy() for palabra in hashtags.split()]
df = pd.DataFrame(np.vstack(representaciones), columns=[f'Feature_{i}' for i in range(768)])
df['Hashtag'] = hashtags.split()
pca = PCA(n_components=3)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.iloc[:, :-1])
pca_result = pca.fit_transform(df_scaled)
df['PCA1'], df['PCA2'], df['PCA3'] = pca_result[:, 0], pca_result[:, 1], pca_result[:, 2]
similarity_matrix = calcular_similitud(df.iloc[:, :-4])
sizes, colors = obtener_tamano_color_relevancia_esferas(df, relevancia_minima)
color_and_opacity = obtener_color_transparencia_lineas(df)
df['CuartaDim'] = np.random.rand(len(df))
fig = px.scatter_3d(df, x='PCA1', y='PCA2', z='PCA3', text='Hashtag', color='CuartaDim',
title="Nube de Palabras 3D", opacity=0.9, width=1000, height=1000,
color_continuous_scale='Viridis')
for i in range(len(df)):
for j in range(i + 1, len(df)):
color, width, opacity = color_and_opacity.pop(0)
fig.add_trace(go.Scatter3d(x=[df.iloc[i]['PCA1'], df.iloc[j]['PCA1']],
y=[df.iloc[i]['PCA2'], df.iloc[j]['PCA2']],
z=[df.iloc[i]['PCA3'], df.iloc[j]['PCA3']],
mode='lines',
line=dict(width=width, color=color),
opacity=opacity))
for i in range(len(df)):
size = sizes[i]
color = colors[i]
fig.add_trace(go.Scatter3d(x=[df.iloc[i]['PCA1']],
y=[df.iloc[i]['PCA2']],
z=[df.iloc[i]['PCA3']],
mode='markers',
marker=dict(size=size, color=color, opacity=0.1),
text=[df.iloc[i]['Hashtag']]))
st.plotly_chart(fig)
def main():
st.title("Nube de Palabras 3D con BERT")
palabra_clave = st.text_input("Ingrese una palabra clave")
relevancia_minima = st.slider("Seleccione la relevancia mínima", 0.0, 1.0, 0.5, step=0.01)
if palabra_clave and st.button("Generar Nube de Palabras"):
fill_mask = pipeline('fill-mask', model='bert-base-uncased')
resultados = fill_mask(f"{palabra_clave} [MASK]", top_k=8)
keywords = [resultado['token_str'] for resultado in resultados]
keywords_str = ' '.join(keywords)
generar_nube_palabras_3d(keywords_str, relevancia_minima)
if __name__ == "__main__":
main()