import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from transformers import BertTokenizer, BertModel, pipeline import torch import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler modelo_bert = BertModel.from_pretrained('bert-base-uncased') tokenizador_bert = BertTokenizer.from_pretrained('bert-base-uncased') def obtener_representaciones(texto): tokens = tokenizador_bert.tokenize(tokenizador_bert.decode(tokenizador_bert.encode(texto))) ids = tokenizador_bert.convert_tokens_to_ids(tokens) input_ids = torch.tensor(ids).unsqueeze(0) with torch.no_grad(): outputs = modelo_bert(input_ids) return outputs.last_hidden_state def calcular_similitud(representaciones): similarity_matrix = np.dot(representaciones, representaciones.T) np.fill_diagonal(similarity_matrix, 0) return similarity_matrix def analizar_sentimiento(texto): analizador_sentimientos = pipeline("sentiment-analysis") resultado = analizador_sentimientos(texto) return resultado[0]['score'], resultado[0]['label'] def obtener_tamano_color_relevancia_esferas(df, relevancia_minima): sizes = [] colors = [] for i in range(len(df)): relevance = np.max(df.iloc[i, :-4]) size = 2 + relevance * 7 color = 'rgba(0, 0, 255, 0.8)' if relevance < relevancia_minima else 'rgba(255, 0, 0, 0.8)' sizes.append(size) colors.append(color) return sizes, colors def obtener_color_transparencia_lineas(df): colores = ['green', 'blue', 'yellow', 'red', 'purple', 'orange'] color_and_opacity = [] color_index = 0 for i in range(len(df)): for j in range(i + 1, len(df)): relevance = np.linalg.norm(df.iloc[i, :-4] - df.iloc[j, :-4]) sentimiento_promedio = (analizar_sentimiento(df['Hashtag'][i])[0] + analizar_sentimiento(df['Hashtag'][j])[0]) / 2 color = colores[color_index] color_index = (color_index + 1) % len(colores) color_and_opacity.append((color, relevance, 0.6)) return color_and_opacity def generar_nube_palabras_3d(hashtags, relevancia_minima): representaciones = [obtener_representaciones(palabra).mean(dim=1).squeeze().numpy() for palabra in hashtags.split()] df = pd.DataFrame(np.vstack(representaciones), columns=[f'Feature_{i}' for i in range(768)]) df['Hashtag'] = hashtags.split() pca = PCA(n_components=3) scaler = StandardScaler() df_scaled = scaler.fit_transform(df.iloc[:, :-1]) pca_result = pca.fit_transform(df_scaled) df['PCA1'], df['PCA2'], df['PCA3'] = pca_result[:, 0], pca_result[:, 1], pca_result[:, 2] similarity_matrix = calcular_similitud(df.iloc[:, :-4]) sizes, colors = obtener_tamano_color_relevancia_esferas(df, relevancia_minima) color_and_opacity = obtener_color_transparencia_lineas(df) df['CuartaDim'] = np.random.rand(len(df)) fig = px.scatter_3d(df, x='PCA1', y='PCA2', z='PCA3', text='Hashtag', color='CuartaDim', title="Nube de Palabras 3D", opacity=0.9, width=1000, height=1000, color_continuous_scale='Viridis') for i in range(len(df)): for j in range(i + 1, len(df)): color, width, opacity = color_and_opacity.pop(0) fig.add_trace(go.Scatter3d(x=[df.iloc[i]['PCA1'], df.iloc[j]['PCA1']], y=[df.iloc[i]['PCA2'], df.iloc[j]['PCA2']], z=[df.iloc[i]['PCA3'], df.iloc[j]['PCA3']], mode='lines', line=dict(width=width, color=color), opacity=opacity)) for i in range(len(df)): size = sizes[i] color = colors[i] fig.add_trace(go.Scatter3d(x=[df.iloc[i]['PCA1']], y=[df.iloc[i]['PCA2']], z=[df.iloc[i]['PCA3']], mode='markers', marker=dict(size=size, color=color, opacity=0.1), text=[df.iloc[i]['Hashtag']])) st.plotly_chart(fig) def main(): st.title("Nube de Palabras 3D con BERT") palabra_clave = st.text_input("Ingrese una palabra clave") relevancia_minima = st.slider("Seleccione la relevancia mínima", 0.0, 1.0, 0.5, step=0.01) if palabra_clave and st.button("Generar Nube de Palabras"): fill_mask = pipeline('fill-mask', model='bert-base-uncased') resultados = fill_mask(f"{palabra_clave} [MASK]", top_k=8) keywords = [resultado['token_str'] for resultado in resultados] keywords_str = ' '.join(keywords) generar_nube_palabras_3d(keywords_str, relevancia_minima) if __name__ == "__main__": main()