Spaces:

marcossuzuki
/

TCC_PoliUSPPro

Sleeping

App Files Files Community

marcossuzuki commited on May 30, 2025

Commit

4b035a2

1 Parent(s): 387bf78

optimize load dataframes

Browse files

Files changed (2) hide show

requirements.txt +2 -1
src/streamlit_app.py +83 -85

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ streamlit
 shap
 matplotlib
 huggingface_hub
-IPython

 shap
 matplotlib
 huggingface_hub
+IPython
+plotly

src/streamlit_app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
@@ -11,53 +12,58 @@ fs = HfFileSystem()
 from datetime import datetime, timedelta, time
 import re
-arquivos_processados_bb=[]
-arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb1t24.save')
-arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb2t24.save')
-arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb3t24.save')
-arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb4t24.save')
-arquivos_processados_vale=[]
-arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale1t24.save')
-arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale2t24.save')
-arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale3t24.save')
-arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale4t24.save')
-arquivos_processados_petr=[]
-arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr1t24.save')
-arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr2t24.save')
-arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr3t24.save')
-arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr4t24.save')
-if 'bb' not in st.session_state:
-    st.session_state['bb'] = []
-    for path in arquivos_processados_bb:
-        with fs.open(path, 'rb') as inp:
-            st.session_state['bb'].append(pickle.load(inp))
-            inp.close()
-if 'petr' not in st.session_state:
-    st.session_state['petr'] = []
-    for path in arquivos_processados_petr:
-        with fs.open(path, 'rb') as inp:
-            st.session_state['petr'].append(pickle.load(inp))
-            inp.close()
-if 'vale' not in st.session_state:
-    st.session_state['vale'] = []
-    for path in arquivos_processados_vale:
-        with fs.open(path, 'rb') as inp:
-            st.session_state['vale'].append(pickle.load(inp))
-            inp.close()
-shap_values = {'bb':st.session_state['bb'], 'petr': st.session_state['petr'], 'vale':st.session_state['vale']}
 st.header("Sentimento da fala e Valores de Shapley")
 col1, col2, col3 = st.columns(3)
-empresa_dict = {'petr':'Petrobrás', 'vale':'Vale', 'bb':'Banco do Brasil'}
 empresa = col1.selectbox(
     "Qual empresa quer analisar: ",
     ("vale", "bb", "petr"),
@@ -71,61 +77,52 @@ trim = col2.number_input(
 text_num = col3.number_input(
     "Fala número:",
-    0, max_value = len(shap_values[empresa][trim])-1
 )
-st.text("Incluir total de tokens")
-st.text("Incluir gráfico de sentimentos por frase")
-def join_text(lista):
-    return np.array([''.join(x) for x in lista])
-def count_token(lista):
-    return np.array([len(x) for x in lista])
-def calc_score(empresa, trimestre, text_num, lista):
-    positive = lista[empresa][trimestre-1][text_num,:,'POSITIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'POSITIVE'].values.sum()
-    negative = lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].values.sum()
-    neutral = lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].base_values + lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].values.sum()
-    return positive, negative, neutral
-def calc_allscore(lista):
-    score = []
-    for x in lista:
-        positive = x[:,'POSITIVE'].base_values + x[:,'POSITIVE'].values.sum()
-        negative = x[:,'NEGATIVE'].base_values + x[:,'NEGATIVE'].values.sum()
-        neutral = x[:,'NEUTRAL'].base_values + x[:,'NEUTRAL'].values.sum()
-        score.append([positive, negative, neutral])
-    return score
 score_positive, score_negative, score_neutral = calc_score(empresa, trim, text_num, shap_values)
-scores = calc_allscore(shap_values[empresa][trim-1])
-texto_junto = join_text(shap_values[empresa][trim-1].data)
-token_qtde = count_token(shap_values[empresa][trim-1].data)
-df = np.stack((texto_junto, token_qtde), axis=-1)
-df = np.hstack((df, scores))
-df = pd.DataFrame(df, columns=['fala', 'qtde_tokens', 'positive_score', 'negative_score', 'neutral_score'])
-st.dataframe(df)
 proc_time=timedelta(seconds=shap_values[empresa][trim-1].compute_time)
 h,m,s = re.split(':', str(proc_time))
-#s, ms=re.split('.', str(s))
 st.text(f"Compute time: {h}h {m}m {s:.2}s")
 option_map = {'POSITIVE':f'Positivo: {score_positive:.4}', 'NEGATIVE':f'Negativo: {score_negative:.4}', 'NEUTRAL':f'Neutro: {score_neutral:.4}'}
-selection = st.segmented_control(
-    "Sentimento",
-    options=option_map.keys(),
-    default = 'POSITIVE',
     format_func=lambda option: option_map[option],
-    selection_mode="single",
 )
-fig = shap.plots.text(shap_values[empresa][trim-1][text_num, :, selection], display = False)
 components.html(fig, height=200, scrolling = True)
 #st.text(np.shape(shap_values[empresa][0][:,:].data))
@@ -134,9 +131,10 @@ st.header("Gráfico waterfall dos termos e Valores de Shapley")
 max_display = st.slider(
     "Máximo de exibição:",
-    1, max_value = len(shap_values[empresa][trim-1][text_num].data), value=int(len(shap_values[empresa][trim-1][text_num].data)/3)+1
 )
 fig2, ax = plt.subplots()
-shap.plots.waterfall(shap_values[empresa][trim-1][text_num, :, selection], show=False, max_display=max_display)
 st.pyplot(fig2)

+import plotly.graph_objects as go
+import plotly.express as px
 import numpy as np
 import pandas as pd
 import streamlit as st
 from datetime import datetime, timedelta, time
 import re
+def join_text(lista):
+    return np.array([''.join(x) for x in lista])
+def count_token(lista):
+    return np.array([len(x) for x in lista])
+def calc_score(empresa, trimestre, text_num, lista):
+    positive = lista[empresa][trimestre-1][text_num,:,'POSITIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'POSITIVE'].values.sum()
+    negative = lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].values.sum()
+    neutral = lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].base_values + lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].values.sum()
+    return positive, negative, neutral
+def calc_allscore(lista):
+    # score = []
+    base_values = pd.DataFrame(lista.base_values, columns=['POSITIVE', 'NEGATIVE', 'NEUTRAL'])
+    a = pd.DataFrame()
+    a['NEUTRAL'] = pd.DataFrame(lista[:,:,'NEUTRAL'].values, columns=['NEUTRAL'])['NEUTRAL'].apply(lambda x: x.sum()) + base_values['NEUTRAL']
+    a['POSITIVE'] = pd.DataFrame(lista[:,:,'POSITIVE'].values, columns=['POSITIVE'])['POSITIVE'].apply(lambda x: x.sum()) + base_values['POSITIVE']
+    a['NEGATIVE'] = pd.DataFrame(lista[:,:,'NEGATIVE'].values, columns=['NEGATIVE'])['NEGATIVE'].apply(lambda x: x.sum()) + base_values['NEGATIVE']
+    # for x in lista:
+    #     positive = x[:,'POSITIVE'].base_values + x[:,'POSITIVE'].values.sum()
+    #     negative = x[:,'NEGATIVE'].base_values + x[:,'NEGATIVE'].values.sum()
+    #     neutral = x[:,'NEUTRAL'].base_values + x[:,'NEUTRAL'].values.sum()
+    #     score.append([positive, negative, neutral])
+    return a
+pasta = {'vale':'VALE', 'petr':'Petrobras', 'bb':'BB'}
+for key, val in pasta.items():
+    if key not in st.session_state:
+        st.session_state[key] = []
+        st.session_state[f'df_{key}']=[]
+        for i in range(1,5):
+            arquivo=f'spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/{val}/valores_shap-{key}{i}t24.save'
+            with fs.open(arquivo, 'rb') as inp:
+                st.session_state[key].append(pickle.load(inp))
+                inp.close()
+            scores = calc_allscore(st.session_state[key][i-1])
+            texto_junto = join_text(st.session_state[key][i-1].data)
+            token_qtde = count_token(st.session_state[key][i-1].data)
+            df = np.stack((texto_junto, token_qtde), axis=-1)
+            df = np.hstack((df, scores))
+            df = pd.DataFrame(df, columns=['fala', 'qtde_tokens',  'neutral_score', 'positive_score', 'negative_score'])
+            st.session_state[f'df_{key}'].append(df)
+shap_values = {'bb':st.session_state['bb'], 'petr': st.session_state['petr'], 'vale':st.session_state['vale']}
 st.header("Sentimento da fala e Valores de Shapley")
 col1, col2, col3 = st.columns(3)
+empresa_dict = {'petr':'Petrobras', 'vale':'Vale', 'bb':'Banco do Brasil'}
 empresa = col1.selectbox(
     "Qual empresa quer analisar: ",
     ("vale", "bb", "petr"),
 text_num = col3.number_input(
     "Fala número:",
+    0, max_value = len(shap_values[empresa][trim-1])-1
 )
 score_positive, score_negative, score_neutral = calc_score(empresa, trim, text_num, shap_values)
+df=st.session_state[f'df_{empresa}'][trim-1]
+fig3 = go.Figure(go.Bar(x=df.index, y=df['negative_score'], name='Negativo'))
+fig3.add_trace(go.Bar(x=df.index, y=df['neutral_score'], name='Neutro'))
+fig3.add_trace(go.Bar(x=df.index, y=df['positive_score'], name='Positivo'))
+fig3.update_layout(barmode='stack',  title='Score por fala')
+tab1, tab2 = st.tabs(["Data Frame", "Gráfico"])
+with tab1:
+    # Use the Streamlit theme.
+    # This is the default. So you can also omit the theme argument.
+    st.dataframe(df, column_config={'fala':'Fala',
+                                    'qtde_tokens':st.column_config.NumberColumn("Quantidade Tokens",
+                                                                                format='%d'),
+                                     'positive_score':st.column_config.NumberColumn("Score Positivo",),
+                                     'negative_score':st.column_config.NumberColumn("Score Negativo",),
+                                     'neutral_score':st.column_config.NumberColumn("Score Neutro",),
+                                    },)
+with tab2:
+    # Use the native Plotly theme.
+    st.plotly_chart(fig3)
+total_tokens = df['qtde_tokens'].astype('int64').sum()
 proc_time=timedelta(seconds=shap_values[empresa][trim-1].compute_time)
 h,m,s = re.split(':', str(proc_time))
+st.text(f"Total tokens: {total_tokens}")
 st.text(f"Compute time: {h}h {m}m {s:.2}s")
+#st.text(df.describe())
 option_map = {'POSITIVE':f'Positivo: {score_positive:.4}', 'NEGATIVE':f'Negativo: {score_negative:.4}', 'NEUTRAL':f'Neutro: {score_neutral:.4}'}
+sentimento = st.radio(
+    "**Sentimento**",
+    option_map.keys(),
+    horizontal=True,
+    # default = 'POSITIVE',
     format_func=lambda option: option_map[option],
+    # selection_mode="single",
 )
+fig = shap.plots.text(shap_values[empresa][trim-1][text_num, :, sentimento], display = False)
 components.html(fig, height=200, scrolling = True)
 #st.text(np.shape(shap_values[empresa][0][:,:].data))
 max_display = st.slider(
     "Máximo de exibição:",
+    1, max_value = int(df['qtde_tokens'][text_num]),
+                    value=int(int(df['qtde_tokens'][text_num])/3)+1
 )
 fig2, ax = plt.subplots()
+shap.plots.waterfall(shap_values[empresa][trim-1][text_num, :, sentimento], show=False, max_display=max_display)
 st.pyplot(fig2)