marcossuzuki commited on
Commit
4b035a2
·
1 Parent(s): 387bf78

optimize load dataframes

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. src/streamlit_app.py +83 -85
requirements.txt CHANGED
@@ -4,4 +4,5 @@ streamlit
4
  shap
5
  matplotlib
6
  huggingface_hub
7
- IPython
 
 
4
  shap
5
  matplotlib
6
  huggingface_hub
7
+ IPython
8
+ plotly
src/streamlit_app.py CHANGED
@@ -1,4 +1,5 @@
1
- import altair as alt
 
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
@@ -11,53 +12,58 @@ fs = HfFileSystem()
11
  from datetime import datetime, timedelta, time
12
  import re
13
 
14
- arquivos_processados_bb=[]
15
- arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb1t24.save')
16
- arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb2t24.save')
17
- arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb3t24.save')
18
- arquivos_processados_bb.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/BB/valores_shap-bb4t24.save')
19
-
20
- arquivos_processados_vale=[]
21
- arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale1t24.save')
22
- arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale2t24.save')
23
- arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale3t24.save')
24
- arquivos_processados_vale.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/VALE/valores_shap-vale4t24.save')
25
-
26
- arquivos_processados_petr=[]
27
- arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr1t24.save')
28
- arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr2t24.save')
29
- arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr3t24.save')
30
- arquivos_processados_petr.append('spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/Petrobras/valores_shap-petr4t24.save')
31
-
32
- if 'bb' not in st.session_state:
33
- st.session_state['bb'] = []
34
- for path in arquivos_processados_bb:
35
- with fs.open(path, 'rb') as inp:
36
- st.session_state['bb'].append(pickle.load(inp))
37
- inp.close()
38
-
39
- if 'petr' not in st.session_state:
40
- st.session_state['petr'] = []
41
- for path in arquivos_processados_petr:
42
- with fs.open(path, 'rb') as inp:
43
- st.session_state['petr'].append(pickle.load(inp))
44
- inp.close()
45
-
46
- if 'vale' not in st.session_state:
47
- st.session_state['vale'] = []
48
- for path in arquivos_processados_vale:
49
- with fs.open(path, 'rb') as inp:
50
- st.session_state['vale'].append(pickle.load(inp))
51
- inp.close()
52
 
53
- shap_values = {'bb':st.session_state['bb'], 'petr': st.session_state['petr'], 'vale':st.session_state['vale']}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
 
55
 
56
  st.header("Sentimento da fala e Valores de Shapley")
57
 
58
  col1, col2, col3 = st.columns(3)
59
 
60
- empresa_dict = {'petr':'Petrobrás', 'vale':'Vale', 'bb':'Banco do Brasil'}
61
  empresa = col1.selectbox(
62
  "Qual empresa quer analisar: ",
63
  ("vale", "bb", "petr"),
@@ -71,61 +77,52 @@ trim = col2.number_input(
71
 
72
  text_num = col3.number_input(
73
  "Fala número:",
74
- 0, max_value = len(shap_values[empresa][trim])-1
75
  )
76
 
77
- st.text("Incluir total de tokens")
78
- st.text("Incluir gráfico de sentimentos por frase")
79
-
80
- def join_text(lista):
81
- return np.array([''.join(x) for x in lista])
82
-
83
- def count_token(lista):
84
- return np.array([len(x) for x in lista])
85
-
86
- def calc_score(empresa, trimestre, text_num, lista):
87
- positive = lista[empresa][trimestre-1][text_num,:,'POSITIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'POSITIVE'].values.sum()
88
- negative = lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].values.sum()
89
- neutral = lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].base_values + lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].values.sum()
90
- return positive, negative, neutral
91
-
92
- def calc_allscore(lista):
93
- score = []
94
- for x in lista:
95
- positive = x[:,'POSITIVE'].base_values + x[:,'POSITIVE'].values.sum()
96
- negative = x[:,'NEGATIVE'].base_values + x[:,'NEGATIVE'].values.sum()
97
- neutral = x[:,'NEUTRAL'].base_values + x[:,'NEUTRAL'].values.sum()
98
- score.append([positive, negative, neutral])
99
- return score
100
-
101
  score_positive, score_negative, score_neutral = calc_score(empresa, trim, text_num, shap_values)
102
 
103
- scores = calc_allscore(shap_values[empresa][trim-1])
104
- texto_junto = join_text(shap_values[empresa][trim-1].data)
105
- token_qtde = count_token(shap_values[empresa][trim-1].data)
106
-
107
- df = np.stack((texto_junto, token_qtde), axis=-1)
108
- df = np.hstack((df, scores))
109
- df = pd.DataFrame(df, columns=['fala', 'qtde_tokens', 'positive_score', 'negative_score', 'neutral_score'])
110
- st.dataframe(df)
111
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  proc_time=timedelta(seconds=shap_values[empresa][trim-1].compute_time)
113
  h,m,s = re.split(':', str(proc_time))
114
- #s, ms=re.split('.', str(s))
115
  st.text(f"Compute time: {h}h {m}m {s:.2}s")
116
-
117
 
118
  option_map = {'POSITIVE':f'Positivo: {score_positive:.4}', 'NEGATIVE':f'Negativo: {score_negative:.4}', 'NEUTRAL':f'Neutro: {score_neutral:.4}'}
119
 
120
- selection = st.segmented_control(
121
- "Sentimento",
122
- options=option_map.keys(),
123
- default = 'POSITIVE',
 
124
  format_func=lambda option: option_map[option],
125
- selection_mode="single",
126
  )
127
 
128
- fig = shap.plots.text(shap_values[empresa][trim-1][text_num, :, selection], display = False)
129
 
130
  components.html(fig, height=200, scrolling = True)
131
  #st.text(np.shape(shap_values[empresa][0][:,:].data))
@@ -134,9 +131,10 @@ st.header("Gráfico waterfall dos termos e Valores de Shapley")
134
 
135
  max_display = st.slider(
136
  "Máximo de exibição:",
137
- 1, max_value = len(shap_values[empresa][trim-1][text_num].data), value=int(len(shap_values[empresa][trim-1][text_num].data)/3)+1
 
138
  )
139
 
140
  fig2, ax = plt.subplots()
141
- shap.plots.waterfall(shap_values[empresa][trim-1][text_num, :, selection], show=False, max_display=max_display)
142
  st.pyplot(fig2)
 
1
+ import plotly.graph_objects as go
2
+ import plotly.express as px
3
  import numpy as np
4
  import pandas as pd
5
  import streamlit as st
 
12
  from datetime import datetime, timedelta, time
13
  import re
14
 
15
+ def join_text(lista):
16
+ return np.array([''.join(x) for x in lista])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def count_token(lista):
19
+ return np.array([len(x) for x in lista])
20
+
21
+ def calc_score(empresa, trimestre, text_num, lista):
22
+ positive = lista[empresa][trimestre-1][text_num,:,'POSITIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'POSITIVE'].values.sum()
23
+ negative = lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].base_values + lista[empresa][trimestre-1][text_num,:,'NEGATIVE'].values.sum()
24
+ neutral = lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].base_values + lista[empresa][trimestre-1][text_num,:,'NEUTRAL'].values.sum()
25
+ return positive, negative, neutral
26
+
27
+ def calc_allscore(lista):
28
+ # score = []
29
+ base_values = pd.DataFrame(lista.base_values, columns=['POSITIVE', 'NEGATIVE', 'NEUTRAL'])
30
+ a = pd.DataFrame()
31
+ a['NEUTRAL'] = pd.DataFrame(lista[:,:,'NEUTRAL'].values, columns=['NEUTRAL'])['NEUTRAL'].apply(lambda x: x.sum()) + base_values['NEUTRAL']
32
+ a['POSITIVE'] = pd.DataFrame(lista[:,:,'POSITIVE'].values, columns=['POSITIVE'])['POSITIVE'].apply(lambda x: x.sum()) + base_values['POSITIVE']
33
+ a['NEGATIVE'] = pd.DataFrame(lista[:,:,'NEGATIVE'].values, columns=['NEGATIVE'])['NEGATIVE'].apply(lambda x: x.sum()) + base_values['NEGATIVE']
34
+ # for x in lista:
35
+ # positive = x[:,'POSITIVE'].base_values + x[:,'POSITIVE'].values.sum()
36
+ # negative = x[:,'NEGATIVE'].base_values + x[:,'NEGATIVE'].values.sum()
37
+ # neutral = x[:,'NEUTRAL'].base_values + x[:,'NEUTRAL'].values.sum()
38
+ # score.append([positive, negative, neutral])
39
+ return a
40
+
41
+ pasta = {'vale':'VALE', 'petr':'Petrobras', 'bb':'BB'}
42
+
43
+ for key, val in pasta.items():
44
+ if key not in st.session_state:
45
+ st.session_state[key] = []
46
+ st.session_state[f'df_{key}']=[]
47
+ for i in range(1,5):
48
+ arquivo=f'spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/{val}/valores_shap-{key}{i}t24.save'
49
+ with fs.open(arquivo, 'rb') as inp:
50
+ st.session_state[key].append(pickle.load(inp))
51
+ inp.close()
52
+ scores = calc_allscore(st.session_state[key][i-1])
53
+ texto_junto = join_text(st.session_state[key][i-1].data)
54
+ token_qtde = count_token(st.session_state[key][i-1].data)
55
+ df = np.stack((texto_junto, token_qtde), axis=-1)
56
+ df = np.hstack((df, scores))
57
+ df = pd.DataFrame(df, columns=['fala', 'qtde_tokens', 'neutral_score', 'positive_score', 'negative_score'])
58
+ st.session_state[f'df_{key}'].append(df)
59
 
60
+ shap_values = {'bb':st.session_state['bb'], 'petr': st.session_state['petr'], 'vale':st.session_state['vale']}
61
 
62
  st.header("Sentimento da fala e Valores de Shapley")
63
 
64
  col1, col2, col3 = st.columns(3)
65
 
66
+ empresa_dict = {'petr':'Petrobras', 'vale':'Vale', 'bb':'Banco do Brasil'}
67
  empresa = col1.selectbox(
68
  "Qual empresa quer analisar: ",
69
  ("vale", "bb", "petr"),
 
77
 
78
  text_num = col3.number_input(
79
  "Fala número:",
80
+ 0, max_value = len(shap_values[empresa][trim-1])-1
81
  )
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  score_positive, score_negative, score_neutral = calc_score(empresa, trim, text_num, shap_values)
84
 
85
+ df=st.session_state[f'df_{empresa}'][trim-1]
86
+
87
+ fig3 = go.Figure(go.Bar(x=df.index, y=df['negative_score'], name='Negativo'))
88
+ fig3.add_trace(go.Bar(x=df.index, y=df['neutral_score'], name='Neutro'))
89
+ fig3.add_trace(go.Bar(x=df.index, y=df['positive_score'], name='Positivo'))
90
+ fig3.update_layout(barmode='stack', title='Score por fala')
91
+
92
+ tab1, tab2 = st.tabs(["Data Frame", "Gráfico"])
93
+ with tab1:
94
+ # Use the Streamlit theme.
95
+ # This is the default. So you can also omit the theme argument.
96
+ st.dataframe(df, column_config={'fala':'Fala',
97
+ 'qtde_tokens':st.column_config.NumberColumn("Quantidade Tokens",
98
+ format='%d'),
99
+ 'positive_score':st.column_config.NumberColumn("Score Positivo",),
100
+ 'negative_score':st.column_config.NumberColumn("Score Negativo",),
101
+ 'neutral_score':st.column_config.NumberColumn("Score Neutro",),
102
+ },)
103
+ with tab2:
104
+ # Use the native Plotly theme.
105
+ st.plotly_chart(fig3)
106
+
107
+ total_tokens = df['qtde_tokens'].astype('int64').sum()
108
  proc_time=timedelta(seconds=shap_values[empresa][trim-1].compute_time)
109
  h,m,s = re.split(':', str(proc_time))
110
+ st.text(f"Total tokens: {total_tokens}")
111
  st.text(f"Compute time: {h}h {m}m {s:.2}s")
112
+ #st.text(df.describe())
113
 
114
  option_map = {'POSITIVE':f'Positivo: {score_positive:.4}', 'NEGATIVE':f'Negativo: {score_negative:.4}', 'NEUTRAL':f'Neutro: {score_neutral:.4}'}
115
 
116
+ sentimento = st.radio(
117
+ "**Sentimento**",
118
+ option_map.keys(),
119
+ horizontal=True,
120
+ # default = 'POSITIVE',
121
  format_func=lambda option: option_map[option],
122
+ # selection_mode="single",
123
  )
124
 
125
+ fig = shap.plots.text(shap_values[empresa][trim-1][text_num, :, sentimento], display = False)
126
 
127
  components.html(fig, height=200, scrolling = True)
128
  #st.text(np.shape(shap_values[empresa][0][:,:].data))
 
131
 
132
  max_display = st.slider(
133
  "Máximo de exibição:",
134
+ 1, max_value = int(df['qtde_tokens'][text_num]),
135
+ value=int(int(df['qtde_tokens'][text_num])/3)+1
136
  )
137
 
138
  fig2, ax = plt.subplots()
139
+ shap.plots.waterfall(shap_values[empresa][trim-1][text_num, :, sentimento], show=False, max_display=max_display)
140
  st.pyplot(fig2)