Spaces:

marcossuzuki
/

TCC_PoliUSPPro

Sleeping

App Files Files Community

marcossuzuki commited on Jun 18, 2025

Commit

acf89de

1 Parent(s): ae315c9

Using oop

Browse files

Files changed (3) hide show

src/datashap/DataSHAP.py +148 -0
src/datashap/__init__.py +0 -0
src/streamlit_app.py +28 -137

src/datashap/DataSHAP.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import pickle
+from datetime import timedelta
+import re
+import plotly.express as px
+from plotly.subplots import make_subplots
+import numpy as np
+from pandas import DataFrame
+import shap
+import matplotlib.pyplot as plt
+from huggingface_hub import HfFileSystem
+import warnings
+warnings.simplefilter("ignore", category=DeprecationWarning)
+class DataSHAP:
+    def __init__(self, file_addr, company, trim):
+        self.file_addr = file_addr
+        self.shap_value = self.load_file(self.file_addr)
+        self.df = self.mount_df(self.shap_value)
+        self.total_tokens, self.h, self.m, self.s = self.calc_performance(self.shap_value.compute_time,
+                                                                        self.df['qty_tokens'])
+        self.statistic = self.get_statistic(self.df)
+        self.trim = trim
+        self.company = company
+        self.plot = self.three_plot(self.df)
+        self.plot_bar, self.axis, self.rank = self.plot_rank()
+    def load_file(self, file_addr):
+        shap_value = 0
+        fs = HfFileSystem()
+        with fs.open(file_addr, 'rb') as inp:
+            shap_value = pickle.load(inp)
+            inp.close()
+        return shap_value
+    def join_text(self, list):
+        return np.array([''.join(x) for x in list])
+    def count_token(self, list):
+        return np.array([len(x) for x in list])
+    def calc_allscore(self, shap_value):
+        base_values = DataFrame(shap_value.base_values, columns=['POSITIVE', 'NEGATIVE', 'NEUTRAL'])
+        df = DataFrame()
+        df['NEUTRAL'] = DataFrame(shap_value[:,:,'NEUTRAL'].values,
+                                     columns=['NEUTRAL'])['NEUTRAL'].apply(lambda x: x.sum()) + base_values['NEUTRAL']
+        df['POSITIVE'] = DataFrame(shap_value[:,:,'POSITIVE'].values,
+                                      columns=['POSITIVE'])['POSITIVE'].apply(lambda x: x.sum()) + base_values['POSITIVE']
+        df['NEGATIVE'] = DataFrame(shap_value[:,:,'NEGATIVE'].values,
+                                      columns=['NEGATIVE'])['NEGATIVE'].apply(lambda x: x.sum()) + base_values['NEGATIVE']
+        return df
+    def mount_df(self, shap_value):
+        scores      = self.calc_allscore(shap_value)
+        text = self.join_text(shap_value.data)
+        token_qty  = self.count_token(shap_value.data)
+        df = np.stack((text, token_qty), axis=-1)
+        df = np.hstack((df, scores))
+        df = DataFrame(df, columns=['speech', 'qty_tokens',
+                                    'neutral_score', 'positive_score', 'negative_score'])
+        title_score = ['positive_score', 'negative_score', 'neutral_score']
+        df = self.df_idxmax_score(df, title_score)
+        return df
+    def df_idxmax_score(self, df, title_score):
+        df[title_score] = df[title_score].astype('float')
+        df['tag'] = df[title_score].idxmax(axis="columns")
+        df['score'] = df[title_score].max(axis='columns')
+        df['tag'] = df['tag'].replace({'positive_score': 'POSITIVE', 'negative_score': 'NEGATIVE', 'neutral_score': 'NEUTRAL'})
+        return df
+    def calc_performance(self, time_s, df:DataFrame):
+        total_tokens = df.astype('int64').sum()
+        proc_time=timedelta(seconds=time_s)
+        h,m,s = re.split(':', str(proc_time))
+        return total_tokens, h, m, s
+    def get_statistic(self, df)->DataFrame:
+        statistic = DataFrame()
+        statistic['positive_score'] = df[df['tag']=='POSITIVE']['score'].describe()
+        statistic['negative_score'] = df[df['tag']=='NEGATIVE']['score'].describe()
+        statistic['neutral_score']  = df[df['tag']=='NEUTRAL']['score'].describe()
+        return statistic
+    def get_performance(self):
+        return self.total_tokens, self.h, self.m, self.s
+    def three_plot(self, df):
+        fig = make_subplots(rows=2, cols=2, horizontal_spacing = 0.0, vertical_spacing = 0.05,
+                            shared_xaxes=True, shared_yaxes=True,
+                            row_heights=[0.4, 0.6], column_widths=[0.8, 0.2])
+        fig_scatter = px.scatter(df, x=df.index, y=['score'], color="tag",)
+        fig_histogram = px.histogram(df, x=df.index, color='tag', nbins=20,)
+        fig_box = px.box(df, x='tag', y="score", color='tag',)
+        fig_scatter.data[1]['marker']={'color': '#000007'}
+        fig_histogram.data[1]['marker']={'color': '#000007', 'pattern': {'shape': ''}}
+        fig_box.data[1]['marker']={'color': '#000007'}
+        for x in range(3):
+            fig_histogram.data[x]['showlegend']=False
+            fig_box.data[x]['showlegend']=False
+            fig.add_trace(fig_scatter.data[x], row=2, col=1)
+            fig.add_trace(fig_histogram.data[x], row=1, col=1)
+            fig.add_trace(fig_box.data[x], row=2, col=2,)
+        fig.update_layout(barmode='overlay', title=f'''Estatísticas: {self.company}<br>Trimestre {self.trim} de 2024''',
+                        xaxis3_rangeslider=dict(visible=True, bgcolor="#636EFA", thickness=0.03),
+                        legend=dict(orientation="h", yanchor="top",
+                                    y=1.3, xanchor="center", x=0.5),
+                        scene = dict(yaxis = dict(title=''),))
+        fig.update_xaxes(showticklabels=False, showgrid=True, row=1, col=1)
+        fig.update_xaxes(title_text='#Fala', showgrid=True, row=2, col=1)
+        fig.update_yaxes(title_text='Score', row=2, col=1)
+        fig.update_yaxes(title_text='Frequência', row=1, col=1)
+        fig.update_traces(marker={"opacity": 0.7})
+        return fig
+    def plot_rank(self, tag={'NEUTRAL':'Neutro',
+                             'POSITIVE':'Positivo',
+                             'NEGATIVE':'Negativo',},
+                    max_display=11):
+        plot_bar = dict()
+        axis = dict()
+        rank = DataFrame()
+        for key, val in tag.items():
+            plot, ax = plt.subplots()
+            shap.plots.bar(self.shap_value[:,:,key], show=False, max_display=max_display,)
+            plot_bar[key] = plot
+            axis[key] = ax
+            rank[val] = DataFrame(ax.get_yticklabels()[:-max_display-1]).astype(str)
+            plt.close()
+        rank[list(tag.values())] = rank[list(tag.values())].replace(r"(([T])\w+|(\d+,)|(\d+.\d+,)|(['\(\)\s]))", '', regex=True)
+        return plot_bar, axis, rank
+    def get_plot_rank(self, max_display=11):
+        self.plot_bar, self.axis, self.rank = self.plot_rank(max_display=max_display)
+        return self.plot_bar, self.axis, self.rank

src/datashap/__init__.py ADDED Viewed

File without changes

src/streamlit_app.py CHANGED Viewed

@@ -1,133 +1,34 @@
-import pickle
-from datetime import timedelta
-import re
-import plotly.express as px
-from plotly.subplots import make_subplots
-import numpy as np
-import pandas as pd
 import streamlit as st
 import streamlit.components.v1 as components
 import shap
 import matplotlib.pyplot as plt
-from huggingface_hub import HfFileSystem
 import warnings
 warnings.simplefilter("ignore", category=DeprecationWarning)
-fs = HfFileSystem()
-def join_text(lista):
-    return np.array([''.join(x) for x in lista])
-def count_token(lista):
-    return np.array([len(x) for x in lista])
-def calc_allscore(lista):
-    base_values = pd.DataFrame(lista.base_values, columns=['POSITIVE', 'NEGATIVE', 'NEUTRAL'])
-    a = pd.DataFrame()
-    a['NEUTRAL'] = pd.DataFrame(lista[:,:,'NEUTRAL'].values, columns=['NEUTRAL'])['NEUTRAL'].apply(lambda x: x.sum()) + base_values['NEUTRAL']
-    a['POSITIVE'] = pd.DataFrame(lista[:,:,'POSITIVE'].values, columns=['POSITIVE'])['POSITIVE'].apply(lambda x: x.sum()) + base_values['POSITIVE']
-    a['NEGATIVE'] = pd.DataFrame(lista[:,:,'NEGATIVE'].values, columns=['NEGATIVE'])['NEGATIVE'].apply(lambda x: x.sum()) + base_values['NEGATIVE']
-    return a
 def select_fala():
     if st.session_state.df_falas['selection']['rows']:
         st.session_state.num_fala = st.session_state.df_falas['selection']['rows'][0]
         num = st.session_state.num_fala
-        df=st.session_state[f'df_{st.session_state.empresa}'][st.session_state.trimestre-1]
-        rotulo = df.iloc[num]['rotulo']
         option_map = {'Neutro':'NEUTRAL', 'Positivo':'POSITIVE', 'Negativo':'NEGATIVE',}
         st.session_state.rotulo = option_map[rotulo]
 @st.cache_resource
-def plot_rank(empresa, trim, rotulo, max_display):
-    plot = dict()
-    axis = dict()
-    rank = pd.DataFrame()
-    for key, val in rotulo.items():
-        plot_bar, ax = plt.subplots()
-        shap.plots.bar(shap_values[empresa][trim][:,:,key], show=False, max_display=max_display,)
-        plot[key] = plot_bar
-        axis[key] = ax
-        rank[val] = pd.DataFrame(ax.get_yticklabels()[:-max_display-1]).astype(str)
-    rank[list(rotulo.values())] = rank[list(rotulo.values())].replace(r"(([T])\w+|(\d+,)|(\d+.\d+,)|(['\(\)\s]))", '', regex=True)
-    return plot, axis, rank
-@st.cache_data
-def load_file(arquivo):
-    shap_value = 0
-    with fs.open(arquivo, 'rb') as inp:
-        shap_value = pickle.load(inp)
-        inp.close()
     return shap_value
-@st.cache_data
-def mount_df(_shap_value, val, i):
-    scores      = calc_allscore(_shap_value)
-    texto_junto = join_text(_shap_value.data)
-    token_qtde  = count_token(_shap_value.data)
-    df = np.stack((texto_junto, token_qtde), axis=-1)
-    df = np.hstack((df, scores))
-    df = pd.DataFrame(df, columns=['fala', 'qtde_tokens',
-                                    'neutral_score', 'positive_score', 'negative_score'])
-    return df
-@st.cache_data
-def calc_performance(time_s, df):
-    total_tokens = df.astype('int64').sum()
-    proc_time=timedelta(seconds=time_s)
-    h,m,s = re.split(':', str(proc_time))
-    return total_tokens, h, m, s
-@st.cache_data
-def get_statistic(df):
-    estatistica = pd.DataFrame()
-    estatistica['Positivo'] = df[df['rotulo']=='Positivo']['score'].describe()
-    estatistica['Negativo'] = df[df['rotulo']=='Negativo']['score'].describe()
-    estatistica['Neutro'] = df[df['rotulo']=='Neutro']['score'].describe()
-    return estatistica
-@st.cache_resource
-def three_plot(df):
-    fig = make_subplots(rows=2, cols=2, horizontal_spacing = 0.0, vertical_spacing = 0.05,
-                        shared_xaxes=True, shared_yaxes=True,
-                        row_heights=[0.4, 0.6], column_widths=[0.8, 0.2])
-    fig_scatter = px.scatter(df, x=df.index, y=['score'], color="rotulo",)
-    fig_histogram = px.histogram(df, x=df.index, color='rotulo', nbins=20,)
-    fig_box = px.box(df, x='rotulo', y="score", color='rotulo',)
-    fig_scatter.data[1]['marker']={'color': '#000007'}
-    fig_histogram.data[1]['marker']={'color': '#000007', 'pattern': {'shape': ''}}
-    fig_box.data[1]['marker']={'color': '#000007'}
-    for x in range(3):
-        fig_histogram.data[x]['showlegend']=False
-        fig_box.data[x]['showlegend']=False
-        fig.add_trace(fig_scatter.data[x], row=2, col=1)
-        fig.add_trace(fig_histogram.data[x], row=1, col=1)
-        fig.add_trace(fig_box.data[x], row=2, col=2,)
-    fig.update_layout(barmode='overlay', title=f'''Estatísticas: {empresa_dict[empresa]}<br>Trimestre {trim} de 2024''',
-                    xaxis3_rangeslider=dict(visible=True, bgcolor="#636EFA", thickness=0.03),
-                    legend=dict(orientation="h", yanchor="top",
-                                y=1.3, xanchor="center", x=0.5),
-                    scene = dict(yaxis = dict(title=''),))
-    fig.update_xaxes(showticklabels=False, showgrid=True, row=1, col=1)
-    fig.update_xaxes(title_text='#Fala', showgrid=True, row=2, col=1)
-    fig.update_yaxes(title_text='Score', row=2, col=1)
-    fig.update_yaxes(title_text='Frequência', row=1, col=1)
-    fig.update_traces(marker={"opacity": 0.7})
-    return fig
-@st.cache_data
-def df_idxmax_score(df, empresa, trim, title_score):
-    df[title_score] = df[title_score].astype('float')
-    df['rotulo'] = df[title_score].idxmax(axis="columns")
-    df['score'] = df[title_score].max(axis='columns')
-    df['rotulo'] = df['rotulo'].replace({'positive_score': 'Positivo', 'negative_score': 'Negativo', 'neutral_score': 'Neutro'})
-    return df
 st.set_page_config(page_title="TCCPoliUSPPro", )
@@ -138,16 +39,7 @@ shap_values = {}
 title_score = ['positive_score', 'negative_score', 'neutral_score']
 for key, val in pasta.items():
-    if key not in st.session_state:
-        st.session_state[key] = []
-        st.session_state[f'df_{key}']=[]
-        for i in range(1,5):
-            arquivo=f'spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/{val}/valores_shap-{key}{i}t24.save'
-            shap_value  = load_file(arquivo)
-            df = mount_df(shap_value, val, i)
-            df = df_idxmax_score(df, val, i, title_score)
-            st.session_state[key].append(shap_value)
-            st.session_state[f'df_{key}'].append(df)
     shap_values[key] = st.session_state[key]
 st.header("Sentimento da fala e Scores")
@@ -165,12 +57,12 @@ trim = col2.number_input("**Trimestre de 2024:**", 1, max_value = 4, key='trimes
 text_num = col3.number_input(
     "**Fala número:**",
-    0, max_value = len(shap_values[empresa][trim-1])-1,
     key='num_fala',)
-df=st.session_state[f'df_{empresa}'][trim-1]
-total_tokens, h, m, s = calc_performance(shap_values[empresa][trim-1].compute_time, df['qtde_tokens'])
 col4.write(f"**Total tokens:** {total_tokens} \
         \n**Compute time:** {h}h {m}m {s:.2}s")
@@ -182,22 +74,20 @@ with tab1:
                 selection_mode = 'single-row',
                 key='df_falas',
                 on_select=select_fala,
-                column_config={'fala':st.column_config.Column('Fala', width=100),
-                                    'qtde_tokens':st.column_config.NumberColumn("Qtde. Tokens", format='%d'),
                                     'positive_score':st.column_config.NumberColumn("Score Positivo",),
                                     'negative_score':st.column_config.NumberColumn("Score Negativo",),
                                     'neutral_score':st.column_config.NumberColumn("Score Neutro",),
-                                    'rotulo':"Rótulo",
                                     },
                 height=200,)
 with tab2:
-    estatistica = get_statistic(df)
-    st.dataframe(estatistica, )
 with tab3:
-    fig = three_plot(df)
-    st.plotly_chart(fig)
 score_positive, score_negative, score_neutral = df.loc[text_num, title_score]
@@ -211,7 +101,7 @@ rotulo = st.radio(
     key='rotulo'
 )
-plot_text = shap.plots.text(shap_values[empresa][trim-1][text_num, :, rotulo], display = False)
 components.html(plot_text, height = 180, scrolling = True)
 st.header("Gráfico waterfall dos termos e Valores de Shapley")
@@ -219,18 +109,19 @@ st.header("Gráfico waterfall dos termos e Valores de Shapley")
 with st.expander("Expand"):
     max_display = st.slider(
         "**Máximo de exibição:**",
-        1, max_value = int(df['qtde_tokens'][text_num]),
-        value=int(int(df['qtde_tokens'][text_num])/3)+1
     )
     plot_waterfall, ax = plt.subplots()
-    shap.plots.waterfall(shap_values[empresa][trim-1][text_num, :, rotulo], show=False, max_display=max_display)
     st.pyplot(plot_waterfall)
 st.header('Rank de termos do documento em Gráfico Barra')
 with st.expander("Expand"):
-    plot_bar, ax, rank = plot_rank(empresa, trim-1, option_map, 11)
     for key, val in option_map.items():
         st.subheader(val)
         st.pyplot(plot_bar[key])

 import streamlit as st
 import streamlit.components.v1 as components
 import shap
+from datashap import DataSHAP as ds
 import matplotlib.pyplot as plt
 import warnings
 warnings.simplefilter("ignore", category=DeprecationWarning)
 def select_fala():
     if st.session_state.df_falas['selection']['rows']:
         st.session_state.num_fala = st.session_state.df_falas['selection']['rows'][0]
         num = st.session_state.num_fala
+        df=st.session_state[st.session_state.empresa][st.session_state.trimestre-1].df
+        rotulo = df.iloc[num]['tag']
         option_map = {'Neutro':'NEUTRAL', 'Positivo':'POSITIVE', 'Negativo':'NEGATIVE',}
         st.session_state.rotulo = option_map[rotulo]
 @st.cache_resource
+def get_dataSHAP(file, company, trim):
+    shap_value=ds.DataSHAP(file, company, trim)
+    shap_value.df['tag'] = shap_value.df['tag'].replace({'POSITIVE':'Positivo', 'NEGATIVE':'Negativo', 'NEUTRAL':'Neutro'})
     return shap_value
+def init_session(key, val):
+    if key not in st.session_state:
+        st.session_state[key] = []
+        for i in range(1,5):
+            arquivo=f'spaces/marcossuzuki/TCC_PoliUSPPro/transcrição audio RI/{val}/valores_shap-{key}{i}t24.save'
+            shap_value = get_dataSHAP(arquivo, empresa_dict[key], i)
+            st.session_state[key].append(shap_value)
 st.set_page_config(page_title="TCCPoliUSPPro", )
 title_score = ['positive_score', 'negative_score', 'neutral_score']
 for key, val in pasta.items():
+    init_session(key, val)
     shap_values[key] = st.session_state[key]
 st.header("Sentimento da fala e Scores")
 text_num = col3.number_input(
     "**Fala número:**",
+    0, max_value = len(shap_values[empresa][trim-1].shap_value)-1,
     key='num_fala',)
+df=shap_values[empresa][trim-1].df
+total_tokens, h, m, s = shap_values[empresa][trim-1].get_performance()
 col4.write(f"**Total tokens:** {total_tokens} \
         \n**Compute time:** {h}h {m}m {s:.2}s")
                 selection_mode = 'single-row',
                 key='df_falas',
                 on_select=select_fala,
+                column_config={'speech':st.column_config.Column('Fala', width=100),
+                                    'qty_tokens':st.column_config.NumberColumn("Qtde. Tokens", format='%d'),
                                     'positive_score':st.column_config.NumberColumn("Score Positivo",),
                                     'negative_score':st.column_config.NumberColumn("Score Negativo",),
                                     'neutral_score':st.column_config.NumberColumn("Score Neutro",),
+                                    'tag':"Rótulo",
                                     },
                 height=200,)
 with tab2:
+    st.dataframe(shap_values[empresa][trim-1].statistic, )
 with tab3:
+    st.plotly_chart(shap_values[empresa][trim-1].plot)
 score_positive, score_negative, score_neutral = df.loc[text_num, title_score]
     key='rotulo'
 )
+plot_text = shap.plots.text(shap_values[empresa][trim-1].shap_value[text_num, :, rotulo], display = False)
 components.html(plot_text, height = 180, scrolling = True)
 st.header("Gráfico waterfall dos termos e Valores de Shapley")
 with st.expander("Expand"):
     max_display = st.slider(
         "**Máximo de exibição:**",
+        1, max_value = int(df['qty_tokens'][text_num]),
+        value=int(int(df['qty_tokens'][text_num])/3)+1
     )
     plot_waterfall, ax = plt.subplots()
+    shap.plots.waterfall(shap_values[empresa][trim-1].shap_value[text_num, :, rotulo], show=False, max_display=max_display)
     st.pyplot(plot_waterfall)
+    plt.close()
 st.header('Rank de termos do documento em Gráfico Barra')
 with st.expander("Expand"):
+    plot_bar, ax, rank = shap_values[empresa][trim-1].get_plot_rank()
     for key, val in option_map.items():
         st.subheader(val)
         st.pyplot(plot_bar[key])