Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PIL import Image | |
| import os | |
| import ast | |
| import contextlib | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from gensim import corpora | |
| import networkx as nx | |
| from sklearn.manifold import TSNE | |
| from gensim.models import KeyedVectors | |
| from translate_app import tr | |
| title = "Data Vizualization" | |
| sidebar_name = "Data Vizualization" | |
| dataPath = st.session_state.DataPath | |
| with contextlib.redirect_stdout(open(os.devnull, "w")): | |
| nltk.download('stopwords') | |
| # Première ligne à charger | |
| first_line = 0 | |
| # Nombre maximum de lignes à charger | |
| max_lines = 140000 | |
| if ((first_line+max_lines)>137860): | |
| max_lines = max(137860-first_line ,0) | |
| # Nombre maximum de ligne à afficher pour les DataFrame | |
| max_lines_to_display = 50 | |
| def load_data(path): | |
| input_file = os.path.join(path) | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| # On convertit les majuscules en minulcule | |
| data = data.lower() | |
| data = data.split('\n') | |
| return data[first_line:min(len(data),first_line+max_lines)] | |
| def load_preprocessed_data(path,data_type): | |
| input_file = os.path.join(path) | |
| if data_type == 1: | |
| return pd.read_csv(input_file, encoding="utf-8", index_col=0) | |
| else: | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| data = data.split('\n') | |
| if data_type==0: | |
| data=data[:-1] | |
| elif data_type == 2: | |
| data=[eval(i) for i in data[:-1]] | |
| elif data_type ==3: | |
| data2 = [] | |
| for d in data[:-1]: | |
| data2.append(ast.literal_eval(d)) | |
| data=data2 | |
| return data | |
| def load_all_preprocessed_data(lang): | |
| txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0) | |
| corpus =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0) | |
| txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3) | |
| df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)]) | |
| sent_len =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2) | |
| vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec') | |
| return txt, corpus, txt_split, df_count_word,sent_len, vec_model | |
| #Chargement des textes complet dans les 2 langues | |
| full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en = load_all_preprocessed_data('en') | |
| full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr = load_all_preprocessed_data('fr') | |
| def plot_word_cloud(text, title, masque, stop_words, background_color = "white"): | |
| mask_coloring = np.array(Image.open(str(masque))) | |
| # Définir le calque du nuage des mots | |
| wc = WordCloud(background_color=background_color, max_words=200, | |
| stopwords=stop_words, mask = mask_coloring, | |
| max_font_size=50, random_state=42) | |
| # Générer et afficher le nuage de mots | |
| fig=plt.figure(figsize= (20,10)) | |
| plt.title(tr(title), fontsize=25, color="green") | |
| wc.generate(text) | |
| # getting current axes | |
| a = plt.gca() | |
| # set visibility of x-axis as False | |
| xax = a.axes.get_xaxis() | |
| xax = xax.set_visible(False) | |
| # set visibility of y-axis as False | |
| yax = a.axes.get_yaxis() | |
| yax = yax.set_visible(False) | |
| plt.imshow(wc) | |
| # plt.show() | |
| st.pyplot(fig) | |
| def drop_df_null_col(df): | |
| # Check if all values in each column are 0 | |
| columns_to_drop = df.columns[df.eq(0).all()] | |
| # Drop the columns with all values as 0 | |
| return df.drop(columns=columns_to_drop) | |
| def calcul_occurence(df_count_word): | |
| nb_occurences = pd.DataFrame(df_count_word.sum().sort_values(axis=0,ascending=False)) | |
| nb_occurences.columns = ['occurences'] | |
| nb_occurences.index.name = 'mot' | |
| nb_occurences['mots'] = nb_occurences.index | |
| return nb_occurences | |
| def dist_frequence_mots(df_count_word): | |
| df_count_word = drop_df_null_col(df_count_word) | |
| nb_occurences = calcul_occurence(df_count_word) | |
| sns.set() | |
| fig = plt.figure() #figsize=(4,4) | |
| plt.title(tr("Nombre d'apparitions des mots"), fontsize=16) | |
| chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]); | |
| chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8) | |
| st.pyplot(fig) | |
| def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ): | |
| ''' | |
| fig = px.histogram(sent_len, nbins=16, range_x=[3, 18],labels={'count': 'Count', 'variable': 'Nb de mots'}, | |
| color_discrete_sequence=['rgb(200, 0, 0)'], # Couleur des barres de l'histogramme | |
| opacity=0.7) | |
| fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,) | |
| fig.update_layout( | |
| title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'}, | |
| title_font=dict(size=28), # Ajuste la taille de la police du titre | |
| xaxis_title=None, | |
| xaxis=dict( | |
| title_font=dict(size=30), # Ajuste la taille de la police de l'axe X | |
| tickfont=dict(size=22), | |
| showgrid=True, gridcolor='white' | |
| ), | |
| yaxis_title='Count', | |
| yaxis=dict( | |
| title_font= dict(size=30, color='black'), # Ajuste la taille de la police de l'axe Y | |
| title_standoff=10, # Éloigne le label de l'axe X du graphique | |
| tickfont=dict(size=22), | |
| showgrid=True, gridcolor='white' | |
| ), | |
| margin=dict(l=20, r=20, t=40, b=20), # Ajustez les valeurs de 'r' pour déplacer les commandes à droite | |
| # legend=dict(x=1, y=1), # Position de la légende à droite en haut | |
| # width = 600 | |
| height=600, # Définir la hauteur de la figure | |
| plot_bgcolor='rgba(220, 220, 220, 0.6)', | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| ''' | |
| df = pd.DataFrame({lang1:sent_len,lang2:sent_len2}) | |
| sns.set() | |
| fig = plt.figure() # figsize=(12, 6*row_nb) | |
| fig.tight_layout() | |
| chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step", | |
| common_norm=False, multiple="layer", discrete=True, stat='proportion') | |
| plt.xticks([2,4,6,8,10,12,14,16,18,20,22]) | |
| chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)')); | |
| st.pyplot(fig) | |
| ''' | |
| # fig = ff.create_distplot([sent_len], ['Nb de mots'],bin_size=1, colors=['rgb(200, 0, 0)']) | |
| distribution = pd.DataFrame({'Nb mots':sent_len, 'Nb phrases':[1]*len(sent_len)}) | |
| fig = px.histogram(distribution, x='Nb mots', y='Nb phrases', marginal="box",range_x=[3, 18], nbins=16, hover_data=distribution.columns) | |
| fig.update_layout(height=600,title={'text': 'Distribution du nb de mots/phrase', 'y':1.0, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'}) | |
| fig.update_traces(marker=dict(color='rgb(200, 0, 0)', line=dict(color='white', width=2)), showlegend=False,) | |
| st.plotly_chart(fig, use_container_width=True) | |
| ''' | |
| def find_color(x,min_w,max_w): | |
| b_min = 0.0*(max_w-min_w)+min_w | |
| b_max = 0.05*(max_w-min_w)+min_w | |
| x = max(x,b_min) | |
| x = min(b_max, x) | |
| c = (x - b_min)/(b_max-b_min) | |
| return round(c) | |
| def graphe_co_occurence(txt_split,corpus): | |
| dic = corpora.Dictionary(txt_split) # dictionnaire de tous les mots restant dans le token | |
| # Equivalent (ou presque) de la DTM : DFM, Document Feature Matrix | |
| dfm = [dic.doc2bow(tok) for tok in txt_split] | |
| mes_labels = [k for k, v in dic.token2id.items()] | |
| from gensim.matutils import corpus2csc | |
| term_matrice = corpus2csc(dfm) | |
| term_matrice = np.dot(term_matrice, term_matrice.T) | |
| for i in range(len(mes_labels)): | |
| term_matrice[i,i]= 0 | |
| term_matrice.eliminate_zeros() | |
| G = nx.from_scipy_sparse_matrix(term_matrice) | |
| G.add_nodes = dic | |
| pos=nx.spring_layout(G, k=5) # position des nodes | |
| importance = dict(nx.degree(G)) | |
| importance = [round((v**1.3)) for v in importance.values()] | |
| edges,weights = zip(*nx.get_edge_attributes(G,'weight').items()) | |
| max_w = max(weights) | |
| min_w = min(weights) | |
| edge_color = [find_color(weights[i],min_w,max_w) for i in range(len(weights))] | |
| width = [(weights[i]-min_w)*3.4/(max_w-min_w)+0.2 for i in range(len(weights))] | |
| alpha = [(weights[i]-min_w)*0.3/(max_w-min_w)+0.3 for i in range(len(weights))] | |
| fig = plt.figure(); | |
| nx.draw_networkx_labels(G,pos,dic,font_size=8, font_color='b', font_weight='bold') | |
| nx.draw_networkx_nodes(G,pos, dic, \ | |
| node_color= importance, # range(len(importance)), #"tab:red", \ | |
| node_size=importance, \ | |
| cmap=plt.cm.RdYlGn, #plt.cm.Reds_r, \ | |
| alpha=0.4); | |
| nx.draw_networkx_edges(G,pos,width=width,edge_color=edge_color, alpha=alpha,edge_cmap=plt.cm.RdYlGn) # [1] * len(width) | |
| plt.axis("off"); | |
| st.pyplot(fig) | |
| def proximite(): | |
| global vec_model_en,vec_model_fr | |
| # Creates and TSNE model and plots it" | |
| labels = [] | |
| tokens = [] | |
| nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20) | |
| df = pd.read_csv(dataPath+'/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False) | |
| words_en = df.index.to_list()[:nb_words] | |
| words_fr = df['Francais'].to_list()[:nb_words] | |
| for word in words_en: | |
| tokens.append(vec_model_en[word]) | |
| labels.append(word) | |
| for word in words_fr: | |
| tokens.append(vec_model_fr[word]) | |
| labels.append(word) | |
| tokens = pd.DataFrame(tokens) | |
| tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23) | |
| new_values = tsne_model.fit_transform(tokens) | |
| fig =plt.figure(figsize=(16, 16)) | |
| x = [] | |
| y = [] | |
| for value in new_values: | |
| x.append(value[0]) | |
| y.append(value[1]) | |
| for i in range(len(x)): | |
| if i<nb_words : color='green' | |
| else: color='blue' | |
| plt.scatter(x[i],y[i]) | |
| plt.annotate(labels[i], | |
| xy=(x[i], y[i]), | |
| xytext=(5, 2), | |
| textcoords='offset points', | |
| ha='right', | |
| va='bottom', | |
| color= color, | |
| size=20) | |
| plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green") | |
| plt.legend(loc='best'); | |
| st.pyplot(fig) | |
| def run(): | |
| global max_lines, first_line, Langue | |
| global full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en | |
| global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr | |
| st.write("") | |
| st.title(tr(title)) | |
| # | |
| st.write("## **"+tr("Paramètres")+" :**\n") | |
| Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True) | |
| first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859) | |
| max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :', | |
| options=[1,5,10,15,100, 500, 1000,'Max']) | |
| if max_lines=='Max': | |
| max_lines=137860 | |
| if ((first_line+max_lines)>137860): | |
| max_lines = max(137860-first_line,0) | |
| # Chargement des textes sélectionnés (max lignes = max_lines) | |
| last_line = first_line+max_lines | |
| if (Langue == 'Anglais'): | |
| txt_en = full_txt_en[first_line:last_line] | |
| corpus_en = full_corpus_en[first_line:last_line] | |
| txt_split_en = full_txt_split_en[first_line:last_line] | |
| df_count_word_en =full_df_count_word_en.loc[first_line:last_line-1] | |
| sent_len_en = full_sent_len_en[first_line:last_line] | |
| sent_len_fr = full_sent_len_fr[first_line:last_line] | |
| else: | |
| txt_fr = full_txt_fr[first_line:last_line] | |
| corpus_fr = full_corpus_fr[first_line:last_line] | |
| txt_split_fr = full_txt_split_fr[first_line:last_line] | |
| df_count_word_fr =full_df_count_word_fr.loc[first_line:last_line-1] | |
| sent_len_fr = full_sent_len_fr[first_line:last_line] | |
| sent_len_en = full_sent_len_en[first_line:last_line] | |
| if (Langue=='Anglais'): | |
| st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
| else: | |
| st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
| st.write("") | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")]) | |
| with tab1: | |
| st.subheader(tr("World Cloud")) | |
| st.markdown(tr( | |
| """ | |
| On remarque, en changeant de langue, que certains mot de taille importante dans une langue, | |
| apparaissent avec une taille identique dans l'autre langue. | |
| La traduction mot à mot sera donc peut-être bonne. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| text = "" | |
| # Initialiser la variable des mots vides | |
| stop_words = set(stopwords.words('english')) | |
| for e in txt_en : text += e | |
| plot_word_cloud(text, "English words corpus", st.session_state.ImagePath+"/coeur.png", stop_words) | |
| else: | |
| text = "" | |
| # Initialiser la variable des mots vides | |
| stop_words = set(stopwords.words('french')) | |
| for e in txt_fr : text += e | |
| plot_word_cloud(text,"Mots français du corpus", st.session_state.ImagePath+"/coeur.png", stop_words) | |
| with tab2: | |
| st.subheader(tr("Frequence d'apparition des mots")) | |
| st.markdown(tr( | |
| """ | |
| On remarque, en changeant de langue, que certains mot fréquents dans une langue, | |
| apparaissent aussi fréquemment dans l'autre langue. | |
| Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| dist_frequence_mots(df_count_word_en) | |
| else: | |
| dist_frequence_mots(df_count_word_fr) | |
| with tab3: | |
| st.subheader(tr("Distribution des longueurs de phrases")) | |
| st.markdown(tr( | |
| """ | |
| Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes), | |
| on constate une certaine similitude dans les ditributions de longueur de phrases. | |
| Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français') | |
| else: | |
| dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais') | |
| with tab4: | |
| st.subheader(tr("Co-occurence des mots dans une phrase")) | |
| if (Langue == 'Anglais'): | |
| graphe_co_occurence(txt_split_en[:1000],corpus_en) | |
| else: | |
| graphe_co_occurence(txt_split_fr[:1000],corpus_fr) | |
| with tab5: | |
| st.subheader(tr("Proximité sémantique des mots (Word Embedding)") ) | |
| st.markdown(tr( | |
| """ | |
| MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit | |
| notamment des "Word Embedding" multilingues | |
| Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique. | |
| Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais). | |
| """) | |
| ) | |
| st.markdown(tr( | |
| """ | |
| En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec. | |
| Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot. | |
| """) | |
| ) | |
| st.write("") | |
| proximite() | |