Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PIL import Image | |
| import os | |
| import ast | |
| import contextlib | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from gensim import corpora | |
| import networkx as nx | |
| from sklearn.manifold import TSNE | |
| from gensim.models import KeyedVectors | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from translate_app import tr | |
| title = "Sentence Similarity" | |
| sidebar_name = "Sentence Similarity" | |
| dataPath = st.session_state.DataPath | |
| ''' | |
| with contextlib.redirect_stdout(open(os.devnull, "w")): | |
| nltk.download('stopwords') | |
| # Première ligne à charger | |
| first_line = 0 | |
| # Nombre maximum de lignes à charger | |
| max_lines = 140000 | |
| if ((first_line+max_lines)>137860): | |
| max_lines = max(137860-first_line ,0) | |
| # Nombre maximum de ligne à afficher pour les DataFrame | |
| max_lines_to_display = 50 | |
| @st.cache_data | |
| def load_data(path): | |
| input_file = os.path.join(path) | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| # On convertit les majuscules en minulcule | |
| data = data.lower() | |
| data = data.split('\n') | |
| return data[first_line:min(len(data),first_line+max_lines)] | |
| @st.cache_data | |
| def load_preprocessed_data(path,data_type): | |
| input_file = os.path.join(path) | |
| if data_type == 1: | |
| return pd.read_csv(input_file, encoding="utf-8", index_col=0) | |
| else: | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| data = f.read() | |
| data = data.split('\n') | |
| if data_type==0: | |
| data=data[:-1] | |
| elif data_type == 2: | |
| data=[eval(i) for i in data[:-1]] | |
| elif data_type ==3: | |
| data2 = [] | |
| for d in data[:-1]: | |
| data2.append(ast.literal_eval(d)) | |
| data=data2 | |
| return data | |
| @st.cache_data | |
| def load_all_preprocessed_data(lang): | |
| txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0) | |
| corpus =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0) | |
| txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3) | |
| df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)]) | |
| sent_len =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2) | |
| vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec') | |
| return txt, corpus, txt_split, df_count_word,sent_len, vec_model | |
| #Chargement des textes complet dans les 2 langues | |
| full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en = load_all_preprocessed_data('en') | |
| full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr = load_all_preprocessed_data('fr') | |
| def plot_word_cloud(text, title, masque, stop_words, background_color = "white"): | |
| mask_coloring = np.array(Image.open(str(masque))) | |
| # Définir le calque du nuage des mots | |
| wc = WordCloud(background_color=background_color, max_words=200, | |
| stopwords=stop_words, mask = mask_coloring, | |
| max_font_size=50, random_state=42) | |
| # Générer et afficher le nuage de mots | |
| fig=plt.figure(figsize= (20,10)) | |
| plt.title(tr(title), fontsize=25, color="green") | |
| wc.generate(text) | |
| # getting current axes | |
| a = plt.gca() | |
| # set visibility of x-axis as False | |
| xax = a.axes.get_xaxis() | |
| xax = xax.set_visible(False) | |
| # set visibility of y-axis as False | |
| yax = a.axes.get_yaxis() | |
| yax = yax.set_visible(False) | |
| plt.imshow(wc) | |
| # plt.show() | |
| st.pyplot(fig) | |
| def drop_df_null_col(df): | |
| # Check if all values in each column are 0 | |
| columns_to_drop = df.columns[df.eq(0).all()] | |
| # Drop the columns with all values as 0 | |
| return df.drop(columns=columns_to_drop) | |
| def calcul_occurence(df_count_word): | |
| nb_occurences = pd.DataFrame(df_count_word.sum().sort_values(axis=0,ascending=False)) | |
| nb_occurences.columns = ['occurences'] | |
| nb_occurences.index.name = 'mot' | |
| nb_occurences['mots'] = nb_occurences.index | |
| return nb_occurences | |
| def dist_frequence_mots(df_count_word): | |
| df_count_word = drop_df_null_col(df_count_word) | |
| nb_occurences = calcul_occurence(df_count_word) | |
| sns.set() | |
| fig = plt.figure() #figsize=(4,4) | |
| plt.title(tr("Nombre d'apparitions des mots"), fontsize=16) | |
| chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]); | |
| chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8) | |
| st.pyplot(fig) | |
| def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ): | |
| df = pd.DataFrame({lang1:sent_len,lang2:sent_len2}) | |
| sns.set() | |
| fig = plt.figure() # figsize=(12, 6*row_nb) | |
| fig.tight_layout() | |
| chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step", | |
| common_norm=False, multiple="layer", discrete=True, stat='proportion') | |
| plt.xticks([2,4,6,8,10,12,14,16,18,20,22]) | |
| chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)')); | |
| st.pyplot(fig) | |
| def find_color(x,min_w,max_w): | |
| b_min = 0.0*(max_w-min_w)+min_w | |
| b_max = 0.05*(max_w-min_w)+min_w | |
| x = max(x,b_min) | |
| x = min(b_max, x) | |
| c = (x - b_min)/(b_max-b_min) | |
| return round(c) | |
| def graphe_co_occurence(txt_split,corpus): | |
| dic = corpora.Dictionary(txt_split) # dictionnaire de tous les mots restant dans le token | |
| # Equivalent (ou presque) de la DTM : DFM, Document Feature Matrix | |
| dfm = [dic.doc2bow(tok) for tok in txt_split] | |
| mes_labels = [k for k, v in dic.token2id.items()] | |
| from gensim.matutils import corpus2csc | |
| term_matrice = corpus2csc(dfm) | |
| term_matrice = np.dot(term_matrice, term_matrice.T) | |
| for i in range(len(mes_labels)): | |
| term_matrice[i,i]= 0 | |
| term_matrice.eliminate_zeros() | |
| G = nx.from_scipy_sparse_matrix(term_matrice) | |
| G.add_nodes = dic | |
| pos=nx.spring_layout(G, k=5) # position des nodes | |
| importance = dict(nx.degree(G)) | |
| importance = [round((v**1.3)) for v in importance.values()] | |
| edges,weights = zip(*nx.get_edge_attributes(G,'weight').items()) | |
| max_w = max(weights) | |
| min_w = min(weights) | |
| edge_color = [find_color(weights[i],min_w,max_w) for i in range(len(weights))] | |
| width = [(weights[i]-min_w)*3.4/(max_w-min_w)+0.2 for i in range(len(weights))] | |
| alpha = [(weights[i]-min_w)*0.3/(max_w-min_w)+0.3 for i in range(len(weights))] | |
| fig = plt.figure(); | |
| nx.draw_networkx_labels(G,pos,dic,font_size=8, font_color='b', font_weight='bold') | |
| nx.draw_networkx_nodes(G,pos, dic, \ | |
| node_color= importance, # range(len(importance)), #"tab:red", \ | |
| node_size=importance, \ | |
| cmap=plt.cm.RdYlGn, #plt.cm.Reds_r, \ | |
| alpha=0.4); | |
| nx.draw_networkx_edges(G,pos,width=width,edge_color=edge_color, alpha=alpha,edge_cmap=plt.cm.RdYlGn) # [1] * len(width) | |
| plt.axis("off"); | |
| st.pyplot(fig) | |
| def proximite(): | |
| global vec_model_en,vec_model_fr | |
| # Creates and TSNE model and plots it" | |
| labels = [] | |
| tokens = [] | |
| nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20) | |
| df = pd.read_csv(dataPath+'/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False) | |
| words_en = df.index.to_list()[:nb_words] | |
| words_fr = df['Francais'].to_list()[:nb_words] | |
| for word in words_en: | |
| tokens.append(vec_model_en[word]) | |
| labels.append(word) | |
| for word in words_fr: | |
| tokens.append(vec_model_fr[word]) | |
| labels.append(word) | |
| tokens = pd.DataFrame(tokens) | |
| tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=2000, random_state=23) | |
| new_values = tsne_model.fit_transform(tokens) | |
| fig =plt.figure(figsize=(16, 16)) | |
| x = [] | |
| y = [] | |
| for value in new_values: | |
| x.append(value[0]) | |
| y.append(value[1]) | |
| for i in range(len(x)): | |
| if i<nb_words : color='green' | |
| else: color='blue' | |
| plt.scatter(x[i],y[i]) | |
| plt.annotate(labels[i], | |
| xy=(x[i], y[i]), | |
| xytext=(5, 2), | |
| textcoords='offset points', | |
| ha='right', | |
| va='bottom', | |
| color= color, | |
| size=20) | |
| plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green") | |
| plt.legend(loc='best'); | |
| st.pyplot(fig) | |
| ''' | |
| def run(): | |
| ''' | |
| global max_lines, first_line, Langue | |
| global full_txt_en, full_corpus_en, full_txt_split_en, full_df_count_word_en,full_sent_len_en, vec_model_en | |
| global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr | |
| st.write("") | |
| st.title(tr(title)) | |
| # | |
| st.write("## **"+tr("Paramètres")+" :**\n") | |
| Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True) | |
| first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859) | |
| max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :', | |
| options=[1,5,10,15,100, 500, 1000,'Max']) | |
| if max_lines=='Max': | |
| max_lines=137860 | |
| if ((first_line+max_lines)>137860): | |
| max_lines = max(137860-first_line,0) | |
| # Chargement des textes sélectionnés (max lignes = max_lines) | |
| last_line = first_line+max_lines | |
| if (Langue == 'Anglais'): | |
| txt_en = full_txt_en[first_line:last_line] | |
| corpus_en = full_corpus_en[first_line:last_line] | |
| txt_split_en = full_txt_split_en[first_line:last_line] | |
| df_count_word_en =full_df_count_word_en.loc[first_line:last_line-1] | |
| sent_len_en = full_sent_len_en[first_line:last_line] | |
| sent_len_fr = full_sent_len_fr[first_line:last_line] | |
| else: | |
| txt_fr = full_txt_fr[first_line:last_line] | |
| corpus_fr = full_corpus_fr[first_line:last_line] | |
| txt_split_fr = full_txt_split_fr[first_line:last_line] | |
| df_count_word_fr =full_df_count_word_fr.loc[first_line:last_line-1] | |
| sent_len_fr = full_sent_len_fr[first_line:last_line] | |
| sent_len_en = full_sent_len_en[first_line:last_line] | |
| if (Langue=='Anglais'): | |
| st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
| else: | |
| st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
| st.write("") | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")]) | |
| with tab1: | |
| st.subheader(tr("World Cloud")) | |
| st.markdown(tr( | |
| """ | |
| On remarque, en changeant de langue, que certains mot de taille importante dans une langue, | |
| apparaissent avec une taille identique dans l'autre langue. | |
| La traduction mot à mot sera donc peut-être bonne. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| text = "" | |
| # Initialiser la variable des mots vides | |
| stop_words = set(stopwords.words('english')) | |
| for e in txt_en : text += e | |
| plot_word_cloud(text, "English words corpus", st.session_state.ImagePath+"/coeur.png", stop_words) | |
| else: | |
| text = "" | |
| # Initialiser la variable des mots vides | |
| stop_words = set(stopwords.words('french')) | |
| for e in txt_fr : text += e | |
| plot_word_cloud(text,"Mots français du corpus", st.session_state.ImagePath+"/coeur.png", stop_words) | |
| with tab2: | |
| st.subheader(tr("Frequence d'apparition des mots")) | |
| st.markdown(tr( | |
| """ | |
| On remarque, en changeant de langue, que certains mot fréquents dans une langue, | |
| apparaissent aussi fréquemment dans l'autre langue. | |
| Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| dist_frequence_mots(df_count_word_en) | |
| else: | |
| dist_frequence_mots(df_count_word_fr) | |
| with tab3: | |
| st.subheader(tr("Distribution des longueurs de phrases")) | |
| st.markdown(tr( | |
| """ | |
| Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes), | |
| on constate une certaine similitude dans les ditributions de longueur de phrases. | |
| Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise. | |
| """) | |
| ) | |
| if (Langue == 'Anglais'): | |
| dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français') | |
| else: | |
| dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais') | |
| with tab4: | |
| st.subheader(tr("Co-occurence des mots dans une phrase")) | |
| if (Langue == 'Anglais'): | |
| graphe_co_occurence(txt_split_en[:1000],corpus_en) | |
| else: | |
| graphe_co_occurence(txt_split_fr[:1000],corpus_fr) | |
| with tab5: | |
| st.subheader(tr("Proximité sémantique des mots (Word Embedding)") ) | |
| st.markdown(tr( | |
| """ | |
| MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit | |
| notamment des "Word Embedding" multilingues | |
| Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique. | |
| Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais). | |
| """) | |
| ) | |
| st.markdown(tr( | |
| """ | |
| En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec. | |
| Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot. | |
| """) | |
| ) | |
| st.write("") | |
| proximite() | |
| ''' | |
| st.write("") | |
| st.title(tr(title)) | |
| sentences = ["This is an example sentence", "Each sentence is converted"] | |
| sentences[0] = st.text_area(label=tr("Saisir un élément issu de la proposition de valeur (quelque soit la langue):"), value="This is an example sentence") | |
| sentences[1] = st.text_area(label=tr("Saisir une phrase issue de l'acte de vente (quelque soit la langue):"), value="Each sentence is converted", height=200) | |
| st.button(label=tr("Validez"), type="primary") | |
| model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') | |
| embeddings = model.encode(sentences) | |
| st.write("Transformation de chaque phrase en vecteur (dimension = 384 ):") | |
| st.write(embeddings) | |
| st.write("") | |
| # Calculate cosine similarity between the two sentences | |
| similarity = cosine_similarity([embeddings[0]], [embeddings[1]]) | |
| st.write(f"Cosine similarity comprise entre 0 et 1: {similarity[0][0]}") | |
| st.write("") | |
| st.write("") | |
| st.write("") |