| |
|
|
| |
| import json |
| import requests |
| import pandas as pd |
| import numpy as np |
| import emoji |
| import regex |
| import re |
| import string |
| from collections import Counter |
| import tqdm |
| from operator import itemgetter |
|
|
| |
| import plotly.express as px |
| import seaborn as sns |
| import matplotlib.pyplot as plt |
| import pyLDAvis.gensim |
| import chart_studio |
| import chart_studio.plotly as py |
| import chart_studio.tools as tls |
|
|
| |
| import spacy |
| import gensim |
| import json |
| from spacy.tokenizer import Tokenizer |
| from gensim.corpora import Dictionary |
| from gensim.models.ldamulticore import LdaMulticore |
| from gensim.models.coherencemodel import CoherenceModel |
| from gensim.parsing.preprocessing import STOPWORDS as SW |
| from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer |
| from sklearn.model_selection import GridSearchCV |
| from pprint import pprint |
| from wordcloud import STOPWORDS |
| from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric |
|
|
| import gradio as gr |
|
|
| def give_emoji_free_text(text): |
| """ |
| Removes emoji's from tweets |
| Accepts: |
| Text (tweets) |
| Returns: |
| Text (emoji free tweets) |
| """ |
| emoji_list = [c for c in text if c in emoji.EMOJI_DATA] |
| clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) |
| return clean_text |
|
|
| def url_free_text(text): |
| ''' |
| Cleans text from urls |
| ''' |
| text = re.sub(r'http\S+', '', text) |
| return text |
|
|
| |
| def tokenize(text): |
| """ |
| Parses a string into a list of semantic units (words) |
| Args: |
| text (str): The string that the function will tokenize. |
| Returns: |
| list: tokens parsed out |
| """ |
| |
| pattern = r"http\S+" |
|
|
| tokens = re.sub(pattern, "", text) |
| tokens = re.sub('[^a-zA-Z 0-9]', '', text) |
| tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) |
| tokens = re.sub('\w*\d\w*', '', text) |
| |
| tokens = tokens.strip(',') |
| tokens = tokens.strip('?') |
| tokens = tokens.strip('!') |
| tokens = tokens.strip("'") |
| tokens = tokens.strip(".") |
|
|
| tokens = tokens.lower().split() |
|
|
| return tokens |
|
|
| def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1): |
| coherence_values = [] |
| model_list = [] |
| for num_topics in range(start, limit, step): |
| model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
| num_topics=num_topics, |
| random_state=100, |
| chunksize=200, |
| passes=10, |
| per_word_topics=True, |
| id2word=id2word) |
| model_list.append(model) |
| coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') |
| coherence_values.append(coherencemodel.get_coherence()) |
|
|
| return model_list, coherence_values |
|
|
| def compute_coherence_values2(corpus, dictionary, k, a, b): |
| lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
| id2word=id2word, |
| num_topics=num_topics, |
| random_state=100, |
| chunksize=200, |
| passes=10, |
| alpha=a, |
| eta=b, |
| per_word_topics=True) |
| coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') |
|
|
| return coherence_model_lda.get_coherence() |
|
|
| def assignTopic(l): |
| maxTopic = max(l,key=itemgetter(1))[0] |
| return maxTopic |
|
|
| def get_topic_value(row, i): |
| if len(row) == 1: |
| return row[0][1] |
| else: |
| return row[i][1] |
|
|
| def dataframeProcessing(dataset): |
| |
| f = open('stopwords-tl.json') |
| tlStopwords = json.loads(f.read()) |
| stopwords = set(STOPWORDS) |
| stopwords.update(tlStopwords) |
| stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi']) |
|
|
| global df |
| df = pd.read_csv(dataset + '.csv') |
| df.rename(columns = {'tweet':'original_tweets'}, inplace = True) |
| df = df.apply(lambda row: row[df['language'].isin(['en'])]) |
| df.reset_index(inplace=True) |
|
|
| |
| call_emoji_free = lambda x: give_emoji_free_text(x) |
|
|
| |
| df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free) |
|
|
| |
| df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text) |
|
|
| |
| |
| nlp = spacy.load('en_core_web_lg') |
|
|
| |
| tokenizer = Tokenizer(nlp.vocab) |
|
|
|
|
| |
| custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@'] |
|
|
|
|
| |
| STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords) |
|
|
| |
| ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords) |
|
|
|
|
| tokens = [] |
| STOP_WORDS.update(stopwords) |
|
|
| for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500): |
| doc_tokens = [] |
| for token in doc: |
| if token.text.lower() not in STOP_WORDS: |
| doc_tokens.append(token.text.lower()) |
| tokens.append(doc_tokens) |
|
|
| |
| df['tokens'] = tokens |
|
|
| |
| df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']] |
|
|
| def get_lemmas(text): |
| '''Used to lemmatize the processed tweets''' |
| lemmas = [] |
|
|
| doc = nlp(text) |
|
|
| |
| for token in doc: |
| if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'): |
| lemmas.append(token.lemma_) |
|
|
| return lemmas |
|
|
| df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas) |
|
|
| |
| df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']] |
|
|
| |
| df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize) |
|
|
| |
| global id2word |
| id2word = Dictionary(df['lemma_tokens']) |
|
|
| |
| id2word.filter_extremes(no_below=2, no_above=.99) |
| print(len(id2word)) |
|
|
| |
| corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] |
|
|
| lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
| id2word=id2word, |
| num_topics=5, |
| random_state=100, |
| chunksize=200, |
| passes=10, |
| per_word_topics=True) |
|
|
| pprint(lda_model.print_topics()) |
| doc_lda = lda_model[corpus] |
|
|
| coherence_model_lda = CoherenceModel(model=lda_model, texts=df['lemma_tokens'], dictionary=id2word, coherence='c_v') |
| coherence_lda = coherence_model_lda.get_coherence() |
|
|
| model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, |
| texts=df['lemma_tokens'], |
| start=2, |
| limit=10, |
| step=1) |
|
|
| k_max = max(coherence_values) |
|
|
| global num_topics |
| num_topics = coherence_values.index(k_max) + 2 |
|
|
| lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
| id2word=id2word, |
| num_topics=num_topics, |
| random_state=100, |
| chunksize=200, |
| passes=10, |
| per_word_topics=True) |
|
|
| grid = {} |
| grid['Validation_Set'] = {} |
|
|
| alpha = [0.05, 0.1, 0.5, 1, 5, 10] |
|
|
| beta = [0.05, 0.1, 0.5, 1, 5, 10] |
|
|
| num_of_docs = len(corpus) |
| corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), |
| corpus] |
| corpus_title = ['75% Corpus', '100% Corpus'] |
| model_results = {'Validation_Set': [], |
| 'Alpha': [], |
| 'Beta': [], |
| 'Coherence': [] |
| } |
| if 1 == 1: |
| pbar = tqdm.tqdm(total=540) |
|
|
| for i in range(len(corpus_sets)): |
| for a in alpha: |
| for b in beta: |
| cv = compute_coherence_values2(corpus=corpus_sets[i], dictionary=id2word, k=num_topics, a=a, b=b) |
| model_results['Validation_Set'].append(corpus_title[i]) |
| model_results['Alpha'].append(a) |
| model_results['Beta'].append(b) |
| model_results['Coherence'].append(cv) |
|
|
| pbar.update(1) |
| pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False) |
| pbar.close() |
|
|
| params_df = pd.read_csv('lda_tuning_results_new.csv') |
| params_df = params_df[params_df.Validation_Set == '100% Corpus'] |
| params_df.reset_index(inplace=True) |
|
|
| max_params = params_df.loc[params_df['Coherence'].idxmax()] |
| max_coherence = max_params['Coherence'] |
| max_alpha = max_params['Alpha'] |
| max_beta = max_params['Beta'] |
|
|
| lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus, |
| id2word=id2word, |
| num_topics=7, |
| random_state=100, |
| chunksize=200, |
| passes=10, |
| alpha=max_alpha, |
| eta=max_beta, |
| per_word_topics=True) |
|
|
| coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word, |
| coherence='c_v') |
| coherence_lda = coherence_model_lda.get_coherence() |
|
|
| lda_topics = lda_model_final.show_topics(num_words=10) |
|
|
| topics = [] |
| filters = [lambda x: x.lower(), strip_punctuation, strip_numeric] |
| lda_topics_string = '' |
|
|
| for topic in lda_topics: |
| print(topic) |
| lda_topics_string += 'Topic ' + str(topic[0]) + '\n' + str(topic[1]) + '\n\n' |
| topics.append(preprocess_string(topic[1], filters)) |
|
|
| df['topic'] = [sorted(lda_model_final[corpus][text][0]) for text in range(len(df['original_tweets']))] |
|
|
| def sort_topics(l): |
| return(sorted(l, key=lambda x: x[1], reverse=True)) |
| |
| df['topic'] = df['topic'].apply(sort_topics) |
| df['topic_string'] = df['topic'].astype(str) |
| |
|
|
| df = df[df['topic'].map(lambda d: len(d)) > 0] |
| df['topic'][0] |
|
|
| df['max_topic'] = df['topic'].map(lambda row: assignTopic(row)) |
|
|
| topic_clusters = [] |
| for i in range(num_topics): |
| topic_clusters.append(df[df['max_topic'].isin(([i]))]) |
| topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist() |
|
|
| for i in range(len(topic_clusters)): |
| tweets = df.loc[df['max_topic'] == i] |
| tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i)) |
| |
| tweets_sorted = tweets.sort_values('topic', ascending=False) |
| tweets_sorted.drop_duplicates(subset=['original_tweets']) |
| rep_tweets = tweets_sorted['original_tweets'] |
| rep_tweets = [*set(rep_tweets)] |
| print('Topic ', i) |
| print(rep_tweets[:5]) |
| |
| output_df = df[['topic_string', 'original_tweets']].copy() |
| |
| return lda_topics_string, output_df |
|
|
| def greet(name): |
| return "Hello " + name + "!!" |
|
|
| iface = gr.Interface(fn=dataframeProcessing, |
| inputs=gr.Dropdown(["katip-december", |
| "katipunan-december", |
| "bgc-december", |
| "bonifacio global city-december"], |
| label="Dataset"), |
| outputs=["text", |
| gr.Dataframe(headers=['topic_string', 'original_tweets'])]) |
| iface.launch() |