separate function for cleaning
Browse files
app.py
CHANGED
|
@@ -185,9 +185,7 @@ def get_topic_value(row, i):
|
|
| 185 |
except Exception as e:
|
| 186 |
print(e)
|
| 187 |
|
| 188 |
-
def
|
| 189 |
-
|
| 190 |
-
print('cleaning')
|
| 191 |
df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
|
| 192 |
|
| 193 |
# Apply the function above and get tweets free of emoji's
|
|
@@ -246,6 +244,13 @@ def full_lda(df):
|
|
| 246 |
# Apply tokenizer
|
| 247 |
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
print('base model setup')
|
| 250 |
# Create a id2word dictionary
|
| 251 |
global id2word
|
|
@@ -532,11 +537,13 @@ def main(dataset, model, progress=gr.Progress(track_tqdm=True)):
|
|
| 532 |
print(df)
|
| 533 |
|
| 534 |
if model == 'LDA':
|
|
|
|
| 535 |
print('doing lda')
|
| 536 |
top_tweets = full_lda(df)
|
| 537 |
print('done lda')
|
| 538 |
place_data = 'test'
|
| 539 |
else:
|
|
|
|
| 540 |
base_bertopic(df)
|
| 541 |
top_tweets = optimized_bertopic()
|
| 542 |
|
|
|
|
| 185 |
except Exception as e:
|
| 186 |
print(e)
|
| 187 |
|
| 188 |
+
def cleaning(df):
|
|
|
|
|
|
|
| 189 |
df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
|
| 190 |
|
| 191 |
# Apply the function above and get tweets free of emoji's
|
|
|
|
| 244 |
# Apply tokenizer
|
| 245 |
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
|
| 246 |
|
| 247 |
+
return df
|
| 248 |
+
|
| 249 |
+
def full_lda(df):
|
| 250 |
+
|
| 251 |
+
print('cleaning')
|
| 252 |
+
|
| 253 |
+
|
| 254 |
print('base model setup')
|
| 255 |
# Create a id2word dictionary
|
| 256 |
global id2word
|
|
|
|
| 537 |
print(df)
|
| 538 |
|
| 539 |
if model == 'LDA':
|
| 540 |
+
df = cleaning(df)
|
| 541 |
print('doing lda')
|
| 542 |
top_tweets = full_lda(df)
|
| 543 |
print('done lda')
|
| 544 |
place_data = 'test'
|
| 545 |
else:
|
| 546 |
+
df = cleaning(df)
|
| 547 |
base_bertopic(df)
|
| 548 |
top_tweets = optimized_bertopic()
|
| 549 |
|