In [1]:
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Phrases, LdaModel
from gensim.models.phrases import Phraser
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vldth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('SentiTaglish_ProductsAndServices.csv')
print("Original dataset:")
print(df.head())

Original dataset:
                                              review  sentiment
0  at first gumagana cya..pero pagnalowbat cya nd...          1
1  grabi pangalawa ko ng order sa shapee pero pur...          1
2  2l gray/black order ko. bakit 850ml lang po pi...          1
3  walang silbing product.. bwesit. di gumagana d...          1
4  d po maganda naman po yung neck fan, pero po n...          4


In [4]:
# Drop the sentiment column
reviews_df = df.drop(columns=['sentiment'])
print(reviews_df.head())

                                              review
0  at first gumagana cya..pero pagnalowbat cya nd...
1  grabi pangalawa ko ng order sa shapee pero pur...
2  2l gray/black order ko. bakit 850ml lang po pi...
3  walang silbing product.. bwesit. di gumagana d...
4  d po maganda naman po yung neck fan, pero po n...


In [5]:
documents = reviews_df['review'].astype(str).tolist()

In [6]:
#load tagalog stopwords function
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return set(line.strip() for line in file if line.strip())

In [7]:
# Define stopwords
english_stopwords = stopwords.words('english')

# Tagalog/Filipino stopwords 
tagalog_stopwords = load_stopwords("stopwords-new.txt")

combined_stopwords = set(english_stopwords).union(tagalog_stopwords)

In [8]:
# Preprocessing function
def preprocess_data(documents):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in combined_stopwords]
        for doc in documents
    ]

In [9]:
# 1. Preprocess and tokenize your documents
processed_texts = preprocess_data(documents)  # Should return list of tokenized docs

# 2. Create bigram and trigram models
bigram = Phrases(processed_texts, min_count=3, threshold=5)
trigram = Phrases(bigram[processed_texts], threshold=5)

# 3. Convert to efficient Phrasers
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# 4. Apply phrase models
def make_ngrams(texts):
    bigram_texts = [bigram_mod[doc] for doc in texts]
    trigram_texts = [trigram_mod[doc] for doc in bigram_texts]
    return trigram_texts

ngrammed_texts = make_ngrams(processed_texts)

# 5. Join tokens back into strings
texts_for_bertopic = [' '.join(doc) for doc in ngrammed_texts]

In [10]:
from bertopic import BERTopic
print("BERTopic is working!")

BERTopic is working!


In [11]:
# Initialize BERTopic
topic_model = BERTopic(language="multilingual")

# Fit the model on the preprocessed texts
topics, probs = topic_model.fit_transform(texts_for_bertopic)

In [12]:
topic_info = topic_model.get_topic_info()
display(topic_info.sort_values(by='Count', ascending=False).head(20))  # Top 20 topics

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3614,-1_item_seller_product_kasi,"[item, seller, product, kasi, order, maganda, ...","[maganda mura thank order next_time, ganda ite..."
1,0,993,0_color_black_white_pink,"[color, black, white, pink, blue, green, wrong...","[disappointed wrong_color purple pink, pink_bl..."
2,1,514,1_size_maliit_size_sizes_add_size,"[size, maliit_size, sizes, add_size, maliit, l...","[wrong_size binigay order size size binigay, c..."
3,2,463,2_dumating_sira_lng_ok,"[dumating, sira, lng, ok, agad, sana, maganda,...",[sobrang nakakadismaya basta maka deliver lng ...
4,3,336,3_price_worth_worth_price_good_price,"[price, worth, worth_price, good_price, sakto_...","[maganda worth_price, ganda worth, okay lng pr..."
5,4,194,4_food_place_masarap_service,"[food, place, masarap, service, staff, rice, c...",[service good energy crew poor took mins food ...
6,5,181,5_order_shop_thank_seller_order_received,"[order, shop, thank_seller, order_received, or...",[goods order_received maganda gumagana thank_s...
7,6,175,6_damage_item_damaged_items,"[damage, item, damaged, items, box, product, t...","[ganda damage thankyou_seller, damage dumating..."
8,7,158,7_good_quality_good_maganda_quality_quality,"[good_quality, good, maganda_quality, quality,...","[good_quality ganda, good_quality, maganda goo..."
9,8,156,8_shoes_sandals_socks_boots,"[shoes, sandals, socks, boots, slippers, legs,...","[maganda shoes price see_get, maganda shoes se..."


In [15]:
# Get and sort topics
all_topics = topic_model.get_topic_info().sort_values(by='Count', ascending=False)

# Style with borders, aligned text, and background gradient
styled_html = all_topics.style.set_table_attributes('border="1" class="dataframe"') \
    .set_table_styles([
        {'selector': 'th', 'props': [('border', '1px solid black'), ('background-color', '#f2f2f2'), ('padding', '8px')]},
        {'selector': 'td', 'props': [('border', '1px solid black'), ('padding', '8px'), ('text-align', 'left')]},
        {'selector': 'table', 'props': [('border-collapse', 'collapse'), ('width', '100%')]}
    ]) \
    .background_gradient(subset=['Count'], cmap='Blues') \
    .set_properties(**{
        'white-space': 'pre-wrap',
        'text-align': 'left',
    })

# Save to HTML file
styled_html.to_html("all_bertopic_topics.html")

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart(top_n_topics=20)