Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +564 -35
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,569 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
-
import
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Topic Modelling menggunakan LDA (Latent Dirichlet Allocation) untuk Teks Bahasa Indonesia
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
# 1. Import library yang diperlukan
|
| 6 |
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
import re
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
from pprint import pprint
|
| 11 |
|
| 12 |
+
# NLP processing
|
| 13 |
+
import nltk
|
| 14 |
+
from nltk.tokenize import word_tokenize
|
| 15 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
| 16 |
|
| 17 |
+
# Gensim
|
| 18 |
+
import gensim
|
| 19 |
+
import gensim.corpora as corpora
|
| 20 |
+
from gensim.models import CoherenceModel
|
| 21 |
+
import pyLDAvis
|
| 22 |
+
import pyLDAvis.gensim_models
|
| 23 |
|
| 24 |
+
# Scikit-learn
|
| 25 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 26 |
+
|
| 27 |
+
# Memastikan paket NLTK yang diperlukan sudah terunduh
|
| 28 |
+
# nltk.download('punkt')
|
| 29 |
+
|
| 30 |
+
# 2. Load dataset
|
| 31 |
+
def load_data(file_path):
|
| 32 |
+
"""
|
| 33 |
+
Memuat data dari file CSV
|
| 34 |
+
"""
|
| 35 |
+
df = pd.read_csv(file_path, sep=';')
|
| 36 |
+
return df
|
| 37 |
+
|
| 38 |
+
# 3. Preprocessing text
|
| 39 |
+
def preprocess_text(text):
|
| 40 |
+
"""
|
| 41 |
+
Preprocessing teks:
|
| 42 |
+
- Mengubah ke lowercase
|
| 43 |
+
- Menghapus emoji dan karakter khusus
|
| 44 |
+
- Menghapus angka
|
| 45 |
+
- Menghapus URL
|
| 46 |
+
"""
|
| 47 |
+
if isinstance(text, str):
|
| 48 |
+
# Lowercase
|
| 49 |
+
text = text.lower()
|
| 50 |
+
|
| 51 |
+
# Hapus URL
|
| 52 |
+
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
| 53 |
+
|
| 54 |
+
# Hapus emoji dan karakter khusus
|
| 55 |
+
text = re.sub(r'[^\w\s]', '', text)
|
| 56 |
+
|
| 57 |
+
# Hapus angka
|
| 58 |
+
text = re.sub(r'\d+', '', text)
|
| 59 |
+
|
| 60 |
+
# Hapus multiple whitespace
|
| 61 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 62 |
+
|
| 63 |
+
return text
|
| 64 |
+
return ""
|
| 65 |
+
|
| 66 |
+
# 4. Stopwords Bahasa Indonesia
|
| 67 |
+
def get_stopwords():
|
| 68 |
+
"""
|
| 69 |
+
Daftar stopwords Bahasa Indonesia
|
| 70 |
+
"""
|
| 71 |
+
stopwords = [
|
| 72 |
+
'ada', 'adalah', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 'akankah', 'akhir',
|
| 73 |
+
'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 'anda', 'andalah', 'antar', 'antara',
|
| 74 |
+
'antaranya', 'apa', 'apaan', 'apabila', 'apakah', 'apalagi', 'apatah', 'artinya', 'asal',
|
| 75 |
+
'asalkan', 'atas', 'atau', 'ataukah', 'ataupun', 'awal', 'awalnya', 'bagai', 'bagaikan',
|
| 76 |
+
'bagaimana', 'bagaimanakah', 'bagaimanapun', 'bagi', 'bagian', 'bahkan', 'bahwa', 'bahwasanya',
|
| 77 |
+
'baik', 'bakal', 'bakalan', 'balik', 'banyak', 'bapak', 'baru', 'bawah', 'beberapa', 'begini',
|
| 78 |
+
'beginian', 'beginikah', 'beginilah', 'begitu', 'begitukah', 'begitulah', 'begitupun', 'bekerja',
|
| 79 |
+
'belakang', 'belakangan', 'belum', 'belumlah', 'benar', 'benarkah', 'benarlah', 'berada',
|
| 80 |
+
'berakhir', 'berakhirlah', 'berakhirnya', 'berapa', 'berapakah', 'berapalah', 'berapapun',
|
| 81 |
+
'berarti', 'berawal', 'berbagai', 'berdatangan', 'beri', 'berikan', 'berikut', 'berikutnya',
|
| 82 |
+
'berjumlah', 'berkali-kali', 'berkata', 'berkehendak', 'berkeinginan', 'berkenaan', 'berlainan',
|
| 83 |
+
'berlalu', 'berlangsung', 'berlebihan', 'bermacam', 'bermacam-macam', 'bermaksud', 'bermula',
|
| 84 |
+
'bersama', 'bersama-sama', 'bersiap', 'bersiap-siap', 'bertanya', 'bertanya-tanya', 'berturut',
|
| 85 |
+
'berturut-turut', 'bertutur', 'berujar', 'berupa', 'besar', 'betul', 'betulkah', 'biasa',
|
| 86 |
+
'biasanya', 'bila', 'bilakah', 'bisa', 'bisakah', 'boleh', 'bolehkah', 'bolehlah', 'buat',
|
| 87 |
+
'bukan', 'bukankah', 'bukanlah', 'bukannya', 'bulan', 'bung', 'cara', 'caranya', 'cukup',
|
| 88 |
+
'cukupkah', 'cukuplah', 'cuma', 'dahulu', 'dalam', 'dan', 'dapat', 'dari', 'daripada',
|
| 89 |
+
'datang', 'dekat', 'demi', 'demikian', 'demikianlah', 'dengan', 'depan', 'di', 'dia',
|
| 90 |
+
'diakhiri', 'diakhirinya', 'dialah', 'diantara', 'diantaranya', 'diberi', 'diberikan',
|
| 91 |
+
'diberikannya', 'dibuat', 'dibuatnya', 'didapat', 'didatangkan', 'digunakan', 'diibaratkan',
|
| 92 |
+
'diibaratkannya', 'diingat', 'diingatkan', 'diinginkan', 'dijawab', 'dijelaskan', 'dijelaskannya',
|
| 93 |
+
'dikarenakan', 'dikatakan', 'dikatakannya', 'dikerjakan', 'diketahui', 'diketahuinya',
|
| 94 |
+
'dikira', 'dilakukan', 'dilalui', 'dilihat', 'dimaksud', 'dimaksudkan', 'dimaksudkannya',
|
| 95 |
+
'dimaksudnya', 'diminta', 'dimintai', 'dimisalkan', 'dimulai', 'dimulailah', 'dimulainya',
|
| 96 |
+
'dimungkinkan', 'dini', 'dipastikan', 'diperbuat', 'diperbuatnya', 'dipergunakan',
|
| 97 |
+
'diperkirakan', 'diperlihatkan', 'diperlukan', 'diperlukannya', 'dipersoalkan',
|
| 98 |
+
'dipertanyakan', 'dipunyai', 'diri', 'dirinya', 'disampaikan', 'disebut', 'disebutkan',
|
| 99 |
+
'disebutkannya', 'disini', 'disinilah', 'ditambahkan', 'ditandaskan', 'ditanya', 'ditanyai',
|
| 100 |
+
'ditanyakan', 'ditegaskan', 'ditujukan', 'ditunjuk', 'ditunjuki', 'ditunjukkan',
|
| 101 |
+
'ditunjukkannya', 'ditunjuknya', 'dituturkan', 'dituturkannya', 'diucapkan', 'diucapkannya',
|
| 102 |
+
'diungkapkan', 'dong', 'dua', 'dulu', 'empat', 'enggak', 'enggaknya', 'entah', 'entahlah',
|
| 103 |
+
'guna', 'gunakan', 'hal', 'hampir', 'hanya', 'hanyalah', 'hari', 'harus', 'haruslah',
|
| 104 |
+
'harusnya', 'hendak', 'hendaklah', 'hendaknya', 'hingga', 'ia', 'ialah', 'ibarat',
|
| 105 |
+
'ibaratkan', 'ibaratnya', 'ibu', 'ikut', 'ingat', 'ingat-ingat', 'ingin', 'inginkah',
|
| 106 |
+
'inginkan', 'ini', 'inikah', 'inilah', 'itu', 'itukah', 'itulah', 'jadi', 'jadilah',
|
| 107 |
+
'jadinya', 'jangan', 'jangankan', 'janganlah', 'jauh', 'jawab', 'jawaban', 'jawabnya',
|
| 108 |
+
'jelas', 'jelaskan', 'jelaslah', 'jelasnya', 'jika', 'jikalau', 'juga', 'jumlah',
|
| 109 |
+
'jumlahnya', 'justru', 'kala', 'kalau', 'kalaulah', 'kalaupun', 'kalian', 'kami',
|
| 110 |
+
'kamilah', 'kamu', 'kamulah', 'kan', 'kapan', 'kapankah', 'kapanpun', 'karena',
|
| 111 |
+
'karenanya', 'kasus', 'kata', 'katakan', 'katakanlah', 'katanya', 'ke', 'keadaan',
|
| 112 |
+
'kebetulan', 'kecil', 'kedua', 'keduanya', 'keinginan', 'kelamaan', 'kelihatan',
|
| 113 |
+
'kelihatannya', 'kelima', 'keluar', 'kembali', 'kemudian', 'kemungkinan', 'kemungkinannya',
|
| 114 |
+
'kenapa', 'kepada', 'kepadanya', 'kesamaannya', 'keseluruhan', 'keseluruhannya',
|
| 115 |
+
'keterlaluan', 'ketika', 'khususnya', 'kini', 'kinilah', 'kira', 'kira-kira', 'kiranya',
|
| 116 |
+
'kita', 'kitalah', 'kok', 'kurang', 'lagi', 'lagian', 'lah', 'lain', 'lainnya', 'lalu',
|
| 117 |
+
'lama', 'lamanya', 'lanjut', 'lanjutnya', 'lebih', 'lewat', 'lima', 'luar', 'macam',
|
| 118 |
+
'maka', 'makanya', 'makin', 'malah', 'malahan', 'mampu', 'mampukah', 'mana', 'manakala',
|
| 119 |
+
'manalagi', 'masa', 'masalah', 'masalahnya', 'masih', 'masihkah', 'masing', 'masing-masing',
|
| 120 |
+
'mau', 'maupun', 'melainkan', 'melakukan', 'melalui', 'melihat', 'melihatnya', 'memang',
|
| 121 |
+
'memastikan', 'memberi', 'memberikan', 'membuat', 'memerlukan', 'memihak', 'meminta',
|
| 122 |
+
'memintakan', 'memisalkan', 'memperbuat', 'mempergunakan', 'memperkirakan', 'memperlihatkan',
|
| 123 |
+
'mempersiapkan', 'mempersoalkan', 'mempertanyakan', 'mempunyai', 'memulai', 'memungkinkan',
|
| 124 |
+
'menaiki', 'menambahkan', 'menandaskan', 'menanti', 'menanti-nanti', 'menantikan',
|
| 125 |
+
'menanya', 'menanyai', 'menanyakan', 'mendapat', 'mendapatkan', 'mendatang', 'mendatangi',
|
| 126 |
+
'mendatangkan', 'menegaskan', 'mengakhiri', 'mengapa', 'mengatakan', 'mengatakannya',
|
| 127 |
+
'mengenai', 'mengerjakan', 'mengetahui', 'menggunakan', 'menghendaki', 'mengibaratkan',
|
| 128 |
+
'mengibaratkannya', 'mengingat', 'mengingatkan', 'menginginkan', 'mengira', 'mengucapkan',
|
| 129 |
+
'mengucapkannya', 'mengungkapkan', 'menjadi', 'menjawab', 'menjelaskan', 'menuju',
|
| 130 |
+
'menunjuk', 'menunjuki', 'menunjukkan', 'menunjuknya', 'menurut', 'menuturkan',
|
| 131 |
+
'menyampaikan', 'menyangkut', 'menyatakan', 'menyebutkan', 'menyeluruh', 'menyiapkan',
|
| 132 |
+
'merasa', 'mereka', 'merekalah', 'merupakan', 'meski', 'meskipun', 'meyakini', 'meyakinkan',
|
| 133 |
+
'minta', 'mirip', 'misal', 'misalkan', 'misalnya', 'mula', 'mulai', 'mulailah', 'mulanya',
|
| 134 |
+
'mungkin', 'mungkinkah', 'nah', 'naik', 'namun', 'nanti', 'nantinya', 'nyaris', 'nyatanya',
|
| 135 |
+
'oleh', 'olehnya', 'pada', 'padahal', 'padanya', 'pak', 'paling', 'panjang', 'pantas',
|
| 136 |
+
'para', 'pasti', 'pastilah', 'penting', 'pentingnya', 'per', 'percuma', 'perlu', 'perlukah',
|
| 137 |
+
'perlunya', 'pernah', 'persoalan', 'pertama', 'pertama-tama', 'pertanyaan', 'pertanyakan',
|
| 138 |
+
'pihak', 'pihaknya', 'pukul', 'pula', 'pun', 'punya', 'rasa', 'rasanya', 'rata', 'rupanya',
|
| 139 |
+
'saat', 'saatnya', 'saja', 'sajalah', 'saling', 'sama', 'sama-sama', 'sambil', 'sampai',
|
| 140 |
+
'sampai-sampai', 'sampaikan', 'sana', 'sangat', 'sangatlah', 'satu', 'saya', 'sayalah',
|
| 141 |
+
'se', 'sebab', 'sebabnya', 'sebagai', 'sebagaimana', 'sebagainya', 'sebagian', 'sebaik',
|
| 142 |
+
'sebaik-baiknya', 'sebaiknya', 'sebaliknya', 'sebanyak', 'sebegini', 'sebegitu', 'sebelum',
|
| 143 |
+
'sebelumnya', 'sebenarnya', 'seberapa', 'sebesar', 'sebetulnya', 'sebisanya', 'sebuah',
|
| 144 |
+
'sebut', 'sebutlah', 'sebutnya', 'secara', 'secukupnya', 'sedang', 'sedangkan', 'sedemikian',
|
| 145 |
+
'sedikit', 'sedikitnya', 'seenaknya', 'segala', 'segalanya', 'segera', 'seharusnya',
|
| 146 |
+
'sehingga', 'seingat', 'sejak', 'sejauh', 'sejenak', 'sejumlah', 'sekadar', 'sekadarnya',
|
| 147 |
+
'sekali', 'sekali-kali', 'sekalian', 'sekaligus', 'sekalipun', 'sekarang', 'sekarang',
|
| 148 |
+
'sekecil', 'seketika', 'sekiranya', 'sekitar', 'sekitarnya', 'sekurang-kurangnya',
|
| 149 |
+
'sekurangnya', 'sela', 'selain', 'selaku', 'selalu', 'selama', 'selama-lamanya',
|
| 150 |
+
'selamanya', 'selanjutnya', 'seluruh', 'seluruhnya', 'semacam', 'semakin', 'semampu',
|
| 151 |
+
'semampunya', 'semasa', 'semasih', 'semata', 'semata-mata', 'semaunya', 'sementara',
|
| 152 |
+
'semisal', 'semisalnya', 'sempat', 'semua', 'semuanya', 'semula', 'sendiri', 'sendirian',
|
| 153 |
+
'sendirinya', 'seolah', 'seolah-olah', 'seorang', 'sepanjang', 'sepantasnya', 'sepantasnyalah',
|
| 154 |
+
'seperlunya', 'seperti', 'sepertinya', 'sepihak', 'sering', 'seringnya', 'serta',
|
| 155 |
+
'serupa', 'sesaat', 'sesama', 'sesampai', 'sesegera', 'sesekali', 'seseorang',
|
| 156 |
+
'sesuatu', 'sesuatunya', 'sesudah', 'sesudahnya', 'setelah', 'setempat', 'setengah',
|
| 157 |
+
'seterusnya', 'setiap', 'setiba', 'setibanya', 'setidak-tidaknya', 'setidaknya',
|
| 158 |
+
'setinggi', 'seusai', 'sewaktu', 'siap', 'siapa', 'siapakah', 'siapapun', 'sini',
|
| 159 |
+
'sinilah', 'soal', 'soalnya', 'suatu', 'sudah', 'sudahkah', 'sudahlah', 'supaya',
|
| 160 |
+
'tadi', 'tadinya', 'tahu', 'tahun', 'tak', 'tambah', 'tambahnya', 'tampak', 'tampaknya',
|
| 161 |
+
'tandas', 'tandasnya', 'tanpa', 'tanya', 'tanyakan', 'tanyanya', 'tapi', 'tegas',
|
| 162 |
+
'tegasnya', 'telah', 'tempat', 'tengah', 'tentang', 'tentu', 'tentulah', 'tentunya',
|
| 163 |
+
'tepat', 'terakhir', 'terasa', 'terbanyak', 'terdahulu', 'terdapat', 'terdiri',
|
| 164 |
+
'terhadap', 'terhadapnya', 'teringat', 'teringat-ingat', 'terjadi', 'terjadilah',
|
| 165 |
+
'terjadinya', 'terkira', 'terlalu', 'terlebih', 'terlihat', 'termasuk', 'ternyata',
|
| 166 |
+
'tersampaikan', 'tersebut', 'tersebutlah', 'tertentu', 'tertuju', 'terus', 'terutama',
|
| 167 |
+
'tetap', 'tetapi', 'tiap', 'tiba', 'tiba-tiba', 'tidak', 'tidakkah', 'tidaklah',
|
| 168 |
+
'tiga', 'tinggi', 'toh', 'tunjuk', 'turut', 'tutur', 'tuturnya', 'ucap', 'ucapnya',
|
| 169 |
+
'ujar', 'ujarnya', 'umum', 'umumnya', 'ungkap', 'ungkapnya', 'untuk', 'usah',
|
| 170 |
+
'usai', 'waduh', 'wah', 'wahai', 'waktu', 'waktunya', 'walau', 'walaupun', 'wong',
|
| 171 |
+
'yaitu', 'yakin', 'yakni', 'yang', 'yg'
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
# Tambahkan stopwords khusus untuk dataset (dari analisis awal)
|
| 175 |
+
additional_stopwords = ['ya', 'nya', 'yg', 'aja', 'gw', 'ga', 'udah', 'kek', 'gak', 'nya',
|
| 176 |
+
'nih', 'sih', 'kalo', 'eh', 'aku', 'kau', 'lo', 'lu', 'kan', 'kok',
|
| 177 |
+
'juga', 'udh', 'eh', 'ah', 'kk', 'wkwk', 'wkwkwk', 'haha', 'hahaha',
|
| 178 |
+
'bang', 'banget', 'emang']
|
| 179 |
+
|
| 180 |
+
stopwords.extend(additional_stopwords)
|
| 181 |
+
return set(stopwords)
|
| 182 |
+
|
| 183 |
+
# 5. Stemming
|
| 184 |
+
def get_stemmer():
|
| 185 |
+
"""
|
| 186 |
+
Mendapatkan stemmer bahasa Indonesia dari Sastrawi
|
| 187 |
+
"""
|
| 188 |
+
factory = StemmerFactory()
|
| 189 |
+
return factory.create_stemmer()
|
| 190 |
+
|
| 191 |
+
# 6. Preprocessing langkah-langkah
|
| 192 |
+
def preprocess_data(df, text_column):
|
| 193 |
+
"""
|
| 194 |
+
Preprocessing data teks
|
| 195 |
+
"""
|
| 196 |
+
# Preprocessing teks
|
| 197 |
+
df['clean_text'] = df[text_column].apply(preprocess_text)
|
| 198 |
+
|
| 199 |
+
# Tokenisasi
|
| 200 |
+
df['tokens'] = df['clean_text'].apply(word_tokenize)
|
| 201 |
+
|
| 202 |
+
# Hapus stopwords
|
| 203 |
+
stopwords = get_stopwords()
|
| 204 |
+
df['tokens_without_stopwords'] = df['tokens'].apply(
|
| 205 |
+
lambda tokens: [word for word in tokens if word not in stopwords and len(word) > 2]
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Stemming
|
| 209 |
+
stemmer = get_stemmer()
|
| 210 |
+
df['stemmed_tokens'] = df['tokens_without_stopwords'].apply(
|
| 211 |
+
lambda tokens: [stemmer.stem(word) for word in tokens]
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return df
|
| 215 |
+
|
| 216 |
+
# 7. Membuat korpus untuk LDA
|
| 217 |
+
def create_lda_corpus(df, tokens_column='stemmed_tokens'):
|
| 218 |
+
"""
|
| 219 |
+
Membuat korpus untuk LDA:
|
| 220 |
+
- Dictionary
|
| 221 |
+
- Bag of Words corpus
|
| 222 |
+
"""
|
| 223 |
+
# Buat dictionary
|
| 224 |
+
id2word = corpora.Dictionary(df[tokens_column])
|
| 225 |
+
|
| 226 |
+
# Buat corpus
|
| 227 |
+
corpus = [id2word.doc2bow(text) for text in df[tokens_column]]
|
| 228 |
+
|
| 229 |
+
return corpus, id2word
|
| 230 |
+
|
| 231 |
+
# 8. Membuat model LDA
|
| 232 |
+
def create_lda_model(corpus, id2word, num_topics=5, passes=10):
|
| 233 |
+
"""
|
| 234 |
+
Membuat model LDA
|
| 235 |
+
"""
|
| 236 |
+
lda_model = gensim.models.ldamodel.LdaModel(
|
| 237 |
+
corpus=corpus,
|
| 238 |
+
id2word=id2word,
|
| 239 |
+
num_topics=num_topics,
|
| 240 |
+
random_state=42,
|
| 241 |
+
passes=passes,
|
| 242 |
+
alpha='auto',
|
| 243 |
+
eta='auto'
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
return lda_model
|
| 247 |
+
|
| 248 |
+
# 9. Mengevaluasi model dengan coherence score
|
| 249 |
+
def compute_coherence_score(model, corpus, texts, dictionary):
|
| 250 |
+
"""
|
| 251 |
+
Menghitung coherence score
|
| 252 |
+
"""
|
| 253 |
+
coherence_model = CoherenceModel(
|
| 254 |
+
model=model,
|
| 255 |
+
texts=texts,
|
| 256 |
+
dictionary=dictionary,
|
| 257 |
+
coherence='c_v'
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
return coherence_model.get_coherence()
|
| 261 |
+
|
| 262 |
+
# 10. Mencari jumlah topik optimal
|
| 263 |
+
def find_optimal_topics(corpus, id2word, texts, start=2, limit=10, step=1):
|
| 264 |
+
"""
|
| 265 |
+
Mencari jumlah topik optimal berdasarkan coherence score
|
| 266 |
+
"""
|
| 267 |
+
coherence_values = []
|
| 268 |
+
model_list = []
|
| 269 |
+
|
| 270 |
+
for num_topics in range(start, limit, step):
|
| 271 |
+
model = create_lda_model(corpus=corpus, id2word=id2word, num_topics=num_topics)
|
| 272 |
+
model_list.append(model)
|
| 273 |
+
|
| 274 |
+
coherence_model = CoherenceModel(
|
| 275 |
+
model=model,
|
| 276 |
+
texts=texts,
|
| 277 |
+
dictionary=id2word,
|
| 278 |
+
coherence='c_v'
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
coherence_values.append(coherence_model.get_coherence())
|
| 282 |
+
|
| 283 |
+
# Plot coherence score
|
| 284 |
+
plt.figure(figsize=(12, 6))
|
| 285 |
+
plt.plot(range(start, limit, step), coherence_values)
|
| 286 |
+
plt.xlabel("Jumlah Topik")
|
| 287 |
+
plt.ylabel("Coherence Score")
|
| 288 |
+
plt.title("Coherence Score untuk Berbagai Jumlah Topik")
|
| 289 |
+
plt.grid(True)
|
| 290 |
+
plt.tight_layout()
|
| 291 |
+
plt.savefig('coherence_score.png')
|
| 292 |
+
|
| 293 |
+
# Temukan indeks dengan coherence score tertinggi
|
| 294 |
+
max_index = coherence_values.index(max(coherence_values))
|
| 295 |
+
optimal_topics = range(start, limit, step)[max_index]
|
| 296 |
+
|
| 297 |
+
return model_list[max_index], optimal_topics, coherence_values, 'coherence_score.png'
|
| 298 |
+
|
| 299 |
+
# 11. Visualisasi model LDA
|
| 300 |
+
def visualize_lda(lda_model, corpus, id2word):
|
| 301 |
+
"""
|
| 302 |
+
Visualisasi model LDA dengan pyLDAvis
|
| 303 |
+
"""
|
| 304 |
+
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
|
| 305 |
+
pyLDAvis.save_html(vis, 'lda_visualization.html')
|
| 306 |
+
|
| 307 |
+
return 'lda_visualization.html'
|
| 308 |
+
|
| 309 |
+
# 12. Format hasil model
|
| 310 |
+
def format_topics_sentences(ldamodel, corpus, texts):
|
| 311 |
+
"""
|
| 312 |
+
Format hasil topic modeling
|
| 313 |
+
"""
|
| 314 |
+
# Init output - buat list untuk menampung data
|
| 315 |
+
topics_data = []
|
| 316 |
+
|
| 317 |
+
# Get main topic in each document
|
| 318 |
+
for i, row in enumerate(ldamodel[corpus]):
|
| 319 |
+
row = sorted(row, key=lambda x: (x[1]), reverse=True)
|
| 320 |
+
|
| 321 |
+
# Get the Dominant topic, Perc Contribution and Keywords for each document
|
| 322 |
+
for j, (topic_num, prop_topic) in enumerate(row):
|
| 323 |
+
if j == 0: # => dominant topic
|
| 324 |
+
wp = ldamodel.show_topic(topic_num)
|
| 325 |
+
topic_keywords = ", ".join([word for word, prop in wp])
|
| 326 |
+
# Simpan data dalam list
|
| 327 |
+
topics_data.append((int(topic_num), round(prop_topic, 4), topic_keywords))
|
| 328 |
+
else:
|
| 329 |
+
break
|
| 330 |
+
|
| 331 |
+
# Buat DataFrame sekaligus dari list data
|
| 332 |
+
sent_topics_df = pd.DataFrame(topics_data, columns=['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'])
|
| 333 |
+
|
| 334 |
+
# Tambahkan teks asli
|
| 335 |
+
contents = pd.Series(texts)
|
| 336 |
+
sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
|
| 337 |
+
|
| 338 |
+
return sent_topics_df
|
| 339 |
+
|
| 340 |
+
# 13. Fungsi untuk menyimpan dan memuat hasil processing
|
| 341 |
+
def save_results(results, output_dir='output'):
|
| 342 |
+
"""
|
| 343 |
+
Menyimpan semua hasil processing ke dalam file CSV dan model
|
| 344 |
+
"""
|
| 345 |
+
import os
|
| 346 |
+
|
| 347 |
+
# Buat direktori output jika belum ada
|
| 348 |
+
if not os.path.exists(output_dir):
|
| 349 |
+
os.makedirs(output_dir)
|
| 350 |
+
|
| 351 |
+
# Simpan model LDA, dictionary, dan corpus
|
| 352 |
+
results['model'].save(os.path.join(output_dir, 'lda_model'))
|
| 353 |
+
results['id2word'].save(os.path.join(output_dir, 'id2word.dictionary'))
|
| 354 |
+
corpora.MmCorpus.serialize(os.path.join(output_dir, 'corpus.mm'), results['corpus'])
|
| 355 |
+
|
| 356 |
+
# Simpan dataframe hasil ke CSV
|
| 357 |
+
results['df_topic_keywords'].to_csv(os.path.join(output_dir, 'topic_keywords.csv'), index=False)
|
| 358 |
+
|
| 359 |
+
# Simpan informasi coherence score
|
| 360 |
+
coherence_df = pd.DataFrame({
|
| 361 |
+
'num_topics': list(range(2, 2 + len(results['coherence_values']))),
|
| 362 |
+
'coherence_score': results['coherence_values']
|
| 363 |
+
})
|
| 364 |
+
coherence_df.to_csv(os.path.join(output_dir, 'coherence_scores.csv'), index=False)
|
| 365 |
+
|
| 366 |
+
# Simpan dataframe preprocessing jika tersedia
|
| 367 |
+
if 'df_processed' in results:
|
| 368 |
+
results['df_processed'].to_csv(os.path.join(output_dir, 'preprocessed_data.csv'), index=False)
|
| 369 |
+
|
| 370 |
+
print(f"Semua hasil telah disimpan di direktori: {output_dir}")
|
| 371 |
+
|
| 372 |
+
return output_dir
|
| 373 |
+
|
| 374 |
+
def load_results(output_dir='output'):
|
| 375 |
+
"""
|
| 376 |
+
Memuat hasil yang telah disimpan sebelumnya
|
| 377 |
+
"""
|
| 378 |
+
import os
|
| 379 |
+
|
| 380 |
+
results = {}
|
| 381 |
+
|
| 382 |
+
# Muat model LDA, dictionary, dan corpus
|
| 383 |
+
if os.path.exists(os.path.join(output_dir, 'lda_model')):
|
| 384 |
+
results['model'] = gensim.models.ldamodel.LdaModel.load(os.path.join(output_dir, 'lda_model'))
|
| 385 |
+
|
| 386 |
+
if os.path.exists(os.path.join(output_dir, 'id2word.dictionary')):
|
| 387 |
+
results['id2word'] = corpora.Dictionary.load(os.path.join(output_dir, 'id2word.dictionary'))
|
| 388 |
+
|
| 389 |
+
if os.path.exists(os.path.join(output_dir, 'corpus.mm')):
|
| 390 |
+
results['corpus'] = corpora.MmCorpus(os.path.join(output_dir, 'corpus.mm'))
|
| 391 |
+
|
| 392 |
+
# Muat dataframe hasil
|
| 393 |
+
if os.path.exists(os.path.join(output_dir, 'topic_keywords.csv')):
|
| 394 |
+
results['df_topic_keywords'] = pd.read_csv(os.path.join(output_dir, 'topic_keywords.csv'))
|
| 395 |
+
|
| 396 |
+
# Muat informasi coherence score
|
| 397 |
+
if os.path.exists(os.path.join(output_dir, 'coherence_scores.csv')):
|
| 398 |
+
coherence_df = pd.read_csv(os.path.join(output_dir, 'coherence_scores.csv'))
|
| 399 |
+
results['coherence_values'] = coherence_df['coherence_score'].tolist()
|
| 400 |
+
results['optimal_topics'] = coherence_df.loc[coherence_df['coherence_score'].idxmax(), 'num_topics']
|
| 401 |
+
|
| 402 |
+
# Muat dataframe preprocessing
|
| 403 |
+
if os.path.exists(os.path.join(output_dir, 'preprocessed_data.csv')):
|
| 404 |
+
results['df_processed'] = pd.read_csv(os.path.join(output_dir, 'preprocessed_data.csv'))
|
| 405 |
+
|
| 406 |
+
# Rekonstruksi kolom list dari string
|
| 407 |
+
if 'tokens' in results['df_processed'].columns:
|
| 408 |
+
results['df_processed']['tokens'] = results['df_processed']['tokens'].apply(
|
| 409 |
+
lambda x: eval(x) if isinstance(x, str) else []
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
if 'tokens_without_stopwords' in results['df_processed'].columns:
|
| 413 |
+
results['df_processed']['tokens_without_stopwords'] = results['df_processed']['tokens_without_stopwords'].apply(
|
| 414 |
+
lambda x: eval(x) if isinstance(x, str) else []
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
if 'stemmed_tokens' in results['df_processed'].columns:
|
| 418 |
+
results['df_processed']['stemmed_tokens'] = results['df_processed']['stemmed_tokens'].apply(
|
| 419 |
+
lambda x: eval(x) if isinstance(x, str) else []
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
print(f"Hasil berhasil dimuat dari direktori: {output_dir}")
|
| 423 |
+
|
| 424 |
+
return results
|
| 425 |
+
def run_topic_modeling(file_path, text_column='text', save_intermediates=True, output_dir='output'):
|
| 426 |
+
"""
|
| 427 |
+
Menjalankan topic modeling
|
| 428 |
+
"""
|
| 429 |
+
print("Loading data...")
|
| 430 |
+
df = load_data(file_path)
|
| 431 |
+
|
| 432 |
+
print("Preprocessing data...")
|
| 433 |
+
df_processed = preprocess_data(df, text_column)
|
| 434 |
+
|
| 435 |
+
# Simpan hasil preprocessing jika diminta
|
| 436 |
+
if save_intermediates:
|
| 437 |
+
import os
|
| 438 |
+
if not os.path.exists(output_dir):
|
| 439 |
+
os.makedirs(output_dir)
|
| 440 |
+
|
| 441 |
+
# Konversi kolom list menjadi string untuk disimpan ke CSV
|
| 442 |
+
df_to_save = df_processed.copy()
|
| 443 |
+
for col in ['tokens', 'tokens_without_stopwords', 'stemmed_tokens']:
|
| 444 |
+
if col in df_to_save.columns:
|
| 445 |
+
df_to_save[col] = df_to_save[col].apply(str)
|
| 446 |
+
|
| 447 |
+
df_to_save.to_csv(os.path.join(output_dir, 'preprocessed_data.csv'), index=False)
|
| 448 |
+
print(f"Data preprocessing disimpan di {os.path.join(output_dir, 'preprocessed_data.csv')}")
|
| 449 |
+
|
| 450 |
+
print("Creating corpus...")
|
| 451 |
+
corpus, id2word = create_lda_corpus(df_processed)
|
| 452 |
+
|
| 453 |
+
# Simpan corpus dan dictionary jika diminta
|
| 454 |
+
if save_intermediates:
|
| 455 |
+
import os
|
| 456 |
+
if not os.path.exists(output_dir):
|
| 457 |
+
os.makedirs(output_dir)
|
| 458 |
+
|
| 459 |
+
id2word.save(os.path.join(output_dir, 'id2word.dictionary'))
|
| 460 |
+
corpora.MmCorpus.serialize(os.path.join(output_dir, 'corpus.mm'), corpus)
|
| 461 |
+
print(f"Corpus dan dictionary disimpan di direktori {output_dir}")
|
| 462 |
+
|
| 463 |
+
print("Finding optimal number of topics...")
|
| 464 |
+
best_model, optimal_topics, coherence_values, coherence_plot = find_optimal_topics(
|
| 465 |
+
corpus=corpus,
|
| 466 |
+
id2word=id2word,
|
| 467 |
+
texts=df_processed['stemmed_tokens']
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
print(f"Optimal number of topics: {optimal_topics}")
|
| 471 |
+
print(f"Best coherence score: {max(coherence_values)}")
|
| 472 |
+
|
| 473 |
+
# Simpan model jika diminta
|
| 474 |
+
if save_intermediates:
|
| 475 |
+
import os
|
| 476 |
+
if not os.path.exists(output_dir):
|
| 477 |
+
os.makedirs(output_dir)
|
| 478 |
+
|
| 479 |
+
best_model.save(os.path.join(output_dir, 'lda_model'))
|
| 480 |
+
|
| 481 |
+
# Simpan coherence scores
|
| 482 |
+
coherence_df = pd.DataFrame({
|
| 483 |
+
'num_topics': list(range(2, 2 + len(coherence_values))),
|
| 484 |
+
'coherence_score': coherence_values
|
| 485 |
+
})
|
| 486 |
+
coherence_df.to_csv(os.path.join(output_dir, 'coherence_scores.csv'), index=False)
|
| 487 |
+
print(f"Model LDA dan coherence scores disimpan di direktori {output_dir}")
|
| 488 |
+
|
| 489 |
+
print("Visualizing LDA model...")
|
| 490 |
+
vis_path = visualize_lda(best_model, corpus, id2word)
|
| 491 |
+
|
| 492 |
+
print("Topics and their keywords:")
|
| 493 |
+
topics = best_model.print_topics()
|
| 494 |
+
pprint(topics)
|
| 495 |
+
|
| 496 |
+
# Simpan topik dan kata kunci ke CSV
|
| 497 |
+
if save_intermediates:
|
| 498 |
+
import os
|
| 499 |
+
if not os.path.exists(output_dir):
|
| 500 |
+
os.makedirs(output_dir)
|
| 501 |
+
|
| 502 |
+
topics_df = pd.DataFrame({
|
| 503 |
+
'topic_id': [i for i, _ in topics],
|
| 504 |
+
'keywords': [keywords for _, keywords in topics]
|
| 505 |
+
})
|
| 506 |
+
topics_df.to_csv(os.path.join(output_dir, 'topics_keywords.csv'), index=False)
|
| 507 |
+
print(f"Topik dan kata kunci disimpan di {os.path.join(output_dir, 'topics_keywords.csv')}")
|
| 508 |
+
|
| 509 |
+
# Format hasil
|
| 510 |
+
df_topic_sents_keywords = format_topics_sentences(
|
| 511 |
+
ldamodel=best_model,
|
| 512 |
+
corpus=corpus,
|
| 513 |
+
texts=df_processed['clean_text']
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
# Dominant topic distribution
|
| 517 |
+
df_dominant_topic = df_topic_sents_keywords.reset_index()
|
| 518 |
+
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
|
| 519 |
+
|
| 520 |
+
# Simpan hasil pemetaan dokumen ke topik
|
| 521 |
+
if save_intermediates:
|
| 522 |
+
df_dominant_topic.to_csv(os.path.join(output_dir, 'document_topics.csv'), index=False)
|
| 523 |
+
print(f"Pemetaan dokumen ke topik disimpan di {os.path.join(output_dir, 'document_topics.csv')}")
|
| 524 |
+
|
| 525 |
+
print("\nDominant Topic Distribution:")
|
| 526 |
+
topic_counts = df_dominant_topic['Dominant_Topic'].value_counts().reset_index()
|
| 527 |
+
topic_counts.columns = ['Topic_Num', 'Count']
|
| 528 |
+
print(topic_counts)
|
| 529 |
+
|
| 530 |
+
# Simpan distribusi topik
|
| 531 |
+
if save_intermediates:
|
| 532 |
+
topic_counts.to_csv(os.path.join(output_dir, 'topic_distribution.csv'), index=False)
|
| 533 |
+
print(f"Distribusi topik disimpan di {os.path.join(output_dir, 'topic_distribution.csv')}")
|
| 534 |
+
|
| 535 |
+
# Plot distribusi topik
|
| 536 |
+
plt.figure(figsize=(10, 6))
|
| 537 |
+
plt.bar(topic_counts['Topic_Num'], topic_counts['Count'])
|
| 538 |
+
plt.xlabel('Topic Number')
|
| 539 |
+
plt.ylabel('Count')
|
| 540 |
+
plt.title('Distribution of Dominant Topics')
|
| 541 |
+
plt.xticks(topic_counts['Topic_Num'])
|
| 542 |
+
plt.grid(True, alpha=0.3)
|
| 543 |
+
plt.tight_layout()
|
| 544 |
+
plt.savefig(os.path.join(output_dir, 'topic_distribution.png'))
|
| 545 |
+
|
| 546 |
+
results = {
|
| 547 |
+
'model': best_model,
|
| 548 |
+
'corpus': corpus,
|
| 549 |
+
'id2word': id2word,
|
| 550 |
+
'optimal_topics': optimal_topics,
|
| 551 |
+
'coherence_values': coherence_values,
|
| 552 |
+
'coherence_plot': coherence_plot,
|
| 553 |
+
'visualization': vis_path,
|
| 554 |
+
'topic_distribution': os.path.join(output_dir, 'topic_distribution.png'),
|
| 555 |
+
'df_topic_keywords': df_dominant_topic,
|
| 556 |
+
'df_processed': df_processed
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
print(f"\nSemua hasil telah disimpan di direktori: {output_dir}")
|
| 560 |
+
return results
|
| 561 |
|
| 562 |
+
if __name__ == "__main__":
|
| 563 |
+
# Ganti dengan path file CSV yang sesuai
|
| 564 |
+
results = run_topic_modeling("./src/review.csv", output_dir='./src/lda_output')
|
| 565 |
+
print("Topic modeling completed successfully!")
|
| 566 |
+
|
| 567 |
+
# Contoh cara memuat kembali hasil yang sudah disimpan
|
| 568 |
+
# loaded_results = load_results('lda_output')
|
| 569 |
+
# print(f"Loaded model with {loaded_results['optimal_topics']} topics")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|