# -*- coding: utf-8 -*- """Word Cloud.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1rwyDXgYaTJQJvXu2FPeggecHOxIYQ3l3 """ !pip install stop-words !pip install sastrawi !pip install transformers import pandas as pd import numpy as np import matplotlib.pyplot as plt import html import re import json from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from wordcloud import WordCloud from tqdm import tqdm from IPython.display import display from bs4 import BeautifulSoup from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory from stop_words import get_stop_words from collections import Counter from transformers import pipeline # =============================================== # --- Konfigurasi --- # =============================================== FILE_PATH = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/medsos (6).csv' N_TOPICS = 15 N_TOP_WORDS = 10 # top kata per topik (juga dipakai untuk wordcloud) SAMPLE_DATA_TO_SHOW = 5 # Jumlah sampel data yang ingin ditampilkan per sentimen # =============================================== # 1. Stopwords: stop_words + Sastrawi + tambahan # =============================================== stopwords_indonesia = get_stop_words('indonesian') factory = StopWordRemoverFactory() sastrawi_stopwords = factory.get_stop_words() additional_stopwords = [ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk', 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi','apk', 'sllu','apknya','sngt','joos','ni','kak', # kata umum 'manfaatnya','ya','lbh','digunakan','semangat','dah','sangat','penting', 'lancar','cepat','senang','makasih','bermanfaat','keren','berguna','baik', 'indonesia','usaha','memudahkan','pokoknya','puas','mantap','dananya','luar', 'hati','ber','terimakasih','tepat','memudah','terbaik','mempermudah','praktis', 'simple','kadang','memuaskan','bagus','semoga','smoga','aplikasi','transaksi', 'kesimpulan','sip','pelayanannya','orang','manfaat','untuk','proses','membantu', 'pengiriman','muda','mantaap','kedepannya','pake','aktifitas','sejauh','untung', 'tenang','bikin','pakek','saldo','keluhan','dimanapun','cukup','menggunakan', 'sengat','banget','pakai','terpercaya','top','sukses', # hasil wordcloud 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk','guna', 'baru','jelas','level','selengkapnya','yuk','mohon','punya','cara','hari', 'kota','news','baca','fitur','kasih','suruh', 'besar','sapa','bawa','atas','hidup','jaga','moga','kali','balas','perintah', 'masyarakat','ide','hadir','ikut','ingat','tali','alhamdulillah','sambut', 'masa','tuju','terima','ibu','silaturahmi','pasang','bangun','dukung', 'muhammad','teladan','tahun','insan','bulan','iman','erat','syukur', 'kabupaten','cirebon','langsung','cinta','kuat','tebar','hubung','ikat', 'resmi','giat','selenggara','luka','kendara','putih','fyp','reses','mulai', 'rctvcirebon','radarcirebon','temu','satu','factor','harap','wararctv', 'maksimal','salah','tiktokberita','kawasan','sangka','juang','merah','puluh', 'ribu','omo','argo','role','jati','tingkat','kata','emis','majalengka', 'madam','sebut','tawur','duga', # tambahan kata lain 'visi','saw','keras','sayang','bentuk','didik','jalin','keluarga','momen', 'program','baginda','hikmah','panjang','lingkung','wewararctv', 'magelang', 'kang', 'langkah', 'limpah', 'explore', 'tabindex', 'penuh', 'aa', 'rasa', 'tags', 'notranslate', 'desa', 'daerah', 'lengkap', 'aa', 'kunjung', 'laku', 'klik', 'berkah', 'aboutcirebon', 'jl', 'terus', 'hasil', 'instastory', 'taut', 'upaya', 'berita', 'beri', 'lanjut', 'pemkabcirebon', 'warga', 'pemkabcirebon', 'selamat', 'wujud', 'maju', 'wakil', 'ungkap', 'turut', 'pihak', 'wilayah', 'dinas', 'promo', 'pemkotcirebon', 'hadap', 'barat', 'layan', 'siap', 'milik', 'lokasi', 'ujar', 'rupa', 'gratis', 'daftar', 'jawa', 'tengah', 'kolaborasi', 'tempat', 'tegas', 'gelar', 'wib' # Bulan 'januari', 'februari', 'maret', 'april', 'mei', 'juni', 'juli', 'agustus', 'september', 'oktober', 'november', 'desember' ] # ===== Tambahan stopwords untuk kata tidak jelas ===== noise_stopwords = [ 'by','zd','xyri','yu','uobl','ypdohk','xt','pz','lziwak','mp', 'rp','xdj','xexx','xggy','xjbqb','xstzfhl','link','class','hfl','xat', 'qhh','dhg','cr', 'tdsg', 'ct', 'etr', 'nq', 'oe', 'ejq', 'psk', 'href', 'hl', 'hd' , 'sy', 'amp', 'fbf', 'tags' ] CUSTOM_STOPWORDS = [ # HTML & atribut umum "class", "id", "span", "div", "href", "src", "style", "alt", "aria", "role", "tabindex", "button", "label", "img", "input", "placeholder", "form", "field", "hidden", "value", 'aa', # Token acak/huruf tunggal "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", # Kata noise berulang dari teks kamu "hfl", "xjbqb", "ejq", "ypdohk", "xexx", "hfr", "eyih", "dwj", "hkzxv", "yuc", "igjr", "eqks", "oq", "kjzd", "oxk", "zsgpy", "dycq", "g", "o", "wa", "wo", "ae", "ov", "vv", "uxc", # Kata teknis netral "content", "data", "video", "playlist", "source", "watch", "channel", "views", "subscribe", "update", "next", "prev", "click", "menu", "link", "button", "card", "section", # Angka & simbol sering muncul "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ] # Gabungkan semua stopwords final_stopwords = list(set(stopwords_indonesia + sastrawi_stopwords + additional_stopwords + noise_stopwords + CUSTOM_STOPWORDS)) # =============================================== # 2. Pembersihan HTML + Stemming Sastrawi # =============================================== stemmer = StemmerFactory().create_stemmer() html_noise = ['fbf','tabindex','tags','notranslate','aria-label','div','span','class'] noise_words = set(noise_stopwords + CUSTOM_STOPWORDS + html_noise) def clean_html(text): if pd.isna(text): return "" s = BeautifulSoup(str(text), "html.parser") for tag in s(["script", "style"]): tag.decompose() cleaned = s.get_text(separator=" ") cleaned = html.unescape(cleaned) cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def remove_single_letters(text): return re.sub(r"\b\w\b", "", text) def hapus (text): tokens = [word for word in text.split() if word not in noise_words] text = " ".join(tokens) return text def preprocess_text(text): # 1. Clean HTML text = clean_html(text) # 2. Lowercase text = text.lower() # 3. Stemming text = stemmer.stem(text) # 4. Hapus stopwords dan html noise tokens = [word for word in text.split() if word not in final_stopwords and word not in html_noise] # 5. Ambil hanya kata (huruf saja) tokens = [t for t in tokens if re.search(r"[a-zA-Z]", t)] # 6. Gabung kembali text = " ".join(tokens) # 7. Hapus huruf tunggal text = remove_single_letters(text) return text.strip() # =============================================== # 3. Load & Preprocess Dataset # =============================================== try: df = pd.read_csv(FILE_PATH) df.dropna(subset=['caption'], inplace=True) df['caption'] = df['caption'].astype(str) df['caption_clean'] = df['caption'].apply(preprocess_text) df['caption'] = df['caption'].apply(hapus) print("āœ… Dataset berhasil dimuat & dipreproses.") print(f"Jumlah data: {len(df)} baris") if 'caption_pred' in df.columns: print("\nDistribusi Sentimen (caption_pred):") print(df['caption_pred'].value_counts()) except FileNotFoundError: print(f"āŒ Error: File '{FILE_PATH}' tidak ditemukan.") raise SystemExit # =============================================== # 4. Fungsi utilitas # =============================================== def get_top_words_per_topic(model, feature_names, n_top_words): topics = {} for topic_idx, topic in enumerate(model.components_): top_features_ind = topic.argsort()[: -n_top_words - 1 : -1] top_features = [feature_names[i] for i in top_features_ind] topics[topic_idx] = top_features return topics def format_topics_sentences(topics): return {topic_idx: ", ".join(words) for topic_idx, words in topics.items()} def create_circular_wordcloud(words_list, title, n_words=10): text_data = " ".join(words_list[:n_words]) if not text_data.strip(): print(f"Tidak ada kata untuk word cloud '{title}'.") return x, y = np.ogrid[:400, :400] mask = (x - 200) ** 2 + (y - 200) ** 2 > 190 ** 2 mask = 255 * mask.astype(int) wc = WordCloud(width=800, height=800, background_color='white', colormap='viridis', mask=mask, contour_width=3, contour_color='steelblue').generate(text_data) plt.figure(figsize=(8, 8)) plt.imshow(wc, interpolation='bilinear') plt.title(title, fontsize=18, pad=15) plt.axis('off') plt.show() def get_top_words_by_doc_frequency(df_subset, n_top_words=10): word_doc_count = Counter() for text in df_subset['caption_clean'].fillna(""): tokens = [w for w in text.split() if not re.fullmatch(r"[a-z]", w)] unique_tokens = set(tokens) word_doc_count.update(unique_tokens) return word_doc_count.most_common(n_top_words) summarizer = pipeline( "summarization", model="google/mt5-small", tokenizer="google/mt5-small" ) def generate_summary(text, max_length=60, min_length=20): if not text or len(text.split()) < 10: return text try: result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) return result[0]['summary_text'] except Exception as e: print(f"āš ļø Error summarizing: {e}") return text def summarize_text(corpus, n_topics=5, n_words=10): vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') X = vectorizer.fit_transform(corpus) nmf = NMF(n_components=n_topics, random_state=42) nmf.fit(X) feature_names = vectorizer.get_feature_names_out() key_sentences = [] for topic_idx, topic in enumerate(nmf.components_): top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]] key_sentences.extend(top_words) # ubah jadi paragraf ringkas summary = " ".join(key_sentences) return summary # =============================================== # 5. GLOBAL Topic Modeling dan Pembuatan Ringkasan (PARAGRAF) # =============================================== print("\n--- 🧠 Memprediksi Topik dan Membuat Ringkasan untuk Semua Data ---") # šŸ”¹ Gabungkan caption + comment jadi satu teks df['combined_text'] = df['caption_clean'].fillna('') + " " + df['comments_pred'].fillna('') # --- TF-IDF Vectorizer --- global_vectorizer = TfidfVectorizer( max_df=0.9, min_df=10, max_features=1000, stop_words=final_stopwords, ngram_range=(1, 2) ) global_tfidf = global_vectorizer.fit_transform(df['combined_text']) global_feature_names = global_vectorizer.get_feature_names_out() # --- Bagian NMF + Summary --- if global_tfidf.shape[1] == 0: df['predicted_topic_id'] = -1 df['predicted_topic'] = "Tidak ada fitur yang cukup untuk modeling" df['summary'] = "Tidak dapat membuat ringkasan" print("āš ļø Peringatan: Kosakata terlalu sedikit setelah preprocessing. Topic modeling tidak dapat dilakukan.") else: global_nmf_model = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5) global_nmf_model.fit(global_tfidf) # Distribusi topik per dokumen topic_distribution = global_nmf_model.transform(global_tfidf) df['predicted_topic_id'] = np.argmax(topic_distribution, axis=1) # Ambil kata-kata penting tiap topik def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10): top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1] return [feature_names[i] for i in top_indices] # Mapping topik → keyword utama topic_keywords = {} for topic_idx in range(N_TOPICS): top_words = get_top_words_for_topic(global_nmf_model, global_feature_names, topic_idx, N_TOP_WORDS) topic_keywords[topic_idx] = ", ".join(top_words) df['predicted_topic'] = df['predicted_topic_id'].map(topic_keywords).fillna("Topik tidak teridentifikasi") # šŸ”¹ Update ringkasan pakai IndoBERT, berdasarkan teks gabungan df['summary'] = df['combined_text'].apply(lambda x: generate_summary(x)) print("āœ… Prediksi topik selesai, ringkasan memakai IndoBERT Summarization (gabungan caption + comment).") # Menampilkan hasil untuk verifikasi print("\n--- ✨ Contoh Hasil Prediksi Topik dan Ringkasan ---") display(df[['caption', 'comments_pred', 'predicted_topic', 'summary']].head(10)) # =============================================== # 6. Analisis per Sentimen + WordCloud + TAMPILKAN BUKTI BERDASARKAN KEYWORD # =============================================== analysis_result = {} # tempat simpan hasil JSON if 'caption_pred' in df.columns: sentiments = ['positif', 'negatif', 'netral'] # Pandas tampilkan teks penuh pd.set_option('display.max_colwidth', None) for sentiment in sentiments: print(f"\n\n=======================================================") print(f"šŸ“Š Analisis Mendalam untuk Sentimen: '{sentiment.upper()}'") print(f"=======================================================") subset_df = df[df['caption_pred'] == sentiment].copy() analysis_result[sentiment] = [] # list kosong untuk simpan hasil tiap sentimen if subset_df.empty: print(f"Tidak ada data untuk sentimen '{sentiment}'.") continue # 1. Dapatkan kata-kata teratas top_words_tuples = get_top_words_by_doc_frequency(subset_df, n_top_words=N_TOP_WORDS) if not top_words_tuples: print(f"Tidak ada kata signifikan pada sentimen '{sentiment}' untuk dianalisis.") continue # 2. Buat WordCloud words_list_for_wc = [word for word, count in top_words_tuples] create_circular_wordcloud(words_list_for_wc, f"WordCloud Sentimen {sentiment.upper()}", n_words=N_TOP_WORDS) # 3. Tampilkan bukti ringkasan print(f"\n--- šŸ“„ Bukti Ringkasan Berdasarkan Kata Kunci Populer ---") for word, doc_count in top_words_tuples: relevant_data = subset_df[ subset_df['caption_clean'].str.contains(r'\b{}\b'.format(re.escape(word)), case=False, na=False) ] summaries_list = [] if not relevant_data.empty: print(f"\nāœ… Kata Kunci: '{word}' (ditemukan dalam {len(relevant_data)} data pada sentimen ini)") for i, row in enumerate(relevant_data.itertuples(index=False), 1): caption = getattr(row, "caption_clean", "") link = getattr(row, "link", None) or getattr(row, "url", None) or "-" comment = getattr(row, "comments_pred", "") print(f" {i}. {caption} šŸ”— {link} šŸ’¬ {comment}") summaries_list.append({ "caption": caption, "link": link, "comment": comment }) else: print(f"\nāŒ Kata Kunci: '{word}' (tidak ditemukan data relevan untuk ditampilkan)") # tetap simpan ke JSON meskipun kosong analysis_result[sentiment].append({ "keyword": word, "count": int(len(relevant_data)), "summary": summaries_list }) else: print("\nKolom 'caption_pred' tidak ditemukan. Melewati analisis per sentimen.") # =============================================== # Simpan hasil JSON # =============================================== with open("sentiment_analysis_result.json", "w", encoding="utf-8") as f: json.dump(analysis_result, f, ensure_ascii=False, indent=4) print("\nšŸ“‚ Hasil analisis juga telah disimpan di 'sentiment_analysis_result.json'") # =============================================== # Prediksi Dataset Berita (judul, isi_berita, tag, link) # =============================================== FILE_BERITA = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita (6).csv' try: df_berita = pd.read_csv(FILE_BERITA) df_berita.dropna(subset=['isi_berita'], inplace=True) df_berita['isi_berita'] = df_berita['isi_berita'].astype(str) # Preprocessing isi_berita df_berita['isi_berita_clean'] = df_berita['isi_berita'].apply(preprocess_text) print("āœ… Dataset berita berhasil dimuat & dipreproses.") print(f"Jumlah data: {len(df_berita)} baris") except FileNotFoundError: print(f"āŒ Error: File '{FILE_BERITA}' tidak ditemukan.") raise SystemExit # =============================================== # Topic Modeling untuk berita # =============================================== print("\n--- 🧠 Memprediksi Topik & Ringkasan untuk Dataset Berita ---") # šŸ”¹ Gabungkan isi_berita_clean + judul + tag df_berita['combined_text'] = ( df_berita['isi_berita_clean'].fillna('') + " " + df_berita['judul'].fillna('') + " " + df_berita['tag'].fillna('') ) # --- TF-IDF Vectorizer --- vectorizer_berita = TfidfVectorizer( max_df=0.9, min_df=5, max_features=1000, stop_words=final_stopwords, ngram_range=(1, 2) ) tfidf_berita = vectorizer_berita.fit_transform(df_berita['combined_text']) feature_names_berita = vectorizer_berita.get_feature_names_out() if tfidf_berita.shape[1] == 0: df_berita['predicted_topic_id'] = -1 df_berita['predicted_topic'] = "Tidak cukup fitur untuk modeling" df_berita['summary'] = "Tidak dapat membuat ringkasan" else: nmf_berita = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5) nmf_berita.fit(tfidf_berita) topic_dist_berita = nmf_berita.transform(tfidf_berita) df_berita['predicted_topic_id'] = np.argmax(topic_dist_berita, axis=1) # Ambil kata topik def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10): top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1] return [feature_names[i] for i in top_indices] topic_keywords_berita = {} for topic_idx in range(N_TOPICS): top_words = get_top_words_for_topic(nmf_berita, feature_names_berita, topic_idx, N_TOP_WORDS) topic_keywords_berita[topic_idx] = ", ".join(top_words) df_berita['predicted_topic'] = df_berita['predicted_topic_id'].map(topic_keywords_berita).fillna("Topik tidak teridentifikasi") # šŸ”¹ Summarization IndoBERT (Google mT5) df_berita['summary'] = df_berita['isi_berita'].apply(lambda x: generate_summary(x)) print("āœ… Prediksi topik & ringkasan berita selesai.") # =============================================== # Simpan hasil JSON # =============================================== output_data = [] for row in df_berita.itertuples(index=False): output_data.append({ "judul": getattr(row, "judul", ""), "tag": getattr(row, "tag", ""), "link": getattr(row, "link", ""), "isi_berita": getattr(row, "isi_berita", ""), "isi_berita_clean": getattr(row, "isi_berita_clean", ""), "predicted_topic": getattr(row, "predicted_topic", ""), "summary": getattr(row, "summary", "") }) with open("berita_analysis_result.json", "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=4) print("\nšŸ“‚ Hasil analisis berita disimpan di 'berita_analysis_result.json'") !pip install pyngrok flask from flask import Flask, jsonify from pyngrok import ngrok import json # Masukkan token ngrok kamu ngrok.set_auth_token("31odwJIHeYFk9aOrDfXDajKjK87_7esvX4phWySwTCG3BQ1R2") # Load JSON hasil analisis sentiment with open("sentiment_analysis_result.json", "r", encoding="utf-8") as f: sentiment_result = json.load(f) # Load JSON hasil analisis berita with open("berita_analysis_result.json", "r", encoding="utf-8") as f: berita_result = json.load(f) # Inisialisasi Flask app = Flask(__name__) # Endpoint untuk sentiment @app.route("/api/sentiment", methods=["GET"]) def api_sentiment(): return jsonify(sentiment_result) # Endpoint untuk berita @app.route("/api/berita", methods=["GET"]) def api_berita(): return jsonify(berita_result) # Jalankan Flask di port 5000 port = 5000 public_url = ngrok.connect(port) print("šŸ”— Public URL:", public_url) app.run(port=port)