| |
| """Word Cloud.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1rwyDXgYaTJQJvXu2FPeggecHOxIYQ3l3 |
| """ |
|
|
| !pip install stop-words |
| !pip install sastrawi |
| !pip install transformers |
|
|
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
| import html |
| import re |
| import json |
|
|
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.decomposition import NMF |
| from wordcloud import WordCloud |
| from tqdm import tqdm |
| from IPython.display import display |
| from bs4 import BeautifulSoup |
| from Sastrawi.Stemmer.StemmerFactory import StemmerFactory |
| from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory |
| from stop_words import get_stop_words |
| from collections import Counter |
| from transformers import pipeline |
|
|
| |
| |
| |
| FILE_PATH = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/medsos (6).csv' |
| N_TOPICS = 15 |
| N_TOP_WORDS = 10 |
| SAMPLE_DATA_TO_SHOW = 5 |
|
|
| |
| |
| |
| stopwords_indonesia = get_stop_words('indonesian') |
| factory = StopWordRemoverFactory() |
| sastrawi_stopwords = factory.get_stop_words() |
|
|
| additional_stopwords = [ |
| 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk', |
| 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi','apk', |
| 'sllu','apknya','sngt','joos','ni','kak', |
| |
| 'manfaatnya','ya','lbh','digunakan','semangat','dah','sangat','penting', |
| 'lancar','cepat','senang','makasih','bermanfaat','keren','berguna','baik', |
| 'indonesia','usaha','memudahkan','pokoknya','puas','mantap','dananya','luar', |
| 'hati','ber','terimakasih','tepat','memudah','terbaik','mempermudah','praktis', |
| 'simple','kadang','memuaskan','bagus','semoga','smoga','aplikasi','transaksi', |
| 'kesimpulan','sip','pelayanannya','orang','manfaat','untuk','proses','membantu', |
| 'pengiriman','muda','mantaap','kedepannya','pake','aktifitas','sejauh','untung', |
| 'tenang','bikin','pakek','saldo','keluhan','dimanapun','cukup','menggunakan', |
| 'sengat','banget','pakai','terpercaya','top','sukses', |
| |
| 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk','guna', |
| 'baru','jelas','level','selengkapnya','yuk','mohon','punya','cara','hari', |
| 'kota','news','baca','fitur','kasih','suruh', |
| 'besar','sapa','bawa','atas','hidup','jaga','moga','kali','balas','perintah', |
| 'masyarakat','ide','hadir','ikut','ingat','tali','alhamdulillah','sambut', |
| 'masa','tuju','terima','ibu','silaturahmi','pasang','bangun','dukung', |
| 'muhammad','teladan','tahun','insan','bulan','iman','erat','syukur', |
| 'kabupaten','cirebon','langsung','cinta','kuat','tebar','hubung','ikat', |
| 'resmi','giat','selenggara','luka','kendara','putih','fyp','reses','mulai', |
| 'rctvcirebon','radarcirebon','temu','satu','factor','harap','wararctv', |
| 'maksimal','salah','tiktokberita','kawasan','sangka','juang','merah','puluh', |
| 'ribu','omo','argo','role','jati','tingkat','kata','emis','majalengka', |
| 'madam','sebut','tawur','duga', |
| |
| 'visi','saw','keras','sayang','bentuk','didik','jalin','keluarga','momen', |
| 'program','baginda','hikmah','panjang','lingkung','wewararctv', 'magelang', |
| 'kang', 'langkah', 'limpah', 'explore', 'tabindex', 'penuh', 'aa', 'rasa', 'tags', |
| 'notranslate', 'desa', 'daerah', 'lengkap', 'aa', 'kunjung', 'laku', 'klik', 'berkah', |
| 'aboutcirebon', 'jl', 'terus', 'hasil', 'instastory', 'taut', 'upaya', 'berita', |
| 'beri', 'lanjut', 'pemkabcirebon', 'warga', 'pemkabcirebon', 'selamat', 'wujud', 'maju', |
| 'wakil', 'ungkap', 'turut', 'pihak', 'wilayah', 'dinas', 'promo', 'pemkotcirebon', 'hadap', |
| 'barat', 'layan', 'siap', 'milik', 'lokasi', 'ujar', 'rupa', 'gratis', 'daftar', 'jawa', 'tengah', |
| 'kolaborasi', 'tempat', 'tegas', 'gelar', 'wib' |
| |
| 'januari', 'februari', 'maret', 'april', 'mei', 'juni', 'juli', 'agustus', 'september', |
| 'oktober', 'november', 'desember' |
| ] |
|
|
| |
| noise_stopwords = [ |
| 'by','zd','xyri','yu','uobl','ypdohk','xt','pz','lziwak','mp', |
| 'rp','xdj','xexx','xggy','xjbqb','xstzfhl','link','class','hfl','xat', |
| 'qhh','dhg','cr', 'tdsg', 'ct', 'etr', 'nq', 'oe', 'ejq', 'psk', 'href', |
| 'hl', 'hd' , 'sy', 'amp', 'fbf', 'tags' |
| ] |
|
|
| CUSTOM_STOPWORDS = [ |
| |
| "class", "id", "span", "div", "href", "src", "style", "alt", |
| "aria", "role", "tabindex", "button", "label", "img", "input", |
| "placeholder", "form", "field", "hidden", "value", 'aa', |
|
|
| |
| "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", |
| "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", |
| "w", "x", "y", "z", |
|
|
| |
| "hfl", "xjbqb", "ejq", "ypdohk", "xexx", "hfr", "eyih", |
| "dwj", "hkzxv", "yuc", "igjr", "eqks", "oq", "kjzd", "oxk", |
| "zsgpy", "dycq", "g", "o", "wa", "wo", "ae", "ov", "vv", "uxc", |
|
|
| |
| "content", "data", "video", "playlist", "source", "watch", |
| "channel", "views", "subscribe", "update", "next", "prev", |
| "click", "menu", "link", "button", "card", "section", |
|
|
| |
| "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", |
| ] |
|
|
| |
| final_stopwords = list(set(stopwords_indonesia + sastrawi_stopwords + additional_stopwords + noise_stopwords + CUSTOM_STOPWORDS)) |
|
|
| |
| |
| |
| stemmer = StemmerFactory().create_stemmer() |
| html_noise = ['fbf','tabindex','tags','notranslate','aria-label','div','span','class'] |
| noise_words = set(noise_stopwords + CUSTOM_STOPWORDS + html_noise) |
|
|
| def clean_html(text): |
| if pd.isna(text): |
| return "" |
| s = BeautifulSoup(str(text), "html.parser") |
| for tag in s(["script", "style"]): |
| tag.decompose() |
| cleaned = s.get_text(separator=" ") |
| cleaned = html.unescape(cleaned) |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() |
| return cleaned |
|
|
| def remove_single_letters(text): |
| return re.sub(r"\b\w\b", "", text) |
|
|
| def hapus (text): |
| tokens = [word for word in text.split() if word not in noise_words] |
| text = " ".join(tokens) |
| return text |
|
|
|
|
| def preprocess_text(text): |
| |
| text = clean_html(text) |
|
|
| |
| text = text.lower() |
|
|
| |
| text = stemmer.stem(text) |
|
|
| |
| tokens = [word for word in text.split() |
| if word not in final_stopwords and word not in html_noise] |
|
|
| |
| tokens = [t for t in tokens if re.search(r"[a-zA-Z]", t)] |
|
|
| |
| text = " ".join(tokens) |
|
|
| |
| text = remove_single_letters(text) |
|
|
| return text.strip() |
|
|
| |
| |
| |
| try: |
| df = pd.read_csv(FILE_PATH) |
| df.dropna(subset=['caption'], inplace=True) |
| df['caption'] = df['caption'].astype(str) |
| df['caption_clean'] = df['caption'].apply(preprocess_text) |
| df['caption'] = df['caption'].apply(hapus) |
|
|
| print("β
Dataset berhasil dimuat & dipreproses.") |
| print(f"Jumlah data: {len(df)} baris") |
| if 'caption_pred' in df.columns: |
| print("\nDistribusi Sentimen (caption_pred):") |
| print(df['caption_pred'].value_counts()) |
| except FileNotFoundError: |
| print(f"β Error: File '{FILE_PATH}' tidak ditemukan.") |
| raise SystemExit |
|
|
| |
| |
| |
| def get_top_words_per_topic(model, feature_names, n_top_words): |
| topics = {} |
| for topic_idx, topic in enumerate(model.components_): |
| top_features_ind = topic.argsort()[: -n_top_words - 1 : -1] |
| top_features = [feature_names[i] for i in top_features_ind] |
| topics[topic_idx] = top_features |
| return topics |
|
|
| def format_topics_sentences(topics): |
| return {topic_idx: ", ".join(words) for topic_idx, words in topics.items()} |
|
|
| def create_circular_wordcloud(words_list, title, n_words=10): |
| text_data = " ".join(words_list[:n_words]) |
| if not text_data.strip(): |
| print(f"Tidak ada kata untuk word cloud '{title}'.") |
| return |
| x, y = np.ogrid[:400, :400] |
| mask = (x - 200) ** 2 + (y - 200) ** 2 > 190 ** 2 |
| mask = 255 * mask.astype(int) |
| wc = WordCloud(width=800, height=800, background_color='white', |
| colormap='viridis', mask=mask, |
| contour_width=3, contour_color='steelblue').generate(text_data) |
| plt.figure(figsize=(8, 8)) |
| plt.imshow(wc, interpolation='bilinear') |
| plt.title(title, fontsize=18, pad=15) |
| plt.axis('off') |
| plt.show() |
|
|
| def get_top_words_by_doc_frequency(df_subset, n_top_words=10): |
| word_doc_count = Counter() |
| for text in df_subset['caption_clean'].fillna(""): |
| tokens = [w for w in text.split() if not re.fullmatch(r"[a-z]", w)] |
| unique_tokens = set(tokens) |
| word_doc_count.update(unique_tokens) |
| return word_doc_count.most_common(n_top_words) |
|
|
| summarizer = pipeline( |
| "summarization", |
| model="google/mt5-small", |
| tokenizer="google/mt5-small" |
| ) |
|
|
| def generate_summary(text, max_length=60, min_length=20): |
| if not text or len(text.split()) < 10: |
| return text |
| try: |
| result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) |
| return result[0]['summary_text'] |
| except Exception as e: |
| print(f"β οΈ Error summarizing: {e}") |
| return text |
|
|
| def summarize_text(corpus, n_topics=5, n_words=10): |
| vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') |
| X = vectorizer.fit_transform(corpus) |
|
|
| nmf = NMF(n_components=n_topics, random_state=42) |
| nmf.fit(X) |
|
|
| feature_names = vectorizer.get_feature_names_out() |
| key_sentences = [] |
|
|
| for topic_idx, topic in enumerate(nmf.components_): |
| top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]] |
| key_sentences.extend(top_words) |
|
|
| |
| summary = " ".join(key_sentences) |
| return summary |
|
|
| |
| |
| |
| print("\n--- π§ Memprediksi Topik dan Membuat Ringkasan untuk Semua Data ---") |
|
|
| |
| df['combined_text'] = df['caption_clean'].fillna('') + " " + df['comments_pred'].fillna('') |
|
|
| |
| global_vectorizer = TfidfVectorizer( |
| max_df=0.9, |
| min_df=10, |
| max_features=1000, |
| stop_words=final_stopwords, |
| ngram_range=(1, 2) |
| ) |
|
|
| global_tfidf = global_vectorizer.fit_transform(df['combined_text']) |
| global_feature_names = global_vectorizer.get_feature_names_out() |
|
|
| |
| if global_tfidf.shape[1] == 0: |
| df['predicted_topic_id'] = -1 |
| df['predicted_topic'] = "Tidak ada fitur yang cukup untuk modeling" |
| df['summary'] = "Tidak dapat membuat ringkasan" |
| print("β οΈ Peringatan: Kosakata terlalu sedikit setelah preprocessing. Topic modeling tidak dapat dilakukan.") |
| else: |
| global_nmf_model = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5) |
| global_nmf_model.fit(global_tfidf) |
|
|
| |
| topic_distribution = global_nmf_model.transform(global_tfidf) |
| df['predicted_topic_id'] = np.argmax(topic_distribution, axis=1) |
|
|
| |
| def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10): |
| top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1] |
| return [feature_names[i] for i in top_indices] |
|
|
| |
| topic_keywords = {} |
| for topic_idx in range(N_TOPICS): |
| top_words = get_top_words_for_topic(global_nmf_model, global_feature_names, topic_idx, N_TOP_WORDS) |
| topic_keywords[topic_idx] = ", ".join(top_words) |
|
|
| df['predicted_topic'] = df['predicted_topic_id'].map(topic_keywords).fillna("Topik tidak teridentifikasi") |
|
|
| |
| df['summary'] = df['combined_text'].apply(lambda x: generate_summary(x)) |
|
|
| print("β
Prediksi topik selesai, ringkasan memakai IndoBERT Summarization (gabungan caption + comment).") |
|
|
| |
| print("\n--- β¨ Contoh Hasil Prediksi Topik dan Ringkasan ---") |
| display(df[['caption', 'comments_pred', 'predicted_topic', 'summary']].head(10)) |
|
|
| |
| |
| |
| analysis_result = {} |
|
|
| if 'caption_pred' in df.columns: |
| sentiments = ['positif', 'negatif', 'netral'] |
|
|
| |
| pd.set_option('display.max_colwidth', None) |
|
|
| for sentiment in sentiments: |
| print(f"\n\n=======================================================") |
| print(f"π Analisis Mendalam untuk Sentimen: '{sentiment.upper()}'") |
| print(f"=======================================================") |
|
|
| subset_df = df[df['caption_pred'] == sentiment].copy() |
| analysis_result[sentiment] = [] |
|
|
| if subset_df.empty: |
| print(f"Tidak ada data untuk sentimen '{sentiment}'.") |
| continue |
|
|
| |
| top_words_tuples = get_top_words_by_doc_frequency(subset_df, n_top_words=N_TOP_WORDS) |
|
|
| if not top_words_tuples: |
| print(f"Tidak ada kata signifikan pada sentimen '{sentiment}' untuk dianalisis.") |
| continue |
|
|
| |
| words_list_for_wc = [word for word, count in top_words_tuples] |
| create_circular_wordcloud(words_list_for_wc, f"WordCloud Sentimen {sentiment.upper()}", n_words=N_TOP_WORDS) |
|
|
| |
| print(f"\n--- π Bukti Ringkasan Berdasarkan Kata Kunci Populer ---") |
|
|
| for word, doc_count in top_words_tuples: |
| relevant_data = subset_df[ |
| subset_df['caption_clean'].str.contains(r'\b{}\b'.format(re.escape(word)), case=False, na=False) |
| ] |
|
|
| summaries_list = [] |
| if not relevant_data.empty: |
| print(f"\nβ
Kata Kunci: '{word}' (ditemukan dalam {len(relevant_data)} data pada sentimen ini)") |
|
|
| for i, row in enumerate(relevant_data.itertuples(index=False), 1): |
| caption = getattr(row, "caption_clean", "") |
| link = getattr(row, "link", None) or getattr(row, "url", None) or "-" |
| comment = getattr(row, "comments_pred", "") |
| print(f" {i}. {caption} π {link} π¬ {comment}") |
|
|
| summaries_list.append({ |
| "caption": caption, |
| "link": link, |
| "comment": comment |
| }) |
|
|
| else: |
| print(f"\nβ Kata Kunci: '{word}' (tidak ditemukan data relevan untuk ditampilkan)") |
|
|
| |
| analysis_result[sentiment].append({ |
| "keyword": word, |
| "count": int(len(relevant_data)), |
| "summary": summaries_list |
| }) |
|
|
| else: |
| print("\nKolom 'caption_pred' tidak ditemukan. Melewati analisis per sentimen.") |
|
|
| |
| |
| |
| with open("sentiment_analysis_result.json", "w", encoding="utf-8") as f: |
| json.dump(analysis_result, f, ensure_ascii=False, indent=4) |
|
|
| print("\nπ Hasil analisis juga telah disimpan di 'sentiment_analysis_result.json'") |
|
|
| |
| |
| |
|
|
| FILE_BERITA = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita (6).csv' |
|
|
| try: |
| df_berita = pd.read_csv(FILE_BERITA) |
| df_berita.dropna(subset=['isi_berita'], inplace=True) |
| df_berita['isi_berita'] = df_berita['isi_berita'].astype(str) |
|
|
| |
| df_berita['isi_berita_clean'] = df_berita['isi_berita'].apply(preprocess_text) |
|
|
| print("β
Dataset berita berhasil dimuat & dipreproses.") |
| print(f"Jumlah data: {len(df_berita)} baris") |
|
|
| except FileNotFoundError: |
| print(f"β Error: File '{FILE_BERITA}' tidak ditemukan.") |
| raise SystemExit |
|
|
| |
| |
| |
| print("\n--- π§ Memprediksi Topik & Ringkasan untuk Dataset Berita ---") |
|
|
| |
| df_berita['combined_text'] = ( |
| df_berita['isi_berita_clean'].fillna('') + " " + |
| df_berita['judul'].fillna('') + " " + |
| df_berita['tag'].fillna('') |
| ) |
|
|
| |
| vectorizer_berita = TfidfVectorizer( |
| max_df=0.9, |
| min_df=5, |
| max_features=1000, |
| stop_words=final_stopwords, |
| ngram_range=(1, 2) |
| ) |
|
|
| tfidf_berita = vectorizer_berita.fit_transform(df_berita['combined_text']) |
| feature_names_berita = vectorizer_berita.get_feature_names_out() |
|
|
| if tfidf_berita.shape[1] == 0: |
| df_berita['predicted_topic_id'] = -1 |
| df_berita['predicted_topic'] = "Tidak cukup fitur untuk modeling" |
| df_berita['summary'] = "Tidak dapat membuat ringkasan" |
| else: |
| nmf_berita = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5) |
| nmf_berita.fit(tfidf_berita) |
|
|
| topic_dist_berita = nmf_berita.transform(tfidf_berita) |
| df_berita['predicted_topic_id'] = np.argmax(topic_dist_berita, axis=1) |
|
|
| |
| def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10): |
| top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1] |
| return [feature_names[i] for i in top_indices] |
|
|
| topic_keywords_berita = {} |
| for topic_idx in range(N_TOPICS): |
| top_words = get_top_words_for_topic(nmf_berita, feature_names_berita, topic_idx, N_TOP_WORDS) |
| topic_keywords_berita[topic_idx] = ", ".join(top_words) |
|
|
| df_berita['predicted_topic'] = df_berita['predicted_topic_id'].map(topic_keywords_berita).fillna("Topik tidak teridentifikasi") |
|
|
| |
| df_berita['summary'] = df_berita['isi_berita'].apply(lambda x: generate_summary(x)) |
|
|
| print("β
Prediksi topik & ringkasan berita selesai.") |
|
|
| |
| |
| |
| output_data = [] |
| for row in df_berita.itertuples(index=False): |
| output_data.append({ |
| "judul": getattr(row, "judul", ""), |
| "tag": getattr(row, "tag", ""), |
| "link": getattr(row, "link", ""), |
| "isi_berita": getattr(row, "isi_berita", ""), |
| "isi_berita_clean": getattr(row, "isi_berita_clean", ""), |
| "predicted_topic": getattr(row, "predicted_topic", ""), |
| "summary": getattr(row, "summary", "") |
| }) |
|
|
| with open("berita_analysis_result.json", "w", encoding="utf-8") as f: |
| json.dump(output_data, f, ensure_ascii=False, indent=4) |
|
|
| print("\nπ Hasil analisis berita disimpan di 'berita_analysis_result.json'") |
|
|
| !pip install pyngrok flask |
|
|
| from flask import Flask, jsonify |
| from pyngrok import ngrok |
| import json |
|
|
| |
| ngrok.set_auth_token("31odwJIHeYFk9aOrDfXDajKjK87_7esvX4phWySwTCG3BQ1R2") |
|
|
| |
| with open("sentiment_analysis_result.json", "r", encoding="utf-8") as f: |
| sentiment_result = json.load(f) |
|
|
| |
| with open("berita_analysis_result.json", "r", encoding="utf-8") as f: |
| berita_result = json.load(f) |
|
|
| |
| app = Flask(__name__) |
|
|
| |
| @app.route("/api/sentiment", methods=["GET"]) |
| def api_sentiment(): |
| return jsonify(sentiment_result) |
|
|
| |
| @app.route("/api/berita", methods=["GET"]) |
| def api_berita(): |
| return jsonify(berita_result) |
|
|
| |
| port = 5000 |
| public_url = ngrok.connect(port) |
| print("π Public URL:", public_url) |
|
|
| app.run(port=port) |