# -*- coding: utf-8 -*- """SentimentAnalysis Automatically generated by Colab. Original file is located at https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/fatihramadhan/sentimentanalysis.74f160cb-74cc-4609-ba85-0081c3654a18.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20260326/auto/storage/goog4_request%26X-Goog-Date%3D20260326T141800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2fe877a762338b5e556a035ce46a5a6bf9c51c0d33c4b062e919cfd44e0297ff787b3a23bf4290b33ca0467d04cf7ba377d77c975cd79da4f1adfec176cb7d78d1eddf1eec10e87d86e656200eaed9b0781f5f5d215ee084957aa5a30c2e9fa1731c23b333d5f742767875bd84e34b83339d834639567639d817ad1295fbc8fd552a5ae92f938b90cb8d916b4a7190e208c6d0effdc10665a9405efffc12a2d4497159428e898204e32ad2d629a58e985c020c7febef459895fd34b052c37a041102284e207ed788a6490c64656ece6150fc355120a49cf2b2fdadda53018d3dba4f8aeda15faaa1eb9c9cef82a476c38be69504e5a5f98cf61686a2b337ea77 """ # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES, # THEN FEEL FREE TO DELETE THIS CELL. # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR # NOTEBOOK. import kagglehub fatihramadhan_sentimentdataset_path = kagglehub.dataset_download('fatihramadhan/sentimentdataset') print('Data source import complete.') import pandas as pd import numpy as np import matplotlib.pyplot as plt import re import html import torch import evaluate import os import transformers import inspect import joblib from pathlib import Path from torch.utils.data import Dataset, DataLoader from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline from sklearn.model_selection import train_test_split from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import accuracy_score, f1_score from sklearn.utils import resample # ---------------------------- # Konfigurasi # ---------------------------- INPUT_PATH = "/kaggle/input/sentimentdataset/dataset_gabungan.csv" # Jika kamu pakai model cased (mis. indobenchmark/indobert-base-p2), set ke False APPLY_LOWERCASE = True # Batasi huruf berulang (contoh: "baguuuusss" -> "baguus") LIMIT_REPEAT_CHARS = True MAX_REPEAT = 2 # Nama kolom (biarkan None agar ditebak otomatis) TEXT_COL = None LABEL_COL = None # Label yang didukung (akan dinormalisasi ke bentuk ini) CANON_LABELS = {"positif": "positif", "positive": "positif", "pos": "positif", 'positi': 'positif', "negatif": "negatif", "negative": "negatif", "neg": "negatif", 'negartif': 'negatif', "netral": "netral", "neutral": "netral", "neu": "netral", 'netr' : 'netral'} # ---------------------------- # Utilitas # ---------------------------- def guess_column(df: pd.DataFrame, candidates): for c in candidates: if c in df.columns: return c # fallback: pilih kolom bertipe object terpanjang obj_cols = [c for c in df.columns if df[c].dtype == "object"] return obj_cols[0] if obj_cols else df.columns[0] url_pattern = re.compile(r"(https?://\S+|www\.\S+)") mention_pattern = re.compile(r"@\w+") hashtag_pattern = re.compile(r"#(\w+)") multi_space_pattern = re.compile(r"\s+") rt_fw_pattern = re.compile(r"\b(rt|fw|fwd)\b[:]?", flags=re.IGNORECASE) # Optional: pola khusus yang sering ada di data komentar (hapus segmen "author ... comment") author_comment_pattern = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL) def limit_repeated_chars(text: str, max_repeat: int = 2) -> str: return re.sub(r"(.)\1{%d,}" % (max_repeat), r"\1" * max_repeat, text) class TextPreprocessor(BaseEstimator, TransformerMixin): def __init__(self, apply_lowercase=True, limit_repeat=True, max_repeat=2, canon_labels=None): self.apply_lowercase = apply_lowercase self.limit_repeat = limit_repeat self.max_repeat = max_repeat self.canon_labels = canon_labels or {} def fit(self, X, y=None): return self def transform(self, X, y=None): # pastikan Series + atasi NaN di sini, JANGAN di _clean_text texts = pd.Series(X).fillna("").astype(str) return texts.apply(self._clean_text) def transform_labels(self, y): if y is None: return None labels = pd.Series(y).astype(str) return labels.apply(self._normalize_label) def _normalize_label(self, x): if pd.isna(x): return None s = str(x).strip().lower() return self.canon_labels.get(s, None) def _clean_text(self, t: str) -> str: if not isinstance(t, str): return "" # Hapus pola "author ... comment" t = author_comment_pattern.sub("", t) # Hapus tag HTML / atribut t = remove_html_elements(t) # Unescape HTML entities t = html.unescape(t) # Ganti URL dan mention t = url_pattern.sub(" ", t) t = mention_pattern.sub(" ", t) # Hashtag "#kata" -> "kata" t = hashtag_pattern.sub(lambda m: f"{m.group(1)}", t) # Hapus token RT/FW t = rt_fw_pattern.sub(" ", t) # Hanya simpan huruf, angka, dan spasi t = re.sub(r"[^a-zA-Z0-9\s]", " ", t) # Normalisasi whitespace t = multi_space_pattern.sub(" ", t).strip() # Lowercase jika diinginkan if self.apply_lowercase: t = t.lower() # Batasi huruf berulang if self.limit_repeat: t = limit_repeated_chars(t, self.max_repeat) return t def remove_html_elements(text: str) -> str: if not isinstance(text, str): return "" # Unescape HTML entities (& -> &, dll) text = html.unescape(text) # Hapus semua lengkap text = TAG_RE.sub(" ", text) # Hapus atribut HTML yang nyangkut sebagai plain text text = ATTR_RE.sub(" ", text) # Hapus simbol "<" atau ">" sisa text = re.sub(r"[<>]", " ", text) # Normalkan spasi text = re.sub(r"\s+", " ", text).strip() return text # regex: hapus beserta isinya TAG_RE = re.compile(r"<[^>]+>") # regex: hapus atribut-atribut html yang sering nyangkut ATTR_RE = re.compile(r"\b(class|id|style|role|tabindex|href|src|alt)=[^\s>]+", flags=re.IGNORECASE) # ---------------------------- # Load # ---------------------------- path = Path(INPUT_PATH) if not path.exists(): raise FileNotFoundError(f"File tidak ditemukan: {path.resolve()}") df = pd.read_csv(path) # ---------------------------- # Tentukan kolom teks & label # ---------------------------- if TEXT_COL is None: TEXT_COL = guess_column(df, ["text", "tweet", "content", "sentence", "caption", "judul", "deskripsi"]) if LABEL_COL is None: LABEL_COL = guess_column(df, ["label", "sentiment", "polarity", "target", "kelas"]) print(f"Kolom teks terdeteksi : {TEXT_COL}") print(f"Kolom label terdeteksi: {LABEL_COL}") # ---------------------------- # Load Preproc # ---------------------------- preproc = TextPreprocessor( apply_lowercase=APPLY_LOWERCASE, limit_repeat=LIMIT_REPEAT_CHARS, max_repeat=MAX_REPEAT, canon_labels=CANON_LABELS ) # ---------------------------- # Penggunaan Preproc # ---------------------------- # fit_transform teks df["text"] = preproc.fit_transform(df[TEXT_COL]) df["sentiment"] = preproc.transform_labels(df[LABEL_COL]) # ---------------------------- # Drop Data jika Text Kosong # ---------------------------- df = df[df["text"].str.strip().ne("")] # ---------------------------- # Tampilkan contoh label tak dikenal # ---------------------------- unknown = df[df["sentiment"].isna()] print("\nContoh label tak dikenal yang akan dibuang:") print(unknown[[LABEL_COL]].value_counts()) # tampilkan 10 teratas # Buang label tak dikenal before = len(df) df = df[df["sentiment"].notna()] dropped_unknown = before - len(df) # ---------------------------- # Hapus duplikasi (berdasarkan teks bersih) # ---------------------------- df = df.drop_duplicates(subset=["text"]).reset_index(drop=True) # ---------------------------- # Ringkasan # ---------------------------- print("\nRingkasan setelah preprocessing:") print(f" - Baris total : {len(df)}") print(f" - Dibuang label tak dikenal: {dropped_unknown}") print(" - Distribusi label:") print(df["sentiment"].value_counts(dropna=False)) # Contoh pratinjau print("\nContoh 5 baris:") print(df[[TEXT_COL, "text", LABEL_COL, "sentiment"]].head(5)) # df.to_csv('/content/drive/MyDrive/Machine Learning/Latih Model/bersihhh.csv') # ---------------------------- # Save Preproc # ---------------------------- joblib.dump(preproc, "preprocessor.joblib") # ============================ # PERBAIKAN LABEL BERDASARKAN KATA KUNCI # ============================ # Definisikan kamus kata kunci untuk tiap label NEGATIVE_KEYWORDS = { # Kata kasar / slang "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol", "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug", "kik", "goblog", "kntl", # Kata resmi / formal "buruk", "lemah", "rendah", "gagal", "hancur", "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang", "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan", "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan" } POSITIVE_KEYWORDS = { # Kata umum positif "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih", "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul", "sempurna", "berhasil", "luas", "indah" } NEUTRAL_KEYWORDS = { # Kata netral / umum "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak" } def correct_label(row): text = row["text"] label = row["sentiment"] # cek kata negatif if any(word in text for word in NEGATIVE_KEYWORDS): return "negatif" # cek kata positif if any(word in text for word in POSITIVE_KEYWORDS): return "positif" # cek kata netral if any(word in text for word in NEUTRAL_KEYWORDS): return "netral" # kalau tidak ada aturan yang kena, pakai label asli return label # Terapkan perbaikan df["sentiment"] = df.apply(correct_label, axis=1) # Ringkasan distribusi setelah perbaikan print("\nDistribusi label setelah perbaikan:") print(df["sentiment"].value_counts()) # Pisahkan tiap kelas df_negatif = df[df["sentiment"] == "negatif"] df_positif = df[df["sentiment"] == "positif"] df_netral = df[df["sentiment"] == "netral"] # Tentukan target jumlah (misal samakan dengan kelas netral) target_count = df_netral.shape[0] # Oversampling positif & negatif df_negatif_over = resample(df_negatif, replace=True, n_samples=target_count, random_state=42) df_positif_over = resample(df_positif, replace=True, n_samples=target_count, random_state=42) # Gabungkan kembali df_balanced = pd.concat([df_netral, df_negatif_over, df_positif_over]) print("Distribusi setelah balancing:") print(df_balanced["sentiment"].value_counts()) # ============================ # VISUALISASI DISTRIBUSI LABEL # ============================ # ambil distribusi label_clean label_counts = df_balanced["sentiment"].value_counts() # -------- Diagram Batang -------- plt.figure(figsize=(6,4)) label_counts.plot(kind="bar", color=["red","green","blue"]) plt.title("Distribusi Sentimen") plt.xlabel("Label") plt.ylabel("Jumlah") plt.xticks(rotation=0) plt.show() print('\n') # -------- Diagram Lingkaran (Pie) -------- plt.figure(figsize=(5,5)) label_counts.plot(kind="pie", autopct='%1.1f%%', startangle=90, colors=["red","green","blue"]) plt.title("Persentase Sentimen") plt.ylabel("") # hilangkan label Y plt.show() # ============================ # SPLIT DATASET (train/val/test) # ============================ # ambil teks & label hasil bersih X = df_balanced["text"].values y = df_balanced["sentiment"].values # 1. Bagi train + temp (80%) dan test (20%) X_train, X_temp, y_train, y_temp = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 2. Dari temp (20%), bagi lagi jadi val (10%) + test (10%) X_val, X_test, y_val, y_test = train_test_split( X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp ) # Cek ukuran hasil split print("Ukuran dataset:") print(f"Train: {len(X_train)}") print(f"Validation: {len(X_val)}") print(f"Test: {len(X_test)}") # ============================ # FINE-TUNING IndoBERT # ============================ # pastikan pakai GPU kalau tersedia device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) os.environ["WANDB_API_KEY"] = "009f08e71506e55bdfd282b691a4abee4ac85ff9" os.environ["WANDB_DISABLED"] = "false" # ---------------------------- # 1. Tokenizer & Label Encoding # ---------------------------- MODEL_NAME = "indobenchmark/indobert-base-p1" # model IndoBERT pre-trained tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) # mapping label ke angka label2id = {"negatif": 0, "netral": 1, "positif": 2} id2label = {v: k for k, v in label2id.items()} def encode_labels(labels): return [label2id[l] for l in labels] y_train_enc = encode_labels(y_train) y_val_enc = encode_labels(y_val) y_test_enc = encode_labels(y_test) # ---------------------------- # 2. Dataset class # ---------------------------- class SentimentDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len=128): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) label = self.labels[idx] enc = self.tokenizer( text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt" ) return { "input_ids": enc["input_ids"].squeeze(), "attention_mask": enc["attention_mask"].squeeze(), "labels": torch.tensor(label, dtype=torch.long) } train_dataset = SentimentDataset(X_train, y_train_enc, tokenizer) val_dataset = SentimentDataset(X_val, y_val_enc, tokenizer) test_dataset = SentimentDataset(X_test, y_test_enc, tokenizer) # ---------------------------- # 3. Model # ---------------------------- model = BertForSequenceClassification.from_pretrained( MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id ).to(device) # ---------------------------- # 4. Training Arguments # ---------------------------- training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=32, per_device_eval_batch_size=32, num_train_epochs=5, # cukup 10–15, early stopping yang handle learning_rate=2e-5, # lebih kecil → stabil weight_decay=0.05, # lebih besar → regularisasi warmup_ratio=0.1, # 10% step awal dipakai warmup logging_dir="./logs", logging_steps=500, save_total_limit=2, eval_strategy="epoch", # evaluasi setiap epoch save_strategy="epoch", # simpan juga setiap epoch load_best_model_at_end=True, metric_for_best_model="f1", greater_is_better=True ) # ---------------------------- # 5. Metrics # ---------------------------- metric_acc = evaluate.load("accuracy") metric_f1 = evaluate.load("f1") def compute_metrics(eval_pred): logits, labels = eval_pred preds = np.argmax(logits, axis=-1) acc = metric_acc.compute(predictions=preds, references=labels) f1 = metric_f1.compute(predictions=preds, references=labels, average="weighted") return {"accuracy": acc["accuracy"], "f1": f1["f1"]} # ---------------------------- # 6. Trainer # ---------------------------- trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # stop kalau 2 epoch tidak membaik ) # ---------------------------- # 7. Mulai Training # ---------------------------- trainer.train() # ============================= # 8. Evaluasi & Simpan Prediksi # ============================= # hasil prediksi di test set pred_results = trainer.predict(test_dataset) # ambil logits → konversi ke label prediksi pred_logits = pred_results.predictions pred_labels = np.argmax(pred_logits, axis=1) # konversi angka ke label teks pred_text_labels = [id2label[i] for i in pred_labels] true_text_labels = [id2label[i] for i in y_test_enc] # gabungkan dengan teks asli df_test_results = pd.DataFrame({ "text": X_test, "true_label": true_text_labels, "predicted_label": pred_text_labels }) # simpan ke CSV df_test_results.to_csv("test_predictions.csv", index=False) print("✅ Hasil prediksi test set sudah disimpan ke test_predictions.csv") # ============================ # EVALUASI & SIMPAN MODEL # ============================ # 1. Evaluasi di test set print("\nEvaluasi di Test Set:") test_result = trainer.evaluate(test_dataset) print(test_result) # 2. Prediksi label test set (opsional, untuk analisis lebih lanjut) predictions = trainer.predict(test_dataset) pred_labels = np.argmax(predictions.predictions, axis=-1) # contoh lihat 10 prediksi pertama for i in range(10): print(f"Teks: {X_test[i]}") print(f"Label Asli: {id2label[y_test_enc[i]]} | Prediksi: {id2label[pred_labels[i]]}") print("---") # 3. Simpan model + tokenizer SAVE_DIR = "./indoBERT-sentiment" trainer.save_model(SAVE_DIR) tokenizer.save_pretrained(SAVE_DIR) print(f"\nModel & tokenizer sudah disimpan ke: {SAVE_DIR}") # ========================== # LOAD MODEL & TOKENIZER # ========================== MODEL_DIR = "./indoBERT-sentiment" tokenizer = BertTokenizer.from_pretrained(MODEL_DIR) model = BertForSequenceClassification.from_pretrained(MODEL_DIR) device = 0 if torch.cuda.is_available() else -1 sentiment_pipeline = pipeline( "text-classification", model=model, tokenizer=tokenizer, device=device ) # load preprocessor yang sudah disimpan preproc = joblib.load("preprocessor.joblib") # ========================== # FUNGSI PREDIKSI # ========================== def predict_text(text): if not isinstance(text, str) or text.strip() == "": return "EMPTY" result = sentiment_pipeline(text, truncation=True, max_length=512)[0] return result["label"] # ========================== # PREDIKSI FILE 1 (MEDIA SOSIAL) # ========================== file1 = pd.read_csv("/kaggle/input/sentimentdataset/gabungan (1).csv") # Preprocessing caption file1["caption"] = preproc.transform(file1["caption"]) # Preprocessing comment file1["comments"] = preproc.transform(file1["comments"]) # drop NaN biar aman file1 = file1.dropna(subset=["caption", "comments"]) outputs1 = [] for idx, row in file1.iterrows(): print(f"[File1] Proses baris {idx+1}/{len(file1)}") # caption caption_text = str(row["caption"]).strip() caption_pred = predict_text(caption_text) # comments comments_text = str(row["comments"]).strip() comments_pred_label = predict_text(comments_text) outputs1.append({ "link": row.get("link", ""), # simpan link medsos "caption": caption_text, "caption_pred": caption_pred, "comments_pred": comments_text, # simpan teks asli komentar "comments_summary": comments_pred_label # hasil prediksi sentimen komentar }) df_out1 = pd.DataFrame(outputs1) df_out1.to_csv("medsos.csv", index=False, encoding="utf-8-sig") print("✅ Hasil prediksi file1 sudah disimpan ke medsos.csv") # ========================== # PREDIKSI FILE 2 (BERITA) # ========================== file2 = pd.read_csv("/kaggle/input/sentimentdataset/berita2 (1).csv") # Preprocessing judul file2["judul"] = preproc.transform(file2["judul"]) # Preprocessing tag (✅ perbaikan: tidak menimpa judul) file2["tag"] = preproc.transform(file2["tag"]) # Preprocessing isi_berita file2["isi_berita"] = preproc.transform(file2["isi_berita"]) # drop NaN biar aman file2 = file2.dropna(subset=["judul", "tag", "isi_berita"]) outputs2 = [] for idx, row in file2.iterrows(): print(f"[File2] Proses baris {idx+1}/{len(file2)}") combined_text = f"{row['judul']} {row['tag']} {row['isi_berita']}" pred = predict_text(combined_text) outputs2.append({ "link": row.get("link", ""), # simpan link berita "judul": row["judul"], "tag": row["tag"], "isi_berita": row["isi_berita"], "prediction": pred }) df_out2 = pd.DataFrame(outputs2) df_out2.to_csv("berita.csv", index=False, encoding="utf-8-sig") print("✅ Hasil prediksi file2 sudah disimpan ke berita.csv")