| |
| """SentimentAnalysis |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/fatihramadhan/sentimentanalysis.74f160cb-74cc-4609-ba85-0081c3654a18.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20260326/auto/storage/goog4_request%26X-Goog-Date%3D20260326T141800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2fe877a762338b5e556a035ce46a5a6bf9c51c0d33c4b062e919cfd44e0297ff787b3a23bf4290b33ca0467d04cf7ba377d77c975cd79da4f1adfec176cb7d78d1eddf1eec10e87d86e656200eaed9b0781f5f5d215ee084957aa5a30c2e9fa1731c23b333d5f742767875bd84e34b83339d834639567639d817ad1295fbc8fd552a5ae92f938b90cb8d916b4a7190e208c6d0effdc10665a9405efffc12a2d4497159428e898204e32ad2d629a58e985c020c7febef459895fd34b052c37a041102284e207ed788a6490c64656ece6150fc355120a49cf2b2fdadda53018d3dba4f8aeda15faaa1eb9c9cef82a476c38be69504e5a5f98cf61686a2b337ea77 |
| """ |
|
|
| |
| |
| |
| |
| |
| import kagglehub |
| fatihramadhan_sentimentdataset_path = kagglehub.dataset_download('fatihramadhan/sentimentdataset') |
|
|
| print('Data source import complete.') |
|
|
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
|
|
| import re |
| import html |
| import torch |
| import evaluate |
| import os |
| import transformers |
| import inspect |
| import joblib |
|
|
| from pathlib import Path |
| from torch.utils.data import Dataset, DataLoader |
| from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline |
|
|
| from sklearn.model_selection import train_test_split |
| from sklearn.base import BaseEstimator, TransformerMixin |
| from sklearn.metrics import accuracy_score, f1_score |
| from sklearn.utils import resample |
|
|
| |
| |
| |
| INPUT_PATH = "/kaggle/input/sentimentdataset/dataset_gabungan.csv" |
|
|
| |
| APPLY_LOWERCASE = True |
|
|
| |
| LIMIT_REPEAT_CHARS = True |
| MAX_REPEAT = 2 |
|
|
| |
| TEXT_COL = None |
| LABEL_COL = None |
|
|
| |
| CANON_LABELS = {"positif": "positif", "positive": "positif", "pos": "positif", 'positi': 'positif', |
| "negatif": "negatif", "negative": "negatif", "neg": "negatif", 'negartif': 'negatif', |
| "netral": "netral", "neutral": "netral", "neu": "netral", 'netr' : 'netral'} |
|
|
| |
| |
| |
| def guess_column(df: pd.DataFrame, candidates): |
| for c in candidates: |
| if c in df.columns: |
| return c |
| |
| obj_cols = [c for c in df.columns if df[c].dtype == "object"] |
| return obj_cols[0] if obj_cols else df.columns[0] |
|
|
| url_pattern = re.compile(r"(https?://\S+|www\.\S+)") |
| mention_pattern = re.compile(r"@\w+") |
| hashtag_pattern = re.compile(r"#(\w+)") |
| multi_space_pattern = re.compile(r"\s+") |
| rt_fw_pattern = re.compile(r"\b(rt|fw|fwd)\b[:]?", flags=re.IGNORECASE) |
|
|
| |
| author_comment_pattern = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL) |
|
|
| def limit_repeated_chars(text: str, max_repeat: int = 2) -> str: |
| return re.sub(r"(.)\1{%d,}" % (max_repeat), r"\1" * max_repeat, text) |
|
|
| class TextPreprocessor(BaseEstimator, TransformerMixin): |
| def __init__(self, |
| apply_lowercase=True, |
| limit_repeat=True, |
| max_repeat=2, |
| canon_labels=None): |
| self.apply_lowercase = apply_lowercase |
| self.limit_repeat = limit_repeat |
| self.max_repeat = max_repeat |
| self.canon_labels = canon_labels or {} |
|
|
| def fit(self, X, y=None): |
| return self |
|
|
| def transform(self, X, y=None): |
| |
| texts = pd.Series(X).fillna("").astype(str) |
| return texts.apply(self._clean_text) |
|
|
| def transform_labels(self, y): |
| if y is None: |
| return None |
| labels = pd.Series(y).astype(str) |
| return labels.apply(self._normalize_label) |
|
|
| def _normalize_label(self, x): |
| if pd.isna(x): |
| return None |
| s = str(x).strip().lower() |
| return self.canon_labels.get(s, None) |
|
|
| def _clean_text(self, t: str) -> str: |
| if not isinstance(t, str): |
| return "" |
|
|
| |
| t = author_comment_pattern.sub("", t) |
|
|
| |
| t = remove_html_elements(t) |
|
|
| |
| t = html.unescape(t) |
|
|
| |
| t = url_pattern.sub(" <url> ", t) |
| t = mention_pattern.sub(" <user> ", t) |
|
|
| |
| t = hashtag_pattern.sub(lambda m: f"{m.group(1)}", t) |
|
|
| |
| t = rt_fw_pattern.sub(" ", t) |
|
|
| |
| t = re.sub(r"[^a-zA-Z0-9\s]", " ", t) |
|
|
| |
| t = multi_space_pattern.sub(" ", t).strip() |
|
|
| |
| if self.apply_lowercase: |
| t = t.lower() |
|
|
| |
| if self.limit_repeat: |
| t = limit_repeated_chars(t, self.max_repeat) |
|
|
| return t |
|
|
|
|
| def remove_html_elements(text: str) -> str: |
| if not isinstance(text, str): |
| return "" |
|
|
| |
| text = html.unescape(text) |
|
|
| |
| text = TAG_RE.sub(" ", text) |
|
|
| |
| text = ATTR_RE.sub(" ", text) |
|
|
| |
| text = re.sub(r"[<>]", " ", text) |
|
|
| |
| text = re.sub(r"\s+", " ", text).strip() |
|
|
| return text |
|
|
| |
| TAG_RE = re.compile(r"<[^>]+>") |
|
|
| |
| ATTR_RE = re.compile(r"\b(class|id|style|role|tabindex|href|src|alt)=[^\s>]+", flags=re.IGNORECASE) |
|
|
| |
| |
| |
| path = Path(INPUT_PATH) |
| if not path.exists(): |
| raise FileNotFoundError(f"File tidak ditemukan: {path.resolve()}") |
|
|
| df = pd.read_csv(path) |
|
|
| |
| |
| |
| if TEXT_COL is None: |
| TEXT_COL = guess_column(df, ["text", "tweet", "content", "sentence", "caption", "judul", "deskripsi"]) |
| if LABEL_COL is None: |
| LABEL_COL = guess_column(df, ["label", "sentiment", "polarity", "target", "kelas"]) |
|
|
| print(f"Kolom teks terdeteksi : {TEXT_COL}") |
| print(f"Kolom label terdeteksi: {LABEL_COL}") |
|
|
| |
| |
| |
|
|
| preproc = TextPreprocessor( |
| apply_lowercase=APPLY_LOWERCASE, |
| limit_repeat=LIMIT_REPEAT_CHARS, |
| max_repeat=MAX_REPEAT, |
| canon_labels=CANON_LABELS |
| ) |
|
|
| |
| |
| |
| |
| df["text"] = preproc.fit_transform(df[TEXT_COL]) |
| df["sentiment"] = preproc.transform_labels(df[LABEL_COL]) |
|
|
| |
| |
| |
| df = df[df["text"].str.strip().ne("")] |
|
|
| |
| |
| |
| unknown = df[df["sentiment"].isna()] |
| print("\nContoh label tak dikenal yang akan dibuang:") |
| print(unknown[[LABEL_COL]].value_counts()) |
|
|
| |
| before = len(df) |
| df = df[df["sentiment"].notna()] |
| dropped_unknown = before - len(df) |
|
|
| |
| |
| |
| df = df.drop_duplicates(subset=["text"]).reset_index(drop=True) |
|
|
| |
| |
| |
| print("\nRingkasan setelah preprocessing:") |
| print(f" - Baris total : {len(df)}") |
| print(f" - Dibuang label tak dikenal: {dropped_unknown}") |
| print(" - Distribusi label:") |
| print(df["sentiment"].value_counts(dropna=False)) |
|
|
| |
| print("\nContoh 5 baris:") |
| print(df[[TEXT_COL, "text", LABEL_COL, "sentiment"]].head(5)) |
|
|
| |
|
|
| |
| |
| |
|
|
| joblib.dump(preproc, "preprocessor.joblib") |
|
|
| |
| |
| |
|
|
| |
| NEGATIVE_KEYWORDS = { |
| |
| "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol", |
| "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug", |
| "kik", "goblog", "kntl", |
|
|
| |
| "buruk", "lemah", "rendah", "gagal", "hancur", "rusak", "cacat", |
| "jahat", "dusta", "bohong", "fitnah", "korup", "curang", "palsu", |
| "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan", |
| "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan" |
| } |
|
|
| POSITIVE_KEYWORDS = { |
| |
| "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih", |
| "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul", |
| "sempurna", "berhasil", "luas", "indah" |
| } |
|
|
| NEUTRAL_KEYWORDS = { |
| |
| "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak" |
| } |
|
|
| def correct_label(row): |
| text = row["text"] |
| label = row["sentiment"] |
|
|
| |
| if any(word in text for word in NEGATIVE_KEYWORDS): |
| return "negatif" |
| |
| if any(word in text for word in POSITIVE_KEYWORDS): |
| return "positif" |
| |
| if any(word in text for word in NEUTRAL_KEYWORDS): |
| return "netral" |
|
|
| |
| return label |
|
|
| |
| df["sentiment"] = df.apply(correct_label, axis=1) |
|
|
| |
| print("\nDistribusi label setelah perbaikan:") |
| print(df["sentiment"].value_counts()) |
|
|
| |
| df_negatif = df[df["sentiment"] == "negatif"] |
| df_positif = df[df["sentiment"] == "positif"] |
| df_netral = df[df["sentiment"] == "netral"] |
|
|
| |
| target_count = df_netral.shape[0] |
|
|
| |
| df_negatif_over = resample(df_negatif, |
| replace=True, |
| n_samples=target_count, |
| random_state=42) |
|
|
| df_positif_over = resample(df_positif, |
| replace=True, |
| n_samples=target_count, |
| random_state=42) |
|
|
| |
| df_balanced = pd.concat([df_netral, df_negatif_over, df_positif_over]) |
|
|
| print("Distribusi setelah balancing:") |
| print(df_balanced["sentiment"].value_counts()) |
|
|
| |
| |
| |
|
|
| |
| label_counts = df_balanced["sentiment"].value_counts() |
|
|
| |
| plt.figure(figsize=(6,4)) |
| label_counts.plot(kind="bar", color=["red","green","blue"]) |
| plt.title("Distribusi Sentimen") |
| plt.xlabel("Label") |
| plt.ylabel("Jumlah") |
| plt.xticks(rotation=0) |
| plt.show() |
|
|
| print('\n') |
|
|
| |
| plt.figure(figsize=(5,5)) |
| label_counts.plot(kind="pie", autopct='%1.1f%%', startangle=90, colors=["red","green","blue"]) |
| plt.title("Persentase Sentimen") |
| plt.ylabel("") |
| plt.show() |
|
|
| |
| |
| |
|
|
| |
| X = df_balanced["text"].values |
| y = df_balanced["sentiment"].values |
|
|
| |
| X_train, X_temp, y_train, y_temp = train_test_split( |
| X, y, test_size=0.2, random_state=42, stratify=y |
| ) |
|
|
| |
| X_val, X_test, y_val, y_test = train_test_split( |
| X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp |
| ) |
|
|
| |
| print("Ukuran dataset:") |
| print(f"Train: {len(X_train)}") |
| print(f"Validation: {len(X_val)}") |
| print(f"Test: {len(X_test)}") |
|
|
| |
| |
| |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print("Device:", device) |
|
|
| os.environ["WANDB_API_KEY"] = "009f08e71506e55bdfd282b691a4abee4ac85ff9" |
| os.environ["WANDB_DISABLED"] = "false" |
|
|
| |
| |
| |
| MODEL_NAME = "indobenchmark/indobert-base-p1" |
|
|
| tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) |
|
|
| |
| label2id = {"negatif": 0, "netral": 1, "positif": 2} |
| id2label = {v: k for k, v in label2id.items()} |
|
|
| def encode_labels(labels): |
| return [label2id[l] for l in labels] |
|
|
| y_train_enc = encode_labels(y_train) |
| y_val_enc = encode_labels(y_val) |
| y_test_enc = encode_labels(y_test) |
|
|
| |
| |
| |
| class SentimentDataset(Dataset): |
| def __init__(self, texts, labels, tokenizer, max_len=128): |
| self.texts = texts |
| self.labels = labels |
| self.tokenizer = tokenizer |
| self.max_len = max_len |
|
|
| def __len__(self): |
| return len(self.texts) |
|
|
| def __getitem__(self, idx): |
| text = str(self.texts[idx]) |
| label = self.labels[idx] |
|
|
| enc = self.tokenizer( |
| text, |
| truncation=True, |
| padding="max_length", |
| max_length=self.max_len, |
| return_tensors="pt" |
| ) |
|
|
| return { |
| "input_ids": enc["input_ids"].squeeze(), |
| "attention_mask": enc["attention_mask"].squeeze(), |
| "labels": torch.tensor(label, dtype=torch.long) |
| } |
|
|
| train_dataset = SentimentDataset(X_train, y_train_enc, tokenizer) |
| val_dataset = SentimentDataset(X_val, y_val_enc, tokenizer) |
| test_dataset = SentimentDataset(X_test, y_test_enc, tokenizer) |
|
|
| |
| |
| |
| model = BertForSequenceClassification.from_pretrained( |
| MODEL_NAME, |
| num_labels=3, |
| id2label=id2label, |
| label2id=label2id |
| ).to(device) |
|
|
| |
| |
| |
| training_args = TrainingArguments( |
| output_dir="./results", |
| per_device_train_batch_size=32, |
| per_device_eval_batch_size=32, |
| num_train_epochs=5, |
| learning_rate=2e-5, |
| weight_decay=0.05, |
| warmup_ratio=0.1, |
| logging_dir="./logs", |
| logging_steps=500, |
| save_total_limit=2, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| metric_for_best_model="f1", |
| greater_is_better=True |
| ) |
|
|
|
|
| |
| |
| |
|
|
| metric_acc = evaluate.load("accuracy") |
| metric_f1 = evaluate.load("f1") |
|
|
| def compute_metrics(eval_pred): |
| logits, labels = eval_pred |
| preds = np.argmax(logits, axis=-1) |
| acc = metric_acc.compute(predictions=preds, references=labels) |
| f1 = metric_f1.compute(predictions=preds, references=labels, average="weighted") |
| return {"accuracy": acc["accuracy"], "f1": f1["f1"]} |
|
|
| |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=val_dataset, |
| compute_metrics=compute_metrics, |
| callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] |
| ) |
|
|
| |
| |
| |
| trainer.train() |
|
|
| |
| |
| |
|
|
| |
| pred_results = trainer.predict(test_dataset) |
|
|
| |
| pred_logits = pred_results.predictions |
| pred_labels = np.argmax(pred_logits, axis=1) |
|
|
| |
| pred_text_labels = [id2label[i] for i in pred_labels] |
| true_text_labels = [id2label[i] for i in y_test_enc] |
|
|
| |
| df_test_results = pd.DataFrame({ |
| "text": X_test, |
| "true_label": true_text_labels, |
| "predicted_label": pred_text_labels |
| }) |
|
|
| |
| df_test_results.to_csv("test_predictions.csv", index=False) |
| print("✅ Hasil prediksi test set sudah disimpan ke test_predictions.csv") |
|
|
| |
| |
| |
|
|
| |
| print("\nEvaluasi di Test Set:") |
| test_result = trainer.evaluate(test_dataset) |
| print(test_result) |
|
|
| |
| predictions = trainer.predict(test_dataset) |
| pred_labels = np.argmax(predictions.predictions, axis=-1) |
|
|
| |
| for i in range(10): |
| print(f"Teks: {X_test[i]}") |
| print(f"Label Asli: {id2label[y_test_enc[i]]} | Prediksi: {id2label[pred_labels[i]]}") |
| print("---") |
|
|
| |
| SAVE_DIR = "./indoBERT-sentiment" |
|
|
| trainer.save_model(SAVE_DIR) |
| tokenizer.save_pretrained(SAVE_DIR) |
|
|
| print(f"\nModel & tokenizer sudah disimpan ke: {SAVE_DIR}") |
|
|
| |
| |
| |
| MODEL_DIR = "./indoBERT-sentiment" |
|
|
| tokenizer = BertTokenizer.from_pretrained(MODEL_DIR) |
| model = BertForSequenceClassification.from_pretrained(MODEL_DIR) |
|
|
| device = 0 if torch.cuda.is_available() else -1 |
| sentiment_pipeline = pipeline( |
| "text-classification", |
| model=model, |
| tokenizer=tokenizer, |
| device=device |
| ) |
|
|
| |
| preproc = joblib.load("preprocessor.joblib") |
|
|
| |
| |
| |
| def predict_text(text): |
| if not isinstance(text, str) or text.strip() == "": |
| return "EMPTY" |
| result = sentiment_pipeline(text, truncation=True, max_length=512)[0] |
| return result["label"] |
|
|
| |
| |
| |
| file1 = pd.read_csv("/kaggle/input/sentimentdataset/gabungan (1).csv") |
|
|
| |
| file1["caption"] = preproc.transform(file1["caption"]) |
|
|
| |
| file1["comments"] = preproc.transform(file1["comments"]) |
|
|
| |
| file1 = file1.dropna(subset=["caption", "comments"]) |
|
|
| outputs1 = [] |
|
|
| for idx, row in file1.iterrows(): |
| print(f"[File1] Proses baris {idx+1}/{len(file1)}") |
|
|
| |
| caption_text = str(row["caption"]).strip() |
| caption_pred = predict_text(caption_text) |
|
|
| |
| comments_text = str(row["comments"]).strip() |
| comments_pred_label = predict_text(comments_text) |
|
|
| outputs1.append({ |
| "link": row.get("link", ""), |
| "caption": caption_text, |
| "caption_pred": caption_pred, |
| "comments_pred": comments_text, |
| "comments_summary": comments_pred_label |
| }) |
|
|
| df_out1 = pd.DataFrame(outputs1) |
| df_out1.to_csv("medsos.csv", index=False, encoding="utf-8-sig") |
| print("✅ Hasil prediksi file1 sudah disimpan ke medsos.csv") |
|
|
| |
| |
| |
| file2 = pd.read_csv("/kaggle/input/sentimentdataset/berita2 (1).csv") |
|
|
| |
| file2["judul"] = preproc.transform(file2["judul"]) |
|
|
| |
| file2["tag"] = preproc.transform(file2["tag"]) |
|
|
| |
| file2["isi_berita"] = preproc.transform(file2["isi_berita"]) |
|
|
| |
| file2 = file2.dropna(subset=["judul", "tag", "isi_berita"]) |
|
|
| outputs2 = [] |
|
|
| for idx, row in file2.iterrows(): |
| print(f"[File2] Proses baris {idx+1}/{len(file2)}") |
|
|
| combined_text = f"{row['judul']} {row['tag']} {row['isi_berita']}" |
| pred = predict_text(combined_text) |
|
|
| outputs2.append({ |
| "link": row.get("link", ""), |
| "judul": row["judul"], |
| "tag": row["tag"], |
| "isi_berita": row["isi_berita"], |
| "prediction": pred |
| }) |
|
|
| df_out2 = pd.DataFrame(outputs2) |
| df_out2.to_csv("berita.csv", index=False, encoding="utf-8-sig") |
| print("✅ Hasil prediksi file2 sudah disimpan ke berita.csv") |