File size: 8,749 Bytes
fa8ff66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 | # -*- coding: utf-8 -*-
"""Preparing.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
"""
!pip -q install -U transformers accelerate torch
!pip install transformers
!pip install --upgrade transformers
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
!pip install transformers accelerate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import re
from textblob import TextBlob
from transformers import AutoConfig, pipeline
# Menampilkan Dataset
folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'
try:
# Dapatkan daftar semua file dalam folder
files = os.listdir(folder_path)
# Loop melalui setiap file
for file_name in files:
# Periksa apakah file tersebut adalah file CSV
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
print(f"Membaca file: {file_name}")
try:
# Baca file CSV menggunakan Pandas
df = pd.read_csv(file_path)
# Tampilkan beberapa baris pertama dari dataset
print(df)
print("\n") # Beri jarak antar file
except Exception as e:
print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")
except FileNotFoundError:
print(f"Error: Folder '{folder_path}' tidak ditemukan.")
except Exception as e:
print(f"Terjadi error: {e}")
# Medsos
# 1. Memproses data Instagram
ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
df_ig = pd.read_csv(ig_path)
df_ig = df_ig.rename(columns={
'source_name': 'profile',
'post_url': 'url'
})
# 2. Memproses data TikTok
tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
df_tiktok = pd.read_csv(tiktok_path)
df_tiktok = df_tiktok.rename(columns={
'like_count': 'likes',
'caption_short': 'caption',
'profile_username': 'profile',
'scrape_date': 'datetime'
})
df_tiktok = df_tiktok.drop(columns=['upload_date'])
# --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---
kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']
# Proses DataFrame Instagram
df1_pilihan = df_ig[kolom_yang_dipilih].copy()
df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
df1_pilihan['asal_dataset'] = 'Instagram'
# Proses DataFrame TikTok
df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
df2_pilihan['asal_dataset'] = 'Tiktok'
# --- Penggabungan ---
df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)
# --- Pembersihan Data (Preprocessing) ---
# Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)
def clean_text(text):
if pd.isna(text): return ""
text = str(text).lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def format_author(text):
formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
return formatted
df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)
# Hapus baris kosong dan duplikat di akhir
df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
df_gabungan = df_gabungan.drop_duplicates()
# --- HASIL AKHIR ---
print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
df_gabungan.info()
# Simpan ke file CSV baru
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
df_gabungan.to_csv(save_path, index=False)
print(f"\nData berhasil disimpan di: {save_path}")
# Berita
df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')
# Apply string operations to the 'tag' column
df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)
# Filter the DataFrame based on the 'tag' column
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
df_berita = df_berita_filtered.dropna().drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])
df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
df_berita['tag'] = df_berita['tag'].apply(clean_text)
df_berita['judul'] = df_berita['judul'].apply(clean_text)
df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()
df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
df_berita['isi_berita'] = (
df_berita['isi_berita']
.str.replace(r'(?i)scroll.*?content', '', regex=True)
.str.replace(r'(?i)h3:', '', regex=True)
.str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
.str.replace(r'(?i)editor.*?antara', '', regex=True)
.str.replace(r'(?i)pewarta.*?antara', '', regex=True)
.str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
.str.replace(r'(?i)dilarang.*?antara', '', regex=True)
.str.replace(r'(?i)advertisement', '', regex=True)
.str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
.str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
.str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
.str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
)
df_berita = df_berita.drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])
print(df_berita)
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
df_berita.to_csv(save_path, index=False)
MODEL_ID = "taufiqdp/indonesian-sentiment" # IndoBERT fine-tuned (3 kelas)
# (opsional) kalau kamu perlu token HF untuk repo privat:
# from huggingface_hub import login
# login("hf_xxx") # token kamu
config = AutoConfig.from_pretrained(MODEL_ID)
clf = pipeline(
task="text-classification",
model=MODEL_ID,
tokenizer=MODEL_ID,
device=0 if torch.cuda.is_available() else -1,
truncation=True,
max_length=256,
return_all_scores=False,
)
def normalize_label(lbl: str) -> str:
l = lbl.lower()
if l in ("positif","positive"): return "positif"
if l in ("negatif","negative"): return "negatif"
if l in ("netral","neutral"): return "netral"
# fallback jika format 'LABEL_0/1/2'
if "label_" in l:
try:
idx = int(l.split("_")[-1])
return config.id2label[idx].lower()
except:
return "netral"
return l
# Sentimen untuk CAPTION
texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
preds_caption = clf(texts_caption, batch_size=64)
df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]
# Sentimen untuk COMMENTS
texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
preds_comments = clf(texts_comments, batch_size=64)
df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]
# (opsional) buat kolom sentimen gabungan
# kalau caption netral/empty, ambil dari comments
def combine_sentiment(row):
if row['sentimen_caption'] != "netral":
return row['sentimen_caption']
return row['sentimen_comments']
df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)
df_gabungan.to_csv('medsos2.csv', index=False)
# contoh ke dataframe berita (judul/tag)
texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
preds_b = clf(texts_b, batch_size=64)
df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]
df_berita.to_csv('berita2.csv', index=False) |