Sentiment / preparing.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
# -*- coding: utf-8 -*-
"""Preparing.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
"""
!pip -q install -U transformers accelerate torch
!pip install transformers
!pip install --upgrade transformers
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
!pip install transformers accelerate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import re
from textblob import TextBlob
from transformers import AutoConfig, pipeline
# Menampilkan Dataset
folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'
try:
# Dapatkan daftar semua file dalam folder
files = os.listdir(folder_path)
# Loop melalui setiap file
for file_name in files:
# Periksa apakah file tersebut adalah file CSV
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
print(f"Membaca file: {file_name}")
try:
# Baca file CSV menggunakan Pandas
df = pd.read_csv(file_path)
# Tampilkan beberapa baris pertama dari dataset
print(df)
print("\n") # Beri jarak antar file
except Exception as e:
print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")
except FileNotFoundError:
print(f"Error: Folder '{folder_path}' tidak ditemukan.")
except Exception as e:
print(f"Terjadi error: {e}")
# Medsos
# 1. Memproses data Instagram
ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
df_ig = pd.read_csv(ig_path)
df_ig = df_ig.rename(columns={
'source_name': 'profile',
'post_url': 'url'
})
# 2. Memproses data TikTok
tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
df_tiktok = pd.read_csv(tiktok_path)
df_tiktok = df_tiktok.rename(columns={
'like_count': 'likes',
'caption_short': 'caption',
'profile_username': 'profile',
'scrape_date': 'datetime'
})
df_tiktok = df_tiktok.drop(columns=['upload_date'])
# --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---
kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']
# Proses DataFrame Instagram
df1_pilihan = df_ig[kolom_yang_dipilih].copy()
df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
df1_pilihan['asal_dataset'] = 'Instagram'
# Proses DataFrame TikTok
df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
df2_pilihan['asal_dataset'] = 'Tiktok'
# --- Penggabungan ---
df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)
# --- Pembersihan Data (Preprocessing) ---
# Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)
def clean_text(text):
if pd.isna(text): return ""
text = str(text).lower()
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def format_author(text):
formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
return formatted
df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)
# Hapus baris kosong dan duplikat di akhir
df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
df_gabungan = df_gabungan.drop_duplicates()
# --- HASIL AKHIR ---
print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
df_gabungan.info()
# Simpan ke file CSV baru
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
df_gabungan.to_csv(save_path, index=False)
print(f"\nData berhasil disimpan di: {save_path}")
# Berita
df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')
# Apply string operations to the 'tag' column
df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)
# Filter the DataFrame based on the 'tag' column
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
df_berita = df_berita_filtered.dropna().drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])
df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
df_berita['tag'] = df_berita['tag'].apply(clean_text)
df_berita['judul'] = df_berita['judul'].apply(clean_text)
df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()
df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
df_berita['isi_berita'] = (
df_berita['isi_berita']
.str.replace(r'(?i)scroll.*?content', '', regex=True)
.str.replace(r'(?i)h3:', '', regex=True)
.str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
.str.replace(r'(?i)editor.*?antara', '', regex=True)
.str.replace(r'(?i)pewarta.*?antara', '', regex=True)
.str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
.str.replace(r'(?i)dilarang.*?antara', '', regex=True)
.str.replace(r'(?i)advertisement', '', regex=True)
.str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
.str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
.str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
.str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
)
df_berita = df_berita.drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])
print(df_berita)
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
df_berita.to_csv(save_path, index=False)
MODEL_ID = "taufiqdp/indonesian-sentiment" # IndoBERT fine-tuned (3 kelas)
# (opsional) kalau kamu perlu token HF untuk repo privat:
# from huggingface_hub import login
# login("hf_xxx") # token kamu
config = AutoConfig.from_pretrained(MODEL_ID)
clf = pipeline(
task="text-classification",
model=MODEL_ID,
tokenizer=MODEL_ID,
device=0 if torch.cuda.is_available() else -1,
truncation=True,
max_length=256,
return_all_scores=False,
)
def normalize_label(lbl: str) -> str:
l = lbl.lower()
if l in ("positif","positive"): return "positif"
if l in ("negatif","negative"): return "negatif"
if l in ("netral","neutral"): return "netral"
# fallback jika format 'LABEL_0/1/2'
if "label_" in l:
try:
idx = int(l.split("_")[-1])
return config.id2label[idx].lower()
except:
return "netral"
return l
# Sentimen untuk CAPTION
texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
preds_caption = clf(texts_caption, batch_size=64)
df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]
# Sentimen untuk COMMENTS
texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
preds_comments = clf(texts_comments, batch_size=64)
df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]
# (opsional) buat kolom sentimen gabungan
# kalau caption netral/empty, ambil dari comments
def combine_sentiment(row):
if row['sentimen_caption'] != "netral":
return row['sentimen_caption']
return row['sentimen_comments']
df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)
df_gabungan.to_csv('medsos2.csv', index=False)
# contoh ke dataframe berita (judul/tag)
texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
preds_b = clf(texts_b, batch_size=64)
df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]
df_berita.to_csv('berita2.csv', index=False)