File size: 8,749 Bytes
fa8ff66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# -*- coding: utf-8 -*-
"""Preparing.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
"""

!pip -q install -U transformers accelerate torch
!pip install transformers
!pip install --upgrade transformers
!pip uninstall -y torch torchvision torchaudio transformers
!pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
!pip install transformers accelerate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import re

from textblob import TextBlob
from transformers import AutoConfig, pipeline

# Menampilkan Dataset
folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'

try:
    # Dapatkan daftar semua file dalam folder
    files = os.listdir(folder_path)

    # Loop melalui setiap file
    for file_name in files:
        # Periksa apakah file tersebut adalah file CSV
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)

            print(f"Membaca file: {file_name}")

            try:
                # Baca file CSV menggunakan Pandas
                df = pd.read_csv(file_path)

                # Tampilkan beberapa baris pertama dari dataset
                print(df)
                print("\n") # Beri jarak antar file

            except Exception as e:
                print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")

except FileNotFoundError:
    print(f"Error: Folder '{folder_path}' tidak ditemukan.")
except Exception as e:
    print(f"Terjadi error: {e}")

# Medsos

# 1. Memproses data Instagram
ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
df_ig = pd.read_csv(ig_path)
df_ig = df_ig.rename(columns={
    'source_name': 'profile',
    'post_url': 'url'
})

# 2. Memproses data TikTok
tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
df_tiktok = pd.read_csv(tiktok_path)
df_tiktok = df_tiktok.rename(columns={
    'like_count': 'likes',
    'caption_short': 'caption',
    'profile_username': 'profile',
    'scrape_date': 'datetime'
})
df_tiktok = df_tiktok.drop(columns=['upload_date'])

# --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---

kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']

# Proses DataFrame Instagram
df1_pilihan = df_ig[kolom_yang_dipilih].copy()
df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
df1_pilihan['asal_dataset'] = 'Instagram'

# Proses DataFrame TikTok
df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
df2_pilihan['asal_dataset'] = 'Tiktok'

# --- Penggabungan ---
df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)

# --- Pembersihan Data (Preprocessing) ---

# Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)

def clean_text(text):
    if pd.isna(text): return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def format_author(text):
    formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
    return formatted

df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)

# Hapus baris kosong dan duplikat di akhir
df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
df_gabungan = df_gabungan.drop_duplicates()

# --- HASIL AKHIR ---
print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
df_gabungan.info()

# Simpan ke file CSV baru
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
df_gabungan.to_csv(save_path, index=False)
print(f"\nData berhasil disimpan di: {save_path}")

# Berita

df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')

# Apply string operations to the 'tag' column
df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)

# Filter the DataFrame based on the 'tag' column
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()

df_berita = df_berita_filtered.dropna().drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])

df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
df_berita['tag'] = df_berita['tag'].apply(clean_text)
df_berita['judul'] = df_berita['judul'].apply(clean_text)
df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()

df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
df_berita['isi_berita'] = (
    df_berita['isi_berita']
    .str.replace(r'(?i)scroll.*?content', '', regex=True)
    .str.replace(r'(?i)h3:', '', regex=True)
    .str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
    .str.replace(r'(?i)editor.*?antara', '', regex=True)
    .str.replace(r'(?i)pewarta.*?antara', '', regex=True)
    .str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
    .str.replace(r'(?i)dilarang.*?antara', '', regex=True)
    .str.replace(r'(?i)advertisement', '', regex=True)
    .str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
    .str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
    .str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
    .str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
)

df_berita = df_berita.drop_duplicates()
df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])

print(df_berita)

save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
df_berita.to_csv(save_path, index=False)

MODEL_ID = "taufiqdp/indonesian-sentiment"  # IndoBERT fine-tuned (3 kelas)

# (opsional) kalau kamu perlu token HF untuk repo privat:
# from huggingface_hub import login
# login("hf_xxx")  # token kamu

config = AutoConfig.from_pretrained(MODEL_ID)
clf = pipeline(
    task="text-classification",
    model=MODEL_ID,
    tokenizer=MODEL_ID,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    max_length=256,
    return_all_scores=False,
)

def normalize_label(lbl: str) -> str:
    l = lbl.lower()
    if l in ("positif","positive"): return "positif"
    if l in ("negatif","negative"): return "negatif"
    if l in ("netral","neutral"):   return "netral"
    # fallback jika format 'LABEL_0/1/2'
    if "label_" in l:
        try:
            idx = int(l.split("_")[-1])
            return config.id2label[idx].lower()
        except:
            return "netral"
    return l

# Sentimen untuk CAPTION
texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
preds_caption = clf(texts_caption, batch_size=64)
df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]

# Sentimen untuk COMMENTS
texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
preds_comments = clf(texts_comments, batch_size=64)
df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]

# (opsional) buat kolom sentimen gabungan
# kalau caption netral/empty, ambil dari comments
def combine_sentiment(row):
    if row['sentimen_caption'] != "netral":
        return row['sentimen_caption']
    return row['sentimen_comments']
df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)

df_gabungan.to_csv('medsos2.csv', index=False)


# contoh ke dataframe berita (judul/tag)
texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
preds_b = clf(texts_b, batch_size=64)
df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]

df_berita.to_csv('berita2.csv', index=False)