# -*- coding: utf-8 -*- """Preparing.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg """ !pip -q install -U transformers accelerate torch !pip install transformers !pip install --upgrade transformers !pip uninstall -y torch torchvision torchaudio transformers !pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118 !pip install transformers accelerate import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import os import torch import re from textblob import TextBlob from transformers import AutoConfig, pipeline # Menampilkan Dataset folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis' try: # Dapatkan daftar semua file dalam folder files = os.listdir(folder_path) # Loop melalui setiap file for file_name in files: # Periksa apakah file tersebut adalah file CSV if file_name.endswith('.csv'): file_path = os.path.join(folder_path, file_name) print(f"Membaca file: {file_name}") try: # Baca file CSV menggunakan Pandas df = pd.read_csv(file_path) # Tampilkan beberapa baris pertama dari dataset print(df) print("\n") # Beri jarak antar file except Exception as e: print(f"Tidak dapat membaca file {file_name}. Error: {e}\n") except FileNotFoundError: print(f"Error: Folder '{folder_path}' tidak ditemukan.") except Exception as e: print(f"Terjadi error: {e}") # Medsos # 1. Memproses data Instagram ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv' df_ig = pd.read_csv(ig_path) df_ig = df_ig.rename(columns={ 'source_name': 'profile', 'post_url': 'url' }) # 2. Memproses data TikTok tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv' df_tiktok = pd.read_csv(tiktok_path) df_tiktok = df_tiktok.rename(columns={ 'like_count': 'likes', 'caption_short': 'caption', 'profile_username': 'profile', 'scrape_date': 'datetime' }) df_tiktok = df_tiktok.drop(columns=['upload_date']) # --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) --- kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime'] # Proses DataFrame Instagram df1_pilihan = df_ig[kolom_yang_dipilih].copy() df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini df1_pilihan['asal_dataset'] = 'Instagram' # Proses DataFrame TikTok df2_pilihan = df_tiktok[kolom_yang_dipilih].copy() df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini df2_pilihan['asal_dataset'] = 'Tiktok' # --- Penggabungan --- df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True) # --- Pembersihan Data (Preprocessing) --- # Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int) def clean_text(text): if pd.isna(text): return "" text = str(text).lower() text = re.sub(r'http\S+|www\S+|https\S+', '', text) text = re.sub(r'[^a-zA-Z\s]', ' ', text) text = re.sub(r'\s+', ' ', text).strip() return text def format_author(text): formatted = re.sub(r'(? str: l = lbl.lower() if l in ("positif","positive"): return "positif" if l in ("negatif","negative"): return "negatif" if l in ("netral","neutral"): return "netral" # fallback jika format 'LABEL_0/1/2' if "label_" in l: try: idx = int(l.split("_")[-1]) return config.id2label[idx].lower() except: return "netral" return l # Sentimen untuk CAPTION texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist() preds_caption = clf(texts_caption, batch_size=64) df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption] # Sentimen untuk COMMENTS texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist() preds_comments = clf(texts_comments, batch_size=64) df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments] # (opsional) buat kolom sentimen gabungan # kalau caption netral/empty, ambil dari comments def combine_sentiment(row): if row['sentimen_caption'] != "netral": return row['sentimen_caption'] return row['sentimen_comments'] df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1) df_gabungan.to_csv('medsos2.csv', index=False) # contoh ke dataframe berita (judul/tag) texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist() preds_b = clf(texts_b, batch_size=64) df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b] df_berita.to_csv('berita2.csv', index=False)