| |
| """Preparing.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg |
| """ |
|
|
| !pip -q install -U transformers accelerate torch |
| !pip install transformers |
| !pip install --upgrade transformers |
| !pip uninstall -y torch torchvision torchaudio transformers |
| !pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118 |
| !pip install transformers accelerate |
|
|
| import pandas as pd |
| import numpy as np |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
| import os |
| import torch |
| import re |
|
|
| from textblob import TextBlob |
| from transformers import AutoConfig, pipeline |
|
|
| |
| folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis' |
|
|
| try: |
| |
| files = os.listdir(folder_path) |
|
|
| |
| for file_name in files: |
| |
| if file_name.endswith('.csv'): |
| file_path = os.path.join(folder_path, file_name) |
|
|
| print(f"Membaca file: {file_name}") |
|
|
| try: |
| |
| df = pd.read_csv(file_path) |
|
|
| |
| print(df) |
| print("\n") |
|
|
| except Exception as e: |
| print(f"Tidak dapat membaca file {file_name}. Error: {e}\n") |
|
|
| except FileNotFoundError: |
| print(f"Error: Folder '{folder_path}' tidak ditemukan.") |
| except Exception as e: |
| print(f"Terjadi error: {e}") |
|
|
| |
|
|
| |
| ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv' |
| df_ig = pd.read_csv(ig_path) |
| df_ig = df_ig.rename(columns={ |
| 'source_name': 'profile', |
| 'post_url': 'url' |
| }) |
|
|
| |
| tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv' |
| df_tiktok = pd.read_csv(tiktok_path) |
| df_tiktok = df_tiktok.rename(columns={ |
| 'like_count': 'likes', |
| 'caption_short': 'caption', |
| 'profile_username': 'profile', |
| 'scrape_date': 'datetime' |
| }) |
| df_tiktok = df_tiktok.drop(columns=['upload_date']) |
|
|
| |
|
|
| kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime'] |
|
|
| |
| df1_pilihan = df_ig[kolom_yang_dipilih].copy() |
| df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') |
| df1_pilihan['asal_dataset'] = 'Instagram' |
|
|
| |
| df2_pilihan = df_tiktok[kolom_yang_dipilih].copy() |
| df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') |
| df2_pilihan['asal_dataset'] = 'Tiktok' |
|
|
| |
| df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True) |
|
|
| |
|
|
| |
| df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int) |
|
|
| def clean_text(text): |
| if pd.isna(text): return "" |
| text = str(text).lower() |
| text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
| text = re.sub(r'[^a-zA-Z\s]', ' ', text) |
| text = re.sub(r'\s+', ' ', text).strip() |
| return text |
|
|
| def format_author(text): |
| formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text)) |
| return formatted |
|
|
| df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text) |
| df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text) |
| df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False) |
| df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True) |
| df_gabungan['comments'] = df_gabungan['comments'].apply(format_author) |
|
|
| |
| df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption']) |
| df_gabungan = df_gabungan.drop_duplicates() |
|
|
| |
| print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---") |
| print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}") |
| print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}") |
| df_gabungan.info() |
|
|
| |
| save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv' |
| df_gabungan.to_csv(save_path, index=False) |
| print(f"\nData berhasil disimpan di: {save_path}") |
|
|
| |
|
|
| df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv') |
|
|
| |
| df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False) |
|
|
| |
| df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy() |
| df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy() |
|
|
| df_berita = df_berita_filtered.dropna().drop_duplicates() |
| df_berita = df_berita.dropna(subset=['isi_berita', 'tag']) |
|
|
| df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce') |
| df_berita['tag'] = df_berita['tag'].apply(clean_text) |
| df_berita['judul'] = df_berita['judul'].apply(clean_text) |
| df_berita['isi_berita'] = df_berita['isi_berita'].str.lower() |
|
|
| df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)] |
| df_berita['isi_berita'] = ( |
| df_berita['isi_berita'] |
| .str.replace(r'(?i)scroll.*?content', '', regex=True) |
| .str.replace(r'(?i)h3:', '', regex=True) |
| .str.replace(r'(?i)tonton.*?20detik\]', '', regex=True) |
| .str.replace(r'(?i)editor.*?antara', '', regex=True) |
| .str.replace(r'(?i)pewarta.*?antara', '', regex=True) |
| .str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True) |
| .str.replace(r'(?i)dilarang.*?antara', '', regex=True) |
| .str.replace(r'(?i)advertisement', '', regex=True) |
| .str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True) |
| .str.replace(r'(?i)\bradar\b.*?-', '', regex=True) |
| .str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True) |
| .str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True) |
| ) |
|
|
| df_berita = df_berita.drop_duplicates() |
| df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal']) |
|
|
| print(df_berita) |
|
|
| save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv' |
| df_berita.to_csv(save_path, index=False) |
|
|
| MODEL_ID = "taufiqdp/indonesian-sentiment" |
|
|
| |
| |
| |
|
|
| config = AutoConfig.from_pretrained(MODEL_ID) |
| clf = pipeline( |
| task="text-classification", |
| model=MODEL_ID, |
| tokenizer=MODEL_ID, |
| device=0 if torch.cuda.is_available() else -1, |
| truncation=True, |
| max_length=256, |
| return_all_scores=False, |
| ) |
|
|
| def normalize_label(lbl: str) -> str: |
| l = lbl.lower() |
| if l in ("positif","positive"): return "positif" |
| if l in ("negatif","negative"): return "negatif" |
| if l in ("netral","neutral"): return "netral" |
| |
| if "label_" in l: |
| try: |
| idx = int(l.split("_")[-1]) |
| return config.id2label[idx].lower() |
| except: |
| return "netral" |
| return l |
|
|
| |
| texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist() |
| preds_caption = clf(texts_caption, batch_size=64) |
| df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption] |
|
|
| |
| texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist() |
| preds_comments = clf(texts_comments, batch_size=64) |
| df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments] |
|
|
| |
| |
| def combine_sentiment(row): |
| if row['sentimen_caption'] != "netral": |
| return row['sentimen_caption'] |
| return row['sentimen_comments'] |
| df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1) |
|
|
| df_gabungan.to_csv('medsos2.csv', index=False) |
|
|
|
|
| |
| texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist() |
| preds_b = clf(texts_b, batch_size=64) |
| df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b] |
|
|
| df_berita.to_csv('berita2.csv', index=False) |