Sentiment / medos_scraping.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
import time
import pandas as pd
import json
import os
from datetime import datetime
from json import JSONDecodeError
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
# ==============================================================================
# KONFIGURASI SELENIUM
# ==============================================================================
def setup_driver():
"""Menyiapkan instance Selenium WebDriver."""
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--log-level=3')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
try:
driver = webdriver.Chrome(options=options)
return driver
except Exception as e:
print(f"Error saat memulai WebDriver: {e}")
print("Pastikan chromedriver sudah diunduh dan berada di folder yang sama.")
return None
# ==============================================================================
# FUNGSI COOKIES & CAPTCHA
# ==============================================================================
def save_cookies(driver, path):
"""Menyimpan cookies dari sesi browser ke file JSON."""
with open(path, 'w', encoding='utf-8') as f:
json.dump(driver.get_cookies(), f, indent=2)
print(f"\nCookies berhasil disimpan ke {path}")
# [PERBAIKAN] Fungsi ini dibuat lebih tangguh terhadap file kosong/rusak
def load_cookies(driver, path):
"""Memuat cookies dari file JSON. Mengembalikan True jika berhasil, False jika gagal."""
if not os.path.exists(path) or os.path.getsize(path) == 0:
print(f"File cookies '{path}' tidak ditemukan atau kosong.")
return False
try:
with open(path, 'r', encoding='utf-8') as f:
cookies = json.load(f)
if not isinstance(cookies, list):
print(f"Format data di '{path}' tidak valid (bukan list).")
return False
for cookie in cookies:
driver.add_cookie(cookie)
print(f"Cookies berhasil dimuat dari {path}")
return True
except JSONDecodeError:
print(f"Gagal membaca '{path}' karena file rusak (JSONDecodeError).")
return False
except Exception as e:
print(f"Terjadi error saat memuat cookies dari '{path}': {e}")
return False
def establish_and_verify_session(driver, base_cookies_path, profile_cookies_path, profile_url):
"""
Menangani alur CAPTCHA dengan membangun sesi dasar terlebih dahulu.
"""
# --- TAHAP 1: MEMBANGUN SESI DASAR (HOMEPAGE) ---
print("\n--- Tahap 1: Membangun Sesi Dasar di tiktok.com ---")
driver.get("https://www.tiktok.com/")
# [PERBAIKAN] Cek hasil dari load_cookies
if not load_cookies(driver, base_cookies_path):
print("\n" + "="*50)
print("‼️ TINDAKAN AWAL DIPERLUKAN ‼️")
input("File cookies dasar tidak valid/tidak ada. Selesaikan CAPTCHA di tiktok.com, lalu tekan [Enter]...")
save_cookies(driver, base_cookies_path)
driver.refresh()
print("Sesi dasar telah dibuat/dimuat.")
# --- TAHAP 2: VERIFIKASI SESI PROFIL ---
print(f"\n--- Tahap 2: Verifikasi Sesi di Halaman Profil ---")
driver.get(profile_url)
# [PERBAIKAN] Cek hasil dari load_cookies
if load_cookies(driver, profile_cookies_path):
print("Mencoba memvalidasi sesi dengan cookies profil...")
driver.refresh()
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
)
print("✅ Sesi profil berhasil dipulihkan.")
return True
except TimeoutException:
print("⚠️ Cookies profil tidak valid. Diperlukan verifikasi manual.")
print("\n" + "="*50)
print("‼️ VERIFIKASI SEBELUM SCRAPING ‼️")
input("Halaman profil telah dimuat. Jika ada CAPTCHA, selesaikan sekarang. Tekan [Enter]...")
save_cookies(driver, profile_cookies_path)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
)
print("✅ Sesi profil berhasil dibuat/diperbarui.")
return True
except TimeoutException:
print("❌ Gagal memverifikasi halaman profil.")
return False
# ==============================================================================
# FUNGSI-FUNGSI BANTUAN SCRAPING (Tidak Berubah)
# ==============================================================================
def get_video_links(driver, max_videos):
"""
Mengambil link video dari halaman profil dengan melakukan scroll
hingga batas maksimal tercapai atau halaman paling bawah.
"""
print(f"\n🔎 Mulai mengumpulkan link video (target: {max_videos} video)...")
video_links = set()
try:
# 1. Tunggu hingga elemen video pertama kali muncul
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
)
print("✅ Halaman profil berhasil dimuat.")
# 2. Loop untuk scroll dan kumpulkan link
while len(video_links) < max_videos:
# Simpan jumlah link sebelum scroll untuk deteksi akhir halaman
links_before_scroll = len(video_links)
# Kumpulkan semua link yang ada di DOM saat ini
video_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a')
for elem in video_elements:
href = elem.get_attribute('href')
if href:
video_links.add(href)
# Cek apakah target sudah tercapai setelah pengumpulan
if len(video_links) >= max_videos:
print(f"🎯 Target {max_videos} video tercapai ({len(video_links)} ditemukan). Berhenti scroll.")
break
# Lakukan scroll ke paling bawah halaman
print(f"📜 Scrolling... Ditemukan {len(video_links)}/{max_videos} video.")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Beri waktu agar konten baru sempat dimuat
time.sleep(3)
# 3. Deteksi jika sudah tidak ada video baru yang dimuat (paling bawah)
# Untuk menghindari infinite loop, kita cek apakah jumlah link bertambah.
if len(video_links) == links_before_scroll:
print("🏁 Halaman sudah paling bawah atau tidak ada video baru yang dimuat.")
break
except TimeoutException:
print("❌ Gagal memuat halaman profil atau tidak ada video ditemukan.")
return []
print(f"\n👍 Selesai mengumpulkan. Total {len(video_links)} link video unik ditemukan.")
# Pastikan hasil akhir tidak melebihi max_videos
return list(video_links)[:max_videos]
def check_for_captcha(driver):
"""
[PERBAIKAN V2] Memeriksa CAPTCHA, termasuk di dalam iFrame.
"""
captcha_texts = [
"Drag the slider to fit the puzzle",
"Drag the puzzle piece into place",
"Geser puzzle untuk melengkapi gambar",
"Verify to continue"
]
# Menggunakan contains(., '...') agar lebih kuat dalam mencari teks
xpath_query = "//*[" + " or ".join([f"contains(., '{text}')" for text in captcha_texts]) + "]"
# 1. Cek di dalam iFrame terlebih dahulu (penyebab paling umum)
try:
iframes = driver.find_elements(By.TAG_NAME, 'iframe')
if iframes:
print(f"\n Mendeteksi {len(iframes)} iFrame, memeriksa satu per satu untuk CAPTCHA...")
for frame in iframes:
try:
# Pindah fokus ke dalam iFrame
driver.switch_to.frame(frame)
# Cari elemen CAPTCHA di dalam iFrame
driver.find_element(By.XPATH, xpath_query)
print("\n⚠️ CAPTCHA terdeteksi di dalam sebuah iFrame!")
# PENTING: Kembali ke konteks halaman utama agar sisa skrip tidak error
driver.switch_to.default_content()
return True
except NoSuchElementException:
# Jika tidak ditemukan di iFrame ini, kembali dan lanjut ke iFrame berikutnya
driver.switch_to.default_content()
continue
except Exception as e:
print(f"\n Error saat memeriksa iFrame: {e}")
# Pastikan kembali ke konteks utama jika ada error tak terduga
driver.switch_to.default_content()
# 2. Jika tidak ada di iFrame, cek di halaman utama (sebagai cadangan)
try:
driver.find_element(By.XPATH, xpath_query)
print("\n⚠️ CAPTCHA terdeteksi di halaman utama!")
return True
except NoSuchElementException:
return False
def scrape_video_details(driver, video_url):
"""Mengambil caption dan seluruh komentar, dengan penanganan CAPTCHA dan logika ekspansi konten."""
print(f"\n--- Memproses video: {video_url} ---")
driver.get(video_url)
max_retries = 2
for attempt in range(max_retries):
try:
upload_date = "N/A"
like_count = "N/A"
try:
date_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
)
upload_date = date_element.text
except TimeoutException:
print("  -> Info tanggal video tidak ditemukan.")
try:
like_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'strong[data-e2e="like-count"]'))
)
like_count = like_element.text
print(f"  -> Jumlah 'like' ditemukan: {like_count}")
except TimeoutException:
print("  -> Info jumlah 'like' tidak ditemukan.")
video_data = {'url': video_url, 'upload_date': upload_date, 'like_count': like_count, 'caption_short': '', 'caption_detail': '', 'comments': []}
# --- [PERBAIKAN DIMULAI DI SINI] ---
try:
# 1. Tetap tunggu container utamanya
desc_container = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
)
# 2. Cari caption di dalam try...except baru
try:
video_data['caption_short'] = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]').text
print(f"  -> Caption singkat ditemukan: {video_data['caption_short'][:50]}...")
# 3. Logika untuk tombol 'more' hanya dijalankan jika caption ditemukan
try:
more_button = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
driver.execute_script("arguments[0].click();", more_button)
print("  -> Tombol 'more' (ikon) pada caption diklik.")
time.sleep(2)
detail_container = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
)
desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
keywords_text = ""
try:
keywords_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
except NoSuchElementException: pass
video_data['caption_detail'] = f"Deskripsi: {desc_text}\nKeywords: {keywords_text}".strip()
print(f"  -> Caption detail ditemukan: {video_data['caption_detail'][:50]}...")
except (NoSuchElementException, TimeoutException):
print("  -> Tidak ada tombol 'more' untuk caption detail.")
except NoSuchElementException:
# Jika elemen caption tidak ada, cetak pesan dan lanjutkan
print("  -> Video ini tidak memiliki caption.")
except TimeoutException:
# Jika bahkan container deskripsinya tidak ada, anggap halaman gagal dimuat
print("  -> Bagian deskripsi/caption tidak ditemukan, kemungkinan halaman terhalang.")
# --- [PERBAIKAN SELESAI DI SINI] ---
# ... (Sisa kode untuk mengambil komentar tidak perlu diubah) ...
try:
comment_container = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
)
print("  -> Bagian komentar ditemukan. Memuat seluruh komentar...")
body = driver.find_element(By.TAG_NAME, 'body')
except TimeoutException:
print("  -> Bagian komentar tidak ditemukan.")
return video_data
try:
print("  -> Memulai proses scroll dan klik balasan secara dinamis...")
reply_button_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'reply') or contains(text(), 'replies'))]"
last_comment_count = 0
stalled_attempts = 0
max_stalled_attempts = 5
while stalled_attempts < max_stalled_attempts:
try:
view_buttons = driver.find_elements(By.XPATH, reply_button_xpath)
if view_buttons:
print(f"    -> Menemukan {len(view_buttons)} tombol balasan. Mengklik satu...")
driver.execute_script("arguments[0].click();", view_buttons[0])
time.sleep(2)
stalled_attempts = 0
continue
except Exception as e:
print(f"    -> Error minor saat mengklik tombol balasan: {e}")
print("    -> Tidak ada tombol balasan terlihat. Melakukan scroll...")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
current_comment_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
if current_comment_count > last_comment_count:
print(f"    -> Konten baru dimuat. Total item sekarang: {current_comment_count}")
last_comment_count = current_comment_count
stalled_attempts = 0
else:
stalled_attempts += 1
print(f"    -> Konten tidak bertambah, percobaan ke-{stalled_attempts}/{max_stalled_attempts}.")
print("  -> Scroll dan klik selesai. Memulai ekstraksi final...")
comment_item_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
print(f"  -> Ditemukan total {comment_item_count} item komentar. Memproses satu per satu...")
for i in range(comment_item_count):
try:
all_comment_items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
item = all_comment_items[i]
try:
author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]')
new_comment = {
'author': author_element.text,
'comment': comment_element.text,
'replies': []
}
video_data['comments'].append(new_comment)
continue
except NoSuchElementException:
pass
try:
reply_author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
reply_comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]')
if video_data['comments']:
new_reply = {
'author': reply_author_element.text,
'comment': reply_comment_element.text
}
video_data['comments'][-1]['replies'].append(new_reply)
except NoSuchElementException:
pass
except IndexError:
print(f"    -> Peringatan: Jumlah komentar berubah saat proses. Melewatkan indeks ke-{i}.")
break
except Exception as e:
print(f"    -> Terjadi error pada item ke-{i}, melewati. Error: {e}")
print("  -> Selesai. Berhasil memproses dan mengelompokkan komentar.")
except Exception as e:
print(f"  -> Gagal pada proses utama karena: {e}")
return video_data
except TimeoutException:
print("  -> Gagal memuat elemen halaman (Timeout).")
if check_for_captcha(driver):
print("\n" + "="*50)
print(f"⚠️ CAPTCHA terdeteksi pada percobaan ke-{attempt + 1} untuk video: {video_url}")
input("   Silakan selesaikan CAPTCHA di browser, lalu tekan [Enter] untuk mencoba lagi...")
driver.refresh()
print("   Mencoba lagi...")
continue
else:
print("  -> Tidak ada CAPTCHA. Melewati video ini.")
return None
print(f"  -> Gagal memproses video setelah {max_retries} kali percobaan. Melewati video ini.")
return None
# ==============================================================================
# EKSEKUSI UTAMA (Tidak Berubah)
# ==============================================================================
if __name__ == "__main__":
PROFILE_USERNAMES = ["rctvcirebon", "cirebonkabtv", "kang_jigus", "kangimron_", "info.cirebonan"]
#
MAX_VIDEOS_PER_PROFILE = 200
BASE_COOKIES_FILE = "tiktok_base_cookies.json"
PROFILE_COOKIES_FILE = "tiktok_profile_cookies.json"
all_data = []
driver = setup_driver()
if driver:
try:
if not PROFILE_USERNAMES:
print("Daftar PROFILE_USERNAMES kosong.")
else:
first_profile_url = f"https://www.tiktok.com/@{PROFILE_USERNAMES[0]}"
session_ok = establish_and_verify_session(driver, BASE_COOKIES_FILE, PROFILE_COOKIES_FILE, first_profile_url)
if session_ok:
for username in PROFILE_USERNAMES:
print("\n" + "="*70)
print(f"MEMULAI SCRAPING UNTUK PROFIL: @{username}")
print("="*70)
profile_url = f"https://www.tiktok.com/@{username}"
driver.get(profile_url)
# [PERUBAHAN] Panggilan fungsi disederhanakan
video_urls = get_video_links(driver, MAX_VIDEOS_PER_PROFILE)
for url in video_urls:
data = scrape_video_details(driver, url)
if data:
data['profile_username'] = username
data['scrape_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
all_data.append(data)
time.sleep(2)
# ... sisa kode untuk menyimpan file tidak perlu diubah ...
if all_data:
print("\nMenyimpan semua data yang terkumpul...")
df = pd.DataFrame(all_data)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"tiktok_data_multi_{timestamp}"
df.to_csv(f'{output_filename}.csv', index=False, encoding='utf-8-sig')
print(f"Data telah disimpan ke {output_filename}.csv")
with open(f'{output_filename}.json', 'w', encoding='utf-8') as f:
json.dump(all_data, f, ensure_ascii=False, indent=4)
print(f"Data telah disimpan ke {output_filename}.json")
else:
print("\nTidak ada data yang berhasil dikumpulkan untuk disimpan.")
except Exception as e:
print(f"\nTerjadi kesalahan fatal selama proses: {e}")
finally:
print("\n--- PROSES SELESAI ---")
driver.quit()