""" facebook.py – Facebook group scraper using Selenium. Exports: scrape_facebook(username, password, groups) -> list[dict] Returns structured data per-post: group_name, group_url, post_url, author, caption, comments """ from __future__ import annotations import json import os import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from ._driver import _create_driver COOKIES_FILE = "fb_cookies.json" FB_BASE = "https://www.facebook.com" MOBILE_FB = "https://m.facebook.com" # ── Cookie helpers ───────────────────────────────────────────────────────────── def _save_cookies(driver, path: str) -> None: try: with open(path, "w") as f: json.dump(driver.get_cookies(), f) except Exception as e: print(f"[Facebook] Gagal simpan cookies: {e}") def _load_cookies(driver, path: str) -> bool: if not os.path.exists(path) or os.path.getsize(path) == 0: return False try: with open(path, "r") as f: cookies = json.load(f) for cookie in cookies: try: driver.add_cookie(cookie) except Exception: pass return True except Exception as e: print(f"[Facebook] Gagal load cookies: {e}") return False # ── Login ────────────────────────────────────────────────────────────────────── def _fb_login(driver, username: str, password: str) -> bool: wait = WebDriverWait(driver, 20) driver.get(MOBILE_FB) time.sleep(3) if os.path.exists(COOKIES_FILE): try: _load_cookies(driver, COOKIES_FILE) driver.refresh() time.sleep(4) if "login" not in driver.current_url and "checkpoint" not in driver.current_url: print("[Facebook] Login via cookies berhasil.") return True driver.delete_all_cookies() driver.get(MOBILE_FB) time.sleep(2) except Exception as e: pass print("[Facebook] Login manual username/password...") try: email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]'))) pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]') email_input.clear() email_input.send_keys(username) pass_input.clear() pass_input.send_keys(password) pass_input.send_keys("\n") time.sleep(1) try: login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]') driver.execute_script("arguments[0].click();", login_btn) except Exception: pass except Exception: try: driver.get(f"{FB_BASE}/login.php") time.sleep(3) email_input = wait.until(EC.presence_of_element_located((By.ID, "email"))) pass_input = driver.find_element(By.ID, "pass") email_input.clear() email_input.send_keys(username) pass_input.clear() pass_input.send_keys(password) driver.find_element(By.NAME, "login").click() except Exception as e2: return False time.sleep(6) if "login" in driver.current_url or "checkpoint" in driver.current_url: return False _save_cookies(driver, COOKIES_FILE) return True def ensure_logged_in(driver, username, password): try: url = driver.current_url if url and "login" in url: _fb_login(driver, username, password) return try: popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]') if popup.is_displayed(): _fb_login(driver, username, password) return except: pass try: login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]') if login_modal.is_displayed(): _fb_login(driver, username, password) return except: pass except: pass # ── Scraping ─────────────────────────────────────────────────────────────────── def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list: """Scrape posts from a single FB group URL. Returns list of dict strings.""" posts: list = [] group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com") print(f"[Facebook] Scraping grup: {group_url}") try: driver.get(group_url) time.sleep(6) ensure_logged_in(driver, username, password) except Exception as e: print(f"[Facebook] Gagal buka grup: {e}") return posts last_height = driver.execute_script("return document.body.scrollHeight") for scroll_n in range(max_scrolls): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(4) ensure_logged_in(driver, username, password) post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]') print(f"[Facebook] Scroll {scroll_n + 1} → {len(post_elements)} artikel ditemukan") for idx, post in enumerate(post_elements): try: driver.execute_script("arguments[0].scrollIntoView(true);", post) time.sleep(1) permalink = None post_context = post try: link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]") permalink = link_el.get_attribute("href").split("?")[0] except: try: link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]") permalink = link_el.get_attribute("href").split("?")[0] except: try: post_id = post.get_attribute("data-ft") if post_id and "top_level_post_id" in post_id: d = json.loads(post_id) pid = d.get("top_level_post_id") if pid: permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/" except: pass if not permalink: permalink = group_url try: driver.execute_script(f"window.open('{permalink}', '_blank');") time.sleep(1) driver.switch_to.window(driver.window_handles[-1]) time.sleep(3) ensure_logged_in(driver, username, password) post_context = driver.find_element(By.XPATH, "//div[@role='article']") except: post_context = None author = "Unknown" try: if post_context: try: author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip() except: try: author = post_context.find_element(By.XPATH, ".//strong//span").text.strip() except: author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip() except: pass # Expand comments if permalink tab is open if post_context: while True: try: btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]") driver.execute_script("arguments[0].click();", btn) time.sleep(2) except: break while True: try: btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]") driver.execute_script("arguments[0].click();", btn) time.sleep(2) except: break caption = "" comments = [] if post_context: try: blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']") caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000] except: pass try: comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']") seen_c = set() for cb in comment_blocks: c = cb.text.strip() if c and c not in seen_c: seen_c.add(c) comments.append(c) except: pass if len(driver.window_handles) > 1: driver.close() driver.switch_to.window(driver.window_handles[0]) if caption or comments: posts.append({ "group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2], "group_url": group_url, "post_url": permalink, "author": author, "caption": caption, "comments": comments }) except Exception as e: print(f"[Facebook] Error baca post: {e}") if len(driver.window_handles) > 1: driver.close() driver.switch_to.window(driver.window_handles[0]) continue new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height return posts # ── Public API ───────────────────────────────────────────────────────────────── def scrape_facebook(username: str, password: str, groups: list | None = None) -> list: if not username or not password: print("[Facebook] Username/password tidak disediakan.") return [] if not groups: print("[Facebook] Tidak ada URL grup yang disediakan — skip.") return [] driver = _create_driver(mobile=False) all_data: list = [] try: if not _fb_login(driver, username, password): return [] for group_url in groups: if not group_url or not group_url.strip(): continue data = _scrape_group(driver, username, password, group_url.strip()) all_data.extend(data) except Exception as e: print(f"[Facebook] Fatal error: {e}") finally: try: driver.quit() except Exception: pass print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}") return all_data