| """ |
| facebook.py β Facebook group scraper using Selenium. |
| Exports: scrape_facebook(username, password, groups) -> list[dict] |
| |
| Returns structured data per-post: |
| group_name, group_url, post_url, author, caption, comments |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import os |
| import time |
|
|
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support.ui import WebDriverWait |
| from selenium.webdriver.support import expected_conditions as EC |
|
|
| from ._driver import _create_driver |
|
|
| COOKIES_FILE = "fb_cookies.json" |
| FB_BASE = "https://www.facebook.com" |
| MOBILE_FB = "https://m.facebook.com" |
|
|
|
|
| |
|
|
| def _save_cookies(driver, path: str) -> None: |
| try: |
| with open(path, "w") as f: |
| json.dump(driver.get_cookies(), f) |
| except Exception as e: |
| print(f"[Facebook] Gagal simpan cookies: {e}") |
|
|
|
|
| def _load_cookies(driver, path: str) -> bool: |
| if not os.path.exists(path) or os.path.getsize(path) == 0: |
| return False |
| try: |
| with open(path, "r") as f: |
| cookies = json.load(f) |
| for cookie in cookies: |
| try: |
| driver.add_cookie(cookie) |
| except Exception: |
| pass |
| return True |
| except Exception as e: |
| print(f"[Facebook] Gagal load cookies: {e}") |
| return False |
|
|
|
|
| |
|
|
| def _fb_login(driver, username: str, password: str) -> bool: |
| wait = WebDriverWait(driver, 20) |
| driver.get(MOBILE_FB) |
| time.sleep(3) |
|
|
| if os.path.exists(COOKIES_FILE): |
| try: |
| _load_cookies(driver, COOKIES_FILE) |
| driver.refresh() |
| time.sleep(4) |
| if "login" not in driver.current_url and "checkpoint" not in driver.current_url: |
| print("[Facebook] Login via cookies berhasil.") |
| return True |
| driver.delete_all_cookies() |
| driver.get(MOBILE_FB) |
| time.sleep(2) |
| except Exception as e: |
| pass |
|
|
| print("[Facebook] Login manual username/password...") |
| try: |
| email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]'))) |
| pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]') |
| email_input.clear() |
| email_input.send_keys(username) |
| pass_input.clear() |
| pass_input.send_keys(password) |
| pass_input.send_keys("\n") |
| time.sleep(1) |
| |
| try: |
| login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]') |
| driver.execute_script("arguments[0].click();", login_btn) |
| except Exception: |
| pass |
| except Exception: |
| try: |
| driver.get(f"{FB_BASE}/login.php") |
| time.sleep(3) |
| email_input = wait.until(EC.presence_of_element_located((By.ID, "email"))) |
| pass_input = driver.find_element(By.ID, "pass") |
| email_input.clear() |
| email_input.send_keys(username) |
| pass_input.clear() |
| pass_input.send_keys(password) |
| driver.find_element(By.NAME, "login").click() |
| except Exception as e2: |
| return False |
|
|
| time.sleep(6) |
| if "login" in driver.current_url or "checkpoint" in driver.current_url: |
| return False |
|
|
| _save_cookies(driver, COOKIES_FILE) |
| return True |
|
|
|
|
| def ensure_logged_in(driver, username, password): |
| try: |
| url = driver.current_url |
| if url and "login" in url: |
| _fb_login(driver, username, password) |
| return |
|
|
| try: |
| popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]') |
| if popup.is_displayed(): |
| _fb_login(driver, username, password) |
| return |
| except: pass |
|
|
| try: |
| login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]') |
| if login_modal.is_displayed(): |
| _fb_login(driver, username, password) |
| return |
| except: pass |
| except: pass |
|
|
|
|
| |
|
|
| def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list: |
| """Scrape posts from a single FB group URL. Returns list of dict strings.""" |
| posts: list = [] |
| |
| group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com") |
| print(f"[Facebook] Scraping grup: {group_url}") |
| |
| try: |
| driver.get(group_url) |
| time.sleep(6) |
| ensure_logged_in(driver, username, password) |
| except Exception as e: |
| print(f"[Facebook] Gagal buka grup: {e}") |
| return posts |
|
|
| last_height = driver.execute_script("return document.body.scrollHeight") |
|
|
| for scroll_n in range(max_scrolls): |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
| time.sleep(4) |
| ensure_logged_in(driver, username, password) |
|
|
| post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]') |
| print(f"[Facebook] Scroll {scroll_n + 1} β {len(post_elements)} artikel ditemukan") |
|
|
| for idx, post in enumerate(post_elements): |
| try: |
| driver.execute_script("arguments[0].scrollIntoView(true);", post) |
| time.sleep(1) |
|
|
| permalink = None |
| post_context = post |
| try: |
| link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]") |
| permalink = link_el.get_attribute("href").split("?")[0] |
| except: |
| try: |
| link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]") |
| permalink = link_el.get_attribute("href").split("?")[0] |
| except: |
| try: |
| post_id = post.get_attribute("data-ft") |
| if post_id and "top_level_post_id" in post_id: |
| d = json.loads(post_id) |
| pid = d.get("top_level_post_id") |
| if pid: |
| permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/" |
| except: |
| pass |
|
|
| if not permalink: |
| permalink = group_url |
|
|
| try: |
| driver.execute_script(f"window.open('{permalink}', '_blank');") |
| time.sleep(1) |
| driver.switch_to.window(driver.window_handles[-1]) |
| time.sleep(3) |
| ensure_logged_in(driver, username, password) |
| post_context = driver.find_element(By.XPATH, "//div[@role='article']") |
| except: |
| post_context = None |
|
|
| author = "Unknown" |
| try: |
| if post_context: |
| try: |
| author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip() |
| except: |
| try: |
| author = post_context.find_element(By.XPATH, ".//strong//span").text.strip() |
| except: |
| author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip() |
| except: pass |
|
|
| |
| if post_context: |
| while True: |
| try: |
| btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]") |
| driver.execute_script("arguments[0].click();", btn) |
| time.sleep(2) |
| except: break |
| while True: |
| try: |
| btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]") |
| driver.execute_script("arguments[0].click();", btn) |
| time.sleep(2) |
| except: break |
|
|
| caption = "" |
| comments = [] |
| if post_context: |
| try: |
| blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']") |
| caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000] |
| except: pass |
| try: |
| comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']") |
| seen_c = set() |
| for cb in comment_blocks: |
| c = cb.text.strip() |
| if c and c not in seen_c: |
| seen_c.add(c) |
| comments.append(c) |
| except: pass |
|
|
| if len(driver.window_handles) > 1: |
| driver.close() |
| driver.switch_to.window(driver.window_handles[0]) |
|
|
| if caption or comments: |
| posts.append({ |
| "group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2], |
| "group_url": group_url, |
| "post_url": permalink, |
| "author": author, |
| "caption": caption, |
| "comments": comments |
| }) |
| except Exception as e: |
| print(f"[Facebook] Error baca post: {e}") |
| if len(driver.window_handles) > 1: |
| driver.close() |
| driver.switch_to.window(driver.window_handles[0]) |
| continue |
|
|
| new_height = driver.execute_script("return document.body.scrollHeight") |
| if new_height == last_height: |
| break |
| last_height = new_height |
|
|
| return posts |
|
|
|
|
| |
|
|
| def scrape_facebook(username: str, password: str, groups: list | None = None) -> list: |
| if not username or not password: |
| print("[Facebook] Username/password tidak disediakan.") |
| return [] |
|
|
| if not groups: |
| print("[Facebook] Tidak ada URL grup yang disediakan β skip.") |
| return [] |
|
|
| driver = _create_driver(mobile=False) |
| all_data: list = [] |
|
|
| try: |
| if not _fb_login(driver, username, password): |
| return [] |
|
|
| for group_url in groups: |
| if not group_url or not group_url.strip(): |
| continue |
| data = _scrape_group(driver, username, password, group_url.strip()) |
| all_data.extend(data) |
|
|
| except Exception as e: |
| print(f"[Facebook] Fatal error: {e}") |
| finally: |
| try: |
| driver.quit() |
| except Exception: |
| pass |
|
|
| print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}") |
| return all_data |