Sentiment / services /facebook.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
"""
facebook.py – Facebook group scraper using Selenium.
Exports: scrape_facebook(username, password, groups) -> list[dict]
Returns structured data per-post:
group_name, group_url, post_url, author, caption, comments
"""
from __future__ import annotations
import json
import os
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ._driver import _create_driver
COOKIES_FILE = "fb_cookies.json"
FB_BASE = "https://www.facebook.com"
MOBILE_FB = "https://m.facebook.com"
# ── Cookie helpers ─────────────────────────────────────────────────────────────
def _save_cookies(driver, path: str) -> None:
try:
with open(path, "w") as f:
json.dump(driver.get_cookies(), f)
except Exception as e:
print(f"[Facebook] Gagal simpan cookies: {e}")
def _load_cookies(driver, path: str) -> bool:
if not os.path.exists(path) or os.path.getsize(path) == 0:
return False
try:
with open(path, "r") as f:
cookies = json.load(f)
for cookie in cookies:
try:
driver.add_cookie(cookie)
except Exception:
pass
return True
except Exception as e:
print(f"[Facebook] Gagal load cookies: {e}")
return False
# ── Login ──────────────────────────────────────────────────────────────────────
def _fb_login(driver, username: str, password: str) -> bool:
wait = WebDriverWait(driver, 20)
driver.get(MOBILE_FB)
time.sleep(3)
if os.path.exists(COOKIES_FILE):
try:
_load_cookies(driver, COOKIES_FILE)
driver.refresh()
time.sleep(4)
if "login" not in driver.current_url and "checkpoint" not in driver.current_url:
print("[Facebook] Login via cookies berhasil.")
return True
driver.delete_all_cookies()
driver.get(MOBILE_FB)
time.sleep(2)
except Exception as e:
pass
print("[Facebook] Login manual username/password...")
try:
email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]')))
pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]')
email_input.clear()
email_input.send_keys(username)
pass_input.clear()
pass_input.send_keys(password)
pass_input.send_keys("\n")
time.sleep(1)
try:
login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]')
driver.execute_script("arguments[0].click();", login_btn)
except Exception:
pass
except Exception:
try:
driver.get(f"{FB_BASE}/login.php")
time.sleep(3)
email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
pass_input = driver.find_element(By.ID, "pass")
email_input.clear()
email_input.send_keys(username)
pass_input.clear()
pass_input.send_keys(password)
driver.find_element(By.NAME, "login").click()
except Exception as e2:
return False
time.sleep(6)
if "login" in driver.current_url or "checkpoint" in driver.current_url:
return False
_save_cookies(driver, COOKIES_FILE)
return True
def ensure_logged_in(driver, username, password):
try:
url = driver.current_url
if url and "login" in url:
_fb_login(driver, username, password)
return
try:
popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
if popup.is_displayed():
_fb_login(driver, username, password)
return
except: pass
try:
login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
if login_modal.is_displayed():
_fb_login(driver, username, password)
return
except: pass
except: pass
# ── Scraping ───────────────────────────────────────────────────────────────────
def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list:
"""Scrape posts from a single FB group URL. Returns list of dict strings."""
posts: list = []
group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com")
print(f"[Facebook] Scraping grup: {group_url}")
try:
driver.get(group_url)
time.sleep(6)
ensure_logged_in(driver, username, password)
except Exception as e:
print(f"[Facebook] Gagal buka grup: {e}")
return posts
last_height = driver.execute_script("return document.body.scrollHeight")
for scroll_n in range(max_scrolls):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(4)
ensure_logged_in(driver, username, password)
post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
print(f"[Facebook] Scroll {scroll_n + 1} β†’ {len(post_elements)} artikel ditemukan")
for idx, post in enumerate(post_elements):
try:
driver.execute_script("arguments[0].scrollIntoView(true);", post)
time.sleep(1)
permalink = None
post_context = post
try:
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
permalink = link_el.get_attribute("href").split("?")[0]
except:
try:
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
permalink = link_el.get_attribute("href").split("?")[0]
except:
try:
post_id = post.get_attribute("data-ft")
if post_id and "top_level_post_id" in post_id:
d = json.loads(post_id)
pid = d.get("top_level_post_id")
if pid:
permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
except:
pass
if not permalink:
permalink = group_url
try:
driver.execute_script(f"window.open('{permalink}', '_blank');")
time.sleep(1)
driver.switch_to.window(driver.window_handles[-1])
time.sleep(3)
ensure_logged_in(driver, username, password)
post_context = driver.find_element(By.XPATH, "//div[@role='article']")
except:
post_context = None
author = "Unknown"
try:
if post_context:
try:
author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
except:
try:
author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
except:
author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
except: pass
# Expand comments if permalink tab is open
if post_context:
while True:
try:
btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
driver.execute_script("arguments[0].click();", btn)
time.sleep(2)
except: break
while True:
try:
btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
driver.execute_script("arguments[0].click();", btn)
time.sleep(2)
except: break
caption = ""
comments = []
if post_context:
try:
blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000]
except: pass
try:
comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']")
seen_c = set()
for cb in comment_blocks:
c = cb.text.strip()
if c and c not in seen_c:
seen_c.add(c)
comments.append(c)
except: pass
if len(driver.window_handles) > 1:
driver.close()
driver.switch_to.window(driver.window_handles[0])
if caption or comments:
posts.append({
"group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2],
"group_url": group_url,
"post_url": permalink,
"author": author,
"caption": caption,
"comments": comments
})
except Exception as e:
print(f"[Facebook] Error baca post: {e}")
if len(driver.window_handles) > 1:
driver.close()
driver.switch_to.window(driver.window_handles[0])
continue
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
return posts
# ── Public API ─────────────────────────────────────────────────────────────────
def scrape_facebook(username: str, password: str, groups: list | None = None) -> list:
if not username or not password:
print("[Facebook] Username/password tidak disediakan.")
return []
if not groups:
print("[Facebook] Tidak ada URL grup yang disediakan β€” skip.")
return []
driver = _create_driver(mobile=False)
all_data: list = []
try:
if not _fb_login(driver, username, password):
return []
for group_url in groups:
if not group_url or not group_url.strip():
continue
data = _scrape_group(driver, username, password, group_url.strip())
all_data.extend(data)
except Exception as e:
print(f"[Facebook] Fatal error: {e}")
finally:
try:
driver.quit()
except Exception:
pass
print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}")
return all_data