File size: 15,273 Bytes
fa8ff66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
import os
import time
import json
import csv
from datetime import datetime
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ========== KONFIGURASI ==========
FB_USERNAME = "fatihr252@gmail.com"
FB_PASSWORD = "Bambank1"
COOKIES_FILE = "fb_cookies.json"

# daftar grup yang ingin di-scrape
GROUP_INPUTS = [
    "https://web.facebook.com/groups/183039928416039?locale=id_ID",
    "https://web.facebook.com/groups/teraswarga?locale=id_ID",
    "https://web.facebook.com/groups/967901979894945?locale=id_ID"
]

# lokasi hasil scraping
OUTPUT_CSV = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
OUTPUT_JSON = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

# ========== SETUP SELENIUM ==========
options = uc.ChromeOptions()
options.add_argument("--disable-notifications")
options.add_argument("--disable-infobars")
options.add_argument("--start-maximized")

driver = uc.Chrome(options=options, use_subprocess=True)
wait = WebDriverWait(driver, 15)


# ========== FUNGSI LOGIN ==========
def save_cookies(driver, path):
    with open(path, "w") as file:
        json.dump(driver.get_cookies(), file)


def load_cookies(driver, path):
    with open(path, "r") as file:
        cookies = json.load(file)
        for cookie in cookies:
            driver.add_cookie(cookie)

def fb_login(force=False):
    """
    force=True akan memaksa login pakai username/password
    walaupun ada cookies.
    """
    driver.get("https://www.facebook.com/")
    time.sleep(3)

    if not force and os.path.exists(COOKIES_FILE):
        try:
            load_cookies(driver, COOKIES_FILE)
            driver.refresh()
            time.sleep(5)
            if "login" not in driver.current_url:
                print("βœ… Login pakai cookies berhasil")
                # pastikan search bar muncul sebelum keluar
                try:
                    wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
                    print("πŸ” Search bar tersedia, siap mencari grup")
                except:
                    print("⚠️ Search bar belum muncul, tetap lanjutkan")
                return
        except Exception as e:
            print("⚠️ Cookies gagal dipakai:", e)

    print("πŸ”‘ Login manual pakai username/password...")

    # --- Login form handling ---
    try:
        # versi klasik (id=email, id=pass)
        email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
        pass_input = driver.find_element(By.ID, "pass")
        email_input.clear()
        email_input.send_keys(FB_USERNAME)
        pass_input.clear()
        pass_input.send_keys(FB_PASSWORD)
        driver.find_element(By.NAME, "login").click()
    except Exception:
        try:
            # versi dinamis (_r_s_, _r_17_)
            email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@name="email" and @type="text"]')))
            pass_input = driver.find_element(By.XPATH, '//input[@name="pass" and @type="password"]')
            email_input.clear()
            email_input.send_keys(FB_USERNAME)
            pass_input.clear()
            pass_input.send_keys(FB_PASSWORD)
            pass_input.submit()
        except Exception:
            try:
                # versi lain (data-testid)
                email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@data-testid="royal-email"]')))
                pass_input = driver.find_element(By.XPATH, '//input[@data-testid="royal-pass"]')
                email_input.clear()
                email_input.send_keys(FB_USERNAME)
                pass_input.clear()
                pass_input.send_keys(FB_PASSWORD)
                driver.find_element(By.NAME, "login").click()
            except Exception as e:
                raise Exception(f"❌ Tidak menemukan form login yang cocok: {e}")

    time.sleep(5)
    if "login" in driver.current_url:
        raise Exception("❌ Login gagal! Cek username/password")

    save_cookies(driver, COOKIES_FILE)
    print("βœ… Login sukses & cookies disimpan")

    # setelah login sukses, pastikan search bar ada
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
        print("πŸ” Search bar tersedia, siap mencari grup")
    except:
        print("⚠️ Search bar belum muncul, coba manual redirect ke beranda")
        driver.get("https://www.facebook.com/")
        wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))


def ensure_logged_in():
    """Cek apakah user masih login, kalau muncul halaman login atau popup, login ulang."""
    try:
        # --- Kasus URL berubah ke login page ---
        if driver.current_url and "login" in driver.current_url:
            print("⚠️ Redirect ke halaman login, mencoba login ulang...")
            fb_login(force=True)
            return

        # --- Kasus popup 'See more on Facebook' muncul ---
        try:
            popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
            if popup.is_displayed():
                print("⚠️ Popup login terdeteksi, login ulang...")
                fb_login(force=True)
                return
        except:
            pass

        # --- Kasus ada input email/password nongol di modal ---
        try:
            login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
            if login_modal.is_displayed():
                print("⚠️ Form login modal terdeteksi, login ulang...")
                fb_login(force=True)
                return
        except:
            pass

    except Exception as e:
        print("⚠️ Gagal cek login:", e)

# ========== SEARCH & BUKA GRUP ==========
def open_group(group_input):
    """
    Bisa menerima nama grup ATAU link grup langsung.
    """
    # --- Kasus: input berupa link langsung ---
    if group_input.startswith("http"):
        print(f"πŸ”— Buka langsung link grup: {group_input}")
        driver.get(group_input)
        time.sleep(5)

        ensure_logged_in()
        return group_input

    # --- Kasus: input berupa nama grup ---
    try:
        search_box = wait.until(
            EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]'))
        )
        print(f"πŸ” Mencari grup '{group_input}' via search...")
        search_box.clear()
        search_box.send_keys(group_input)
        search_box.submit()
        time.sleep(5)

        # cari hasil grup dengan nama persis
        link = None
        results = driver.find_elements(By.XPATH, f'//a[contains(text(),"{group_input}")]')
        if results:
            link = results[0].get_attribute("href")

        if link:
            print(f"βœ… Grup ditemukan: {link}")
            driver.get(link)
            time.sleep(5)
            return link
        else:
            print(f"❌ Grup '{group_input}' tidak ditemukan via search")
            return None

    except Exception as e:
        print(f"⚠️ Search gagal untuk '{group_input}':", e)
        return None

def scroll_to_bottom(driver, max_scrolls=10, pause_time=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# ========== SCRAPING POSTINGAN GRUP ==========
def scrape_group(group_url, group_name, max_scrolls=3, max_posts=None):
    print(f"πŸ“₯ Scraping grup: {group_name} ({group_url})")
    driver.get(group_url)
    time.sleep(4)
    ensure_logged_in()

    posts = []
    last_height = driver.execute_script("return document.body.scrollHeight")

    for scroll_round in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
        ensure_logged_in()

        post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
        print(f"πŸ”Ž Ditemukan {len(post_elements)} postingan pada scroll {scroll_round+1}")

        for idx, post in enumerate(post_elements):
            if max_posts and len(posts) >= max_posts:
                break

            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", post)
                time.sleep(1)

                # --- article_ctx: konteks utama artikel/post ---
                article_ctx = None
                try:
                    # Biasanya post itu sendiri sudah konteks utama
                    article_ctx = post
                except:
                    article_ctx = None

                # --- permalink & buka halaman post ---
                permalink = None
                post_context = post  # default fallback ke post list

                try:
                    # coba ambil link /posts/
                    link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
                    permalink = link_el.get_attribute("href").split("?")[0]
                except:
                    try:
                        # coba ambil link /permalink/
                        link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
                        permalink = link_el.get_attribute("href").split("?")[0]
                    except:
                        try:
                            # fallback ambil ID dari data-ft
                            post_id = post.get_attribute("data-ft")
                            if post_id and "top_level_post_id" in post_id:
                                import json
                                d = json.loads(post_id)
                                pid = d.get("top_level_post_id")
                                if pid:
                                    permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
                        except:
                            pass

                if not permalink:
                    print("⚠️ Tidak ada permalink & tidak bisa generate. Tetap lanjut simpan data.")
                    permalink = group_url  # fallback isi dengan URL grup

                # --- buka halaman permalink ---
                try:
                    driver.get(permalink)
                    time.sleep(3)
                    ensure_logged_in()
                    
                    # ambil elemen post baru dari halaman permalink
                    post_context = driver.find_element(By.XPATH, "//div[@role='article']")
                except Exception as e:
                    print(f"⚠️ Gagal buka permalink {permalink}: {e}")
                    post_context = None  # jangan pakai lagi elemen lama

                # --- ambil author ---
                author = "Unknown"
                try:
                    if post_context:
                        try:
                            author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
                        except:
                            try:
                                author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
                            except:
                                author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
                except:
                    pass

                # --- expand komentar ---
                while True:
                    try:
                        btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
                        driver.execute_script("arguments[0].click();", btn)
                        time.sleep(2)
                    except:
                        break

                while True:
                    try:
                        btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
                        driver.execute_script("arguments[0].click();", btn)
                        time.sleep(2)
                    except:
                        break

                # --- ambil caption & komentar dari post_context ---
                if post_context:
                    try:
                        caption_blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
                        caption_texts = [cb.text.strip() for cb in caption_blocks if cb.text.strip()]
                        caption = "\n".join(caption_texts)[:2000] if caption_texts else ""
                    except:
                        caption = ""

                    # ambil komentar
                    comments = []
                    try:
                        comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar']//div[@dir='auto']")
                        seen = set()
                        for cb in comment_blocks:
                            text = cb.text.strip()
                            if text and text not in seen:
                                seen.add(text)
                                comments.append(text)
                    except:
                        comments = []

                data = {
                    "group_name": group_name,
                    "group_url": group_url,
                    "post_url": permalink,
                    "author": author,
                    "caption": caption,
                    "comments": comments,
                }
                print(f"βœ… Post captured: {author} | {caption[:60]}... | {len(comments)} komentar")
                posts.append(data)

            except Exception as e:
                print(f"⚠️ Error baca postingan {idx}: {e}")
                continue

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return posts

# ========== MAIN ==========
all_data = []

fb_login()

for g in GROUP_INPUTS:
    group_url = open_group(g)
    if group_url:
        posts = scrape_group(group_url, g)
        all_data.extend(posts)

# simpan ke CSV
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["group_name", "group_url", "post_url", "author", "caption", "comments"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in all_data:
        writer.writerow(row)

# simpan ke JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

print(f"βœ… Selesai. Data disimpan ke {OUTPUT_CSV} dan {OUTPUT_JSON}")
try:
    driver.quit()
except:
    pass