""" services/instagram.py Scrape komentar + caption Instagram berdasarkan hashtag/keyword menggunakan Apify Instagram Scraper (apify/instagram-scraper). """ import os import time import requests # ── CONFIG ── APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper BASE_URL = "https://api.apify.com/v2" # timeout & polling MAX_WAIT_SEC = 120 # maks tunggu actor selesai POLL_INTERVAL = 4 # cek setiap N detik MAX_POSTS = 10 # jumlah post per hashtag MAX_COMMENTS = 20 # komentar per post # ──────────────────────────────────────────── # INTERNAL HELPERS # ──────────────────────────────────────────── def _headers() -> dict: return {"Authorization": f"Bearer {APIFY_API_KEY}"} def _run_actor(keyword: str) -> str | None: """ Jalankan Apify actor dan kembalikan run_id. Input disesuaikan dengan apify/instagram-scraper schema. """ url = f"{BASE_URL}/acts/{ACTOR_ID}/runs" body = { "hashtags": [keyword.lstrip("#")], "resultsLimit": MAX_POSTS, "addParentData": False, "scrapeComments": True, "commentsLimit": MAX_COMMENTS, "scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal } try: r = requests.post(url, json=body, headers=_headers(), timeout=30) r.raise_for_status() run_id = r.json()["data"]["id"] print(f"✅ Apify run started: {run_id}") return run_id except Exception as e: print(f"❌ Apify run error: {e}") return None def _wait_for_run(run_id: str) -> bool: """ Polling sampai actor selesai atau timeout. Return True jika SUCCEEDED. """ url = f"{BASE_URL}/actor-runs/{run_id}" elapsed = 0 while elapsed < MAX_WAIT_SEC: try: r = requests.get(url, headers=_headers(), timeout=15) status = r.json()["data"]["status"] if status == "SUCCEEDED": print(f"✅ Apify run SUCCEEDED ({elapsed}s)") return True if status in ("FAILED", "ABORTED", "TIMED-OUT"): print(f"❌ Apify run {status}") return False print(f"⏳ Apify status: {status} ({elapsed}s)") except Exception as e: print(f"⚠️ Polling error: {e}") time.sleep(POLL_INTERVAL) elapsed += POLL_INTERVAL print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s") return False def _fetch_dataset(run_id: str) -> list: """ Ambil hasil dataset dari run yang sudah selesai. """ url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items" try: r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30) r.raise_for_status() return r.json() except Exception as e: print(f"❌ Fetch dataset error: {e}") return [] # ──────────────────────────────────────────── # TEXT EXTRACTION # ──────────────────────────────────────────── def _extract_texts(items: list) -> list[str]: """ Ekstrak caption + komentar dari hasil Apify. Struktur item apify/instagram-scraper: item.caption → teks caption post item.comments[] → list komentar .text → teks komentar """ texts = [] for item in items: # Caption post caption = item.get("caption") or item.get("text") or "" if caption and len(caption.strip()) > 5: texts.append(caption.strip()) # Komentar comments = item.get("comments") or item.get("latestComments") or [] for c in comments: if isinstance(c, dict): body = c.get("text") or c.get("ownerComment", {}).get("text") or "" elif isinstance(c, str): body = c else: body = "" if body and len(body.strip()) > 3: texts.append(body.strip()) return texts # ──────────────────────────────────────────── # PUBLIC INTERFACE # ──────────────────────────────────────────── def get_instagram_data(keyword: str) -> list[str]: """ Scrape Instagram berdasarkan hashtag/keyword. Return list of string (caption + komentar). Dipanggil dari services/aggregator.py. """ if not APIFY_API_KEY: print("⚠️ APIFY_API_KEY tidak diset — skip Instagram") return [] # 1. jalankan actor run_id = _run_actor(keyword) if not run_id: return [] # 2. tunggu selesai success = _wait_for_run(run_id) if not success: return [] # 3. ambil dataset items = _fetch_dataset(run_id) if not items: print("⚠️ Dataset Apify kosong") return [] # 4. ekstrak teks texts = _extract_texts(items) print(f"✅ Instagram: {len(texts)} teks dari {len(items)} post") return texts