Spaces:
Sleeping
Sleeping
| """ | |
| services/instagram.py | |
| Scrape komentar + caption Instagram berdasarkan hashtag/keyword | |
| menggunakan Apify Instagram Scraper (apify/instagram-scraper). | |
| """ | |
| import os | |
| import time | |
| import requests | |
| # ββ CONFIG ββ | |
| APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets | |
| ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper | |
| BASE_URL = "https://api.apify.com/v2" | |
| # timeout & polling | |
| MAX_WAIT_SEC = 120 # maks tunggu actor selesai | |
| POLL_INTERVAL = 4 # cek setiap N detik | |
| MAX_POSTS = 10 # jumlah post per hashtag | |
| MAX_COMMENTS = 20 # komentar per post | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| # INTERNAL HELPERS | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| def _headers() -> dict: | |
| return {"Authorization": f"Bearer {APIFY_API_KEY}"} | |
| def _run_actor(keyword: str) -> str | None: | |
| """ | |
| Jalankan Apify actor dan kembalikan run_id. | |
| Input disesuaikan dengan apify/instagram-scraper schema. | |
| """ | |
| url = f"{BASE_URL}/acts/{ACTOR_ID}/runs" | |
| body = { | |
| "hashtags": [keyword.lstrip("#")], | |
| "resultsLimit": MAX_POSTS, | |
| "addParentData": False, | |
| "scrapeComments": True, | |
| "commentsLimit": MAX_COMMENTS, | |
| "scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal | |
| } | |
| try: | |
| r = requests.post(url, json=body, headers=_headers(), timeout=30) | |
| r.raise_for_status() | |
| run_id = r.json()["data"]["id"] | |
| print(f"β Apify run started: {run_id}") | |
| return run_id | |
| except Exception as e: | |
| print(f"β Apify run error: {e}") | |
| return None | |
| def _wait_for_run(run_id: str) -> bool: | |
| """ | |
| Polling sampai actor selesai atau timeout. | |
| Return True jika SUCCEEDED. | |
| """ | |
| url = f"{BASE_URL}/actor-runs/{run_id}" | |
| elapsed = 0 | |
| while elapsed < MAX_WAIT_SEC: | |
| try: | |
| r = requests.get(url, headers=_headers(), timeout=15) | |
| status = r.json()["data"]["status"] | |
| if status == "SUCCEEDED": | |
| print(f"β Apify run SUCCEEDED ({elapsed}s)") | |
| return True | |
| if status in ("FAILED", "ABORTED", "TIMED-OUT"): | |
| print(f"β Apify run {status}") | |
| return False | |
| print(f"β³ Apify status: {status} ({elapsed}s)") | |
| except Exception as e: | |
| print(f"β οΈ Polling error: {e}") | |
| time.sleep(POLL_INTERVAL) | |
| elapsed += POLL_INTERVAL | |
| print(f"β Apify timeout setelah {MAX_WAIT_SEC}s") | |
| return False | |
| def _fetch_dataset(run_id: str) -> list: | |
| """ | |
| Ambil hasil dataset dari run yang sudah selesai. | |
| """ | |
| url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items" | |
| try: | |
| r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30) | |
| r.raise_for_status() | |
| return r.json() | |
| except Exception as e: | |
| print(f"β Fetch dataset error: {e}") | |
| return [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| # TEXT EXTRACTION | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_texts(items: list) -> list[str]: | |
| """ | |
| Ekstrak caption + komentar dari hasil Apify. | |
| Struktur item apify/instagram-scraper: | |
| item.caption β teks caption post | |
| item.comments[] β list komentar | |
| .text β teks komentar | |
| """ | |
| texts = [] | |
| for item in items: | |
| # Caption post | |
| caption = item.get("caption") or item.get("text") or "" | |
| if caption and len(caption.strip()) > 5: | |
| texts.append(caption.strip()) | |
| # Komentar | |
| comments = item.get("comments") or item.get("latestComments") or [] | |
| for c in comments: | |
| if isinstance(c, dict): | |
| body = c.get("text") or c.get("ownerComment", {}).get("text") or "" | |
| elif isinstance(c, str): | |
| body = c | |
| else: | |
| body = "" | |
| if body and len(body.strip()) > 3: | |
| texts.append(body.strip()) | |
| return texts | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLIC INTERFACE | |
| # ββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_instagram_data(keyword: str) -> list[str]: | |
| """ | |
| Scrape Instagram berdasarkan hashtag/keyword. | |
| Return list of string (caption + komentar). | |
| Dipanggil dari services/aggregator.py. | |
| """ | |
| if not APIFY_API_KEY: | |
| print("β οΈ APIFY_API_KEY tidak diset β skip Instagram") | |
| return [] | |
| # 1. jalankan actor | |
| run_id = _run_actor(keyword) | |
| if not run_id: | |
| return [] | |
| # 2. tunggu selesai | |
| success = _wait_for_run(run_id) | |
| if not success: | |
| return [] | |
| # 3. ambil dataset | |
| items = _fetch_dataset(run_id) | |
| if not items: | |
| print("β οΈ Dataset Apify kosong") | |
| return [] | |
| # 4. ekstrak teks | |
| texts = _extract_texts(items) | |
| print(f"β Instagram: {len(texts)} teks dari {len(items)} post") | |
| return texts |