Spaces:
Sleeping
Sleeping
File size: 5,509 Bytes
4cd190b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | """
services/instagram.py
Scrape komentar + caption Instagram berdasarkan hashtag/keyword
menggunakan Apify Instagram Scraper (apify/instagram-scraper).
"""
import os
import time
import requests
# ββ CONFIG ββ
APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets
ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper
BASE_URL = "https://api.apify.com/v2"
# timeout & polling
MAX_WAIT_SEC = 120 # maks tunggu actor selesai
POLL_INTERVAL = 4 # cek setiap N detik
MAX_POSTS = 10 # jumlah post per hashtag
MAX_COMMENTS = 20 # komentar per post
# ββββββββββββββββββββββββββββββββββββββββββββ
# INTERNAL HELPERS
# ββββββββββββββββββββββββββββββββββββββββββββ
def _headers() -> dict:
return {"Authorization": f"Bearer {APIFY_API_KEY}"}
def _run_actor(keyword: str) -> str | None:
"""
Jalankan Apify actor dan kembalikan run_id.
Input disesuaikan dengan apify/instagram-scraper schema.
"""
url = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
body = {
"hashtags": [keyword.lstrip("#")],
"resultsLimit": MAX_POSTS,
"addParentData": False,
"scrapeComments": True,
"commentsLimit": MAX_COMMENTS,
"scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal
}
try:
r = requests.post(url, json=body, headers=_headers(), timeout=30)
r.raise_for_status()
run_id = r.json()["data"]["id"]
print(f"β
Apify run started: {run_id}")
return run_id
except Exception as e:
print(f"β Apify run error: {e}")
return None
def _wait_for_run(run_id: str) -> bool:
"""
Polling sampai actor selesai atau timeout.
Return True jika SUCCEEDED.
"""
url = f"{BASE_URL}/actor-runs/{run_id}"
elapsed = 0
while elapsed < MAX_WAIT_SEC:
try:
r = requests.get(url, headers=_headers(), timeout=15)
status = r.json()["data"]["status"]
if status == "SUCCEEDED":
print(f"β
Apify run SUCCEEDED ({elapsed}s)")
return True
if status in ("FAILED", "ABORTED", "TIMED-OUT"):
print(f"β Apify run {status}")
return False
print(f"β³ Apify status: {status} ({elapsed}s)")
except Exception as e:
print(f"β οΈ Polling error: {e}")
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
print(f"β Apify timeout setelah {MAX_WAIT_SEC}s")
return False
def _fetch_dataset(run_id: str) -> list:
"""
Ambil hasil dataset dari run yang sudah selesai.
"""
url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
try:
r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
r.raise_for_status()
return r.json()
except Exception as e:
print(f"β Fetch dataset error: {e}")
return []
# ββββββββββββββββββββββββββββββββββββββββββββ
# TEXT EXTRACTION
# ββββββββββββββββββββββββββββββββββββββββββββ
def _extract_texts(items: list) -> list[str]:
"""
Ekstrak caption + komentar dari hasil Apify.
Struktur item apify/instagram-scraper:
item.caption β teks caption post
item.comments[] β list komentar
.text β teks komentar
"""
texts = []
for item in items:
# Caption post
caption = item.get("caption") or item.get("text") or ""
if caption and len(caption.strip()) > 5:
texts.append(caption.strip())
# Komentar
comments = item.get("comments") or item.get("latestComments") or []
for c in comments:
if isinstance(c, dict):
body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
elif isinstance(c, str):
body = c
else:
body = ""
if body and len(body.strip()) > 3:
texts.append(body.strip())
return texts
# ββββββββββββββββββββββββββββββββββββββββββββ
# PUBLIC INTERFACE
# ββββββββββββββββββββββββββββββββββββββββββββ
def get_instagram_data(keyword: str) -> list[str]:
"""
Scrape Instagram berdasarkan hashtag/keyword.
Return list of string (caption + komentar).
Dipanggil dari services/aggregator.py.
"""
if not APIFY_API_KEY:
print("β οΈ APIFY_API_KEY tidak diset β skip Instagram")
return []
# 1. jalankan actor
run_id = _run_actor(keyword)
if not run_id:
return []
# 2. tunggu selesai
success = _wait_for_run(run_id)
if not success:
return []
# 3. ambil dataset
items = _fetch_dataset(run_id)
if not items:
print("β οΈ Dataset Apify kosong")
return []
# 4. ekstrak teks
texts = _extract_texts(items)
print(f"β
Instagram: {len(texts)} teks dari {len(items)} post")
return texts |