Sentimen-Analysis / services /instagram.py
noranisa's picture
Create instagram.py
4cd190b verified
"""
services/instagram.py
Scrape komentar + caption Instagram berdasarkan hashtag/keyword
menggunakan Apify Instagram Scraper (apify/instagram-scraper).
"""
import os
import time
import requests
# ── CONFIG ──
APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets
ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper
BASE_URL = "https://api.apify.com/v2"
# timeout & polling
MAX_WAIT_SEC = 120 # maks tunggu actor selesai
POLL_INTERVAL = 4 # cek setiap N detik
MAX_POSTS = 10 # jumlah post per hashtag
MAX_COMMENTS = 20 # komentar per post
# ────────────────────────────────────────────
# INTERNAL HELPERS
# ────────────────────────────────────────────
def _headers() -> dict:
return {"Authorization": f"Bearer {APIFY_API_KEY}"}
def _run_actor(keyword: str) -> str | None:
"""
Jalankan Apify actor dan kembalikan run_id.
Input disesuaikan dengan apify/instagram-scraper schema.
"""
url = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
body = {
"hashtags": [keyword.lstrip("#")],
"resultsLimit": MAX_POSTS,
"addParentData": False,
"scrapeComments": True,
"commentsLimit": MAX_COMMENTS,
"scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal
}
try:
r = requests.post(url, json=body, headers=_headers(), timeout=30)
r.raise_for_status()
run_id = r.json()["data"]["id"]
print(f"βœ… Apify run started: {run_id}")
return run_id
except Exception as e:
print(f"❌ Apify run error: {e}")
return None
def _wait_for_run(run_id: str) -> bool:
"""
Polling sampai actor selesai atau timeout.
Return True jika SUCCEEDED.
"""
url = f"{BASE_URL}/actor-runs/{run_id}"
elapsed = 0
while elapsed < MAX_WAIT_SEC:
try:
r = requests.get(url, headers=_headers(), timeout=15)
status = r.json()["data"]["status"]
if status == "SUCCEEDED":
print(f"βœ… Apify run SUCCEEDED ({elapsed}s)")
return True
if status in ("FAILED", "ABORTED", "TIMED-OUT"):
print(f"❌ Apify run {status}")
return False
print(f"⏳ Apify status: {status} ({elapsed}s)")
except Exception as e:
print(f"⚠️ Polling error: {e}")
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
return False
def _fetch_dataset(run_id: str) -> list:
"""
Ambil hasil dataset dari run yang sudah selesai.
"""
url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
try:
r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
r.raise_for_status()
return r.json()
except Exception as e:
print(f"❌ Fetch dataset error: {e}")
return []
# ────────────────────────────────────────────
# TEXT EXTRACTION
# ────────────────────────────────────────────
def _extract_texts(items: list) -> list[str]:
"""
Ekstrak caption + komentar dari hasil Apify.
Struktur item apify/instagram-scraper:
item.caption β†’ teks caption post
item.comments[] β†’ list komentar
.text β†’ teks komentar
"""
texts = []
for item in items:
# Caption post
caption = item.get("caption") or item.get("text") or ""
if caption and len(caption.strip()) > 5:
texts.append(caption.strip())
# Komentar
comments = item.get("comments") or item.get("latestComments") or []
for c in comments:
if isinstance(c, dict):
body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
elif isinstance(c, str):
body = c
else:
body = ""
if body and len(body.strip()) > 3:
texts.append(body.strip())
return texts
# ────────────────────────────────────────────
# PUBLIC INTERFACE
# ────────────────────────────────────────────
def get_instagram_data(keyword: str) -> list[str]:
"""
Scrape Instagram berdasarkan hashtag/keyword.
Return list of string (caption + komentar).
Dipanggil dari services/aggregator.py.
"""
if not APIFY_API_KEY:
print("⚠️ APIFY_API_KEY tidak diset β€” skip Instagram")
return []
# 1. jalankan actor
run_id = _run_actor(keyword)
if not run_id:
return []
# 2. tunggu selesai
success = _wait_for_run(run_id)
if not success:
return []
# 3. ambil dataset
items = _fetch_dataset(run_id)
if not items:
print("⚠️ Dataset Apify kosong")
return []
# 4. ekstrak teks
texts = _extract_texts(items)
print(f"βœ… Instagram: {len(texts)} teks dari {len(items)} post")
return texts