Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

File size: 5,509 Bytes

4cd190b

"""
services/instagram.py
Scrape komentar + caption Instagram berdasarkan hashtag/keyword
menggunakan Apify Instagram Scraper (apify/instagram-scraper).
"""

import os
import time
import requests

# ── CONFIG ──
APIFY_API_KEY = os.getenv("APIFY_API_KEY")          # set di HF Spaces Secrets
ACTOR_ID      = "shu8hvrXbJbY3Eb9W"                 # apify/instagram-scraper
BASE_URL      = "https://api.apify.com/v2"

# timeout & polling
MAX_WAIT_SEC  = 120    # maks tunggu actor selesai
POLL_INTERVAL = 4      # cek setiap N detik
MAX_POSTS     = 10     # jumlah post per hashtag
MAX_COMMENTS  = 20     # komentar per post


# ────────────────────────────────────────────
#  INTERNAL HELPERS
# ────────────────────────────────────────────

def _headers() -> dict:
    return {"Authorization": f"Bearer {APIFY_API_KEY}"}


def _run_actor(keyword: str) -> str | None:
    """
    Jalankan Apify actor dan kembalikan run_id.
    Input disesuaikan dengan apify/instagram-scraper schema.
    """
    url  = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
    body = {
        "hashtags":          [keyword.lstrip("#")],
        "resultsLimit":      MAX_POSTS,
        "addParentData":     False,
        "scrapeComments":    True,
        "commentsLimit":     MAX_COMMENTS,
        "scrapePostsUntilDate": "",   # kosong = tidak ada batas tanggal
    }
    try:
        r = requests.post(url, json=body, headers=_headers(), timeout=30)
        r.raise_for_status()
        run_id = r.json()["data"]["id"]
        print(f"✅ Apify run started: {run_id}")
        return run_id
    except Exception as e:
        print(f"❌ Apify run error: {e}")
        return None


def _wait_for_run(run_id: str) -> bool:
    """
    Polling sampai actor selesai atau timeout.
    Return True jika SUCCEEDED.
    """
    url     = f"{BASE_URL}/actor-runs/{run_id}"
    elapsed = 0

    while elapsed < MAX_WAIT_SEC:
        try:
            r      = requests.get(url, headers=_headers(), timeout=15)
            status = r.json()["data"]["status"]

            if status == "SUCCEEDED":
                print(f"✅ Apify run SUCCEEDED ({elapsed}s)")
                return True
            if status in ("FAILED", "ABORTED", "TIMED-OUT"):
                print(f"❌ Apify run {status}")
                return False

            print(f"⏳ Apify status: {status} ({elapsed}s)")
        except Exception as e:
            print(f"⚠️  Polling error: {e}")

        time.sleep(POLL_INTERVAL)
        elapsed += POLL_INTERVAL

    print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
    return False


def _fetch_dataset(run_id: str) -> list:
    """
    Ambil hasil dataset dari run yang sudah selesai.
    """
    url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
    try:
        r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"❌ Fetch dataset error: {e}")
        return []


# ────────────────────────────────────────────
#  TEXT EXTRACTION
# ────────────────────────────────────────────

def _extract_texts(items: list) -> list[str]:
    """
    Ekstrak caption + komentar dari hasil Apify.
    Struktur item apify/instagram-scraper:
      item.caption       → teks caption post
      item.comments[]    → list komentar
        .text            → teks komentar
    """
    texts = []

    for item in items:
        # Caption post
        caption = item.get("caption") or item.get("text") or ""
        if caption and len(caption.strip()) > 5:
            texts.append(caption.strip())

        # Komentar
        comments = item.get("comments") or item.get("latestComments") or []
        for c in comments:
            if isinstance(c, dict):
                body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
            elif isinstance(c, str):
                body = c
            else:
                body = ""

            if body and len(body.strip()) > 3:
                texts.append(body.strip())

    return texts


# ────────────────────────────────────────────
#  PUBLIC INTERFACE
# ────────────────────────────────────────────

def get_instagram_data(keyword: str) -> list[str]:
    """
    Scrape Instagram berdasarkan hashtag/keyword.
    Return list of string (caption + komentar).

    Dipanggil dari services/aggregator.py.
    """
    if not APIFY_API_KEY:
        print("⚠️  APIFY_API_KEY tidak diset — skip Instagram")
        return []

    # 1. jalankan actor
    run_id = _run_actor(keyword)
    if not run_id:
        return []

    # 2. tunggu selesai
    success = _wait_for_run(run_id)
    if not success:
        return []

    # 3. ambil dataset
    items = _fetch_dataset(run_id)
    if not items:
        print("⚠️  Dataset Apify kosong")
        return []

    # 4. ekstrak teks
    texts = _extract_texts(items)
    print(f"✅ Instagram: {len(texts)} teks dari {len(items)} post")
    return texts