File size: 5,509 Bytes
4cd190b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
services/instagram.py
Scrape komentar + caption Instagram berdasarkan hashtag/keyword
menggunakan Apify Instagram Scraper (apify/instagram-scraper).
"""

import os
import time
import requests

# ── CONFIG ──
APIFY_API_KEY = os.getenv("APIFY_API_KEY")          # set di HF Spaces Secrets
ACTOR_ID      = "shu8hvrXbJbY3Eb9W"                 # apify/instagram-scraper
BASE_URL      = "https://api.apify.com/v2"

# timeout & polling
MAX_WAIT_SEC  = 120    # maks tunggu actor selesai
POLL_INTERVAL = 4      # cek setiap N detik
MAX_POSTS     = 10     # jumlah post per hashtag
MAX_COMMENTS  = 20     # komentar per post


# ────────────────────────────────────────────
#  INTERNAL HELPERS
# ────────────────────────────────────────────

def _headers() -> dict:
    return {"Authorization": f"Bearer {APIFY_API_KEY}"}


def _run_actor(keyword: str) -> str | None:
    """
    Jalankan Apify actor dan kembalikan run_id.
    Input disesuaikan dengan apify/instagram-scraper schema.
    """
    url  = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
    body = {
        "hashtags":          [keyword.lstrip("#")],
        "resultsLimit":      MAX_POSTS,
        "addParentData":     False,
        "scrapeComments":    True,
        "commentsLimit":     MAX_COMMENTS,
        "scrapePostsUntilDate": "",   # kosong = tidak ada batas tanggal
    }
    try:
        r = requests.post(url, json=body, headers=_headers(), timeout=30)
        r.raise_for_status()
        run_id = r.json()["data"]["id"]
        print(f"βœ… Apify run started: {run_id}")
        return run_id
    except Exception as e:
        print(f"❌ Apify run error: {e}")
        return None


def _wait_for_run(run_id: str) -> bool:
    """
    Polling sampai actor selesai atau timeout.
    Return True jika SUCCEEDED.
    """
    url     = f"{BASE_URL}/actor-runs/{run_id}"
    elapsed = 0

    while elapsed < MAX_WAIT_SEC:
        try:
            r      = requests.get(url, headers=_headers(), timeout=15)
            status = r.json()["data"]["status"]

            if status == "SUCCEEDED":
                print(f"βœ… Apify run SUCCEEDED ({elapsed}s)")
                return True
            if status in ("FAILED", "ABORTED", "TIMED-OUT"):
                print(f"❌ Apify run {status}")
                return False

            print(f"⏳ Apify status: {status} ({elapsed}s)")
        except Exception as e:
            print(f"⚠️  Polling error: {e}")

        time.sleep(POLL_INTERVAL)
        elapsed += POLL_INTERVAL

    print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
    return False


def _fetch_dataset(run_id: str) -> list:
    """
    Ambil hasil dataset dari run yang sudah selesai.
    """
    url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
    try:
        r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"❌ Fetch dataset error: {e}")
        return []


# ────────────────────────────────────────────
#  TEXT EXTRACTION
# ────────────────────────────────────────────

def _extract_texts(items: list) -> list[str]:
    """
    Ekstrak caption + komentar dari hasil Apify.
    Struktur item apify/instagram-scraper:
      item.caption       β†’ teks caption post
      item.comments[]    β†’ list komentar
        .text            β†’ teks komentar
    """
    texts = []

    for item in items:
        # Caption post
        caption = item.get("caption") or item.get("text") or ""
        if caption and len(caption.strip()) > 5:
            texts.append(caption.strip())

        # Komentar
        comments = item.get("comments") or item.get("latestComments") or []
        for c in comments:
            if isinstance(c, dict):
                body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
            elif isinstance(c, str):
                body = c
            else:
                body = ""

            if body and len(body.strip()) > 3:
                texts.append(body.strip())

    return texts


# ────────────────────────────────────────────
#  PUBLIC INTERFACE
# ────────────────────────────────────────────

def get_instagram_data(keyword: str) -> list[str]:
    """
    Scrape Instagram berdasarkan hashtag/keyword.
    Return list of string (caption + komentar).

    Dipanggil dari services/aggregator.py.
    """
    if not APIFY_API_KEY:
        print("⚠️  APIFY_API_KEY tidak diset β€” skip Instagram")
        return []

    # 1. jalankan actor
    run_id = _run_actor(keyword)
    if not run_id:
        return []

    # 2. tunggu selesai
    success = _wait_for_run(run_id)
    if not success:
        return []

    # 3. ambil dataset
    items = _fetch_dataset(run_id)
    if not items:
        print("⚠️  Dataset Apify kosong")
        return []

    # 4. ekstrak teks
    texts = _extract_texts(items)
    print(f"βœ… Instagram: {len(texts)} teks dari {len(items)} post")
    return texts