noranisa commited on
Commit
4cd190b
Β·
verified Β·
1 Parent(s): 496de3b

Create instagram.py

Browse files
Files changed (1) hide show
  1. services/instagram.py +171 -0
services/instagram.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ services/instagram.py
3
+ Scrape komentar + caption Instagram berdasarkan hashtag/keyword
4
+ menggunakan Apify Instagram Scraper (apify/instagram-scraper).
5
+ """
6
+
7
+ import os
8
+ import time
9
+ import requests
10
+
11
+ # ── CONFIG ──
12
+ APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets
13
+ ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper
14
+ BASE_URL = "https://api.apify.com/v2"
15
+
16
+ # timeout & polling
17
+ MAX_WAIT_SEC = 120 # maks tunggu actor selesai
18
+ POLL_INTERVAL = 4 # cek setiap N detik
19
+ MAX_POSTS = 10 # jumlah post per hashtag
20
+ MAX_COMMENTS = 20 # komentar per post
21
+
22
+
23
+ # ────────────────────────────────────────────
24
+ # INTERNAL HELPERS
25
+ # ────────────────────────────────────────────
26
+
27
+ def _headers() -> dict:
28
+ return {"Authorization": f"Bearer {APIFY_API_KEY}"}
29
+
30
+
31
+ def _run_actor(keyword: str) -> str | None:
32
+ """
33
+ Jalankan Apify actor dan kembalikan run_id.
34
+ Input disesuaikan dengan apify/instagram-scraper schema.
35
+ """
36
+ url = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
37
+ body = {
38
+ "hashtags": [keyword.lstrip("#")],
39
+ "resultsLimit": MAX_POSTS,
40
+ "addParentData": False,
41
+ "scrapeComments": True,
42
+ "commentsLimit": MAX_COMMENTS,
43
+ "scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal
44
+ }
45
+ try:
46
+ r = requests.post(url, json=body, headers=_headers(), timeout=30)
47
+ r.raise_for_status()
48
+ run_id = r.json()["data"]["id"]
49
+ print(f"βœ… Apify run started: {run_id}")
50
+ return run_id
51
+ except Exception as e:
52
+ print(f"❌ Apify run error: {e}")
53
+ return None
54
+
55
+
56
+ def _wait_for_run(run_id: str) -> bool:
57
+ """
58
+ Polling sampai actor selesai atau timeout.
59
+ Return True jika SUCCEEDED.
60
+ """
61
+ url = f"{BASE_URL}/actor-runs/{run_id}"
62
+ elapsed = 0
63
+
64
+ while elapsed < MAX_WAIT_SEC:
65
+ try:
66
+ r = requests.get(url, headers=_headers(), timeout=15)
67
+ status = r.json()["data"]["status"]
68
+
69
+ if status == "SUCCEEDED":
70
+ print(f"βœ… Apify run SUCCEEDED ({elapsed}s)")
71
+ return True
72
+ if status in ("FAILED", "ABORTED", "TIMED-OUT"):
73
+ print(f"❌ Apify run {status}")
74
+ return False
75
+
76
+ print(f"⏳ Apify status: {status} ({elapsed}s)")
77
+ except Exception as e:
78
+ print(f"⚠️ Polling error: {e}")
79
+
80
+ time.sleep(POLL_INTERVAL)
81
+ elapsed += POLL_INTERVAL
82
+
83
+ print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
84
+ return False
85
+
86
+
87
+ def _fetch_dataset(run_id: str) -> list:
88
+ """
89
+ Ambil hasil dataset dari run yang sudah selesai.
90
+ """
91
+ url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
92
+ try:
93
+ r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
94
+ r.raise_for_status()
95
+ return r.json()
96
+ except Exception as e:
97
+ print(f"❌ Fetch dataset error: {e}")
98
+ return []
99
+
100
+
101
+ # ────────────────────────────────────────────
102
+ # TEXT EXTRACTION
103
+ # ────────────────────────────────────────────
104
+
105
+ def _extract_texts(items: list) -> list[str]:
106
+ """
107
+ Ekstrak caption + komentar dari hasil Apify.
108
+ Struktur item apify/instagram-scraper:
109
+ item.caption β†’ teks caption post
110
+ item.comments[] β†’ list komentar
111
+ .text β†’ teks komentar
112
+ """
113
+ texts = []
114
+
115
+ for item in items:
116
+ # Caption post
117
+ caption = item.get("caption") or item.get("text") or ""
118
+ if caption and len(caption.strip()) > 5:
119
+ texts.append(caption.strip())
120
+
121
+ # Komentar
122
+ comments = item.get("comments") or item.get("latestComments") or []
123
+ for c in comments:
124
+ if isinstance(c, dict):
125
+ body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
126
+ elif isinstance(c, str):
127
+ body = c
128
+ else:
129
+ body = ""
130
+
131
+ if body and len(body.strip()) > 3:
132
+ texts.append(body.strip())
133
+
134
+ return texts
135
+
136
+
137
+ # ────────────────────────────────────────────
138
+ # PUBLIC INTERFACE
139
+ # ────────────────────────────────────────────
140
+
141
+ def get_instagram_data(keyword: str) -> list[str]:
142
+ """
143
+ Scrape Instagram berdasarkan hashtag/keyword.
144
+ Return list of string (caption + komentar).
145
+
146
+ Dipanggil dari services/aggregator.py.
147
+ """
148
+ if not APIFY_API_KEY:
149
+ print("⚠️ APIFY_API_KEY tidak diset β€” skip Instagram")
150
+ return []
151
+
152
+ # 1. jalankan actor
153
+ run_id = _run_actor(keyword)
154
+ if not run_id:
155
+ return []
156
+
157
+ # 2. tunggu selesai
158
+ success = _wait_for_run(run_id)
159
+ if not success:
160
+ return []
161
+
162
+ # 3. ambil dataset
163
+ items = _fetch_dataset(run_id)
164
+ if not items:
165
+ print("⚠️ Dataset Apify kosong")
166
+ return []
167
+
168
+ # 4. ekstrak teks
169
+ texts = _extract_texts(items)
170
+ print(f"βœ… Instagram: {len(texts)} teks dari {len(items)} post")
171
+ return texts