noranisa commited on
Commit
64b0b38
Β·
verified Β·
1 Parent(s): 3e22e71

Create services/tiktok.py

Browse files
Files changed (1) hide show
  1. services/tiktok.py +201 -0
services/tiktok.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ services/tiktok.py
3
+ Scrape komentar + caption TikTok berdasarkan hashtag/keyword
4
+ menggunakan Apify clockworks/tiktok-scraper (GdWCkxBtKWOsKjdch).
5
+
6
+ Data yang diambil:
7
+ - item.text β†’ caption/deskripsi video
8
+ - item.comments[] β†’ komentar (jika tersedia)
9
+ - item.authorMeta β†’ info author (opsional)
10
+ """
11
+
12
+ import os
13
+ import time
14
+ import requests
15
+
16
+ # ── CONFIG ──
17
+ APIFY_API_KEY = os.getenv("APIFY_API_KEY")
18
+ ACTOR_ID = "GdWCkxBtKWOsKjdch" # clockworks/tiktok-scraper
19
+ BASE_URL = "https://api.apify.com/v2"
20
+
21
+ MAX_WAIT_SEC = 150 # TikTok scraper sedikit lebih lambat dari Instagram
22
+ POLL_INTERVAL = 5
23
+ MAX_ITEMS = 20 # jumlah video per hashtag
24
+ MAX_COMMENTS = 30 # komentar per video
25
+
26
+
27
+ # ────────────────────────────────────────────
28
+ # INTERNAL HELPERS
29
+ # ────────────────────────────────────────────
30
+
31
+ def _headers() -> dict:
32
+ return {"Authorization": f"Bearer {APIFY_API_KEY}"}
33
+
34
+
35
+ def _run_actor(keyword: str) -> str | None:
36
+ """
37
+ Jalankan Apify TikTok actor.
38
+ Input schema clockworks/tiktok-scraper:
39
+ hashtags β†’ list hashtag (tanpa #)
40
+ resultsPerPage β†’ jumlah video per hashtag
41
+ maxRequestRetries β†’ retry jika gagal
42
+ """
43
+ url = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
44
+ hashtag = keyword.lstrip("#").strip().replace(" ", "")
45
+
46
+ body = {
47
+ "hashtags": [hashtag],
48
+ "resultsPerPage": MAX_ITEMS,
49
+ "shouldDownloadVideos": False,
50
+ "shouldDownloadCovers": False,
51
+ "shouldDownloadSubtitles": False,
52
+ "shouldDownloadSlideshowImages": False,
53
+ "maxRequestRetries": 3,
54
+ }
55
+
56
+ try:
57
+ r = requests.post(url, json=body, headers=_headers(), timeout=30)
58
+ r.raise_for_status()
59
+ run_id = r.json()["data"]["id"]
60
+ print(f"βœ… Apify TikTok run started: {run_id}")
61
+ return run_id
62
+ except Exception as e:
63
+ print(f"❌ Apify TikTok run error: {e}")
64
+ return None
65
+
66
+
67
+ def _wait_for_run(run_id: str) -> bool:
68
+ """Polling sampai actor selesai atau timeout."""
69
+ url = f"{BASE_URL}/actor-runs/{run_id}"
70
+ elapsed = 0
71
+
72
+ while elapsed < MAX_WAIT_SEC:
73
+ try:
74
+ r = requests.get(url, headers=_headers(), timeout=15)
75
+ status = r.json()["data"]["status"]
76
+
77
+ if status == "SUCCEEDED":
78
+ print(f"βœ… TikTok run SUCCEEDED ({elapsed}s)")
79
+ return True
80
+ if status in ("FAILED", "ABORTED", "TIMED-OUT"):
81
+ print(f"❌ TikTok run {status}")
82
+ return False
83
+
84
+ print(f"⏳ TikTok status: {status} ({elapsed}s)")
85
+ except Exception as e:
86
+ print(f"⚠️ TikTok polling error: {e}")
87
+
88
+ time.sleep(POLL_INTERVAL)
89
+ elapsed += POLL_INTERVAL
90
+
91
+ print(f"❌ TikTok timeout setelah {MAX_WAIT_SEC}s")
92
+ return False
93
+
94
+
95
+ def _fetch_dataset(run_id: str) -> list:
96
+ """Ambil hasil dataset dari run yang selesai."""
97
+ url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
98
+ try:
99
+ r = requests.get(
100
+ url,
101
+ headers=_headers(),
102
+ params={"limit": 200, "clean": True},
103
+ timeout=30
104
+ )
105
+ r.raise_for_status()
106
+ return r.json()
107
+ except Exception as e:
108
+ print(f"❌ TikTok fetch dataset error: {e}")
109
+ return []
110
+
111
+
112
+ # ────────────────────────────────────────────
113
+ # TEXT EXTRACTION
114
+ # ────────────────────────────────────────────
115
+
116
+ def _extract_texts(items: list) -> list[str]:
117
+ """
118
+ Ekstrak caption + komentar dari output clockworks/tiktok-scraper.
119
+
120
+ Struktur item yang relevan:
121
+ item["text"] β†’ caption video
122
+ item["videoMeta"] β†’ metadata video (opsional)
123
+ item["authorMeta"]["name"] β†’ username author
124
+ item["hashtags"] β†’ list hashtag dalam post
125
+ item["comments"] β†’ list komentar (jika actor mendukung)
126
+ .text β†’ teks komentar
127
+ """
128
+ texts = []
129
+
130
+ for item in items:
131
+ # ── Caption / deskripsi video ──
132
+ caption = (
133
+ item.get("text") or
134
+ item.get("description") or
135
+ item.get("desc") or
136
+ ""
137
+ ).strip()
138
+
139
+ if caption and len(caption) > 4:
140
+ texts.append(caption)
141
+
142
+ # ── Komentar (beberapa versi actor menyertakan ini) ──
143
+ comments = (
144
+ item.get("comments") or
145
+ item.get("topComments") or
146
+ []
147
+ )
148
+
149
+ for c in comments[:MAX_COMMENTS]:
150
+ if isinstance(c, dict):
151
+ body = (
152
+ c.get("text") or
153
+ c.get("commentText") or
154
+ c.get("content") or
155
+ ""
156
+ ).strip()
157
+ elif isinstance(c, str):
158
+ body = c.strip()
159
+ else:
160
+ body = ""
161
+
162
+ if body and len(body) > 3:
163
+ texts.append(body)
164
+
165
+ return texts
166
+
167
+
168
+ # ────────────────────────────────────────────
169
+ # PUBLIC INTERFACE
170
+ # ────────────────────────────────────────────
171
+
172
+ def get_tiktok_data(keyword: str) -> list[str]:
173
+ """
174
+ Scrape TikTok berdasarkan hashtag/keyword.
175
+ Return list of string (caption + komentar).
176
+
177
+ Dipanggil dari services/aggregator.py.
178
+ """
179
+ if not APIFY_API_KEY:
180
+ print("⚠️ APIFY_API_KEY tidak diset β€” skip TikTok")
181
+ return []
182
+
183
+ # 1. jalankan actor
184
+ run_id = _run_actor(keyword)
185
+ if not run_id:
186
+ return []
187
+
188
+ # 2. tunggu selesai
189
+ if not _wait_for_run(run_id):
190
+ return []
191
+
192
+ # 3. ambil dataset
193
+ items = _fetch_dataset(run_id)
194
+ if not items:
195
+ print("⚠️ TikTok dataset kosong")
196
+ return []
197
+
198
+ # 4. ekstrak teks
199
+ texts = _extract_texts(items)
200
+ print(f"βœ… TikTok: {len(texts)} teks dari {len(items)} video")
201
+ return texts