Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

noranisa commited on Mar 17

Commit

64b0b38

verified ·

1 Parent(s): 3e22e71

Create services/tiktok.py

Browse files

Files changed (1) hide show

services/tiktok.py +201 -0

services/tiktok.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+services/tiktok.py
+Scrape komentar + caption TikTok berdasarkan hashtag/keyword
+menggunakan Apify clockworks/tiktok-scraper (GdWCkxBtKWOsKjdch).
+Data yang diambil:
+  - item.text        → caption/deskripsi video
+  - item.comments[]  → komentar (jika tersedia)
+  - item.authorMeta  → info author (opsional)
+"""
+import os
+import time
+import requests
+# ── CONFIG ──
+APIFY_API_KEY = os.getenv("APIFY_API_KEY")
+ACTOR_ID      = "GdWCkxBtKWOsKjdch"       # clockworks/tiktok-scraper
+BASE_URL      = "https://api.apify.com/v2"
+MAX_WAIT_SEC  = 150    # TikTok scraper sedikit lebih lambat dari Instagram
+POLL_INTERVAL = 5
+MAX_ITEMS     = 20     # jumlah video per hashtag
+MAX_COMMENTS  = 30     # komentar per video
+# ────────────────────────────────────────────
+#  INTERNAL HELPERS
+# ────────────────────────────────────────────
+def _headers() -> dict:
+    return {"Authorization": f"Bearer {APIFY_API_KEY}"}
+def _run_actor(keyword: str) -> str | None:
+    """
+    Jalankan Apify TikTok actor.
+    Input schema clockworks/tiktok-scraper:
+      hashtags        → list hashtag (tanpa #)
+      resultsPerPage  → jumlah video per hashtag
+      maxRequestRetries → retry jika gagal
+    """
+    url     = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
+    hashtag = keyword.lstrip("#").strip().replace(" ", "")
+    body = {
+        "hashtags":         [hashtag],
+        "resultsPerPage":   MAX_ITEMS,
+        "shouldDownloadVideos": False,
+        "shouldDownloadCovers": False,
+        "shouldDownloadSubtitles": False,
+        "shouldDownloadSlideshowImages": False,
+        "maxRequestRetries": 3,
+    }
+    try:
+        r = requests.post(url, json=body, headers=_headers(), timeout=30)
+        r.raise_for_status()
+        run_id = r.json()["data"]["id"]
+        print(f"✅ Apify TikTok run started: {run_id}")
+        return run_id
+    except Exception as e:
+        print(f"❌ Apify TikTok run error: {e}")
+        return None
+def _wait_for_run(run_id: str) -> bool:
+    """Polling sampai actor selesai atau timeout."""
+    url     = f"{BASE_URL}/actor-runs/{run_id}"
+    elapsed = 0
+    while elapsed < MAX_WAIT_SEC:
+        try:
+            r      = requests.get(url, headers=_headers(), timeout=15)
+            status = r.json()["data"]["status"]
+            if status == "SUCCEEDED":
+                print(f"✅ TikTok run SUCCEEDED ({elapsed}s)")
+                return True
+            if status in ("FAILED", "ABORTED", "TIMED-OUT"):
+                print(f"❌ TikTok run {status}")
+                return False
+            print(f"⏳ TikTok status: {status} ({elapsed}s)")
+        except Exception as e:
+            print(f"⚠️  TikTok polling error: {e}")
+        time.sleep(POLL_INTERVAL)
+        elapsed += POLL_INTERVAL
+    print(f"❌ TikTok timeout setelah {MAX_WAIT_SEC}s")
+    return False
+def _fetch_dataset(run_id: str) -> list:
+    """Ambil hasil dataset dari run yang selesai."""
+    url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
+    try:
+        r = requests.get(
+            url,
+            headers=_headers(),
+            params={"limit": 200, "clean": True},
+            timeout=30
+        )
+        r.raise_for_status()
+        return r.json()
+    except Exception as e:
+        print(f"❌ TikTok fetch dataset error: {e}")
+        return []
+# ────────────────────────────────────────────
+#  TEXT EXTRACTION
+# ────────────────────────────────────────────
+def _extract_texts(items: list) -> list[str]:
+    """
+    Ekstrak caption + komentar dari output clockworks/tiktok-scraper.
+    Struktur item yang relevan:
+      item["text"]             → caption video
+      item["videoMeta"]        → metadata video (opsional)
+      item["authorMeta"]["name"] → username author
+      item["hashtags"]         → list hashtag dalam post
+      item["comments"]         → list komentar (jika actor mendukung)
+        .text                  → teks komentar
+    """
+    texts = []
+    for item in items:
+        # ── Caption / deskripsi video ──
+        caption = (
+            item.get("text") or
+            item.get("description") or
+            item.get("desc") or
+            ""
+        ).strip()
+        if caption and len(caption) > 4:
+            texts.append(caption)
+        # ── Komentar (beberapa versi actor menyertakan ini) ──
+        comments = (
+            item.get("comments") or
+            item.get("topComments") or
+            []
+        )
+        for c in comments[:MAX_COMMENTS]:
+            if isinstance(c, dict):
+                body = (
+                    c.get("text") or
+                    c.get("commentText") or
+                    c.get("content") or
+                    ""
+                ).strip()
+            elif isinstance(c, str):
+                body = c.strip()
+            else:
+                body = ""
+            if body and len(body) > 3:
+                texts.append(body)
+    return texts
+# ────────────────────────────────────────────
+#  PUBLIC INTERFACE
+# ────────────────────────────────────────────
+def get_tiktok_data(keyword: str) -> list[str]:
+    """
+    Scrape TikTok berdasarkan hashtag/keyword.
+    Return list of string (caption + komentar).
+    Dipanggil dari services/aggregator.py.
+    """
+    if not APIFY_API_KEY:
+        print("⚠️  APIFY_API_KEY tidak diset — skip TikTok")
+        return []
+    # 1. jalankan actor
+    run_id = _run_actor(keyword)
+    if not run_id:
+        return []
+    # 2. tunggu selesai
+    if not _wait_for_run(run_id):
+        return []
+    # 3. ambil dataset
+    items = _fetch_dataset(run_id)
+    if not items:
+        print("⚠️  TikTok dataset kosong")
+        return []
+    # 4. ekstrak teks
+    texts = _extract_texts(items)
+    print(f"✅ TikTok: {len(texts)} teks dari {len(items)} video")
+    return texts