Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

noranisa commited on 18 days ago

Commit

4cd190b

verified ·

1 Parent(s): 496de3b

Create instagram.py

Browse files

Files changed (1) hide show

services/instagram.py +171 -0

services/instagram.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+services/instagram.py
+Scrape komentar + caption Instagram berdasarkan hashtag/keyword
+menggunakan Apify Instagram Scraper (apify/instagram-scraper).
+"""
+import os
+import time
+import requests
+# ── CONFIG ──
+APIFY_API_KEY = os.getenv("APIFY_API_KEY")          # set di HF Spaces Secrets
+ACTOR_ID      = "shu8hvrXbJbY3Eb9W"                 # apify/instagram-scraper
+BASE_URL      = "https://api.apify.com/v2"
+# timeout & polling
+MAX_WAIT_SEC  = 120    # maks tunggu actor selesai
+POLL_INTERVAL = 4      # cek setiap N detik
+MAX_POSTS     = 10     # jumlah post per hashtag
+MAX_COMMENTS  = 20     # komentar per post
+# ────────────────────────────────────────────
+#  INTERNAL HELPERS
+# ────────────────────────────────────────────
+def _headers() -> dict:
+    return {"Authorization": f"Bearer {APIFY_API_KEY}"}
+def _run_actor(keyword: str) -> str | None:
+    """
+    Jalankan Apify actor dan kembalikan run_id.
+    Input disesuaikan dengan apify/instagram-scraper schema.
+    """
+    url  = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
+    body = {
+        "hashtags":          [keyword.lstrip("#")],
+        "resultsLimit":      MAX_POSTS,
+        "addParentData":     False,
+        "scrapeComments":    True,
+        "commentsLimit":     MAX_COMMENTS,
+        "scrapePostsUntilDate": "",   # kosong = tidak ada batas tanggal
+    }
+    try:
+        r = requests.post(url, json=body, headers=_headers(), timeout=30)
+        r.raise_for_status()
+        run_id = r.json()["data"]["id"]
+        print(f"✅ Apify run started: {run_id}")
+        return run_id
+    except Exception as e:
+        print(f"❌ Apify run error: {e}")
+        return None
+def _wait_for_run(run_id: str) -> bool:
+    """
+    Polling sampai actor selesai atau timeout.
+    Return True jika SUCCEEDED.
+    """
+    url     = f"{BASE_URL}/actor-runs/{run_id}"
+    elapsed = 0
+    while elapsed < MAX_WAIT_SEC:
+        try:
+            r      = requests.get(url, headers=_headers(), timeout=15)
+            status = r.json()["data"]["status"]
+            if status == "SUCCEEDED":
+                print(f"✅ Apify run SUCCEEDED ({elapsed}s)")
+                return True
+            if status in ("FAILED", "ABORTED", "TIMED-OUT"):
+                print(f"❌ Apify run {status}")
+                return False
+            print(f"⏳ Apify status: {status} ({elapsed}s)")
+        except Exception as e:
+            print(f"⚠️  Polling error: {e}")
+        time.sleep(POLL_INTERVAL)
+        elapsed += POLL_INTERVAL
+    print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
+    return False
+def _fetch_dataset(run_id: str) -> list:
+    """
+    Ambil hasil dataset dari run yang sudah selesai.
+    """
+    url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
+    try:
+        r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
+        r.raise_for_status()
+        return r.json()
+    except Exception as e:
+        print(f"❌ Fetch dataset error: {e}")
+        return []
+# ────────────────────────────────────────────
+#  TEXT EXTRACTION
+# ────────────────────────────────────────────
+def _extract_texts(items: list) -> list[str]:
+    """
+    Ekstrak caption + komentar dari hasil Apify.
+    Struktur item apify/instagram-scraper:
+      item.caption       → teks caption post
+      item.comments[]    → list komentar
+        .text            → teks komentar
+    """
+    texts = []
+    for item in items:
+        # Caption post
+        caption = item.get("caption") or item.get("text") or ""
+        if caption and len(caption.strip()) > 5:
+            texts.append(caption.strip())
+        # Komentar
+        comments = item.get("comments") or item.get("latestComments") or []
+        for c in comments:
+            if isinstance(c, dict):
+                body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
+            elif isinstance(c, str):
+                body = c
+            else:
+                body = ""
+            if body and len(body.strip()) > 3:
+                texts.append(body.strip())
+    return texts
+# ────────────────────────────────────────────
+#  PUBLIC INTERFACE
+# ────────────────────────────────────────────
+def get_instagram_data(keyword: str) -> list[str]:
+    """
+    Scrape Instagram berdasarkan hashtag/keyword.
+    Return list of string (caption + komentar).
+    Dipanggil dari services/aggregator.py.
+    """
+    if not APIFY_API_KEY:
+        print("⚠️  APIFY_API_KEY tidak diset — skip Instagram")
+        return []
+    # 1. jalankan actor
+    run_id = _run_actor(keyword)
+    if not run_id:
+        return []
+    # 2. tunggu selesai
+    success = _wait_for_run(run_id)
+    if not success:
+        return []
+    # 3. ambil dataset
+    items = _fetch_dataset(run_id)
+    if not items:
+        print("⚠️  Dataset Apify kosong")
+        return []
+    # 4. ekstrak teks
+    texts = _extract_texts(items)
+    print(f"✅ Instagram: {len(texts)} teks dari {len(items)} post")
+    return texts