Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

Sentimen-Analysis / services /instagram.py

noranisa

Create instagram.py

4cd190b verified 18 days ago

raw

history blame contribute delete

5.51 kB

	"""
	services/instagram.py
	Scrape komentar + caption Instagram berdasarkan hashtag/keyword
	menggunakan Apify Instagram Scraper (apify/instagram-scraper).
	"""

	import os
	import time
	import requests

	# ── CONFIG ──
	APIFY_API_KEY = os.getenv("APIFY_API_KEY") # set di HF Spaces Secrets
	ACTOR_ID = "shu8hvrXbJbY3Eb9W" # apify/instagram-scraper
	BASE_URL = "https://api.apify.com/v2"

	# timeout & polling
	MAX_WAIT_SEC = 120 # maks tunggu actor selesai
	POLL_INTERVAL = 4 # cek setiap N detik
	MAX_POSTS = 10 # jumlah post per hashtag
	MAX_COMMENTS = 20 # komentar per post


	# ────────────────────────────────────────────
	# INTERNAL HELPERS
	# ────────────────────────────────────────────

	def _headers() -> dict:
	return {"Authorization": f"Bearer {APIFY_API_KEY}"}


	def _run_actor(keyword: str) -> str \| None:
	"""
	Jalankan Apify actor dan kembalikan run_id.
	Input disesuaikan dengan apify/instagram-scraper schema.
	"""
	url = f"{BASE_URL}/acts/{ACTOR_ID}/runs"
	body = {
	"hashtags": [keyword.lstrip("#")],
	"resultsLimit": MAX_POSTS,
	"addParentData": False,
	"scrapeComments": True,
	"commentsLimit": MAX_COMMENTS,
	"scrapePostsUntilDate": "", # kosong = tidak ada batas tanggal
	}
	try:
	r = requests.post(url, json=body, headers=_headers(), timeout=30)
	r.raise_for_status()
	run_id = r.json()["data"]["id"]
	print(f"✅ Apify run started: {run_id}")
	return run_id
	except Exception as e:
	print(f"❌ Apify run error: {e}")
	return None


	def _wait_for_run(run_id: str) -> bool:
	"""
	Polling sampai actor selesai atau timeout.
	Return True jika SUCCEEDED.
	"""
	url = f"{BASE_URL}/actor-runs/{run_id}"
	elapsed = 0

	while elapsed < MAX_WAIT_SEC:
	try:
	r = requests.get(url, headers=_headers(), timeout=15)
	status = r.json()["data"]["status"]

	if status == "SUCCEEDED":
	print(f"✅ Apify run SUCCEEDED ({elapsed}s)")
	return True
	if status in ("FAILED", "ABORTED", "TIMED-OUT"):
	print(f"❌ Apify run {status}")
	return False

	print(f"⏳ Apify status: {status} ({elapsed}s)")
	except Exception as e:
	print(f"⚠️ Polling error: {e}")

	time.sleep(POLL_INTERVAL)
	elapsed += POLL_INTERVAL

	print(f"❌ Apify timeout setelah {MAX_WAIT_SEC}s")
	return False


	def _fetch_dataset(run_id: str) -> list:
	"""
	Ambil hasil dataset dari run yang sudah selesai.
	"""
	url = f"{BASE_URL}/actor-runs/{run_id}/dataset/items"
	try:
	r = requests.get(url, headers=_headers(), params={"limit": 200}, timeout=30)
	r.raise_for_status()
	return r.json()
	except Exception as e:
	print(f"❌ Fetch dataset error: {e}")
	return []


	# ────────────────────────────────────────────
	# TEXT EXTRACTION
	# ────────────────────────────────────────────

	def _extract_texts(items: list) -> list[str]:
	"""
	Ekstrak caption + komentar dari hasil Apify.
	Struktur item apify/instagram-scraper:
	item.caption → teks caption post
	item.comments[] → list komentar
	.text → teks komentar
	"""
	texts = []

	for item in items:
	# Caption post
	caption = item.get("caption") or item.get("text") or ""
	if caption and len(caption.strip()) > 5:
	texts.append(caption.strip())

	# Komentar
	comments = item.get("comments") or item.get("latestComments") or []
	for c in comments:
	if isinstance(c, dict):
	body = c.get("text") or c.get("ownerComment", {}).get("text") or ""
	elif isinstance(c, str):
	body = c
	else:
	body = ""

	if body and len(body.strip()) > 3:
	texts.append(body.strip())

	return texts


	# ────────────────────────────────────────────
	# PUBLIC INTERFACE
	# ────────────────────────────────────────────

	def get_instagram_data(keyword: str) -> list[str]:
	"""
	Scrape Instagram berdasarkan hashtag/keyword.
	Return list of string (caption + komentar).

	Dipanggil dari services/aggregator.py.
	"""
	if not APIFY_API_KEY:
	print("⚠️ APIFY_API_KEY tidak diset — skip Instagram")
	return []

	# 1. jalankan actor
	run_id = _run_actor(keyword)
	if not run_id:
	return []

	# 2. tunggu selesai
	success = _wait_for_run(run_id)
	if not success:
	return []

	# 3. ambil dataset
	items = _fetch_dataset(run_id)
	if not items:
	print("⚠️ Dataset Apify kosong")
	return []

	# 4. ekstrak teks
	texts = _extract_texts(items)
	print(f"✅ Instagram: {len(texts)} teks dari {len(items)} post")
	return texts