Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

Sentimen-Analysis / services /aggregator.py

noranisa

Update services/aggregator.py

f7b57d8 verified 18 days ago

raw

history blame contribute delete

6.73 kB

	"""
	services/aggregator.py
	Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News.

	CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung
	pada services.preprocessing_id yang mungkin belum ada di repo.
	"""

	import re

	from services.youtube import search_videos, get_comments
	from services.reddit import get_reddit_comments

	# ── Optional: deep preprocessing jika tersedia ──
	try:
	from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid
	_DEEP = True
	print("✅ aggregator: deep preprocessing loaded")
	except Exception:
	_DEEP = False
	print("⚠️ aggregator: using built-in basic preprocessing")

	# ── Optional sources ──
	try:
	from services.instagram import get_instagram_data
	INSTAGRAM_OK = True
	except Exception:
	INSTAGRAM_OK = False
	def get_instagram_data(kw): return []

	try:
	from services.tiktok import get_tiktok_data
	TIKTOK_OK = True
	except Exception:
	TIKTOK_OK = False
	def get_tiktok_data(kw): return []

	try:
	from services.google_news import get_google_news
	GNEWS_OK = True
	except Exception:
	GNEWS_OK = False
	def get_google_news(kw): return []


	# ════════════════════════════════════════════════
	# BUILT-IN PREPROCESSING (fallback self-contained)
	# ════════════════════════════════════════════════
	_STOPWORDS_BASIC = {
	'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
	'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami',
	'mereka','kita','ya','jadi','kalau','tapi','atau','karena',
	'the','is','in','of','a','an','and','it','for','that','this',
	}

	_SLANG_BASIC = {
	'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan',
	'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya',
	'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga',
	'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak',
	'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya',
	'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke',
	'wkwk':'haha','hehe':'haha','lol':'tertawa',
	}

	def _clean_basic(text: str) -> str:
	"""Basic preprocessing — always available."""
	if not text or not isinstance(text, str):
	return ""
	t = text.lower().strip()
	t = re.sub(r'https?://\S+\|www\.\S+', '', t) # hapus URL
	t = re.sub(r'@\w+', '', t) # hapus mention
	t = re.sub(r'#(\w+)', r' \1 ', t) # hashtag → kata
	t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi
	t = re.sub(r'[^a-z0-9\s]', ' ', t) # hapus non-alfanumerik
	tokens = [_SLANG_BASIC.get(w, w) for w in t.split()]
	tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC]
	return ' '.join(tokens)

	def _valid_basic(text: str, min_words: int = 3) -> bool:
	"""Cek validitas teks — always available."""
	if not text or not isinstance(text, str):
	return False
	return len(text.split()) >= min_words


	# Gunakan deep preprocessing jika tersedia, fallback ke basic
	def clean_text(text: str) -> str:
	if _DEEP:
	try:
	return _clean(text)
	except Exception:
	pass
	return _clean_basic(text)

	def is_valid(text: str) -> bool:
	if _DEEP:
	try:
	return _valid(text)
	except Exception:
	pass
	return _valid_basic(text)


	# ════════════════════════════════════════════════
	# MAIN COLLECTOR
	# ════════════════════════════════════════════════
	def collect_data(keyword: str, source: str = "all") -> list:
	"""
	Return: list of (source_label, cleaned_text)

	source options (bisa kombinasi CSV):
	"all" → semua 5 platform
	"youtube" → YouTube saja
	"reddit" → Reddit saja
	"instagram" → Instagram saja
	"tiktok" → TikTok saja
	"news" → Google News saja
	"youtube,tiktok" → YouTube + TikTok
	dst.
	"""
	all_data = []
	src = source.lower()

	def wants(platform: str) -> bool:
	return src == "all" or platform in src

	# 1. YOUTUBE
	if wants("youtube"):
	before = len(all_data)
	try:
	for vid in search_videos(keyword):
	for c in get_comments(vid):
	all_data.append(("youtube", c))
	print(f"✅ YouTube: {len(all_data)-before} komentar")
	except Exception as e:
	print(f"⚠️ YouTube error: {e}")

	# 2. REDDIT
	if wants("reddit"):
	before = len(all_data)
	try:
	for c in get_reddit_comments(keyword):
	all_data.append(("reddit", c))
	print(f"✅ Reddit: {len(all_data)-before} komentar")
	except Exception as e:
	print(f"⚠️ Reddit error: {e}")

	# 3. INSTAGRAM
	if wants("instagram") and INSTAGRAM_OK:
	before = len(all_data)
	try:
	for text in get_instagram_data(keyword):
	all_data.append(("instagram", text))
	print(f"✅ Instagram: {len(all_data)-before} teks")
	except Exception as e:
	print(f"⚠️ Instagram error: {e}")

	# 4. TIKTOK
	if wants("tiktok") and TIKTOK_OK:
	before = len(all_data)
	try:
	for text in get_tiktok_data(keyword):
	all_data.append(("tiktok", text))
	print(f"✅ TikTok: {len(all_data)-before} teks")
	except Exception as e:
	print(f"⚠️ TikTok error: {e}")

	# 5. GOOGLE NEWS
	if wants("news") and GNEWS_OK:
	before = len(all_data)
	try:
	for text in get_google_news(keyword):
	all_data.append(("news", text))
	print(f"✅ Google News: {len(all_data)-before} teks")
	except Exception as e:
	print(f"⚠️ Google News error: {e}")

	# FALLBACK
	if not all_data:
	print("⚠️ Tidak ada data dari semua sumber")
	all_data = [("unknown", "data tidak ditemukan")]

	# CLEAN & FILTER
	cleaned = [
	(src_label, clean_text(text))
	for src_label, text in all_data
	if is_valid(text)
	]
	print(f"✅ Total: {len(cleaned)} teks bersih dari {len(all_data)} raw")
	return cleaned