""" services/aggregator.py Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News. CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung pada services.preprocessing_id yang mungkin belum ada di repo. """ import re from services.youtube import search_videos, get_comments from services.reddit import get_reddit_comments # ── Optional: deep preprocessing jika tersedia ── try: from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid _DEEP = True print("✅ aggregator: deep preprocessing loaded") except Exception: _DEEP = False print("⚠️ aggregator: using built-in basic preprocessing") # ── Optional sources ── try: from services.instagram import get_instagram_data INSTAGRAM_OK = True except Exception: INSTAGRAM_OK = False def get_instagram_data(kw): return [] try: from services.tiktok import get_tiktok_data TIKTOK_OK = True except Exception: TIKTOK_OK = False def get_tiktok_data(kw): return [] try: from services.google_news import get_google_news GNEWS_OK = True except Exception: GNEWS_OK = False def get_google_news(kw): return [] # ════════════════════════════════════════════════ # BUILT-IN PREPROCESSING (fallback self-contained) # ════════════════════════════════════════════════ _STOPWORDS_BASIC = { 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah', 'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami', 'mereka','kita','ya','jadi','kalau','tapi','atau','karena', 'the','is','in','of','a','an','and','it','for','that','this', } _SLANG_BASIC = { 'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan', 'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya', 'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga', 'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak', 'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya', 'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke', 'wkwk':'haha','hehe':'haha','lol':'tertawa', } def _clean_basic(text: str) -> str: """Basic preprocessing — always available.""" if not text or not isinstance(text, str): return "" t = text.lower().strip() t = re.sub(r'https?://\S+|www\.\S+', '', t) # hapus URL t = re.sub(r'@\w+', '', t) # hapus mention t = re.sub(r'#(\w+)', r' \1 ', t) # hashtag → kata t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi t = re.sub(r'[^a-z0-9\s]', ' ', t) # hapus non-alfanumerik tokens = [_SLANG_BASIC.get(w, w) for w in t.split()] tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC] return ' '.join(tokens) def _valid_basic(text: str, min_words: int = 3) -> bool: """Cek validitas teks — always available.""" if not text or not isinstance(text, str): return False return len(text.split()) >= min_words # Gunakan deep preprocessing jika tersedia, fallback ke basic def clean_text(text: str) -> str: if _DEEP: try: return _clean(text) except Exception: pass return _clean_basic(text) def is_valid(text: str) -> bool: if _DEEP: try: return _valid(text) except Exception: pass return _valid_basic(text) # ════════════════════════════════════════════════ # MAIN COLLECTOR # ════════════════════════════════════════════════ def collect_data(keyword: str, source: str = "all") -> list: """ Return: list of (source_label, cleaned_text) source options (bisa kombinasi CSV): "all" → semua 5 platform "youtube" → YouTube saja "reddit" → Reddit saja "instagram" → Instagram saja "tiktok" → TikTok saja "news" → Google News saja "youtube,tiktok" → YouTube + TikTok dst. """ all_data = [] src = source.lower() def wants(platform: str) -> bool: return src == "all" or platform in src # 1. YOUTUBE if wants("youtube"): before = len(all_data) try: for vid in search_videos(keyword): for c in get_comments(vid): all_data.append(("youtube", c)) print(f"✅ YouTube: {len(all_data)-before} komentar") except Exception as e: print(f"⚠️ YouTube error: {e}") # 2. REDDIT if wants("reddit"): before = len(all_data) try: for c in get_reddit_comments(keyword): all_data.append(("reddit", c)) print(f"✅ Reddit: {len(all_data)-before} komentar") except Exception as e: print(f"⚠️ Reddit error: {e}") # 3. INSTAGRAM if wants("instagram") and INSTAGRAM_OK: before = len(all_data) try: for text in get_instagram_data(keyword): all_data.append(("instagram", text)) print(f"✅ Instagram: {len(all_data)-before} teks") except Exception as e: print(f"⚠️ Instagram error: {e}") # 4. TIKTOK if wants("tiktok") and TIKTOK_OK: before = len(all_data) try: for text in get_tiktok_data(keyword): all_data.append(("tiktok", text)) print(f"✅ TikTok: {len(all_data)-before} teks") except Exception as e: print(f"⚠️ TikTok error: {e}") # 5. GOOGLE NEWS if wants("news") and GNEWS_OK: before = len(all_data) try: for text in get_google_news(keyword): all_data.append(("news", text)) print(f"✅ Google News: {len(all_data)-before} teks") except Exception as e: print(f"⚠️ Google News error: {e}") # FALLBACK if not all_data: print("⚠️ Tidak ada data dari semua sumber") all_data = [("unknown", "data tidak ditemukan")] # CLEAN & FILTER cleaned = [ (src_label, clean_text(text)) for src_label, text in all_data if is_valid(text) ] print(f"✅ Total: {len(cleaned)} teks bersih dari {len(all_data)} raw") return cleaned