Spaces:
Sleeping
Sleeping
| """ | |
| services/aggregator.py | |
| Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News. | |
| CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung | |
| pada services.preprocessing_id yang mungkin belum ada di repo. | |
| """ | |
| import re | |
| from services.youtube import search_videos, get_comments | |
| from services.reddit import get_reddit_comments | |
| # ββ Optional: deep preprocessing jika tersedia ββ | |
| try: | |
| from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid | |
| _DEEP = True | |
| print("β aggregator: deep preprocessing loaded") | |
| except Exception: | |
| _DEEP = False | |
| print("β οΈ aggregator: using built-in basic preprocessing") | |
| # ββ Optional sources ββ | |
| try: | |
| from services.instagram import get_instagram_data | |
| INSTAGRAM_OK = True | |
| except Exception: | |
| INSTAGRAM_OK = False | |
| def get_instagram_data(kw): return [] | |
| try: | |
| from services.tiktok import get_tiktok_data | |
| TIKTOK_OK = True | |
| except Exception: | |
| TIKTOK_OK = False | |
| def get_tiktok_data(kw): return [] | |
| try: | |
| from services.google_news import get_google_news | |
| GNEWS_OK = True | |
| except Exception: | |
| GNEWS_OK = False | |
| def get_google_news(kw): return [] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # BUILT-IN PREPROCESSING (fallback self-contained) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _STOPWORDS_BASIC = { | |
| 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah', | |
| 'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami', | |
| 'mereka','kita','ya','jadi','kalau','tapi','atau','karena', | |
| 'the','is','in','of','a','an','and','it','for','that','this', | |
| } | |
| _SLANG_BASIC = { | |
| 'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan', | |
| 'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya', | |
| 'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga', | |
| 'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak', | |
| 'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya', | |
| 'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke', | |
| 'wkwk':'haha','hehe':'haha','lol':'tertawa', | |
| } | |
| def _clean_basic(text: str) -> str: | |
| """Basic preprocessing β always available.""" | |
| if not text or not isinstance(text, str): | |
| return "" | |
| t = text.lower().strip() | |
| t = re.sub(r'https?://\S+|www\.\S+', '', t) # hapus URL | |
| t = re.sub(r'@\w+', '', t) # hapus mention | |
| t = re.sub(r'#(\w+)', r' \1 ', t) # hashtag β kata | |
| t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi | |
| t = re.sub(r'[^a-z0-9\s]', ' ', t) # hapus non-alfanumerik | |
| tokens = [_SLANG_BASIC.get(w, w) for w in t.split()] | |
| tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC] | |
| return ' '.join(tokens) | |
| def _valid_basic(text: str, min_words: int = 3) -> bool: | |
| """Cek validitas teks β always available.""" | |
| if not text or not isinstance(text, str): | |
| return False | |
| return len(text.split()) >= min_words | |
| # Gunakan deep preprocessing jika tersedia, fallback ke basic | |
| def clean_text(text: str) -> str: | |
| if _DEEP: | |
| try: | |
| return _clean(text) | |
| except Exception: | |
| pass | |
| return _clean_basic(text) | |
| def is_valid(text: str) -> bool: | |
| if _DEEP: | |
| try: | |
| return _valid(text) | |
| except Exception: | |
| pass | |
| return _valid_basic(text) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN COLLECTOR | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def collect_data(keyword: str, source: str = "all") -> list: | |
| """ | |
| Return: list of (source_label, cleaned_text) | |
| source options (bisa kombinasi CSV): | |
| "all" β semua 5 platform | |
| "youtube" β YouTube saja | |
| "reddit" β Reddit saja | |
| "instagram" β Instagram saja | |
| "tiktok" β TikTok saja | |
| "news" β Google News saja | |
| "youtube,tiktok" β YouTube + TikTok | |
| dst. | |
| """ | |
| all_data = [] | |
| src = source.lower() | |
| def wants(platform: str) -> bool: | |
| return src == "all" or platform in src | |
| # 1. YOUTUBE | |
| if wants("youtube"): | |
| before = len(all_data) | |
| try: | |
| for vid in search_videos(keyword): | |
| for c in get_comments(vid): | |
| all_data.append(("youtube", c)) | |
| print(f"β YouTube: {len(all_data)-before} komentar") | |
| except Exception as e: | |
| print(f"β οΈ YouTube error: {e}") | |
| # 2. REDDIT | |
| if wants("reddit"): | |
| before = len(all_data) | |
| try: | |
| for c in get_reddit_comments(keyword): | |
| all_data.append(("reddit", c)) | |
| print(f"β Reddit: {len(all_data)-before} komentar") | |
| except Exception as e: | |
| print(f"β οΈ Reddit error: {e}") | |
| # 3. INSTAGRAM | |
| if wants("instagram") and INSTAGRAM_OK: | |
| before = len(all_data) | |
| try: | |
| for text in get_instagram_data(keyword): | |
| all_data.append(("instagram", text)) | |
| print(f"β Instagram: {len(all_data)-before} teks") | |
| except Exception as e: | |
| print(f"β οΈ Instagram error: {e}") | |
| # 4. TIKTOK | |
| if wants("tiktok") and TIKTOK_OK: | |
| before = len(all_data) | |
| try: | |
| for text in get_tiktok_data(keyword): | |
| all_data.append(("tiktok", text)) | |
| print(f"β TikTok: {len(all_data)-before} teks") | |
| except Exception as e: | |
| print(f"β οΈ TikTok error: {e}") | |
| # 5. GOOGLE NEWS | |
| if wants("news") and GNEWS_OK: | |
| before = len(all_data) | |
| try: | |
| for text in get_google_news(keyword): | |
| all_data.append(("news", text)) | |
| print(f"β Google News: {len(all_data)-before} teks") | |
| except Exception as e: | |
| print(f"β οΈ Google News error: {e}") | |
| # FALLBACK | |
| if not all_data: | |
| print("β οΈ Tidak ada data dari semua sumber") | |
| all_data = [("unknown", "data tidak ditemukan")] | |
| # CLEAN & FILTER | |
| cleaned = [ | |
| (src_label, clean_text(text)) | |
| for src_label, text in all_data | |
| if is_valid(text) | |
| ] | |
| print(f"β Total: {len(cleaned)} teks bersih dari {len(all_data)} raw") | |
| return cleaned |