Spaces:
Sleeping
Sleeping
File size: 6,726 Bytes
9d6d949 f7b57d8 9d6d949 f7b57d8 fa302f6 f7b57d8 9d6d949 78c8d61 f7b57d8 78c8d61 9d6d949 78c8d61 f7b57d8 78c8d61 f7b57d8 78c8d61 9d6d949 f7b57d8 9d6d949 f7b57d8 78c8d61 f7b57d8 9d6d949 f7b57d8 9d6d949 78c8d61 9d6d949 78c8d61 9d6d949 78c8d61 9d6d949 78c8d61 fa302f6 78c8d61 9d6d949 78c8d61 9d6d949 78c8d61 fa302f6 78c8d61 fa302f6 78c8d61 1f3b000 9d6d949 1f3b000 466dd37 78c8d61 34c8a8e 9d6d949 1f3b000 34c8a8e f7b57d8 fa302f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | """
services/aggregator.py
Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News.
CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung
pada services.preprocessing_id yang mungkin belum ada di repo.
"""
import re
from services.youtube import search_videos, get_comments
from services.reddit import get_reddit_comments
# ββ Optional: deep preprocessing jika tersedia ββ
try:
from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid
_DEEP = True
print("β
aggregator: deep preprocessing loaded")
except Exception:
_DEEP = False
print("β οΈ aggregator: using built-in basic preprocessing")
# ββ Optional sources ββ
try:
from services.instagram import get_instagram_data
INSTAGRAM_OK = True
except Exception:
INSTAGRAM_OK = False
def get_instagram_data(kw): return []
try:
from services.tiktok import get_tiktok_data
TIKTOK_OK = True
except Exception:
TIKTOK_OK = False
def get_tiktok_data(kw): return []
try:
from services.google_news import get_google_news
GNEWS_OK = True
except Exception:
GNEWS_OK = False
def get_google_news(kw): return []
# ββββββββββββββββββββββββββββββββββββββββββββββββ
# BUILT-IN PREPROCESSING (fallback self-contained)
# ββββββββββββββββββββββββββββββββββββββββββββββββ
_STOPWORDS_BASIC = {
'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami',
'mereka','kita','ya','jadi','kalau','tapi','atau','karena',
'the','is','in','of','a','an','and','it','for','that','this',
}
_SLANG_BASIC = {
'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan',
'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya',
'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga',
'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak',
'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya',
'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke',
'wkwk':'haha','hehe':'haha','lol':'tertawa',
}
def _clean_basic(text: str) -> str:
"""Basic preprocessing β always available."""
if not text or not isinstance(text, str):
return ""
t = text.lower().strip()
t = re.sub(r'https?://\S+|www\.\S+', '', t) # hapus URL
t = re.sub(r'@\w+', '', t) # hapus mention
t = re.sub(r'#(\w+)', r' \1 ', t) # hashtag β kata
t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi
t = re.sub(r'[^a-z0-9\s]', ' ', t) # hapus non-alfanumerik
tokens = [_SLANG_BASIC.get(w, w) for w in t.split()]
tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC]
return ' '.join(tokens)
def _valid_basic(text: str, min_words: int = 3) -> bool:
"""Cek validitas teks β always available."""
if not text or not isinstance(text, str):
return False
return len(text.split()) >= min_words
# Gunakan deep preprocessing jika tersedia, fallback ke basic
def clean_text(text: str) -> str:
if _DEEP:
try:
return _clean(text)
except Exception:
pass
return _clean_basic(text)
def is_valid(text: str) -> bool:
if _DEEP:
try:
return _valid(text)
except Exception:
pass
return _valid_basic(text)
# ββββββββββββββββββββββββββββββββββββββββββββββββ
# MAIN COLLECTOR
# ββββββββββββββββββββββββββββββββββββββββββββββββ
def collect_data(keyword: str, source: str = "all") -> list:
"""
Return: list of (source_label, cleaned_text)
source options (bisa kombinasi CSV):
"all" β semua 5 platform
"youtube" β YouTube saja
"reddit" β Reddit saja
"instagram" β Instagram saja
"tiktok" β TikTok saja
"news" β Google News saja
"youtube,tiktok" β YouTube + TikTok
dst.
"""
all_data = []
src = source.lower()
def wants(platform: str) -> bool:
return src == "all" or platform in src
# 1. YOUTUBE
if wants("youtube"):
before = len(all_data)
try:
for vid in search_videos(keyword):
for c in get_comments(vid):
all_data.append(("youtube", c))
print(f"β
YouTube: {len(all_data)-before} komentar")
except Exception as e:
print(f"β οΈ YouTube error: {e}")
# 2. REDDIT
if wants("reddit"):
before = len(all_data)
try:
for c in get_reddit_comments(keyword):
all_data.append(("reddit", c))
print(f"β
Reddit: {len(all_data)-before} komentar")
except Exception as e:
print(f"β οΈ Reddit error: {e}")
# 3. INSTAGRAM
if wants("instagram") and INSTAGRAM_OK:
before = len(all_data)
try:
for text in get_instagram_data(keyword):
all_data.append(("instagram", text))
print(f"β
Instagram: {len(all_data)-before} teks")
except Exception as e:
print(f"β οΈ Instagram error: {e}")
# 4. TIKTOK
if wants("tiktok") and TIKTOK_OK:
before = len(all_data)
try:
for text in get_tiktok_data(keyword):
all_data.append(("tiktok", text))
print(f"β
TikTok: {len(all_data)-before} teks")
except Exception as e:
print(f"β οΈ TikTok error: {e}")
# 5. GOOGLE NEWS
if wants("news") and GNEWS_OK:
before = len(all_data)
try:
for text in get_google_news(keyword):
all_data.append(("news", text))
print(f"β
Google News: {len(all_data)-before} teks")
except Exception as e:
print(f"β οΈ Google News error: {e}")
# FALLBACK
if not all_data:
print("β οΈ Tidak ada data dari semua sumber")
all_data = [("unknown", "data tidak ditemukan")]
# CLEAN & FILTER
cleaned = [
(src_label, clean_text(text))
for src_label, text in all_data
if is_valid(text)
]
print(f"β
Total: {len(cleaned)} teks bersih dari {len(all_data)} raw")
return cleaned |