File size: 6,726 Bytes
9d6d949
 
f7b57d8
 
 
 
9d6d949
 
f7b57d8
 
fa302f6
 
 
f7b57d8
 
 
 
 
 
 
 
 
 
9d6d949
 
78c8d61
f7b57d8
78c8d61
9d6d949
 
78c8d61
 
 
f7b57d8
78c8d61
 
 
 
 
 
f7b57d8
78c8d61
 
 
9d6d949
f7b57d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d6d949
 
 
f7b57d8
78c8d61
 
 
 
 
 
f7b57d8
 
9d6d949
f7b57d8
9d6d949
 
78c8d61
 
 
 
 
 
9d6d949
78c8d61
9d6d949
 
78c8d61
9d6d949
78c8d61
fa302f6
78c8d61
 
 
9d6d949
 
 
78c8d61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d6d949
78c8d61
fa302f6
78c8d61
 
 
 
 
 
 
 
 
fa302f6
78c8d61
1f3b000
9d6d949
1f3b000
466dd37
78c8d61
34c8a8e
9d6d949
 
1f3b000
34c8a8e
f7b57d8
fa302f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
services/aggregator.py
Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News.

CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung
pada services.preprocessing_id yang mungkin belum ada di repo.
"""

import re

from services.youtube import search_videos, get_comments
from services.reddit import get_reddit_comments

# ── Optional: deep preprocessing jika tersedia ──
try:
    from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid
    _DEEP = True
    print("βœ… aggregator: deep preprocessing loaded")
except Exception:
    _DEEP = False
    print("⚠️  aggregator: using built-in basic preprocessing")

# ── Optional sources ──
try:
    from services.instagram import get_instagram_data
    INSTAGRAM_OK = True
except Exception:
    INSTAGRAM_OK = False
    def get_instagram_data(kw): return []

try:
    from services.tiktok import get_tiktok_data
    TIKTOK_OK = True
except Exception:
    TIKTOK_OK = False
    def get_tiktok_data(kw): return []

try:
    from services.google_news import get_google_news
    GNEWS_OK = True
except Exception:
    GNEWS_OK = False
    def get_google_news(kw): return []


# ════════════════════════════════════════════════
# BUILT-IN PREPROCESSING (fallback self-contained)
# ════════════════════════════════════════════════
_STOPWORDS_BASIC = {
    'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
    'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami',
    'mereka','kita','ya','jadi','kalau','tapi','atau','karena',
    'the','is','in','of','a','an','and','it','for','that','this',
}

_SLANG_BASIC = {
    'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan',
    'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya',
    'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga',
    'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak',
    'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya',
    'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke',
    'wkwk':'haha','hehe':'haha','lol':'tertawa',
}

def _clean_basic(text: str) -> str:
    """Basic preprocessing β€” always available."""
    if not text or not isinstance(text, str):
        return ""
    t = text.lower().strip()
    t = re.sub(r'https?://\S+|www\.\S+', '', t)   # hapus URL
    t = re.sub(r'@\w+', '', t)                      # hapus mention
    t = re.sub(r'#(\w+)', r' \1 ', t)               # hashtag β†’ kata
    t = re.sub(r'(.)\1{2,}', r'\1\1', t)            # reduplikasi
    t = re.sub(r'[^a-z0-9\s]', ' ', t)              # hapus non-alfanumerik
    tokens = [_SLANG_BASIC.get(w, w) for w in t.split()]
    tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC]
    return ' '.join(tokens)

def _valid_basic(text: str, min_words: int = 3) -> bool:
    """Cek validitas teks β€” always available."""
    if not text or not isinstance(text, str):
        return False
    return len(text.split()) >= min_words


# Gunakan deep preprocessing jika tersedia, fallback ke basic
def clean_text(text: str) -> str:
    if _DEEP:
        try:
            return _clean(text)
        except Exception:
            pass
    return _clean_basic(text)

def is_valid(text: str) -> bool:
    if _DEEP:
        try:
            return _valid(text)
        except Exception:
            pass
    return _valid_basic(text)


# ════════════════════════════════════════════════
# MAIN COLLECTOR
# ════════════════════════════════════════════════
def collect_data(keyword: str, source: str = "all") -> list:
    """
    Return: list of (source_label, cleaned_text)

    source options (bisa kombinasi CSV):
        "all"              β†’ semua 5 platform
        "youtube"          β†’ YouTube saja
        "reddit"           β†’ Reddit saja
        "instagram"        β†’ Instagram saja
        "tiktok"           β†’ TikTok saja
        "news"             β†’ Google News saja
        "youtube,tiktok"   β†’ YouTube + TikTok
        dst.
    """
    all_data = []
    src = source.lower()

    def wants(platform: str) -> bool:
        return src == "all" or platform in src

    # 1. YOUTUBE
    if wants("youtube"):
        before = len(all_data)
        try:
            for vid in search_videos(keyword):
                for c in get_comments(vid):
                    all_data.append(("youtube", c))
            print(f"βœ… YouTube: {len(all_data)-before} komentar")
        except Exception as e:
            print(f"⚠️  YouTube error: {e}")

    # 2. REDDIT
    if wants("reddit"):
        before = len(all_data)
        try:
            for c in get_reddit_comments(keyword):
                all_data.append(("reddit", c))
            print(f"βœ… Reddit: {len(all_data)-before} komentar")
        except Exception as e:
            print(f"⚠️  Reddit error: {e}")

    # 3. INSTAGRAM
    if wants("instagram") and INSTAGRAM_OK:
        before = len(all_data)
        try:
            for text in get_instagram_data(keyword):
                all_data.append(("instagram", text))
            print(f"βœ… Instagram: {len(all_data)-before} teks")
        except Exception as e:
            print(f"⚠️  Instagram error: {e}")

    # 4. TIKTOK
    if wants("tiktok") and TIKTOK_OK:
        before = len(all_data)
        try:
            for text in get_tiktok_data(keyword):
                all_data.append(("tiktok", text))
            print(f"βœ… TikTok: {len(all_data)-before} teks")
        except Exception as e:
            print(f"⚠️  TikTok error: {e}")

    # 5. GOOGLE NEWS
    if wants("news") and GNEWS_OK:
        before = len(all_data)
        try:
            for text in get_google_news(keyword):
                all_data.append(("news", text))
            print(f"βœ… Google News: {len(all_data)-before} teks")
        except Exception as e:
            print(f"⚠️  Google News error: {e}")

    # FALLBACK
    if not all_data:
        print("⚠️  Tidak ada data dari semua sumber")
        all_data = [("unknown", "data tidak ditemukan")]

    # CLEAN & FILTER
    cleaned = [
        (src_label, clean_text(text))
        for src_label, text in all_data
        if is_valid(text)
    ]
    print(f"βœ… Total: {len(cleaned)} teks bersih dari {len(all_data)} raw")
    return cleaned