Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

noranisa commited on 17 days ago

Commit

f7b57d8

verified ·

1 Parent(s): 3890fa2

Update services/aggregator.py

Browse files

Files changed (1) hide show

services/aggregator.py +87 -15

services/aggregator.py CHANGED Viewed

@@ -1,53 +1,125 @@
 """
 services/aggregator.py
-Kumpulkan data dari:
-  1. YouTube         (Google API)
-  2. Reddit          (PRAW)
-  3. Instagram       (Apify)
-  4. TikTok          (Apify)
-  5. Google News     (SerpApi)
 """
 from services.youtube import search_videos, get_comments
 from services.reddit import get_reddit_comments
-from services.preprocessing_id import clean_text_deep as clean_text, is_valid
 try:
     from services.instagram import get_instagram_data
     INSTAGRAM_OK = True
-except ImportError:
     INSTAGRAM_OK = False
     def get_instagram_data(kw): return []
 try:
     from services.tiktok import get_tiktok_data
     TIKTOK_OK = True
-except ImportError:
     TIKTOK_OK = False
     def get_tiktok_data(kw): return []
 try:
     from services.google_news import get_google_news
     GNEWS_OK = True
-except ImportError:
     GNEWS_OK = False
     def get_google_news(kw): return []
-def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
     """
     Return: list of (source_label, cleaned_text)
-    source values:
         "all"              → semua 5 platform
         "youtube"          → YouTube saja
         "reddit"           → Reddit saja
         "instagram"        → Instagram saja
         "tiktok"           → TikTok saja
         "news"             → Google News saja
-        kombinasi CSV      → "youtube,tiktok" / "tiktok,news" / dst.
     """
-    all_data: list[tuple[str, str]] = []
     src = source.lower()
     def wants(platform: str) -> bool:
@@ -115,5 +187,5 @@ def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
         for src_label, text in all_data
         if is_valid(text)
     ]
-    print(f"✅ Total: {len(cleaned)} teks dari {len(all_data)} raw")
     return cleaned

 """
 services/aggregator.py
+Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News.
+CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung
+pada services.preprocessing_id yang mungkin belum ada di repo.
 """
+import re
 from services.youtube import search_videos, get_comments
 from services.reddit import get_reddit_comments
+# ── Optional: deep preprocessing jika tersedia ──
+try:
+    from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid
+    _DEEP = True
+    print("✅ aggregator: deep preprocessing loaded")
+except Exception:
+    _DEEP = False
+    print("⚠️  aggregator: using built-in basic preprocessing")
+# ── Optional sources ──
 try:
     from services.instagram import get_instagram_data
     INSTAGRAM_OK = True
+except Exception:
     INSTAGRAM_OK = False
     def get_instagram_data(kw): return []
 try:
     from services.tiktok import get_tiktok_data
     TIKTOK_OK = True
+except Exception:
     TIKTOK_OK = False
     def get_tiktok_data(kw): return []
 try:
     from services.google_news import get_google_news
     GNEWS_OK = True
+except Exception:
     GNEWS_OK = False
     def get_google_news(kw): return []
+# ════════════════════════════════════════════════
+# BUILT-IN PREPROCESSING (fallback self-contained)
+# ════════════════════════════════════════════════
+_STOPWORDS_BASIC = {
+    'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
+    'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami',
+    'mereka','kita','ya','jadi','kalau','tapi','atau','karena',
+    'the','is','in','of','a','an','and','it','for','that','this',
+}
+_SLANG_BASIC = {
+    'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan',
+    'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya',
+    'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga',
+    'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak',
+    'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya',
+    'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke',
+    'wkwk':'haha','hehe':'haha','lol':'tertawa',
+}
+def _clean_basic(text: str) -> str:
+    """Basic preprocessing — always available."""
+    if not text or not isinstance(text, str):
+        return ""
+    t = text.lower().strip()
+    t = re.sub(r'https?://\S+|www\.\S+', '', t)   # hapus URL
+    t = re.sub(r'@\w+', '', t)                      # hapus mention
+    t = re.sub(r'#(\w+)', r' \1 ', t)               # hashtag → kata
+    t = re.sub(r'(.)\1{2,}', r'\1\1', t)            # reduplikasi
+    t = re.sub(r'[^a-z0-9\s]', ' ', t)              # hapus non-alfanumerik
+    tokens = [_SLANG_BASIC.get(w, w) for w in t.split()]
+    tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC]
+    return ' '.join(tokens)
+def _valid_basic(text: str, min_words: int = 3) -> bool:
+    """Cek validitas teks — always available."""
+    if not text or not isinstance(text, str):
+        return False
+    return len(text.split()) >= min_words
+# Gunakan deep preprocessing jika tersedia, fallback ke basic
+def clean_text(text: str) -> str:
+    if _DEEP:
+        try:
+            return _clean(text)
+        except Exception:
+            pass
+    return _clean_basic(text)
+def is_valid(text: str) -> bool:
+    if _DEEP:
+        try:
+            return _valid(text)
+        except Exception:
+            pass
+    return _valid_basic(text)
+# ════════════════════════════════════════════════
+# MAIN COLLECTOR
+# ════════════════════════════════════════════════
+def collect_data(keyword: str, source: str = "all") -> list:
     """
     Return: list of (source_label, cleaned_text)
+    source options (bisa kombinasi CSV):
         "all"              → semua 5 platform
         "youtube"          → YouTube saja
         "reddit"           → Reddit saja
         "instagram"        → Instagram saja
         "tiktok"           → TikTok saja
         "news"             → Google News saja
+        "youtube,tiktok"   → YouTube + TikTok
+        dst.
     """
+    all_data = []
     src = source.lower()
     def wants(platform: str) -> bool:
         for src_label, text in all_data
         if is_valid(text)
     ]
+    print(f"✅ Total: {len(cleaned)} teks bersih dari {len(all_data)} raw")
     return cleaned