noranisa commited on
Commit
f7b57d8
Β·
verified Β·
1 Parent(s): 3890fa2

Update services/aggregator.py

Browse files
Files changed (1) hide show
  1. services/aggregator.py +87 -15
services/aggregator.py CHANGED
@@ -1,53 +1,125 @@
1
  """
2
  services/aggregator.py
3
- Kumpulkan data dari:
4
- 1. YouTube (Google API)
5
- 2. Reddit (PRAW)
6
- 3. Instagram (Apify)
7
- 4. TikTok (Apify)
8
- 5. Google News (SerpApi)
9
  """
10
 
 
 
11
  from services.youtube import search_videos, get_comments
12
  from services.reddit import get_reddit_comments
13
- from services.preprocessing_id import clean_text_deep as clean_text, is_valid
14
 
 
 
 
 
 
 
 
 
 
 
15
  try:
16
  from services.instagram import get_instagram_data
17
  INSTAGRAM_OK = True
18
- except ImportError:
19
  INSTAGRAM_OK = False
20
  def get_instagram_data(kw): return []
21
 
22
  try:
23
  from services.tiktok import get_tiktok_data
24
  TIKTOK_OK = True
25
- except ImportError:
26
  TIKTOK_OK = False
27
  def get_tiktok_data(kw): return []
28
 
29
  try:
30
  from services.google_news import get_google_news
31
  GNEWS_OK = True
32
- except ImportError:
33
  GNEWS_OK = False
34
  def get_google_news(kw): return []
35
 
36
 
37
- def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
  Return: list of (source_label, cleaned_text)
40
 
41
- source values:
42
  "all" β†’ semua 5 platform
43
  "youtube" β†’ YouTube saja
44
  "reddit" β†’ Reddit saja
45
  "instagram" β†’ Instagram saja
46
  "tiktok" β†’ TikTok saja
47
  "news" β†’ Google News saja
48
- kombinasi CSV β†’ "youtube,tiktok" / "tiktok,news" / dst.
 
49
  """
50
- all_data: list[tuple[str, str]] = []
51
  src = source.lower()
52
 
53
  def wants(platform: str) -> bool:
@@ -115,5 +187,5 @@ def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
115
  for src_label, text in all_data
116
  if is_valid(text)
117
  ]
118
- print(f"βœ… Total: {len(cleaned)} teks dari {len(all_data)} raw")
119
  return cleaned
 
1
  """
2
  services/aggregator.py
3
+ Kumpulkan data dari YouTube, Reddit, Instagram, TikTok, dan Google News.
4
+
5
+ CATATAN: Preprocessing di-embed langsung di file ini agar tidak bergantung
6
+ pada services.preprocessing_id yang mungkin belum ada di repo.
 
 
7
  """
8
 
9
+ import re
10
+
11
  from services.youtube import search_videos, get_comments
12
  from services.reddit import get_reddit_comments
 
13
 
14
+ # ── Optional: deep preprocessing jika tersedia ──
15
+ try:
16
+ from services.preprocessing_id import clean_text_deep as _clean, is_valid as _valid
17
+ _DEEP = True
18
+ print("βœ… aggregator: deep preprocessing loaded")
19
+ except Exception:
20
+ _DEEP = False
21
+ print("⚠️ aggregator: using built-in basic preprocessing")
22
+
23
+ # ── Optional sources ──
24
  try:
25
  from services.instagram import get_instagram_data
26
  INSTAGRAM_OK = True
27
+ except Exception:
28
  INSTAGRAM_OK = False
29
  def get_instagram_data(kw): return []
30
 
31
  try:
32
  from services.tiktok import get_tiktok_data
33
  TIKTOK_OK = True
34
+ except Exception:
35
  TIKTOK_OK = False
36
  def get_tiktok_data(kw): return []
37
 
38
  try:
39
  from services.google_news import get_google_news
40
  GNEWS_OK = True
41
+ except Exception:
42
  GNEWS_OK = False
43
  def get_google_news(kw): return []
44
 
45
 
46
+ # ════════════════════════════════════════════════
47
+ # BUILT-IN PREPROCESSING (fallback self-contained)
48
+ # ════════════════════════════════════════════════
49
+ _STOPWORDS_BASIC = {
50
+ 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah',
51
+ 'ada','pada','juga','tidak','bisa','sudah','saya','kamu','kami',
52
+ 'mereka','kita','ya','jadi','kalau','tapi','atau','karena',
53
+ 'the','is','in','of','a','an','and','it','for','that','this',
54
+ }
55
+
56
+ _SLANG_BASIC = {
57
+ 'gak':'tidak','ga':'tidak','nggak':'tidak','yg':'yang','dgn':'dengan',
58
+ 'utk':'untuk','krn':'karena','udah':'sudah','udh':'sudah','gue':'saya',
59
+ 'gw':'saya','lo':'kamu','lu':'kamu','tp':'tapi','jg':'juga',
60
+ 'bs':'bisa','lg':'lagi','bgt':'banget','emg':'memang','kyk':'kayak',
61
+ 'dr':'dari','msh':'masih','blm':'belum','jd':'jadi','sy':'saya',
62
+ 'skrg':'sekarang','trs':'terus','ok':'oke','oke':'oke',
63
+ 'wkwk':'haha','hehe':'haha','lol':'tertawa',
64
+ }
65
+
66
+ def _clean_basic(text: str) -> str:
67
+ """Basic preprocessing β€” always available."""
68
+ if not text or not isinstance(text, str):
69
+ return ""
70
+ t = text.lower().strip()
71
+ t = re.sub(r'https?://\S+|www\.\S+', '', t) # hapus URL
72
+ t = re.sub(r'@\w+', '', t) # hapus mention
73
+ t = re.sub(r'#(\w+)', r' \1 ', t) # hashtag β†’ kata
74
+ t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi
75
+ t = re.sub(r'[^a-z0-9\s]', ' ', t) # hapus non-alfanumerik
76
+ tokens = [_SLANG_BASIC.get(w, w) for w in t.split()]
77
+ tokens = [w for w in tokens if len(w) > 2 and w not in _STOPWORDS_BASIC]
78
+ return ' '.join(tokens)
79
+
80
+ def _valid_basic(text: str, min_words: int = 3) -> bool:
81
+ """Cek validitas teks β€” always available."""
82
+ if not text or not isinstance(text, str):
83
+ return False
84
+ return len(text.split()) >= min_words
85
+
86
+
87
+ # Gunakan deep preprocessing jika tersedia, fallback ke basic
88
+ def clean_text(text: str) -> str:
89
+ if _DEEP:
90
+ try:
91
+ return _clean(text)
92
+ except Exception:
93
+ pass
94
+ return _clean_basic(text)
95
+
96
+ def is_valid(text: str) -> bool:
97
+ if _DEEP:
98
+ try:
99
+ return _valid(text)
100
+ except Exception:
101
+ pass
102
+ return _valid_basic(text)
103
+
104
+
105
+ # ════════════════════════════════════════════════
106
+ # MAIN COLLECTOR
107
+ # ════════════════════════════════════════════════
108
+ def collect_data(keyword: str, source: str = "all") -> list:
109
  """
110
  Return: list of (source_label, cleaned_text)
111
 
112
+ source options (bisa kombinasi CSV):
113
  "all" β†’ semua 5 platform
114
  "youtube" β†’ YouTube saja
115
  "reddit" β†’ Reddit saja
116
  "instagram" β†’ Instagram saja
117
  "tiktok" β†’ TikTok saja
118
  "news" β†’ Google News saja
119
+ "youtube,tiktok" β†’ YouTube + TikTok
120
+ dst.
121
  """
122
+ all_data = []
123
  src = source.lower()
124
 
125
  def wants(platform: str) -> bool:
 
187
  for src_label, text in all_data
188
  if is_valid(text)
189
  ]
190
+ print(f"βœ… Total: {len(cleaned)} teks bersih dari {len(all_data)} raw")
191
  return cleaned