noranisa commited on
Commit
78c8d61
Β·
verified Β·
1 Parent(s): 64b0b38

Update services/aggregator.py

Browse files
Files changed (1) hide show
  1. services/aggregator.py +77 -33
services/aggregator.py CHANGED
@@ -1,75 +1,119 @@
1
  """
2
  services/aggregator.py
3
- Kumpulkan data dari YouTube, Reddit, dan Instagram (Apify).
 
 
 
 
 
4
  """
5
 
6
  from services.youtube import search_videos, get_comments
7
  from services.reddit import get_reddit_comments
8
  from services.preprocessing import clean_text, is_valid
9
 
10
- # Instagram via Apify β€” opsional, skip jika API key tidak ada
11
  try:
12
  from services.instagram import get_instagram_data
13
- INSTAGRAM_AVAILABLE = True
14
  except ImportError:
15
- INSTAGRAM_AVAILABLE = False
16
  def get_instagram_data(kw): return []
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
20
  """
21
  Return: list of (source_label, cleaned_text)
22
 
23
- source options:
24
- "all" β†’ YouTube + Reddit + Instagram
25
- "youtube" β†’ YouTube saja
26
- "reddit" β†’ Reddit saja
27
- "instagram" β†’ Instagram saja
28
- kombinasi β†’ "youtube,instagram" dst.
 
 
29
  """
30
- all_data = []
31
  src = source.lower()
32
 
33
- # ── YOUTUBE ──
34
- if src in ("all", "youtube") or "youtube" in src:
 
 
 
 
35
  try:
36
- vids = search_videos(keyword)
37
- for vid in vids:
38
  for c in get_comments(vid):
39
  all_data.append(("youtube", c))
 
40
  except Exception as e:
41
- print(f"⚠️ YouTube collect error: {e}")
42
 
43
- # ── REDDIT ──
44
- if src in ("all", "reddit") or "reddit" in src:
 
45
  try:
46
  for c in get_reddit_comments(keyword):
47
  all_data.append(("reddit", c))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
- print(f"⚠️ Reddit collect error: {e}")
50
 
51
- # ── INSTAGRAM ──
52
- if src in ("all", "instagram") or "instagram" in src:
53
- if INSTAGRAM_AVAILABLE:
54
- try:
55
- for text in get_instagram_data(keyword):
56
- all_data.append(("instagram", text))
57
- except Exception as e:
58
- print(f"⚠️ Instagram collect error: {e}")
59
- else:
60
- print("⚠️ Instagram scraper tidak tersedia")
61
 
62
- # ── FALLBACK jika semua kosong ──
63
  if not all_data:
64
  print("⚠️ Tidak ada data dari semua sumber")
65
  all_data = [("unknown", "data tidak ditemukan")]
66
 
67
- # ── CLEAN & FILTER ──
68
  cleaned = [
69
  (src_label, clean_text(text))
70
  for src_label, text in all_data
71
  if is_valid(text)
72
  ]
73
-
74
- print(f"βœ… Total data terkumpul: {len(cleaned)} dari {len(all_data)} raw")
75
  return cleaned
 
1
  """
2
  services/aggregator.py
3
+ Kumpulkan data dari:
4
+ 1. YouTube (Google API)
5
+ 2. Reddit (PRAW)
6
+ 3. Instagram (Apify)
7
+ 4. TikTok (Apify)
8
+ 5. Google News (SerpApi)
9
  """
10
 
11
  from services.youtube import search_videos, get_comments
12
  from services.reddit import get_reddit_comments
13
  from services.preprocessing import clean_text, is_valid
14
 
 
15
  try:
16
  from services.instagram import get_instagram_data
17
+ INSTAGRAM_OK = True
18
  except ImportError:
19
+ INSTAGRAM_OK = False
20
  def get_instagram_data(kw): return []
21
 
22
+ try:
23
+ from services.tiktok import get_tiktok_data
24
+ TIKTOK_OK = True
25
+ except ImportError:
26
+ TIKTOK_OK = False
27
+ def get_tiktok_data(kw): return []
28
+
29
+ try:
30
+ from services.google_news import get_google_news
31
+ GNEWS_OK = True
32
+ except ImportError:
33
+ GNEWS_OK = False
34
+ def get_google_news(kw): return []
35
+
36
 
37
  def collect_data(keyword: str, source: str = "all") -> list[tuple[str, str]]:
38
  """
39
  Return: list of (source_label, cleaned_text)
40
 
41
+ source values:
42
+ "all" β†’ semua 5 platform
43
+ "youtube" β†’ YouTube saja
44
+ "reddit" β†’ Reddit saja
45
+ "instagram" β†’ Instagram saja
46
+ "tiktok" β†’ TikTok saja
47
+ "news" β†’ Google News saja
48
+ kombinasi CSV β†’ "youtube,tiktok" / "tiktok,news" / dst.
49
  """
50
+ all_data: list[tuple[str, str]] = []
51
  src = source.lower()
52
 
53
+ def wants(platform: str) -> bool:
54
+ return src == "all" or platform in src
55
+
56
+ # 1. YOUTUBE
57
+ if wants("youtube"):
58
+ before = len(all_data)
59
  try:
60
+ for vid in search_videos(keyword):
 
61
  for c in get_comments(vid):
62
  all_data.append(("youtube", c))
63
+ print(f"βœ… YouTube: {len(all_data)-before} komentar")
64
  except Exception as e:
65
+ print(f"⚠️ YouTube error: {e}")
66
 
67
+ # 2. REDDIT
68
+ if wants("reddit"):
69
+ before = len(all_data)
70
  try:
71
  for c in get_reddit_comments(keyword):
72
  all_data.append(("reddit", c))
73
+ print(f"βœ… Reddit: {len(all_data)-before} komentar")
74
+ except Exception as e:
75
+ print(f"⚠️ Reddit error: {e}")
76
+
77
+ # 3. INSTAGRAM
78
+ if wants("instagram") and INSTAGRAM_OK:
79
+ before = len(all_data)
80
+ try:
81
+ for text in get_instagram_data(keyword):
82
+ all_data.append(("instagram", text))
83
+ print(f"βœ… Instagram: {len(all_data)-before} teks")
84
+ except Exception as e:
85
+ print(f"⚠️ Instagram error: {e}")
86
+
87
+ # 4. TIKTOK
88
+ if wants("tiktok") and TIKTOK_OK:
89
+ before = len(all_data)
90
+ try:
91
+ for text in get_tiktok_data(keyword):
92
+ all_data.append(("tiktok", text))
93
+ print(f"βœ… TikTok: {len(all_data)-before} teks")
94
  except Exception as e:
95
+ print(f"⚠️ TikTok error: {e}")
96
 
97
+ # 5. GOOGLE NEWS
98
+ if wants("news") and GNEWS_OK:
99
+ before = len(all_data)
100
+ try:
101
+ for text in get_google_news(keyword):
102
+ all_data.append(("news", text))
103
+ print(f"βœ… Google News: {len(all_data)-before} teks")
104
+ except Exception as e:
105
+ print(f"⚠️ Google News error: {e}")
 
106
 
107
+ # FALLBACK
108
  if not all_data:
109
  print("⚠️ Tidak ada data dari semua sumber")
110
  all_data = [("unknown", "data tidak ditemukan")]
111
 
112
+ # CLEAN & FILTER
113
  cleaned = [
114
  (src_label, clean_text(text))
115
  for src_label, text in all_data
116
  if is_valid(text)
117
  ]
118
+ print(f"βœ… Total: {len(cleaned)} teks dari {len(all_data)} raw")
 
119
  return cleaned