NzTama commited on
Commit
fa8ff66
Β·
0 Parent(s):

Initial clean deploy: Sentiment Analysis

Browse files
.dockerignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python bytecache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ venv/
9
+ env/
10
+
11
+ # IDE
12
+ .idea/
13
+ .vscode/
14
+
15
+ # Cookies (may contain sensitive data, don't bake into image)
16
+ *.json
17
+ !requirements.txt
18
+
19
+ # Output files
20
+ static/output/*.png
21
+
22
+ # Notebook files
23
+ *.ipynb
24
+
25
+ # Git
26
+ .git/
27
+ .gitignore
28
+
29
+ # Model directory β€” mount as volume instead
30
+ indoBERT-sentiment/
31
+
32
+ # Misc
33
+ *.csv
34
+ Procfile
35
+ runtime.txt
.gitattributes ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
3
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.pkl filter=lfs diff=lfs merge=lfs -text
6
+ =======
7
+ *.7z filter=lfs diff=lfs merge=lfs -text
8
+ *.arrow filter=lfs diff=lfs merge=lfs -text
9
+ *.bin filter=lfs diff=lfs merge=lfs -text
10
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
11
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
12
+ *.ftz filter=lfs diff=lfs merge=lfs -text
13
+ *.gz filter=lfs diff=lfs merge=lfs -text
14
+ *.h5 filter=lfs diff=lfs merge=lfs -text
15
+ *.joblib filter=lfs diff=lfs merge=lfs -text
16
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
17
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
18
+ *.model filter=lfs diff=lfs merge=lfs -text
19
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
20
+ *.npy filter=lfs diff=lfs merge=lfs -text
21
+ *.npz filter=lfs diff=lfs merge=lfs -text
22
+ *.onnx filter=lfs diff=lfs merge=lfs -text
23
+ *.ot filter=lfs diff=lfs merge=lfs -text
24
+ *.parquet filter=lfs diff=lfs merge=lfs -text
25
+ *.pb filter=lfs diff=lfs merge=lfs -text
26
+ *.pickle filter=lfs diff=lfs merge=lfs -text
27
+ *.pkl filter=lfs diff=lfs merge=lfs -text
28
+ *.pt filter=lfs diff=lfs merge=lfs -text
29
+ *.pth filter=lfs diff=lfs merge=lfs -text
30
+ *.rar filter=lfs diff=lfs merge=lfs -text
31
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
32
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
34
+ *.tar filter=lfs diff=lfs merge=lfs -text
35
+ *.tflite filter=lfs diff=lfs merge=lfs -text
36
+ *.tgz filter=lfs diff=lfs merge=lfs -text
37
+ *.wasm filter=lfs diff=lfs merge=lfs -text
38
+ *.xz filter=lfs diff=lfs merge=lfs -text
39
+ *.zip filter=lfs diff=lfs merge=lfs -text
40
+ *.zst filter=lfs diff=lfs merge=lfs -text
41
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
42
+ >>>>>>> 649536a0e30a230c86bf243c4a705ac8f70543b6
43
+ static/output/*.png filter=lfs diff=lfs merge=lfs -text
44
+ *.png filter=lfs diff=lfs merge=lfs -text
45
+ *.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
Binary file (130 Bytes). View file
 
Dockerfile ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ─── Base Image ─────────────────────────────────────────────────────────────
2
+ FROM python:3.11-slim
3
+
4
+ # Environment Variables
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV PYTHONDONTWRITEBYTECODE=1
8
+
9
+ # ─── System Dependencies ─────────────────────────────────────────────────────
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ wget curl gnupg ca-certificates unzip \
12
+ # Chromium + driver (AUTO MATCH, STABLE)
13
+ chromium chromium-driver \
14
+ # Required libs
15
+ libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 \
16
+ libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 \
17
+ libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \
18
+ libasound2 libxshmfence1 fonts-liberation libappindicator3-1 \
19
+ xdg-utils libvulkan1 libx11-xcb1 \
20
+ # Fonts
21
+ fonts-noto fonts-noto-cjk \
22
+ # Build tools
23
+ gcc g++ build-essential \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # ─── Set Chromium Path ───────────────────────────────────────────────────────
27
+ ENV CHROME_BIN=/usr/bin/chromium
28
+ ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
29
+
30
+ # ─── Hugging Face Spaces Rules (Non-Root User) ───────────────────────────────
31
+ # Hugging Face Spaces requires running Docker as a non-root user (UID 1000)
32
+ RUN useradd -m -u 1000 user
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:$PATH
35
+
36
+ WORKDIR $HOME/app
37
+
38
+ # Pre-create output directory and ensure permissions
39
+ RUN mkdir -p $HOME/app/static/output && chown -R user:user $HOME
40
+
41
+ # Switch to the non-root user
42
+ USER user
43
+
44
+ # ─── App Setup ───────────────────────────────────────────────────────────────
45
+ COPY --chown=user:user requirements.txt .
46
+
47
+ # Install dependencies into user directory
48
+ # PyTorch CPU version specified explicitly
49
+ RUN pip install --no-cache-dir --user torch --index-url https://download.pytorch.org/whl/cpu && \
50
+ pip install --no-cache-dir --user -r requirements.txt
51
+
52
+ # Copy project files
53
+ COPY --chown=user:user . .
54
+
55
+ # ─── Expose Port ─────────────────────────────────────────────────────────────
56
+ # Hugging Face exposes port 7860
57
+ EXPOSE 7860
58
+
59
+ # ─── Run App ────────────────────────────────────────────────────────────────
60
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn app:app --host 0.0.0.0 --port $PORT
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sentiment
3
+ emoji: 🐨
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py – FastAPI application for Scraping + Sentiment Analysis + WordCloud.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import base64
7
+ import io
8
+ import csv
9
+ import json
10
+ import os
11
+ import traceback
12
+ from typing import Optional
13
+
14
+ import uvicorn
15
+ from fastapi import FastAPI, File, Form, Request, UploadFile
16
+ from fastapi.responses import HTMLResponse
17
+ from fastapi.staticfiles import StaticFiles
18
+ from fastapi.templating import Jinja2Templates
19
+
20
+ from services.medos import scrape_medos
21
+ from services.tiktok import scrape_tiktok
22
+ from services.news import scrape_news
23
+ from services.preprocessing import preprocess_text
24
+ from services.sentiment import analyze_sentiment
25
+ from services.wordcloud_service import generate_wordcloud
26
+ from services.facebook import scrape_facebook
27
+
28
+ # ── App setup ──────────────────────────────────────────────────────────────────
29
+ app = FastAPI(title="Sentiment Analysis Dashboard")
30
+
31
+ app.mount("/static", StaticFiles(directory="static"), name="static")
32
+
33
+ templates = Jinja2Templates(directory="templates")
34
+
35
+
36
+ # ── Helpers ────────────────────────────────────────────────────────────────────
37
+
38
+ def _split_targets(raw: str | None) -> list[str]:
39
+ """Split a newline/comma-separated string into a clean list of non-empty strings."""
40
+ if not raw or not raw.strip():
41
+ return []
42
+ parts = []
43
+ for line in raw.replace(",", "\n").splitlines():
44
+ s = line.strip()
45
+ if s:
46
+ parts.append(s)
47
+ return parts
48
+
49
+
50
+ def _is_enabled(flag: str | None) -> bool:
51
+ """Return True only if the enable flag is explicitly '1'."""
52
+ return (flag or "").strip() == "1"
53
+
54
+
55
+ def _flatten_for_csv(raw_texts: list) -> list[dict]:
56
+ flat = []
57
+ for item in raw_texts:
58
+ if isinstance(item, str):
59
+ flat.append({"text": item})
60
+ elif isinstance(item, dict):
61
+ base = {k: v for k, v in item.items() if k != "comments"}
62
+ comments = item.get("comments", [])
63
+ if not comments:
64
+ flat.append(base)
65
+ else:
66
+ for c in comments:
67
+ row = dict(base)
68
+ if isinstance(c, str):
69
+ row["comment_text"] = c
70
+ elif isinstance(c, dict):
71
+ row["comment_author"] = c.get("author", "")
72
+ row["comment_text"] = c.get("comment", "")
73
+ flat.append(row)
74
+ for r in c.get("replies", []):
75
+ rep_row = dict(base)
76
+ rep_row["comment_author"] = r.get("author", "")
77
+ rep_row["comment_text"] = r.get("comment", "")
78
+ flat.append(rep_row)
79
+ continue
80
+ flat.append(row)
81
+ return flat
82
+
83
+ def _extract_texts(raw_texts: list) -> list[str]:
84
+ extracted = []
85
+ for item in raw_texts:
86
+ if isinstance(item, str):
87
+ extracted.append(item)
88
+ elif isinstance(item, dict):
89
+ if "caption_short" in item: extracted.append(item["caption_short"])
90
+ if "caption_detail" in item: extracted.append(item["caption_detail"])
91
+ if "caption" in item: extracted.append(item["caption"])
92
+ if "judul" in item: extracted.append(item["judul"])
93
+ if "isi_berita" in item: extracted.append(item["isi_berita"])
94
+ if "tag" in item: extracted.append(item["tag"])
95
+ for c in item.get("comments", []):
96
+ if isinstance(c, str):
97
+ extracted.append(c)
98
+ elif isinstance(c, dict):
99
+ extracted.append(c.get("comment", ""))
100
+ for r in c.get("replies", []):
101
+ extracted.append(r.get("comment", ""))
102
+ return extracted
103
+
104
+ def _run_pipeline(raw_texts: list) -> dict:
105
+ """Shared preprocessing β†’ sentiment β†’ wordcloud pipeline."""
106
+ if not raw_texts:
107
+ return {
108
+ "error": "Tidak ada teks yang berhasil dikumpulkan.",
109
+ "result": None,
110
+ "image": None,
111
+ "total_scraped": 0,
112
+ "csv_filename": None,
113
+ }
114
+
115
+ # Save CSV
116
+ import os
117
+ import csv
118
+ from datetime import datetime
119
+ os.makedirs("static/output", exist_ok=True)
120
+ csv_fname = f"scraped_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
121
+ csv_path = os.path.join("static", "output", csv_fname)
122
+
123
+ flat_data = _flatten_for_csv(raw_texts)
124
+ if flat_data:
125
+ keys = set()
126
+ for d in flat_data: keys.update(d.keys())
127
+ with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
128
+ writer = csv.DictWriter(f, fieldnames=list(keys))
129
+ writer.writeheader()
130
+ writer.writerows(flat_data)
131
+ csv_url = f"/static/output/{csv_fname}"
132
+ else:
133
+ csv_url = None
134
+
135
+ # Extract text for ML pipeline
136
+ text_list = _extract_texts(raw_texts)
137
+
138
+ total_scraped = len(text_list)
139
+ print(f"[APP] Total item yg di-ekstrak teksnya: {total_scraped}")
140
+
141
+ # Preprocess
142
+ print("[APP] Preprocessing…")
143
+ clean_texts = preprocess_text(text_list)
144
+ clean_texts = [t for t in clean_texts if t and t.strip()]
145
+
146
+ if not clean_texts:
147
+ return {
148
+ "error": "Semua teks kosong setelah preprocessing. Coba input yang berbeda.",
149
+ "result": None,
150
+ "image": None,
151
+ "total_scraped": total_scraped,
152
+ "csv_filename": csv_url,
153
+ }
154
+
155
+ # Sentiment
156
+ print(f"[APP] Analyzing sentiment on {len(clean_texts)} texts…")
157
+ try:
158
+ sentiment = analyze_sentiment(clean_texts)
159
+ except Exception as e:
160
+ print(f"[APP] Sentiment error: {e}\n{traceback.format_exc()}")
161
+ sentiment = None
162
+
163
+ # WordCloud β€” generate into memory as base64 (no file saved)
164
+ print("[APP] Generating wordcloud…")
165
+ image_b64 = None
166
+ try:
167
+ buf = io.BytesIO()
168
+ wc_ok = generate_wordcloud(clean_texts, buf)
169
+ if wc_ok:
170
+ buf.seek(0)
171
+ image_b64 = base64.b64encode(buf.read()).decode("utf-8")
172
+ except Exception as e:
173
+ print(f"[APP] WordCloud error: {e}")
174
+
175
+ return {
176
+ "error": None,
177
+ "result": sentiment,
178
+ "image": image_b64,
179
+ "total_scraped": total_scraped,
180
+ "csv_filename": csv_url,
181
+ }
182
+
183
+
184
+ # ── Routes ─────────────────────────────────────────────────────────────────────
185
+
186
+ @app.get("/", response_class=HTMLResponse)
187
+ async def home(request: Request):
188
+ return templates.TemplateResponse(request=request, name="index.html")
189
+
190
+
191
+ @app.post("/process", response_class=HTMLResponse)
192
+ async def process(
193
+ request: Request,
194
+
195
+ # ── Platform enable flags (set by JS, "1" = enabled) ──────────────────
196
+ enable_instagram: str = Form(""),
197
+ enable_tiktok: str = Form(""),
198
+ enable_facebook: str = Form(""),
199
+ enable_news: str = Form(""),
200
+
201
+ # ── Instagram (separate credentials) ─────────────────────────────────
202
+ ig_username: str = Form(None),
203
+ ig_password: str = Form(None),
204
+ target_accounts: str = Form(None),
205
+ mode: str = Form("all"),
206
+
207
+ # ── TikTok ────────────────────────────────────────────────────────────
208
+ tiktok_cookie: str = Form(None),
209
+ tiktok_targets: str = Form(None),
210
+
211
+ # ── Facebook (separate credentials, explicit groups only) ─────────────
212
+ fb_username: str = Form(None),
213
+ fb_password: str = Form(None),
214
+ facebook_groups: str = Form(None),
215
+
216
+ # ── News ──────────────────────────────────────────────────────────────
217
+ news_portals: str = Form(None), # comma-separated portal keys
218
+ news_keyword: str = Form("kabupaten cirebon"),
219
+ news_pages: int = Form(1),
220
+ ):
221
+ raw_texts: list = []
222
+
223
+ # ── 1. Instagram ────────────────────────────────────────────────────────
224
+ if _is_enabled(enable_instagram):
225
+ ig_targets = _split_targets(target_accounts)
226
+ if not ig_username or not ig_password:
227
+ print("[APP] Instagram diaktifkan tapi username/password kosong β€” skip.")
228
+ elif not ig_targets:
229
+ print("[APP] Instagram diaktifkan tapi tidak ada target β€” skip.")
230
+ else:
231
+ for tgt in ig_targets:
232
+ print(f"[APP] Scraping Instagram: {tgt}")
233
+ try:
234
+ texts = scrape_medos(ig_username, ig_password, tgt, mode)
235
+ raw_texts.extend(texts)
236
+ print(f"[APP] Instagram @{tgt} β†’ {len(texts)} teks")
237
+ except Exception as e:
238
+ print(f"[APP] Instagram error ({tgt}): {e}")
239
+ else:
240
+ print("[APP] Instagram dinonaktifkan β€” skip.")
241
+
242
+ # ── 2. TikTok ───────────────────────────────────────────────────────────
243
+ if _is_enabled(enable_tiktok):
244
+ tt_targets = _split_targets(tiktok_targets)
245
+ if not tt_targets:
246
+ print("[APP] TikTok diaktifkan tapi tidak ada target β€” skip.")
247
+ else:
248
+ for tgt in tt_targets:
249
+ print(f"[APP] Scraping TikTok: {tgt}")
250
+ try:
251
+ texts = scrape_tiktok(tiktok_cookie or "", tgt)
252
+ raw_texts.extend(texts)
253
+ print(f"[APP] TikTok @{tgt} β†’ {len(texts)} teks")
254
+ except Exception as e:
255
+ print(f"[APP] TikTok error ({tgt}): {e}")
256
+ else:
257
+ print("[APP] TikTok dinonaktifkan β€” skip.")
258
+
259
+ # ── 3. Facebook ─────────────────────────────────────────────────────────
260
+ # TIDAK memakai default groups β€” harus ada URL & credentials eksplisit
261
+ if _is_enabled(enable_facebook):
262
+ fb_groups = _split_targets(facebook_groups)
263
+ if not fb_username or not fb_password:
264
+ print("[APP] Facebook diaktifkan tapi username/password kosong β€” skip.")
265
+ elif not fb_groups:
266
+ print("[APP] Facebook diaktifkan tapi tidak ada URL grup β€” skip (tidak ada default).")
267
+ else:
268
+ print(f"[APP] Scraping Facebook {len(fb_groups)} grup…")
269
+ try:
270
+ texts = scrape_facebook(fb_username, fb_password, fb_groups)
271
+ raw_texts.extend(texts)
272
+ print(f"[APP] Facebook β†’ {len(texts)} teks")
273
+ except Exception as e:
274
+ print(f"[APP] Facebook error: {e}")
275
+ else:
276
+ print("[APP] Facebook dinonaktifkan β€” skip.")
277
+
278
+ # ── 4. News ─────────────────────────────────────────────────────────────
279
+ if _is_enabled(enable_news):
280
+ portals = _split_targets(news_portals)
281
+ if not portals:
282
+ print("[APP] News diaktifkan tapi tidak ada portal dipilih β€” skip.")
283
+ else:
284
+ for portal in portals:
285
+ print(f"[APP] Scraping news: portal={portal}, keyword={news_keyword}, pages={news_pages}")
286
+ try:
287
+ texts = scrape_news(portal, news_pages, keyword=news_keyword)
288
+ raw_texts.extend(texts)
289
+ print(f"[APP] News ({portal}) β†’ {len(texts)} teks")
290
+ except Exception as e:
291
+ print(f"[APP] News error ({portal}): {e}")
292
+ else:
293
+ print("[APP] News dinonaktifkan β€” skip.")
294
+
295
+ # ── Pipeline ────────────────────────────────────────────────────────────
296
+ outcome = _run_pipeline(raw_texts)
297
+
298
+ return templates.TemplateResponse(
299
+ request=request,
300
+ name="index.html",
301
+ context={
302
+ "error": outcome["error"],
303
+ "result": outcome["result"],
304
+ "image": outcome["image"],
305
+ "total_scraped": outcome["total_scraped"],
306
+ "csv_filename": outcome["csv_filename"],
307
+ "active_tab": "scraping",
308
+ },
309
+ )
310
+
311
+
312
+ @app.post("/wordcloud-dataset", response_class=HTMLResponse)
313
+ async def wordcloud_dataset(
314
+ request: Request,
315
+ dataset_text: str = Form(None),
316
+ dataset_file: UploadFile = File(None),
317
+ text_column: str = Form("text"),
318
+ ):
319
+ """
320
+ Word cloud + sentiment from an uploaded dataset (CSV/TXT/JSON) or pasted text.
321
+ """
322
+ raw_texts: list = []
323
+
324
+ # Priority: file upload
325
+ if dataset_file and dataset_file.filename:
326
+ fname = dataset_file.filename.lower()
327
+ content_bytes = await dataset_file.read()
328
+ try:
329
+ content_str = content_bytes.decode("utf-8", errors="replace")
330
+ except Exception:
331
+ content_str = content_bytes.decode("latin-1", errors="replace")
332
+
333
+ if fname.endswith(".csv") or fname.endswith(".tsv"):
334
+ delimiter = "\t" if fname.endswith(".tsv") else ","
335
+ reader = csv.DictReader(io.StringIO(content_str), delimiter=delimiter)
336
+ cols = reader.fieldnames or []
337
+ for row in reader:
338
+ if text_column and text_column in cols and row.get(text_column):
339
+ raw_texts.append(str(row[text_column]))
340
+ else:
341
+ raw_texts.append(row)
342
+
343
+ elif fname.endswith(".json"):
344
+ try:
345
+ data = json.loads(content_str)
346
+ if isinstance(data, list):
347
+ for item in data:
348
+ if isinstance(item, str) and item:
349
+ raw_texts.append(item)
350
+ elif isinstance(item, dict):
351
+ if text_column and text_column in item and item.get(text_column):
352
+ raw_texts.append(str(item[text_column]))
353
+ else:
354
+ raw_texts.append(item)
355
+ except Exception as e:
356
+ print(f"[Dataset] JSON parse error: {e}")
357
+ else:
358
+ # Plain text οΏ½οΏ½οΏ½ each non-empty line is one document
359
+ for line in content_str.splitlines():
360
+ line = line.strip()
361
+ if line:
362
+ raw_texts.append(line)
363
+
364
+ elif dataset_text and dataset_text.strip():
365
+ for line in dataset_text.splitlines():
366
+ line = line.strip()
367
+ if line:
368
+ raw_texts.append(line)
369
+
370
+ if not raw_texts:
371
+ return templates.TemplateResponse(
372
+ request=request,
373
+ name="index.html",
374
+ context={
375
+ "error": "Tidak ada teks ditemukan dalam dataset. Pastikan file / teks tidak kosong.",
376
+ "result": None,
377
+ "image": None,
378
+ "total_scraped": 0,
379
+ "csv_filename": None,
380
+ "active_tab": "dataset",
381
+ },
382
+ )
383
+
384
+ outcome = _run_pipeline(raw_texts)
385
+
386
+ return templates.TemplateResponse(
387
+ request=request,
388
+ name="index.html",
389
+ context={
390
+ "error": outcome["error"],
391
+ "result": outcome["result"],
392
+ "image": outcome["image"],
393
+ "total_scraped": outcome["total_scraped"],
394
+ "csv_filename": outcome["csv_filename"],
395
+ "active_tab": "dataset",
396
+ },
397
+ )
398
+
399
+
400
+ if __name__ == "__main__":
401
+ uvicorn.run(app, host="0.0.0.0", port=8000)
docker-compose.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ services:
4
+ app:
5
+ build: .
6
+ container_name: sentiment_app
7
+ ports:
8
+ # Map host 8000 to container 7860 (Hugging Face default)
9
+ - "8000:7860"
10
+ # Chrome needs a larger /dev/shm to avoid crashes in headless mode
11
+ shm_size: "2gb"
12
+ environment:
13
+ - PYTHONUNBUFFERED=1
14
+ volumes:
15
+ # Persist wordcloud output between runs
16
+ - ./static/output:/home/user/app/static/output
17
+ # Mount a local model folder if you have one (optional)
18
+ # Rename or create the folder 'indoBERT-sentiment' in the project root
19
+ - ./indoBERT-sentiment:/home/user/app/indoBERT-sentiment
20
+ restart: unless-stopped
fb.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import json
4
+ import csv
5
+ from datetime import datetime
6
+ import undetected_chromedriver as uc
7
+ from selenium.webdriver.common.by import By
8
+ from selenium.webdriver.support.ui import WebDriverWait
9
+ from selenium.webdriver.support import expected_conditions as EC
10
+
11
+ # ========== KONFIGURASI ==========
12
+ FB_USERNAME = "fatihr252@gmail.com"
13
+ FB_PASSWORD = "Bambank1"
14
+ COOKIES_FILE = "fb_cookies.json"
15
+
16
+ # daftar grup yang ingin di-scrape
17
+ GROUP_INPUTS = [
18
+ "https://web.facebook.com/groups/183039928416039?locale=id_ID",
19
+ "https://web.facebook.com/groups/teraswarga?locale=id_ID",
20
+ "https://web.facebook.com/groups/967901979894945?locale=id_ID"
21
+ ]
22
+
23
+ # lokasi hasil scraping
24
+ OUTPUT_CSV = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
25
+ OUTPUT_JSON = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
26
+
27
+ # ========== SETUP SELENIUM ==========
28
+ options = uc.ChromeOptions()
29
+ options.add_argument("--disable-notifications")
30
+ options.add_argument("--disable-infobars")
31
+ options.add_argument("--start-maximized")
32
+
33
+ driver = uc.Chrome(options=options, use_subprocess=True)
34
+ wait = WebDriverWait(driver, 15)
35
+
36
+
37
+ # ========== FUNGSI LOGIN ==========
38
+ def save_cookies(driver, path):
39
+ with open(path, "w") as file:
40
+ json.dump(driver.get_cookies(), file)
41
+
42
+
43
+ def load_cookies(driver, path):
44
+ with open(path, "r") as file:
45
+ cookies = json.load(file)
46
+ for cookie in cookies:
47
+ driver.add_cookie(cookie)
48
+
49
+ def fb_login(force=False):
50
+ """
51
+ force=True akan memaksa login pakai username/password
52
+ walaupun ada cookies.
53
+ """
54
+ driver.get("https://www.facebook.com/")
55
+ time.sleep(3)
56
+
57
+ if not force and os.path.exists(COOKIES_FILE):
58
+ try:
59
+ load_cookies(driver, COOKIES_FILE)
60
+ driver.refresh()
61
+ time.sleep(5)
62
+ if "login" not in driver.current_url:
63
+ print("βœ… Login pakai cookies berhasil")
64
+ # pastikan search bar muncul sebelum keluar
65
+ try:
66
+ wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
67
+ print("πŸ” Search bar tersedia, siap mencari grup")
68
+ except:
69
+ print("⚠️ Search bar belum muncul, tetap lanjutkan")
70
+ return
71
+ except Exception as e:
72
+ print("⚠️ Cookies gagal dipakai:", e)
73
+
74
+ print("πŸ”‘ Login manual pakai username/password...")
75
+
76
+ # --- Login form handling ---
77
+ try:
78
+ # versi klasik (id=email, id=pass)
79
+ email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
80
+ pass_input = driver.find_element(By.ID, "pass")
81
+ email_input.clear()
82
+ email_input.send_keys(FB_USERNAME)
83
+ pass_input.clear()
84
+ pass_input.send_keys(FB_PASSWORD)
85
+ driver.find_element(By.NAME, "login").click()
86
+ except Exception:
87
+ try:
88
+ # versi dinamis (_r_s_, _r_17_)
89
+ email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@name="email" and @type="text"]')))
90
+ pass_input = driver.find_element(By.XPATH, '//input[@name="pass" and @type="password"]')
91
+ email_input.clear()
92
+ email_input.send_keys(FB_USERNAME)
93
+ pass_input.clear()
94
+ pass_input.send_keys(FB_PASSWORD)
95
+ pass_input.submit()
96
+ except Exception:
97
+ try:
98
+ # versi lain (data-testid)
99
+ email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@data-testid="royal-email"]')))
100
+ pass_input = driver.find_element(By.XPATH, '//input[@data-testid="royal-pass"]')
101
+ email_input.clear()
102
+ email_input.send_keys(FB_USERNAME)
103
+ pass_input.clear()
104
+ pass_input.send_keys(FB_PASSWORD)
105
+ driver.find_element(By.NAME, "login").click()
106
+ except Exception as e:
107
+ raise Exception(f"❌ Tidak menemukan form login yang cocok: {e}")
108
+
109
+ time.sleep(5)
110
+ if "login" in driver.current_url:
111
+ raise Exception("❌ Login gagal! Cek username/password")
112
+
113
+ save_cookies(driver, COOKIES_FILE)
114
+ print("βœ… Login sukses & cookies disimpan")
115
+
116
+ # setelah login sukses, pastikan search bar ada
117
+ try:
118
+ wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
119
+ print("πŸ” Search bar tersedia, siap mencari grup")
120
+ except:
121
+ print("⚠️ Search bar belum muncul, coba manual redirect ke beranda")
122
+ driver.get("https://www.facebook.com/")
123
+ wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
124
+
125
+
126
+ def ensure_logged_in():
127
+ """Cek apakah user masih login, kalau muncul halaman login atau popup, login ulang."""
128
+ try:
129
+ # --- Kasus URL berubah ke login page ---
130
+ if driver.current_url and "login" in driver.current_url:
131
+ print("⚠️ Redirect ke halaman login, mencoba login ulang...")
132
+ fb_login(force=True)
133
+ return
134
+
135
+ # --- Kasus popup 'See more on Facebook' muncul ---
136
+ try:
137
+ popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
138
+ if popup.is_displayed():
139
+ print("⚠️ Popup login terdeteksi, login ulang...")
140
+ fb_login(force=True)
141
+ return
142
+ except:
143
+ pass
144
+
145
+ # --- Kasus ada input email/password nongol di modal ---
146
+ try:
147
+ login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
148
+ if login_modal.is_displayed():
149
+ print("⚠️ Form login modal terdeteksi, login ulang...")
150
+ fb_login(force=True)
151
+ return
152
+ except:
153
+ pass
154
+
155
+ except Exception as e:
156
+ print("⚠️ Gagal cek login:", e)
157
+
158
+ # ========== SEARCH & BUKA GRUP ==========
159
+ def open_group(group_input):
160
+ """
161
+ Bisa menerima nama grup ATAU link grup langsung.
162
+ """
163
+ # --- Kasus: input berupa link langsung ---
164
+ if group_input.startswith("http"):
165
+ print(f"πŸ”— Buka langsung link grup: {group_input}")
166
+ driver.get(group_input)
167
+ time.sleep(5)
168
+
169
+ ensure_logged_in()
170
+ return group_input
171
+
172
+ # --- Kasus: input berupa nama grup ---
173
+ try:
174
+ search_box = wait.until(
175
+ EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]'))
176
+ )
177
+ print(f"πŸ” Mencari grup '{group_input}' via search...")
178
+ search_box.clear()
179
+ search_box.send_keys(group_input)
180
+ search_box.submit()
181
+ time.sleep(5)
182
+
183
+ # cari hasil grup dengan nama persis
184
+ link = None
185
+ results = driver.find_elements(By.XPATH, f'//a[contains(text(),"{group_input}")]')
186
+ if results:
187
+ link = results[0].get_attribute("href")
188
+
189
+ if link:
190
+ print(f"βœ… Grup ditemukan: {link}")
191
+ driver.get(link)
192
+ time.sleep(5)
193
+ return link
194
+ else:
195
+ print(f"❌ Grup '{group_input}' tidak ditemukan via search")
196
+ return None
197
+
198
+ except Exception as e:
199
+ print(f"⚠️ Search gagal untuk '{group_input}':", e)
200
+ return None
201
+
202
+ def scroll_to_bottom(driver, max_scrolls=10, pause_time=2):
203
+ last_height = driver.execute_script("return document.body.scrollHeight")
204
+ for i in range(max_scrolls):
205
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
206
+ time.sleep(pause_time)
207
+ new_height = driver.execute_script("return document.body.scrollHeight")
208
+ if new_height == last_height:
209
+ break
210
+ last_height = new_height
211
+
212
+ # ========== SCRAPING POSTINGAN GRUP ==========
213
+ def scrape_group(group_url, group_name, max_scrolls=3, max_posts=None):
214
+ print(f"πŸ“₯ Scraping grup: {group_name} ({group_url})")
215
+ driver.get(group_url)
216
+ time.sleep(4)
217
+ ensure_logged_in()
218
+
219
+ posts = []
220
+ last_height = driver.execute_script("return document.body.scrollHeight")
221
+
222
+ for scroll_round in range(max_scrolls):
223
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
224
+ time.sleep(4)
225
+ ensure_logged_in()
226
+
227
+ post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
228
+ print(f"πŸ”Ž Ditemukan {len(post_elements)} postingan pada scroll {scroll_round+1}")
229
+
230
+ for idx, post in enumerate(post_elements):
231
+ if max_posts and len(posts) >= max_posts:
232
+ break
233
+
234
+ try:
235
+ driver.execute_script("arguments[0].scrollIntoView(true);", post)
236
+ time.sleep(1)
237
+
238
+ # --- article_ctx: konteks utama artikel/post ---
239
+ article_ctx = None
240
+ try:
241
+ # Biasanya post itu sendiri sudah konteks utama
242
+ article_ctx = post
243
+ except:
244
+ article_ctx = None
245
+
246
+ # --- permalink & buka halaman post ---
247
+ permalink = None
248
+ post_context = post # default fallback ke post list
249
+
250
+ try:
251
+ # coba ambil link /posts/
252
+ link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
253
+ permalink = link_el.get_attribute("href").split("?")[0]
254
+ except:
255
+ try:
256
+ # coba ambil link /permalink/
257
+ link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
258
+ permalink = link_el.get_attribute("href").split("?")[0]
259
+ except:
260
+ try:
261
+ # fallback ambil ID dari data-ft
262
+ post_id = post.get_attribute("data-ft")
263
+ if post_id and "top_level_post_id" in post_id:
264
+ import json
265
+ d = json.loads(post_id)
266
+ pid = d.get("top_level_post_id")
267
+ if pid:
268
+ permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
269
+ except:
270
+ pass
271
+
272
+ if not permalink:
273
+ print("⚠️ Tidak ada permalink & tidak bisa generate. Tetap lanjut simpan data.")
274
+ permalink = group_url # fallback isi dengan URL grup
275
+
276
+ # --- buka halaman permalink ---
277
+ try:
278
+ driver.get(permalink)
279
+ time.sleep(3)
280
+ ensure_logged_in()
281
+
282
+ # ambil elemen post baru dari halaman permalink
283
+ post_context = driver.find_element(By.XPATH, "//div[@role='article']")
284
+ except Exception as e:
285
+ print(f"⚠️ Gagal buka permalink {permalink}: {e}")
286
+ post_context = None # jangan pakai lagi elemen lama
287
+
288
+ # --- ambil author ---
289
+ author = "Unknown"
290
+ try:
291
+ if post_context:
292
+ try:
293
+ author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
294
+ except:
295
+ try:
296
+ author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
297
+ except:
298
+ author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
299
+ except:
300
+ pass
301
+
302
+ # --- expand komentar ---
303
+ while True:
304
+ try:
305
+ btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
306
+ driver.execute_script("arguments[0].click();", btn)
307
+ time.sleep(2)
308
+ except:
309
+ break
310
+
311
+ while True:
312
+ try:
313
+ btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
314
+ driver.execute_script("arguments[0].click();", btn)
315
+ time.sleep(2)
316
+ except:
317
+ break
318
+
319
+ # --- ambil caption & komentar dari post_context ---
320
+ if post_context:
321
+ try:
322
+ caption_blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
323
+ caption_texts = [cb.text.strip() for cb in caption_blocks if cb.text.strip()]
324
+ caption = "\n".join(caption_texts)[:2000] if caption_texts else ""
325
+ except:
326
+ caption = ""
327
+
328
+ # ambil komentar
329
+ comments = []
330
+ try:
331
+ comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar']//div[@dir='auto']")
332
+ seen = set()
333
+ for cb in comment_blocks:
334
+ text = cb.text.strip()
335
+ if text and text not in seen:
336
+ seen.add(text)
337
+ comments.append(text)
338
+ except:
339
+ comments = []
340
+
341
+ data = {
342
+ "group_name": group_name,
343
+ "group_url": group_url,
344
+ "post_url": permalink,
345
+ "author": author,
346
+ "caption": caption,
347
+ "comments": comments,
348
+ }
349
+ print(f"βœ… Post captured: {author} | {caption[:60]}... | {len(comments)} komentar")
350
+ posts.append(data)
351
+
352
+ except Exception as e:
353
+ print(f"⚠️ Error baca postingan {idx}: {e}")
354
+ continue
355
+
356
+ new_height = driver.execute_script("return document.body.scrollHeight")
357
+ if new_height == last_height:
358
+ break
359
+ last_height = new_height
360
+
361
+ return posts
362
+
363
+ # ========== MAIN ==========
364
+ all_data = []
365
+
366
+ fb_login()
367
+
368
+ for g in GROUP_INPUTS:
369
+ group_url = open_group(g)
370
+ if group_url:
371
+ posts = scrape_group(group_url, g)
372
+ all_data.extend(posts)
373
+
374
+ # simpan ke CSV
375
+ with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
376
+ fieldnames = ["group_name", "group_url", "post_url", "author", "caption", "comments"]
377
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
378
+ writer.writeheader()
379
+ for row in all_data:
380
+ writer.writerow(row)
381
+
382
+ # simpan ke JSON
383
+ with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
384
+ json.dump(all_data, f, ensure_ascii=False, indent=2)
385
+
386
+ print(f"βœ… Selesai. Data disimpan ke {OUTPUT_CSV} dan {OUTPUT_JSON}")
387
+ try:
388
+ driver.quit()
389
+ except:
390
+ pass
medos_scraping.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+
6
+ from datetime import datetime
7
+ from json import JSONDecodeError
8
+ from selenium import webdriver
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
13
+ from selenium.webdriver.common.keys import Keys
14
+
15
+ # ==============================================================================
16
+ # KONFIGURASI SELENIUM
17
+ # ==============================================================================
18
+
19
+ def setup_driver():
20
+ """Menyiapkan instance Selenium WebDriver."""
21
+ options = webdriver.ChromeOptions()
22
+ # options.add_argument('--headless')
23
+ options.add_argument('--disable-gpu')
24
+ options.add_argument('--log-level=3')
25
+ options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36')
26
+ options.add_experimental_option('excludeSwitches', ['enable-logging'])
27
+
28
+ try:
29
+ driver = webdriver.Chrome(options=options)
30
+ return driver
31
+ except Exception as e:
32
+ print(f"Error saat memulai WebDriver: {e}")
33
+ print("Pastikan chromedriver sudah diunduh dan berada di folder yang sama.")
34
+ return None
35
+
36
+ # ==============================================================================
37
+ # FUNGSI COOKIES & CAPTCHA
38
+ # ==============================================================================
39
+
40
+ def save_cookies(driver, path):
41
+ """Menyimpan cookies dari sesi browser ke file JSON."""
42
+ with open(path, 'w', encoding='utf-8') as f:
43
+ json.dump(driver.get_cookies(), f, indent=2)
44
+ print(f"\nCookies berhasil disimpan ke {path}")
45
+
46
+ # [PERBAIKAN] Fungsi ini dibuat lebih tangguh terhadap file kosong/rusak
47
+ def load_cookies(driver, path):
48
+ """Memuat cookies dari file JSON. Mengembalikan True jika berhasil, False jika gagal."""
49
+ if not os.path.exists(path) or os.path.getsize(path) == 0:
50
+ print(f"File cookies '{path}' tidak ditemukan atau kosong.")
51
+ return False
52
+
53
+ try:
54
+ with open(path, 'r', encoding='utf-8') as f:
55
+ cookies = json.load(f)
56
+
57
+ if not isinstance(cookies, list):
58
+ print(f"Format data di '{path}' tidak valid (bukan list).")
59
+ return False
60
+
61
+ for cookie in cookies:
62
+ driver.add_cookie(cookie)
63
+ print(f"Cookies berhasil dimuat dari {path}")
64
+ return True
65
+ except JSONDecodeError:
66
+ print(f"Gagal membaca '{path}' karena file rusak (JSONDecodeError).")
67
+ return False
68
+ except Exception as e:
69
+ print(f"Terjadi error saat memuat cookies dari '{path}': {e}")
70
+ return False
71
+
72
+ def establish_and_verify_session(driver, base_cookies_path, profile_cookies_path, profile_url):
73
+ """
74
+ Menangani alur CAPTCHA dengan membangun sesi dasar terlebih dahulu.
75
+ """
76
+ # --- TAHAP 1: MEMBANGUN SESI DASAR (HOMEPAGE) ---
77
+ print("\n--- Tahap 1: Membangun Sesi Dasar di tiktok.com ---")
78
+ driver.get("https://www.tiktok.com/")
79
+
80
+ # [PERBAIKAN] Cek hasil dari load_cookies
81
+ if not load_cookies(driver, base_cookies_path):
82
+ print("\n" + "="*50)
83
+ print("‼️ TINDAKAN AWAL DIPERLUKAN ‼️")
84
+ input("File cookies dasar tidak valid/tidak ada. Selesaikan CAPTCHA di tiktok.com, lalu tekan [Enter]...")
85
+ save_cookies(driver, base_cookies_path)
86
+
87
+ driver.refresh()
88
+ print("Sesi dasar telah dibuat/dimuat.")
89
+
90
+ # --- TAHAP 2: VERIFIKASI SESI PROFIL ---
91
+ print(f"\n--- Tahap 2: Verifikasi Sesi di Halaman Profil ---")
92
+ driver.get(profile_url)
93
+
94
+ # [PERBAIKAN] Cek hasil dari load_cookies
95
+ if load_cookies(driver, profile_cookies_path):
96
+ print("Mencoba memvalidasi sesi dengan cookies profil...")
97
+ driver.refresh()
98
+ try:
99
+ WebDriverWait(driver, 10).until(
100
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
101
+ )
102
+ print("βœ… Sesi profil berhasil dipulihkan.")
103
+ return True
104
+ except TimeoutException:
105
+ print("⚠️ Cookies profil tidak valid. Diperlukan verifikasi manual.")
106
+
107
+ print("\n" + "="*50)
108
+ print("‼️ VERIFIKASI SEBELUM SCRAPING ‼️")
109
+ input("Halaman profil telah dimuat. Jika ada CAPTCHA, selesaikan sekarang. Tekan [Enter]...")
110
+ save_cookies(driver, profile_cookies_path)
111
+
112
+ try:
113
+ WebDriverWait(driver, 10).until(
114
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
115
+ )
116
+ print("βœ… Sesi profil berhasil dibuat/diperbarui.")
117
+ return True
118
+ except TimeoutException:
119
+ print("❌ Gagal memverifikasi halaman profil.")
120
+ return False
121
+
122
+ # ==============================================================================
123
+ # FUNGSI-FUNGSI BANTUAN SCRAPING (Tidak Berubah)
124
+ # ==============================================================================
125
+
126
+ def get_video_links(driver, max_videos):
127
+ """
128
+ Mengambil link video dari halaman profil dengan melakukan scroll
129
+ hingga batas maksimal tercapai atau halaman paling bawah.
130
+ """
131
+ print(f"\nπŸ”Ž Mulai mengumpulkan link video (target: {max_videos} video)...")
132
+ video_links = set()
133
+
134
+ try:
135
+ # 1. Tunggu hingga elemen video pertama kali muncul
136
+ WebDriverWait(driver, 15).until(
137
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
138
+ )
139
+ print("βœ… Halaman profil berhasil dimuat.")
140
+
141
+ # 2. Loop untuk scroll dan kumpulkan link
142
+ while len(video_links) < max_videos:
143
+ # Simpan jumlah link sebelum scroll untuk deteksi akhir halaman
144
+ links_before_scroll = len(video_links)
145
+
146
+ # Kumpulkan semua link yang ada di DOM saat ini
147
+ video_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a')
148
+ for elem in video_elements:
149
+ href = elem.get_attribute('href')
150
+ if href:
151
+ video_links.add(href)
152
+
153
+ # Cek apakah target sudah tercapai setelah pengumpulan
154
+ if len(video_links) >= max_videos:
155
+ print(f"🎯 Target {max_videos} video tercapai ({len(video_links)} ditemukan). Berhenti scroll.")
156
+ break
157
+
158
+ # Lakukan scroll ke paling bawah halaman
159
+ print(f"πŸ“œ Scrolling... Ditemukan {len(video_links)}/{max_videos} video.")
160
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
161
+
162
+ # Beri waktu agar konten baru sempat dimuat
163
+ time.sleep(3)
164
+
165
+ # 3. Deteksi jika sudah tidak ada video baru yang dimuat (paling bawah)
166
+ # Untuk menghindari infinite loop, kita cek apakah jumlah link bertambah.
167
+ if len(video_links) == links_before_scroll:
168
+ print("🏁 Halaman sudah paling bawah atau tidak ada video baru yang dimuat.")
169
+ break
170
+
171
+ except TimeoutException:
172
+ print("❌ Gagal memuat halaman profil atau tidak ada video ditemukan.")
173
+ return []
174
+
175
+ print(f"\nπŸ‘ Selesai mengumpulkan. Total {len(video_links)} link video unik ditemukan.")
176
+
177
+ # Pastikan hasil akhir tidak melebihi max_videos
178
+ return list(video_links)[:max_videos]
179
+
180
+ def check_for_captcha(driver):
181
+ """
182
+ [PERBAIKAN V2] Memeriksa CAPTCHA, termasuk di dalam iFrame.
183
+ """
184
+ captcha_texts = [
185
+ "Drag the slider to fit the puzzle",
186
+ "Drag the puzzle piece into place",
187
+ "Geser puzzle untuk melengkapi gambar",
188
+ "Verify to continue"
189
+ ]
190
+ # Menggunakan contains(., '...') agar lebih kuat dalam mencari teks
191
+ xpath_query = "//*[" + " or ".join([f"contains(., '{text}')" for text in captcha_texts]) + "]"
192
+
193
+ # 1. Cek di dalam iFrame terlebih dahulu (penyebab paling umum)
194
+ try:
195
+ iframes = driver.find_elements(By.TAG_NAME, 'iframe')
196
+ if iframes:
197
+ print(f"\n Mendeteksi {len(iframes)} iFrame, memeriksa satu per satu untuk CAPTCHA...")
198
+ for frame in iframes:
199
+ try:
200
+ # Pindah fokus ke dalam iFrame
201
+ driver.switch_to.frame(frame)
202
+ # Cari elemen CAPTCHA di dalam iFrame
203
+ driver.find_element(By.XPATH, xpath_query)
204
+ print("\n⚠️ CAPTCHA terdeteksi di dalam sebuah iFrame!")
205
+ # PENTING: Kembali ke konteks halaman utama agar sisa skrip tidak error
206
+ driver.switch_to.default_content()
207
+ return True
208
+ except NoSuchElementException:
209
+ # Jika tidak ditemukan di iFrame ini, kembali dan lanjut ke iFrame berikutnya
210
+ driver.switch_to.default_content()
211
+ continue
212
+ except Exception as e:
213
+ print(f"\n Error saat memeriksa iFrame: {e}")
214
+ # Pastikan kembali ke konteks utama jika ada error tak terduga
215
+ driver.switch_to.default_content()
216
+
217
+ # 2. Jika tidak ada di iFrame, cek di halaman utama (sebagai cadangan)
218
+ try:
219
+ driver.find_element(By.XPATH, xpath_query)
220
+ print("\n⚠️ CAPTCHA terdeteksi di halaman utama!")
221
+ return True
222
+ except NoSuchElementException:
223
+ return False
224
+
225
+ def scrape_video_details(driver, video_url):
226
+ """Mengambil caption dan seluruh komentar, dengan penanganan CAPTCHA dan logika ekspansi konten."""
227
+ print(f"\n--- Memproses video: {video_url} ---")
228
+ driver.get(video_url)
229
+
230
+ max_retries = 2
231
+ for attempt in range(max_retries):
232
+ try:
233
+ upload_date = "N/A"
234
+ like_count = "N/A"
235
+
236
+ try:
237
+ date_element = WebDriverWait(driver, 10).until(
238
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
239
+ )
240
+ upload_date = date_element.text
241
+ except TimeoutException:
242
+ print(" Β -> Info tanggal video tidak ditemukan.")
243
+
244
+ try:
245
+ like_element = WebDriverWait(driver, 10).until(
246
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'strong[data-e2e="like-count"]'))
247
+ )
248
+ like_count = like_element.text
249
+ print(f" Β -> Jumlah 'like' ditemukan: {like_count}")
250
+ except TimeoutException:
251
+ print(" Β -> Info jumlah 'like' tidak ditemukan.")
252
+
253
+ video_data = {'url': video_url, 'upload_date': upload_date, 'like_count': like_count, 'caption_short': '', 'caption_detail': '', 'comments': []}
254
+
255
+ # --- [PERBAIKAN DIMULAI DI SINI] ---
256
+ try:
257
+ # 1. Tetap tunggu container utamanya
258
+ desc_container = WebDriverWait(driver, 5).until(
259
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
260
+ )
261
+
262
+ # 2. Cari caption di dalam try...except baru
263
+ try:
264
+ video_data['caption_short'] = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]').text
265
+ print(f" Β -> Caption singkat ditemukan: {video_data['caption_short'][:50]}...")
266
+
267
+ # 3. Logika untuk tombol 'more' hanya dijalankan jika caption ditemukan
268
+ try:
269
+ more_button = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
270
+ driver.execute_script("arguments[0].click();", more_button)
271
+ print(" Β -> Tombol 'more' (ikon) pada caption diklik.")
272
+ time.sleep(2)
273
+ detail_container = WebDriverWait(driver, 5).until(
274
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
275
+ )
276
+ desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
277
+ keywords_text = ""
278
+ try:
279
+ keywords_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
280
+ except NoSuchElementException: pass
281
+ video_data['caption_detail'] = f"Deskripsi: {desc_text}\nKeywords: {keywords_text}".strip()
282
+ print(f" Β -> Caption detail ditemukan: {video_data['caption_detail'][:50]}...")
283
+ except (NoSuchElementException, TimeoutException):
284
+ print(" Β -> Tidak ada tombol 'more' untuk caption detail.")
285
+
286
+ except NoSuchElementException:
287
+ # Jika elemen caption tidak ada, cetak pesan dan lanjutkan
288
+ print(" Β -> Video ini tidak memiliki caption.")
289
+
290
+ except TimeoutException:
291
+ # Jika bahkan container deskripsinya tidak ada, anggap halaman gagal dimuat
292
+ print(" Β -> Bagian deskripsi/caption tidak ditemukan, kemungkinan halaman terhalang.")
293
+ # --- [PERBAIKAN SELESAI DI SINI] ---
294
+
295
+ # ... (Sisa kode untuk mengambil komentar tidak perlu diubah) ...
296
+ try:
297
+ comment_container = WebDriverWait(driver, 15).until(
298
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
299
+ )
300
+ print(" Β -> Bagian komentar ditemukan. Memuat seluruh komentar...")
301
+ body = driver.find_element(By.TAG_NAME, 'body')
302
+ except TimeoutException:
303
+ print(" Β -> Bagian komentar tidak ditemukan.")
304
+ return video_data
305
+
306
+ try:
307
+ print(" Β -> Memulai proses scroll dan klik balasan secara dinamis...")
308
+ reply_button_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'reply') or contains(text(), 'replies'))]"
309
+
310
+ last_comment_count = 0
311
+ stalled_attempts = 0
312
+ max_stalled_attempts = 5
313
+
314
+ while stalled_attempts < max_stalled_attempts:
315
+ try:
316
+ view_buttons = driver.find_elements(By.XPATH, reply_button_xpath)
317
+ if view_buttons:
318
+ print(f" Β  Β -> Menemukan {len(view_buttons)} tombol balasan. Mengklik satu...")
319
+ driver.execute_script("arguments[0].click();", view_buttons[0])
320
+ time.sleep(2)
321
+ stalled_attempts = 0
322
+ continue
323
+ except Exception as e:
324
+ print(f" Β  Β -> Error minor saat mengklik tombol balasan: {e}")
325
+
326
+ print(" Β  Β -> Tidak ada tombol balasan terlihat. Melakukan scroll...")
327
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
328
+ time.sleep(3)
329
+
330
+ current_comment_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
331
+ if current_comment_count > last_comment_count:
332
+ print(f" Β  Β -> Konten baru dimuat. Total item sekarang: {current_comment_count}")
333
+ last_comment_count = current_comment_count
334
+ stalled_attempts = 0
335
+ else:
336
+ stalled_attempts += 1
337
+ print(f" Β  Β -> Konten tidak bertambah, percobaan ke-{stalled_attempts}/{max_stalled_attempts}.")
338
+
339
+ print(" Β -> Scroll dan klik selesai. Memulai ekstraksi final...")
340
+
341
+ comment_item_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
342
+ print(f" Β -> Ditemukan total {comment_item_count} item komentar. Memproses satu per satu...")
343
+
344
+ for i in range(comment_item_count):
345
+ try:
346
+ all_comment_items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
347
+ item = all_comment_items[i]
348
+
349
+ try:
350
+ author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
351
+ comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]')
352
+ new_comment = {
353
+ 'author': author_element.text,
354
+ 'comment': comment_element.text,
355
+ 'replies': []
356
+ }
357
+ video_data['comments'].append(new_comment)
358
+ continue
359
+ except NoSuchElementException:
360
+ pass
361
+
362
+ try:
363
+ reply_author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
364
+ reply_comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]')
365
+ if video_data['comments']:
366
+ new_reply = {
367
+ 'author': reply_author_element.text,
368
+ 'comment': reply_comment_element.text
369
+ }
370
+ video_data['comments'][-1]['replies'].append(new_reply)
371
+ except NoSuchElementException:
372
+ pass
373
+ except IndexError:
374
+ print(f" Β  Β -> Peringatan: Jumlah komentar berubah saat proses. Melewatkan indeks ke-{i}.")
375
+ break
376
+ except Exception as e:
377
+ print(f" Β  Β -> Terjadi error pada item ke-{i}, melewati. Error: {e}")
378
+
379
+ print(" Β -> Selesai. Berhasil memproses dan mengelompokkan komentar.")
380
+
381
+ except Exception as e:
382
+ print(f" Β -> Gagal pada proses utama karena: {e}")
383
+
384
+ return video_data
385
+
386
+ except TimeoutException:
387
+ print(" Β -> Gagal memuat elemen halaman (Timeout).")
388
+ if check_for_captcha(driver):
389
+ print("\n" + "="*50)
390
+ print(f"⚠️ CAPTCHA terdeteksi pada percobaan ke-{attempt + 1} untuk video: {video_url}")
391
+ input(" Β  Silakan selesaikan CAPTCHA di browser, lalu tekan [Enter] untuk mencoba lagi...")
392
+ driver.refresh()
393
+ print(" Β  Mencoba lagi...")
394
+ continue
395
+ else:
396
+ print(" Β -> Tidak ada CAPTCHA. Melewati video ini.")
397
+ return None
398
+
399
+ print(f" Β -> Gagal memproses video setelah {max_retries} kali percobaan. Melewati video ini.")
400
+ return None
401
+ # ==============================================================================
402
+ # EKSEKUSI UTAMA (Tidak Berubah)
403
+ # ==============================================================================
404
+ if __name__ == "__main__":
405
+ PROFILE_USERNAMES = ["rctvcirebon", "cirebonkabtv", "kang_jigus", "kangimron_", "info.cirebonan"]
406
+ #
407
+ MAX_VIDEOS_PER_PROFILE = 200
408
+
409
+ BASE_COOKIES_FILE = "tiktok_base_cookies.json"
410
+ PROFILE_COOKIES_FILE = "tiktok_profile_cookies.json"
411
+
412
+ all_data = []
413
+ driver = setup_driver()
414
+
415
+ if driver:
416
+ try:
417
+ if not PROFILE_USERNAMES:
418
+ print("Daftar PROFILE_USERNAMES kosong.")
419
+ else:
420
+ first_profile_url = f"https://www.tiktok.com/@{PROFILE_USERNAMES[0]}"
421
+ session_ok = establish_and_verify_session(driver, BASE_COOKIES_FILE, PROFILE_COOKIES_FILE, first_profile_url)
422
+
423
+ if session_ok:
424
+ for username in PROFILE_USERNAMES:
425
+ print("\n" + "="*70)
426
+ print(f"MEMULAI SCRAPING UNTUK PROFIL: @{username}")
427
+ print("="*70)
428
+
429
+ profile_url = f"https://www.tiktok.com/@{username}"
430
+ driver.get(profile_url)
431
+
432
+ # [PERUBAHAN] Panggilan fungsi disederhanakan
433
+ video_urls = get_video_links(driver, MAX_VIDEOS_PER_PROFILE)
434
+
435
+ for url in video_urls:
436
+ data = scrape_video_details(driver, url)
437
+ if data:
438
+ data['profile_username'] = username
439
+ data['scrape_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
440
+ all_data.append(data)
441
+ time.sleep(2)
442
+
443
+ # ... sisa kode untuk menyimpan file tidak perlu diubah ...
444
+ if all_data:
445
+ print("\nMenyimpan semua data yang terkumpul...")
446
+ df = pd.DataFrame(all_data)
447
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
448
+ output_filename = f"tiktok_data_multi_{timestamp}"
449
+ df.to_csv(f'{output_filename}.csv', index=False, encoding='utf-8-sig')
450
+ print(f"Data telah disimpan ke {output_filename}.csv")
451
+ with open(f'{output_filename}.json', 'w', encoding='utf-8') as f:
452
+ json.dump(all_data, f, ensure_ascii=False, indent=4)
453
+ print(f"Data telah disimpan ke {output_filename}.json")
454
+ else:
455
+ print("\nTidak ada data yang berhasil dikumpulkan untuk disimpan.")
456
+
457
+ except Exception as e:
458
+ print(f"\nTerjadi kesalahan fatal selama proses: {e}")
459
+ finally:
460
+ print("\n--- PROSES SELESAI ---")
461
+ driver.quit()
preparing.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Preparing.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
8
+ """
9
+
10
+ !pip -q install -U transformers accelerate torch
11
+ !pip install transformers
12
+ !pip install --upgrade transformers
13
+ !pip uninstall -y torch torchvision torchaudio transformers
14
+ !pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
15
+ !pip install transformers accelerate
16
+
17
+ import pandas as pd
18
+ import numpy as np
19
+ import matplotlib.pyplot as plt
20
+ import seaborn as sns
21
+ import os
22
+ import torch
23
+ import re
24
+
25
+ from textblob import TextBlob
26
+ from transformers import AutoConfig, pipeline
27
+
28
+ # Menampilkan Dataset
29
+ folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'
30
+
31
+ try:
32
+ # Dapatkan daftar semua file dalam folder
33
+ files = os.listdir(folder_path)
34
+
35
+ # Loop melalui setiap file
36
+ for file_name in files:
37
+ # Periksa apakah file tersebut adalah file CSV
38
+ if file_name.endswith('.csv'):
39
+ file_path = os.path.join(folder_path, file_name)
40
+
41
+ print(f"Membaca file: {file_name}")
42
+
43
+ try:
44
+ # Baca file CSV menggunakan Pandas
45
+ df = pd.read_csv(file_path)
46
+
47
+ # Tampilkan beberapa baris pertama dari dataset
48
+ print(df)
49
+ print("\n") # Beri jarak antar file
50
+
51
+ except Exception as e:
52
+ print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")
53
+
54
+ except FileNotFoundError:
55
+ print(f"Error: Folder '{folder_path}' tidak ditemukan.")
56
+ except Exception as e:
57
+ print(f"Terjadi error: {e}")
58
+
59
+ # Medsos
60
+
61
+ # 1. Memproses data Instagram
62
+ ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
63
+ df_ig = pd.read_csv(ig_path)
64
+ df_ig = df_ig.rename(columns={
65
+ 'source_name': 'profile',
66
+ 'post_url': 'url'
67
+ })
68
+
69
+ # 2. Memproses data TikTok
70
+ tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
71
+ df_tiktok = pd.read_csv(tiktok_path)
72
+ df_tiktok = df_tiktok.rename(columns={
73
+ 'like_count': 'likes',
74
+ 'caption_short': 'caption',
75
+ 'profile_username': 'profile',
76
+ 'scrape_date': 'datetime'
77
+ })
78
+ df_tiktok = df_tiktok.drop(columns=['upload_date'])
79
+
80
+ # --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---
81
+
82
+ kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']
83
+
84
+ # Proses DataFrame Instagram
85
+ df1_pilihan = df_ig[kolom_yang_dipilih].copy()
86
+ df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
87
+ df1_pilihan['asal_dataset'] = 'Instagram'
88
+
89
+ # Proses DataFrame TikTok
90
+ df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
91
+ df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
92
+ df2_pilihan['asal_dataset'] = 'Tiktok'
93
+
94
+ # --- Penggabungan ---
95
+ df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)
96
+
97
+ # --- Pembersihan Data (Preprocessing) ---
98
+
99
+ # Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
100
+ df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)
101
+
102
+ def clean_text(text):
103
+ if pd.isna(text): return ""
104
+ text = str(text).lower()
105
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
106
+ text = re.sub(r'[^a-zA-Z\s]', ' ', text)
107
+ text = re.sub(r'\s+', ' ', text).strip()
108
+ return text
109
+
110
+ def format_author(text):
111
+ formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
112
+ return formatted
113
+
114
+ df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
115
+ df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
116
+ df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
117
+ df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
118
+ df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)
119
+
120
+ # Hapus baris kosong dan duplikat di akhir
121
+ df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
122
+ df_gabungan = df_gabungan.drop_duplicates()
123
+
124
+ # --- HASIL AKHIR ---
125
+ print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
126
+ print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
127
+ print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
128
+ df_gabungan.info()
129
+
130
+ # Simpan ke file CSV baru
131
+ save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
132
+ df_gabungan.to_csv(save_path, index=False)
133
+ print(f"\nData berhasil disimpan di: {save_path}")
134
+
135
+ # Berita
136
+
137
+ df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')
138
+
139
+ # Apply string operations to the 'tag' column
140
+ df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)
141
+
142
+ # Filter the DataFrame based on the 'tag' column
143
+ df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
144
+ df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
145
+
146
+ df_berita = df_berita_filtered.dropna().drop_duplicates()
147
+ df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])
148
+
149
+ df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
150
+ df_berita['tag'] = df_berita['tag'].apply(clean_text)
151
+ df_berita['judul'] = df_berita['judul'].apply(clean_text)
152
+ df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()
153
+
154
+ df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
155
+ df_berita['isi_berita'] = (
156
+ df_berita['isi_berita']
157
+ .str.replace(r'(?i)scroll.*?content', '', regex=True)
158
+ .str.replace(r'(?i)h3:', '', regex=True)
159
+ .str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
160
+ .str.replace(r'(?i)editor.*?antara', '', regex=True)
161
+ .str.replace(r'(?i)pewarta.*?antara', '', regex=True)
162
+ .str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
163
+ .str.replace(r'(?i)dilarang.*?antara', '', regex=True)
164
+ .str.replace(r'(?i)advertisement', '', regex=True)
165
+ .str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
166
+ .str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
167
+ .str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
168
+ .str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
169
+ )
170
+
171
+ df_berita = df_berita.drop_duplicates()
172
+ df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])
173
+
174
+ print(df_berita)
175
+
176
+ save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
177
+ df_berita.to_csv(save_path, index=False)
178
+
179
+ MODEL_ID = "taufiqdp/indonesian-sentiment" # IndoBERT fine-tuned (3 kelas)
180
+
181
+ # (opsional) kalau kamu perlu token HF untuk repo privat:
182
+ # from huggingface_hub import login
183
+ # login("hf_xxx") # token kamu
184
+
185
+ config = AutoConfig.from_pretrained(MODEL_ID)
186
+ clf = pipeline(
187
+ task="text-classification",
188
+ model=MODEL_ID,
189
+ tokenizer=MODEL_ID,
190
+ device=0 if torch.cuda.is_available() else -1,
191
+ truncation=True,
192
+ max_length=256,
193
+ return_all_scores=False,
194
+ )
195
+
196
+ def normalize_label(lbl: str) -> str:
197
+ l = lbl.lower()
198
+ if l in ("positif","positive"): return "positif"
199
+ if l in ("negatif","negative"): return "negatif"
200
+ if l in ("netral","neutral"): return "netral"
201
+ # fallback jika format 'LABEL_0/1/2'
202
+ if "label_" in l:
203
+ try:
204
+ idx = int(l.split("_")[-1])
205
+ return config.id2label[idx].lower()
206
+ except:
207
+ return "netral"
208
+ return l
209
+
210
+ # Sentimen untuk CAPTION
211
+ texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
212
+ preds_caption = clf(texts_caption, batch_size=64)
213
+ df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]
214
+
215
+ # Sentimen untuk COMMENTS
216
+ texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
217
+ preds_comments = clf(texts_comments, batch_size=64)
218
+ df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]
219
+
220
+ # (opsional) buat kolom sentimen gabungan
221
+ # kalau caption netral/empty, ambil dari comments
222
+ def combine_sentiment(row):
223
+ if row['sentimen_caption'] != "netral":
224
+ return row['sentimen_caption']
225
+ return row['sentimen_comments']
226
+ df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)
227
+
228
+ df_gabungan.to_csv('medsos2.csv', index=False)
229
+
230
+
231
+ # contoh ke dataframe berita (judul/tag)
232
+ texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
233
+ preds_b = clf(texts_b, batch_size=64)
234
+ df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]
235
+
236
+ df_berita.to_csv('berita2.csv', index=False)
requirements.txt ADDED
Binary file (3.95 kB). View file
 
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10
sentimentanalysis.py ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """SentimentAnalysis
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/fatihramadhan/sentimentanalysis.74f160cb-74cc-4609-ba85-0081c3654a18.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20260326/auto/storage/goog4_request%26X-Goog-Date%3D20260326T141800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2fe877a762338b5e556a035ce46a5a6bf9c51c0d33c4b062e919cfd44e0297ff787b3a23bf4290b33ca0467d04cf7ba377d77c975cd79da4f1adfec176cb7d78d1eddf1eec10e87d86e656200eaed9b0781f5f5d215ee084957aa5a30c2e9fa1731c23b333d5f742767875bd84e34b83339d834639567639d817ad1295fbc8fd552a5ae92f938b90cb8d916b4a7190e208c6d0effdc10665a9405efffc12a2d4497159428e898204e32ad2d629a58e985c020c7febef459895fd34b052c37a041102284e207ed788a6490c64656ece6150fc355120a49cf2b2fdadda53018d3dba4f8aeda15faaa1eb9c9cef82a476c38be69504e5a5f98cf61686a2b337ea77
8
+ """
9
+
10
+ # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
11
+ # THEN FEEL FREE TO DELETE THIS CELL.
12
+ # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
13
+ # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
14
+ # NOTEBOOK.
15
+ import kagglehub
16
+ fatihramadhan_sentimentdataset_path = kagglehub.dataset_download('fatihramadhan/sentimentdataset')
17
+
18
+ print('Data source import complete.')
19
+
20
+ import pandas as pd
21
+ import numpy as np
22
+ import matplotlib.pyplot as plt
23
+
24
+ import re
25
+ import html
26
+ import torch
27
+ import evaluate
28
+ import os
29
+ import transformers
30
+ import inspect
31
+ import joblib
32
+
33
+ from pathlib import Path
34
+ from torch.utils.data import Dataset, DataLoader
35
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline
36
+
37
+ from sklearn.model_selection import train_test_split
38
+ from sklearn.base import BaseEstimator, TransformerMixin
39
+ from sklearn.metrics import accuracy_score, f1_score
40
+ from sklearn.utils import resample
41
+
42
+ # ----------------------------
43
+ # Konfigurasi
44
+ # ----------------------------
45
+ INPUT_PATH = "/kaggle/input/sentimentdataset/dataset_gabungan.csv"
46
+
47
+ # Jika kamu pakai model cased (mis. indobenchmark/indobert-base-p2), set ke False
48
+ APPLY_LOWERCASE = True
49
+
50
+ # Batasi huruf berulang (contoh: "baguuuusss" -> "baguus")
51
+ LIMIT_REPEAT_CHARS = True
52
+ MAX_REPEAT = 2
53
+
54
+ # Nama kolom (biarkan None agar ditebak otomatis)
55
+ TEXT_COL = None
56
+ LABEL_COL = None
57
+
58
+ # Label yang didukung (akan dinormalisasi ke bentuk ini)
59
+ CANON_LABELS = {"positif": "positif", "positive": "positif", "pos": "positif", 'positi': 'positif',
60
+ "negatif": "negatif", "negative": "negatif", "neg": "negatif", 'negartif': 'negatif',
61
+ "netral": "netral", "neutral": "netral", "neu": "netral", 'netr' : 'netral'}
62
+
63
+ # ----------------------------
64
+ # Utilitas
65
+ # ----------------------------
66
+ def guess_column(df: pd.DataFrame, candidates):
67
+ for c in candidates:
68
+ if c in df.columns:
69
+ return c
70
+ # fallback: pilih kolom bertipe object terpanjang
71
+ obj_cols = [c for c in df.columns if df[c].dtype == "object"]
72
+ return obj_cols[0] if obj_cols else df.columns[0]
73
+
74
+ url_pattern = re.compile(r"(https?://\S+|www\.\S+)")
75
+ mention_pattern = re.compile(r"@\w+")
76
+ hashtag_pattern = re.compile(r"#(\w+)")
77
+ multi_space_pattern = re.compile(r"\s+")
78
+ rt_fw_pattern = re.compile(r"\b(rt|fw|fwd)\b[:]?", flags=re.IGNORECASE)
79
+
80
+ # Optional: pola khusus yang sering ada di data komentar (hapus segmen "author ... comment")
81
+ author_comment_pattern = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
82
+
83
+ def limit_repeated_chars(text: str, max_repeat: int = 2) -> str:
84
+ return re.sub(r"(.)\1{%d,}" % (max_repeat), r"\1" * max_repeat, text)
85
+
86
+ class TextPreprocessor(BaseEstimator, TransformerMixin):
87
+ def __init__(self,
88
+ apply_lowercase=True,
89
+ limit_repeat=True,
90
+ max_repeat=2,
91
+ canon_labels=None):
92
+ self.apply_lowercase = apply_lowercase
93
+ self.limit_repeat = limit_repeat
94
+ self.max_repeat = max_repeat
95
+ self.canon_labels = canon_labels or {}
96
+
97
+ def fit(self, X, y=None):
98
+ return self
99
+
100
+ def transform(self, X, y=None):
101
+ # pastikan Series + atasi NaN di sini, JANGAN di _clean_text
102
+ texts = pd.Series(X).fillna("").astype(str)
103
+ return texts.apply(self._clean_text)
104
+
105
+ def transform_labels(self, y):
106
+ if y is None:
107
+ return None
108
+ labels = pd.Series(y).astype(str)
109
+ return labels.apply(self._normalize_label)
110
+
111
+ def _normalize_label(self, x):
112
+ if pd.isna(x):
113
+ return None
114
+ s = str(x).strip().lower()
115
+ return self.canon_labels.get(s, None)
116
+
117
+ def _clean_text(self, t: str) -> str:
118
+ if not isinstance(t, str):
119
+ return ""
120
+
121
+ # Hapus pola "author ... comment"
122
+ t = author_comment_pattern.sub("", t)
123
+
124
+ # Hapus tag HTML / atribut
125
+ t = remove_html_elements(t)
126
+
127
+ # Unescape HTML entities
128
+ t = html.unescape(t)
129
+
130
+ # Ganti URL dan mention
131
+ t = url_pattern.sub(" <url> ", t)
132
+ t = mention_pattern.sub(" <user> ", t)
133
+
134
+ # Hashtag "#kata" -> "kata"
135
+ t = hashtag_pattern.sub(lambda m: f"{m.group(1)}", t)
136
+
137
+ # Hapus token RT/FW
138
+ t = rt_fw_pattern.sub(" ", t)
139
+
140
+ # Hanya simpan huruf, angka, dan spasi
141
+ t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
142
+
143
+ # Normalisasi whitespace
144
+ t = multi_space_pattern.sub(" ", t).strip()
145
+
146
+ # Lowercase jika diinginkan
147
+ if self.apply_lowercase:
148
+ t = t.lower()
149
+
150
+ # Batasi huruf berulang
151
+ if self.limit_repeat:
152
+ t = limit_repeated_chars(t, self.max_repeat)
153
+
154
+ return t
155
+
156
+
157
+ def remove_html_elements(text: str) -> str:
158
+ if not isinstance(text, str):
159
+ return ""
160
+
161
+ # Unescape HTML entities (&amp; -> &, dll)
162
+ text = html.unescape(text)
163
+
164
+ # Hapus semua <tag> lengkap
165
+ text = TAG_RE.sub(" ", text)
166
+
167
+ # Hapus atribut HTML yang nyangkut sebagai plain text
168
+ text = ATTR_RE.sub(" ", text)
169
+
170
+ # Hapus simbol "<" atau ">" sisa
171
+ text = re.sub(r"[<>]", " ", text)
172
+
173
+ # Normalkan spasi
174
+ text = re.sub(r"\s+", " ", text).strip()
175
+
176
+ return text
177
+
178
+ # regex: hapus <tag> beserta isinya
179
+ TAG_RE = re.compile(r"<[^>]+>")
180
+
181
+ # regex: hapus atribut-atribut html yang sering nyangkut
182
+ ATTR_RE = re.compile(r"\b(class|id|style|role|tabindex|href|src|alt)=[^\s>]+", flags=re.IGNORECASE)
183
+
184
+ # ----------------------------
185
+ # Load
186
+ # ----------------------------
187
+ path = Path(INPUT_PATH)
188
+ if not path.exists():
189
+ raise FileNotFoundError(f"File tidak ditemukan: {path.resolve()}")
190
+
191
+ df = pd.read_csv(path)
192
+
193
+ # ----------------------------
194
+ # Tentukan kolom teks & label
195
+ # ----------------------------
196
+ if TEXT_COL is None:
197
+ TEXT_COL = guess_column(df, ["text", "tweet", "content", "sentence", "caption", "judul", "deskripsi"])
198
+ if LABEL_COL is None:
199
+ LABEL_COL = guess_column(df, ["label", "sentiment", "polarity", "target", "kelas"])
200
+
201
+ print(f"Kolom teks terdeteksi : {TEXT_COL}")
202
+ print(f"Kolom label terdeteksi: {LABEL_COL}")
203
+
204
+ # ----------------------------
205
+ # Load Preproc
206
+ # ----------------------------
207
+
208
+ preproc = TextPreprocessor(
209
+ apply_lowercase=APPLY_LOWERCASE,
210
+ limit_repeat=LIMIT_REPEAT_CHARS,
211
+ max_repeat=MAX_REPEAT,
212
+ canon_labels=CANON_LABELS
213
+ )
214
+
215
+ # ----------------------------
216
+ # Penggunaan Preproc
217
+ # ----------------------------
218
+ # fit_transform teks
219
+ df["text"] = preproc.fit_transform(df[TEXT_COL])
220
+ df["sentiment"] = preproc.transform_labels(df[LABEL_COL])
221
+
222
+ # ----------------------------
223
+ # Drop Data jika Text Kosong
224
+ # ----------------------------
225
+ df = df[df["text"].str.strip().ne("")]
226
+
227
+ # ----------------------------
228
+ # Tampilkan contoh label tak dikenal
229
+ # ----------------------------
230
+ unknown = df[df["sentiment"].isna()]
231
+ print("\nContoh label tak dikenal yang akan dibuang:")
232
+ print(unknown[[LABEL_COL]].value_counts()) # tampilkan 10 teratas
233
+
234
+ # Buang label tak dikenal
235
+ before = len(df)
236
+ df = df[df["sentiment"].notna()]
237
+ dropped_unknown = before - len(df)
238
+
239
+ # ----------------------------
240
+ # Hapus duplikasi (berdasarkan teks bersih)
241
+ # ----------------------------
242
+ df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
243
+
244
+ # ----------------------------
245
+ # Ringkasan
246
+ # ----------------------------
247
+ print("\nRingkasan setelah preprocessing:")
248
+ print(f" - Baris total : {len(df)}")
249
+ print(f" - Dibuang label tak dikenal: {dropped_unknown}")
250
+ print(" - Distribusi label:")
251
+ print(df["sentiment"].value_counts(dropna=False))
252
+
253
+ # Contoh pratinjau
254
+ print("\nContoh 5 baris:")
255
+ print(df[[TEXT_COL, "text", LABEL_COL, "sentiment"]].head(5))
256
+
257
+ # df.to_csv('/content/drive/MyDrive/Machine Learning/Latih Model/bersihhh.csv')
258
+
259
+ # ----------------------------
260
+ # Save Preproc
261
+ # ----------------------------
262
+
263
+ joblib.dump(preproc, "preprocessor.joblib")
264
+
265
+ # ============================
266
+ # PERBAIKAN LABEL BERDASARKAN KATA KUNCI
267
+ # ============================
268
+
269
+ # Definisikan kamus kata kunci untuk tiap label
270
+ NEGATIVE_KEYWORDS = {
271
+ # Kata kasar / slang
272
+ "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
273
+ "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
274
+ "kik", "goblog", "kntl",
275
+
276
+ # Kata resmi / formal
277
+ "buruk", "lemah", "rendah", "gagal", "hancur", "rusak", "cacat",
278
+ "jahat", "dusta", "bohong", "fitnah", "korup", "curang", "palsu",
279
+ "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
280
+ "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
281
+ }
282
+
283
+ POSITIVE_KEYWORDS = {
284
+ # Kata umum positif
285
+ "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
286
+ "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
287
+ "sempurna", "berhasil", "luas", "indah"
288
+ }
289
+
290
+ NEUTRAL_KEYWORDS = {
291
+ # Kata netral / umum
292
+ "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
293
+ }
294
+
295
+ def correct_label(row):
296
+ text = row["text"]
297
+ label = row["sentiment"]
298
+
299
+ # cek kata negatif
300
+ if any(word in text for word in NEGATIVE_KEYWORDS):
301
+ return "negatif"
302
+ # cek kata positif
303
+ if any(word in text for word in POSITIVE_KEYWORDS):
304
+ return "positif"
305
+ # cek kata netral
306
+ if any(word in text for word in NEUTRAL_KEYWORDS):
307
+ return "netral"
308
+
309
+ # kalau tidak ada aturan yang kena, pakai label asli
310
+ return label
311
+
312
+ # Terapkan perbaikan
313
+ df["sentiment"] = df.apply(correct_label, axis=1)
314
+
315
+ # Ringkasan distribusi setelah perbaikan
316
+ print("\nDistribusi label setelah perbaikan:")
317
+ print(df["sentiment"].value_counts())
318
+
319
+ # Pisahkan tiap kelas
320
+ df_negatif = df[df["sentiment"] == "negatif"]
321
+ df_positif = df[df["sentiment"] == "positif"]
322
+ df_netral = df[df["sentiment"] == "netral"]
323
+
324
+ # Tentukan target jumlah (misal samakan dengan kelas netral)
325
+ target_count = df_netral.shape[0]
326
+
327
+ # Oversampling positif & negatif
328
+ df_negatif_over = resample(df_negatif,
329
+ replace=True,
330
+ n_samples=target_count,
331
+ random_state=42)
332
+
333
+ df_positif_over = resample(df_positif,
334
+ replace=True,
335
+ n_samples=target_count,
336
+ random_state=42)
337
+
338
+ # Gabungkan kembali
339
+ df_balanced = pd.concat([df_netral, df_negatif_over, df_positif_over])
340
+
341
+ print("Distribusi setelah balancing:")
342
+ print(df_balanced["sentiment"].value_counts())
343
+
344
+ # ============================
345
+ # VISUALISASI DISTRIBUSI LABEL
346
+ # ============================
347
+
348
+ # ambil distribusi label_clean
349
+ label_counts = df_balanced["sentiment"].value_counts()
350
+
351
+ # -------- Diagram Batang --------
352
+ plt.figure(figsize=(6,4))
353
+ label_counts.plot(kind="bar", color=["red","green","blue"])
354
+ plt.title("Distribusi Sentimen")
355
+ plt.xlabel("Label")
356
+ plt.ylabel("Jumlah")
357
+ plt.xticks(rotation=0)
358
+ plt.show()
359
+
360
+ print('\n')
361
+
362
+ # -------- Diagram Lingkaran (Pie) --------
363
+ plt.figure(figsize=(5,5))
364
+ label_counts.plot(kind="pie", autopct='%1.1f%%', startangle=90, colors=["red","green","blue"])
365
+ plt.title("Persentase Sentimen")
366
+ plt.ylabel("") # hilangkan label Y
367
+ plt.show()
368
+
369
+ # ============================
370
+ # SPLIT DATASET (train/val/test)
371
+ # ============================
372
+
373
+ # ambil teks & label hasil bersih
374
+ X = df_balanced["text"].values
375
+ y = df_balanced["sentiment"].values
376
+
377
+ # 1. Bagi train + temp (80%) dan test (20%)
378
+ X_train, X_temp, y_train, y_temp = train_test_split(
379
+ X, y, test_size=0.2, random_state=42, stratify=y
380
+ )
381
+
382
+ # 2. Dari temp (20%), bagi lagi jadi val (10%) + test (10%)
383
+ X_val, X_test, y_val, y_test = train_test_split(
384
+ X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
385
+ )
386
+
387
+ # Cek ukuran hasil split
388
+ print("Ukuran dataset:")
389
+ print(f"Train: {len(X_train)}")
390
+ print(f"Validation: {len(X_val)}")
391
+ print(f"Test: {len(X_test)}")
392
+
393
+ # ============================
394
+ # FINE-TUNING IndoBERT
395
+ # ============================
396
+
397
+ # pastikan pakai GPU kalau tersedia
398
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
399
+ print("Device:", device)
400
+
401
+ os.environ["WANDB_API_KEY"] = "009f08e71506e55bdfd282b691a4abee4ac85ff9"
402
+ os.environ["WANDB_DISABLED"] = "false"
403
+
404
+ # ----------------------------
405
+ # 1. Tokenizer & Label Encoding
406
+ # ----------------------------
407
+ MODEL_NAME = "indobenchmark/indobert-base-p1" # model IndoBERT pre-trained
408
+
409
+ tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
410
+
411
+ # mapping label ke angka
412
+ label2id = {"negatif": 0, "netral": 1, "positif": 2}
413
+ id2label = {v: k for k, v in label2id.items()}
414
+
415
+ def encode_labels(labels):
416
+ return [label2id[l] for l in labels]
417
+
418
+ y_train_enc = encode_labels(y_train)
419
+ y_val_enc = encode_labels(y_val)
420
+ y_test_enc = encode_labels(y_test)
421
+
422
+ # ----------------------------
423
+ # 2. Dataset class
424
+ # ----------------------------
425
+ class SentimentDataset(Dataset):
426
+ def __init__(self, texts, labels, tokenizer, max_len=128):
427
+ self.texts = texts
428
+ self.labels = labels
429
+ self.tokenizer = tokenizer
430
+ self.max_len = max_len
431
+
432
+ def __len__(self):
433
+ return len(self.texts)
434
+
435
+ def __getitem__(self, idx):
436
+ text = str(self.texts[idx])
437
+ label = self.labels[idx]
438
+
439
+ enc = self.tokenizer(
440
+ text,
441
+ truncation=True,
442
+ padding="max_length",
443
+ max_length=self.max_len,
444
+ return_tensors="pt"
445
+ )
446
+
447
+ return {
448
+ "input_ids": enc["input_ids"].squeeze(),
449
+ "attention_mask": enc["attention_mask"].squeeze(),
450
+ "labels": torch.tensor(label, dtype=torch.long)
451
+ }
452
+
453
+ train_dataset = SentimentDataset(X_train, y_train_enc, tokenizer)
454
+ val_dataset = SentimentDataset(X_val, y_val_enc, tokenizer)
455
+ test_dataset = SentimentDataset(X_test, y_test_enc, tokenizer)
456
+
457
+ # ----------------------------
458
+ # 3. Model
459
+ # ----------------------------
460
+ model = BertForSequenceClassification.from_pretrained(
461
+ MODEL_NAME,
462
+ num_labels=3,
463
+ id2label=id2label,
464
+ label2id=label2id
465
+ ).to(device)
466
+
467
+ # ----------------------------
468
+ # 4. Training Arguments
469
+ # ----------------------------
470
+ training_args = TrainingArguments(
471
+ output_dir="./results",
472
+ per_device_train_batch_size=32,
473
+ per_device_eval_batch_size=32,
474
+ num_train_epochs=5, # cukup 10–15, early stopping yang handle
475
+ learning_rate=2e-5, # lebih kecil β†’ stabil
476
+ weight_decay=0.05, # lebih besar β†’ regularisasi
477
+ warmup_ratio=0.1, # 10% step awal dipakai warmup
478
+ logging_dir="./logs",
479
+ logging_steps=500,
480
+ save_total_limit=2,
481
+ eval_strategy="epoch", # evaluasi setiap epoch
482
+ save_strategy="epoch", # simpan juga setiap epoch
483
+ load_best_model_at_end=True,
484
+ metric_for_best_model="f1",
485
+ greater_is_better=True
486
+ )
487
+
488
+
489
+ # ----------------------------
490
+ # 5. Metrics
491
+ # ----------------------------
492
+
493
+ metric_acc = evaluate.load("accuracy")
494
+ metric_f1 = evaluate.load("f1")
495
+
496
+ def compute_metrics(eval_pred):
497
+ logits, labels = eval_pred
498
+ preds = np.argmax(logits, axis=-1)
499
+ acc = metric_acc.compute(predictions=preds, references=labels)
500
+ f1 = metric_f1.compute(predictions=preds, references=labels, average="weighted")
501
+ return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
502
+
503
+ # ----------------------------
504
+ # 6. Trainer
505
+ # ----------------------------
506
+ trainer = Trainer(
507
+ model=model,
508
+ args=training_args,
509
+ train_dataset=train_dataset,
510
+ eval_dataset=val_dataset,
511
+ compute_metrics=compute_metrics,
512
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # stop kalau 2 epoch tidak membaik
513
+ )
514
+
515
+ # ----------------------------
516
+ # 7. Mulai Training
517
+ # ----------------------------
518
+ trainer.train()
519
+
520
+ # =============================
521
+ # 8. Evaluasi & Simpan Prediksi
522
+ # =============================
523
+
524
+ # hasil prediksi di test set
525
+ pred_results = trainer.predict(test_dataset)
526
+
527
+ # ambil logits β†’ konversi ke label prediksi
528
+ pred_logits = pred_results.predictions
529
+ pred_labels = np.argmax(pred_logits, axis=1)
530
+
531
+ # konversi angka ke label teks
532
+ pred_text_labels = [id2label[i] for i in pred_labels]
533
+ true_text_labels = [id2label[i] for i in y_test_enc]
534
+
535
+ # gabungkan dengan teks asli
536
+ df_test_results = pd.DataFrame({
537
+ "text": X_test,
538
+ "true_label": true_text_labels,
539
+ "predicted_label": pred_text_labels
540
+ })
541
+
542
+ # simpan ke CSV
543
+ df_test_results.to_csv("test_predictions.csv", index=False)
544
+ print("βœ… Hasil prediksi test set sudah disimpan ke test_predictions.csv")
545
+
546
+ # ============================
547
+ # EVALUASI & SIMPAN MODEL
548
+ # ============================
549
+
550
+ # 1. Evaluasi di test set
551
+ print("\nEvaluasi di Test Set:")
552
+ test_result = trainer.evaluate(test_dataset)
553
+ print(test_result)
554
+
555
+ # 2. Prediksi label test set (opsional, untuk analisis lebih lanjut)
556
+ predictions = trainer.predict(test_dataset)
557
+ pred_labels = np.argmax(predictions.predictions, axis=-1)
558
+
559
+ # contoh lihat 10 prediksi pertama
560
+ for i in range(10):
561
+ print(f"Teks: {X_test[i]}")
562
+ print(f"Label Asli: {id2label[y_test_enc[i]]} | Prediksi: {id2label[pred_labels[i]]}")
563
+ print("---")
564
+
565
+ # 3. Simpan model + tokenizer
566
+ SAVE_DIR = "./indoBERT-sentiment"
567
+
568
+ trainer.save_model(SAVE_DIR)
569
+ tokenizer.save_pretrained(SAVE_DIR)
570
+
571
+ print(f"\nModel & tokenizer sudah disimpan ke: {SAVE_DIR}")
572
+
573
+ # ==========================
574
+ # LOAD MODEL & TOKENIZER
575
+ # ==========================
576
+ MODEL_DIR = "./indoBERT-sentiment"
577
+
578
+ tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
579
+ model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
580
+
581
+ device = 0 if torch.cuda.is_available() else -1
582
+ sentiment_pipeline = pipeline(
583
+ "text-classification",
584
+ model=model,
585
+ tokenizer=tokenizer,
586
+ device=device
587
+ )
588
+
589
+ # load preprocessor yang sudah disimpan
590
+ preproc = joblib.load("preprocessor.joblib")
591
+
592
+ # ==========================
593
+ # FUNGSI PREDIKSI
594
+ # ==========================
595
+ def predict_text(text):
596
+ if not isinstance(text, str) or text.strip() == "":
597
+ return "EMPTY"
598
+ result = sentiment_pipeline(text, truncation=True, max_length=512)[0]
599
+ return result["label"]
600
+
601
+ # ==========================
602
+ # PREDIKSI FILE 1 (MEDIA SOSIAL)
603
+ # ==========================
604
+ file1 = pd.read_csv("/kaggle/input/sentimentdataset/gabungan (1).csv")
605
+
606
+ # Preprocessing caption
607
+ file1["caption"] = preproc.transform(file1["caption"])
608
+
609
+ # Preprocessing comment
610
+ file1["comments"] = preproc.transform(file1["comments"])
611
+
612
+ # drop NaN biar aman
613
+ file1 = file1.dropna(subset=["caption", "comments"])
614
+
615
+ outputs1 = []
616
+
617
+ for idx, row in file1.iterrows():
618
+ print(f"[File1] Proses baris {idx+1}/{len(file1)}")
619
+
620
+ # caption
621
+ caption_text = str(row["caption"]).strip()
622
+ caption_pred = predict_text(caption_text)
623
+
624
+ # comments
625
+ comments_text = str(row["comments"]).strip()
626
+ comments_pred_label = predict_text(comments_text)
627
+
628
+ outputs1.append({
629
+ "link": row.get("link", ""), # simpan link medsos
630
+ "caption": caption_text,
631
+ "caption_pred": caption_pred,
632
+ "comments_pred": comments_text, # simpan teks asli komentar
633
+ "comments_summary": comments_pred_label # hasil prediksi sentimen komentar
634
+ })
635
+
636
+ df_out1 = pd.DataFrame(outputs1)
637
+ df_out1.to_csv("medsos.csv", index=False, encoding="utf-8-sig")
638
+ print("βœ… Hasil prediksi file1 sudah disimpan ke medsos.csv")
639
+
640
+ # ==========================
641
+ # PREDIKSI FILE 2 (BERITA)
642
+ # ==========================
643
+ file2 = pd.read_csv("/kaggle/input/sentimentdataset/berita2 (1).csv")
644
+
645
+ # Preprocessing judul
646
+ file2["judul"] = preproc.transform(file2["judul"])
647
+
648
+ # Preprocessing tag (βœ… perbaikan: tidak menimpa judul)
649
+ file2["tag"] = preproc.transform(file2["tag"])
650
+
651
+ # Preprocessing isi_berita
652
+ file2["isi_berita"] = preproc.transform(file2["isi_berita"])
653
+
654
+ # drop NaN biar aman
655
+ file2 = file2.dropna(subset=["judul", "tag", "isi_berita"])
656
+
657
+ outputs2 = []
658
+
659
+ for idx, row in file2.iterrows():
660
+ print(f"[File2] Proses baris {idx+1}/{len(file2)}")
661
+
662
+ combined_text = f"{row['judul']} {row['tag']} {row['isi_berita']}"
663
+ pred = predict_text(combined_text)
664
+
665
+ outputs2.append({
666
+ "link": row.get("link", ""), # simpan link berita
667
+ "judul": row["judul"],
668
+ "tag": row["tag"],
669
+ "isi_berita": row["isi_berita"],
670
+ "prediction": pred
671
+ })
672
+
673
+ df_out2 = pd.DataFrame(outputs2)
674
+ df_out2.to_csv("berita.csv", index=False, encoding="utf-8-sig")
675
+ print("βœ… Hasil prediksi file2 sudah disimpan ke berita.csv")
services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Empty init file to make 'services' a proper Python package
services/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (158 Bytes). View file
 
services/__pycache__/facebook.cpython-311.pyc ADDED
Binary file (11.7 kB). View file
 
services/__pycache__/medos.cpython-311.pyc ADDED
Binary file (15.3 kB). View file
 
services/__pycache__/news.cpython-311.pyc ADDED
Binary file (22.8 kB). View file
 
services/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (5.7 kB). View file
 
services/__pycache__/sentiment.cpython-311.pyc ADDED
Binary file (5.12 kB). View file
 
services/__pycache__/tiktok.cpython-311.pyc ADDED
Binary file (11.1 kB). View file
 
services/__pycache__/wordcloud_service.cpython-311.pyc ADDED
Binary file (5.59 kB). View file
 
services/_driver.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ _driver.py – Shared Selenium Chrome driver factory.
3
+ All scrapers import _create_driver() from here so that Docker env-vars
4
+ (CHROME_BIN, CHROMEDRIVER_PATH) are respected in one place.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+
10
+ from selenium import webdriver
11
+ from selenium.webdriver.chrome.service import Service
12
+
13
+
14
+ def _create_driver(mobile: bool = False) -> webdriver.Chrome:
15
+ """
16
+ Return a headless Chrome/Chromium instance tuned for Docker.
17
+
18
+ Picks up:
19
+ CHROME_BIN – path to chromium binary (default: /usr/bin/chromium)
20
+ CHROMEDRIVER_PATH – path to chromedriver (default: /usr/bin/chromedriver)
21
+ """
22
+ chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/chromium")
23
+ driver_bin = os.environ.get("CHROMEDRIVER_PATH", "/usr/bin/chromedriver")
24
+
25
+ options = webdriver.ChromeOptions()
26
+ options.binary_location = chrome_bin
27
+
28
+ # ── Headless & sandbox flags ──────────────────────────────────────────────
29
+ options.add_argument("--headless=new")
30
+ options.add_argument("--no-sandbox")
31
+ options.add_argument("--disable-dev-shm-usage")
32
+ options.add_argument("--disable-gpu")
33
+ options.add_argument("--disable-software-rasterizer")
34
+ options.add_argument("--disable-extensions")
35
+ options.add_argument("--disable-infobars")
36
+ options.add_argument("--disable-notifications")
37
+ options.add_argument("--disable-popup-blocking")
38
+ options.add_argument("--disable-blink-features=AutomationControlled")
39
+ options.add_argument("--ignore-certificate-errors")
40
+ options.add_argument("--window-size=1920,1080")
41
+ options.add_argument("--remote-debugging-port=0") # avoid port conflicts
42
+
43
+ # ── User-Agent ────────────────────────────────────────────────────────────
44
+ if mobile:
45
+ options.add_argument(
46
+ "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
47
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
48
+ )
49
+ else:
50
+ options.add_argument(
51
+ "--user-agent=Mozilla/5.0 (X11; Linux x86_64) "
52
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
53
+ )
54
+
55
+ options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
56
+ options.add_experimental_option("useAutomationExtension", False)
57
+
58
+ service = Service(executable_path=driver_bin)
59
+ driver = webdriver.Chrome(service=service, options=options)
60
+
61
+ # Hide webdriver fingerprint
62
+ driver.execute_cdp_cmd(
63
+ "Page.addScriptToEvaluateOnNewDocument",
64
+ {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
65
+ )
66
+ return driver
services/facebook.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ facebook.py – Facebook group scraper using Selenium.
3
+ Exports: scrape_facebook(username, password, groups) -> list[dict]
4
+
5
+ Returns structured data per-post:
6
+ group_name, group_url, post_url, author, caption, comments
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import time
13
+
14
+ from selenium.webdriver.common.by import By
15
+ from selenium.webdriver.support.ui import WebDriverWait
16
+ from selenium.webdriver.support import expected_conditions as EC
17
+
18
+ from ._driver import _create_driver
19
+
20
+ COOKIES_FILE = "fb_cookies.json"
21
+ FB_BASE = "https://www.facebook.com"
22
+ MOBILE_FB = "https://m.facebook.com"
23
+
24
+
25
+ # ── Cookie helpers ─────────────────────────────────────────────────────────────
26
+
27
+ def _save_cookies(driver, path: str) -> None:
28
+ try:
29
+ with open(path, "w") as f:
30
+ json.dump(driver.get_cookies(), f)
31
+ except Exception as e:
32
+ print(f"[Facebook] Gagal simpan cookies: {e}")
33
+
34
+
35
+ def _load_cookies(driver, path: str) -> bool:
36
+ if not os.path.exists(path) or os.path.getsize(path) == 0:
37
+ return False
38
+ try:
39
+ with open(path, "r") as f:
40
+ cookies = json.load(f)
41
+ for cookie in cookies:
42
+ try:
43
+ driver.add_cookie(cookie)
44
+ except Exception:
45
+ pass
46
+ return True
47
+ except Exception as e:
48
+ print(f"[Facebook] Gagal load cookies: {e}")
49
+ return False
50
+
51
+
52
+ # ── Login ──────────────────────────────────────────────────────────────────────
53
+
54
+ def _fb_login(driver, username: str, password: str) -> bool:
55
+ wait = WebDriverWait(driver, 20)
56
+ driver.get(MOBILE_FB)
57
+ time.sleep(3)
58
+
59
+ if os.path.exists(COOKIES_FILE):
60
+ try:
61
+ _load_cookies(driver, COOKIES_FILE)
62
+ driver.refresh()
63
+ time.sleep(4)
64
+ if "login" not in driver.current_url and "checkpoint" not in driver.current_url:
65
+ print("[Facebook] Login via cookies berhasil.")
66
+ return True
67
+ driver.delete_all_cookies()
68
+ driver.get(MOBILE_FB)
69
+ time.sleep(2)
70
+ except Exception as e:
71
+ pass
72
+
73
+ print("[Facebook] Login manual username/password...")
74
+ try:
75
+ email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]')))
76
+ pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]')
77
+ email_input.clear()
78
+ email_input.send_keys(username)
79
+ pass_input.clear()
80
+ pass_input.send_keys(password)
81
+ pass_input.send_keys("\n")
82
+ time.sleep(1)
83
+
84
+ try:
85
+ login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]')
86
+ driver.execute_script("arguments[0].click();", login_btn)
87
+ except Exception:
88
+ pass
89
+ except Exception:
90
+ try:
91
+ driver.get(f"{FB_BASE}/login.php")
92
+ time.sleep(3)
93
+ email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
94
+ pass_input = driver.find_element(By.ID, "pass")
95
+ email_input.clear()
96
+ email_input.send_keys(username)
97
+ pass_input.clear()
98
+ pass_input.send_keys(password)
99
+ driver.find_element(By.NAME, "login").click()
100
+ except Exception as e2:
101
+ return False
102
+
103
+ time.sleep(6)
104
+ if "login" in driver.current_url or "checkpoint" in driver.current_url:
105
+ return False
106
+
107
+ _save_cookies(driver, COOKIES_FILE)
108
+ return True
109
+
110
+
111
+ def ensure_logged_in(driver, username, password):
112
+ try:
113
+ url = driver.current_url
114
+ if url and "login" in url:
115
+ _fb_login(driver, username, password)
116
+ return
117
+
118
+ try:
119
+ popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
120
+ if popup.is_displayed():
121
+ _fb_login(driver, username, password)
122
+ return
123
+ except: pass
124
+
125
+ try:
126
+ login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
127
+ if login_modal.is_displayed():
128
+ _fb_login(driver, username, password)
129
+ return
130
+ except: pass
131
+ except: pass
132
+
133
+
134
+ # ── Scraping ───────────────────────────────────────────────────────────────────
135
+
136
+ def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list:
137
+ """Scrape posts from a single FB group URL. Returns list of dict strings."""
138
+ posts: list = []
139
+
140
+ group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com")
141
+ print(f"[Facebook] Scraping grup: {group_url}")
142
+
143
+ try:
144
+ driver.get(group_url)
145
+ time.sleep(6)
146
+ ensure_logged_in(driver, username, password)
147
+ except Exception as e:
148
+ print(f"[Facebook] Gagal buka grup: {e}")
149
+ return posts
150
+
151
+ last_height = driver.execute_script("return document.body.scrollHeight")
152
+
153
+ for scroll_n in range(max_scrolls):
154
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
155
+ time.sleep(4)
156
+ ensure_logged_in(driver, username, password)
157
+
158
+ post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
159
+ print(f"[Facebook] Scroll {scroll_n + 1} β†’ {len(post_elements)} artikel ditemukan")
160
+
161
+ for idx, post in enumerate(post_elements):
162
+ try:
163
+ driver.execute_script("arguments[0].scrollIntoView(true);", post)
164
+ time.sleep(1)
165
+
166
+ permalink = None
167
+ post_context = post
168
+ try:
169
+ link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
170
+ permalink = link_el.get_attribute("href").split("?")[0]
171
+ except:
172
+ try:
173
+ link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
174
+ permalink = link_el.get_attribute("href").split("?")[0]
175
+ except:
176
+ try:
177
+ post_id = post.get_attribute("data-ft")
178
+ if post_id and "top_level_post_id" in post_id:
179
+ d = json.loads(post_id)
180
+ pid = d.get("top_level_post_id")
181
+ if pid:
182
+ permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
183
+ except:
184
+ pass
185
+
186
+ if not permalink:
187
+ permalink = group_url
188
+
189
+ try:
190
+ driver.execute_script(f"window.open('{permalink}', '_blank');")
191
+ time.sleep(1)
192
+ driver.switch_to.window(driver.window_handles[-1])
193
+ time.sleep(3)
194
+ ensure_logged_in(driver, username, password)
195
+ post_context = driver.find_element(By.XPATH, "//div[@role='article']")
196
+ except:
197
+ post_context = None
198
+
199
+ author = "Unknown"
200
+ try:
201
+ if post_context:
202
+ try:
203
+ author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
204
+ except:
205
+ try:
206
+ author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
207
+ except:
208
+ author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
209
+ except: pass
210
+
211
+ # Expand comments if permalink tab is open
212
+ if post_context:
213
+ while True:
214
+ try:
215
+ btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
216
+ driver.execute_script("arguments[0].click();", btn)
217
+ time.sleep(2)
218
+ except: break
219
+ while True:
220
+ try:
221
+ btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
222
+ driver.execute_script("arguments[0].click();", btn)
223
+ time.sleep(2)
224
+ except: break
225
+
226
+ caption = ""
227
+ comments = []
228
+ if post_context:
229
+ try:
230
+ blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
231
+ caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000]
232
+ except: pass
233
+ try:
234
+ comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']")
235
+ seen_c = set()
236
+ for cb in comment_blocks:
237
+ c = cb.text.strip()
238
+ if c and c not in seen_c:
239
+ seen_c.add(c)
240
+ comments.append(c)
241
+ except: pass
242
+
243
+ if len(driver.window_handles) > 1:
244
+ driver.close()
245
+ driver.switch_to.window(driver.window_handles[0])
246
+
247
+ if caption or comments:
248
+ posts.append({
249
+ "group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2],
250
+ "group_url": group_url,
251
+ "post_url": permalink,
252
+ "author": author,
253
+ "caption": caption,
254
+ "comments": comments
255
+ })
256
+ except Exception as e:
257
+ print(f"[Facebook] Error baca post: {e}")
258
+ if len(driver.window_handles) > 1:
259
+ driver.close()
260
+ driver.switch_to.window(driver.window_handles[0])
261
+ continue
262
+
263
+ new_height = driver.execute_script("return document.body.scrollHeight")
264
+ if new_height == last_height:
265
+ break
266
+ last_height = new_height
267
+
268
+ return posts
269
+
270
+
271
+ # ── Public API ─────────────────────────────────────────────────────────────────
272
+
273
+ def scrape_facebook(username: str, password: str, groups: list | None = None) -> list:
274
+ if not username or not password:
275
+ print("[Facebook] Username/password tidak disediakan.")
276
+ return []
277
+
278
+ if not groups:
279
+ print("[Facebook] Tidak ada URL grup yang disediakan β€” skip.")
280
+ return []
281
+
282
+ driver = _create_driver(mobile=False)
283
+ all_data: list = []
284
+
285
+ try:
286
+ if not _fb_login(driver, username, password):
287
+ return []
288
+
289
+ for group_url in groups:
290
+ if not group_url or not group_url.strip():
291
+ continue
292
+ data = _scrape_group(driver, username, password, group_url.strip())
293
+ all_data.extend(data)
294
+
295
+ except Exception as e:
296
+ print(f"[Facebook] Fatal error: {e}")
297
+ finally:
298
+ try:
299
+ driver.quit()
300
+ except Exception:
301
+ pass
302
+
303
+ print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}")
304
+ return all_data
services/medos.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ medos.py – Instagram scraper using Selenium.
3
+ Exports: scrape_medos(username, password, target_account, mode) -> list[str]
4
+
5
+ Strategy:
6
+ 1. Try saved cookies first (faster, avoids login throttling).
7
+ 2. Fall back to username/password login via mobile IG version.
8
+ 3. Collect post links from profile / hashtag page.
9
+ 4. Scrape caption + visible comments from each post.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import os
15
+ import time
16
+ from datetime import datetime, timedelta
17
+
18
+ from selenium.webdriver.common.by import By
19
+ from selenium.webdriver.support.ui import WebDriverWait
20
+ from selenium.webdriver.support import expected_conditions as EC
21
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
22
+
23
+ from ._driver import _create_driver
24
+
25
+ IG_BASE = "https://www.instagram.com/"
26
+
27
+
28
+ # ── Cookie helpers ─────────────────────────────────────────────────────────────
29
+
30
+ def _save_cookies(driver, path: str) -> None:
31
+ try:
32
+ with open(path, "w", encoding="utf-8") as f:
33
+ json.dump(driver.get_cookies(), f, ensure_ascii=False, indent=2)
34
+ except Exception as e:
35
+ print(f"[Medos] Gagal simpan cookies: {e}")
36
+
37
+
38
+ def _load_cookies(driver, path: str) -> bool:
39
+ if not os.path.exists(path) or os.path.getsize(path) == 0:
40
+ return False
41
+ try:
42
+ with open(path, "r", encoding="utf-8") as f:
43
+ cookies = json.load(f)
44
+ driver.get(IG_BASE)
45
+ time.sleep(2)
46
+ driver.delete_all_cookies()
47
+ for c in cookies:
48
+ allowed = {k: c[k] for k in c.keys() & {"name", "value", "domain", "path", "secure", "httpOnly", "expiry"}}
49
+ if "expiry" in allowed and isinstance(allowed["expiry"], float):
50
+ allowed["expiry"] = int(allowed["expiry"])
51
+ try:
52
+ driver.add_cookie(allowed)
53
+ except Exception:
54
+ allowed.pop("domain", None)
55
+ try:
56
+ driver.add_cookie(allowed)
57
+ except Exception:
58
+ pass
59
+ return True
60
+ except Exception as e:
61
+ print(f"[Medos] Gagal load cookies: {e}")
62
+ return False
63
+
64
+
65
+ def _is_logged_in(driver) -> bool:
66
+ """Check if the session has a valid sessionid cookie on instagram."""
67
+ return any(c.get("name") == "sessionid" for c in driver.get_cookies())
68
+
69
+
70
+ # ── Login ──────────────────────────────────────────────────────────────────────
71
+
72
+ def _login(driver, username: str, password: str, cookies_file: str) -> bool:
73
+ # 1. Try saved cookies
74
+ if _load_cookies(driver, cookies_file):
75
+ driver.get(IG_BASE)
76
+ time.sleep(3)
77
+ if _is_logged_in(driver):
78
+ print("[Medos] Login via cookies OK.")
79
+ return True
80
+ print("[Medos] Cookies kadaluarsa, coba login manual.")
81
+
82
+ # 2. Username/password login
83
+ login_url = f"{IG_BASE}accounts/login/"
84
+ driver.get(login_url)
85
+ print("[Medos] Membuka halaman login Instagram…")
86
+
87
+ try:
88
+ # Wait for username OR email field
89
+ WebDriverWait(driver, 20).until(
90
+ EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='username'], input[name='email']"))
91
+ )
92
+ except TimeoutException:
93
+ print("[Medos] Halaman login tidak termuat.")
94
+ try:
95
+ with open("/app/static/output/ig_login_error.html", "w", encoding="utf-8") as f:
96
+ f.write(driver.page_source)
97
+ driver.save_screenshot("/app/static/output/ig_login_error.png")
98
+ print("[Medos] Log error HTML dan screenshot disimpan ke /app/static/output/")
99
+ except Exception as e:
100
+ print(f"[Medos] Gagal menyimpan log error: {e}")
101
+ return False
102
+
103
+ try:
104
+ # Try both username/email and password/pass
105
+ user_field = None
106
+ for sel in ["input[name='username']", "input[name='email']"]:
107
+ try:
108
+ user_field = driver.find_element(By.CSS_SELECTOR, sel)
109
+ break
110
+ except NoSuchElementException:
111
+ pass
112
+
113
+ pass_field = None
114
+ for sel in ["input[name='password']", "input[name='pass']"]:
115
+ try:
116
+ pass_field = driver.find_element(By.CSS_SELECTOR, sel)
117
+ break
118
+ except NoSuchElementException:
119
+ pass
120
+
121
+ if not user_field or not pass_field:
122
+ print("[Medos] Field login (username/password) tidak ditemukan.")
123
+ return False
124
+
125
+ user_field.clear()
126
+ user_field.send_keys(username)
127
+ time.sleep(0.8)
128
+ pass_field.clear()
129
+ pass_field.send_keys(password)
130
+ time.sleep(0.5)
131
+
132
+ # Submit form: Press ENTER inside password field
133
+ pass_field.send_keys("\n")
134
+ time.sleep(1)
135
+
136
+ # Fallback: Try clicking the submit button if it exists
137
+ try:
138
+ submit_btn = driver.find_element(By.CSS_SELECTOR, "button[type='submit'], input[type='submit'], div[role='button']")
139
+ driver.execute_script("arguments[0].click();", submit_btn)
140
+ except Exception:
141
+ pass
142
+
143
+ # Wait for redirect away from login page
144
+ WebDriverWait(driver, 20).until(
145
+ lambda d: "/accounts/login/" not in d.current_url and "login" not in d.current_url.lower()
146
+ )
147
+ print("[Medos] Login sukses.")
148
+ except TimeoutException:
149
+ print("[Medos] Login timeout β€” cek credentials atau akun ter-throttle.")
150
+ return False
151
+ except Exception as e:
152
+ print(f"[Medos] Login gagal: {e}")
153
+ return False
154
+
155
+ # 3. Dismiss save-info / notification popups
156
+ for _ in range(2):
157
+ try:
158
+ WebDriverWait(driver, 6).until(
159
+ EC.element_to_be_clickable((
160
+ By.XPATH,
161
+ "//button[contains(text(),'Not Now') or "
162
+ "contains(text(),'Bukan Sekarang') or "
163
+ "contains(text(),'Not now')]"
164
+ ))
165
+ ).click()
166
+ time.sleep(1.5)
167
+ except Exception:
168
+ pass
169
+
170
+ _save_cookies(driver, cookies_file)
171
+ return True
172
+
173
+
174
+ # ── Scraping helpers ───────────────────────────────────────────────────────────
175
+
176
+ def _collect_post_links(driver, target_url: str, max_scrolls: int = 5) -> list:
177
+ print(f"[Medos] Membuka: {target_url}")
178
+ driver.get(target_url)
179
+ time.sleep(6)
180
+
181
+ links: set = set()
182
+ stall = 0
183
+
184
+ for i in range(max_scrolls):
185
+ prev_count = len(links)
186
+ for el in driver.find_elements(By.CSS_SELECTOR, "a[href*='/p/'], a[href*='/reel/']"):
187
+ href = el.get_attribute("href")
188
+ if href:
189
+ links.add(href.split("?")[0])
190
+ print(f"[Medos] Scroll {i+1}: {len(links)} link ditemukan.")
191
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
192
+ time.sleep(3.5)
193
+ if len(links) == prev_count:
194
+ stall += 1
195
+ if stall >= 3:
196
+ break
197
+ else:
198
+ stall = 0
199
+
200
+ return list(links)
201
+
202
+
203
+ def _scrape_post(driver, link: str) -> list:
204
+ """Return list of text strings (caption + comments) from one post."""
205
+ driver.get(link)
206
+ time.sleep(4)
207
+
208
+ texts = []
209
+
210
+ # Caption β€” based on medos_scraping.py
211
+ caption_selectors = [
212
+ (By.XPATH, "//div[@data-testid='post-caption']"),
213
+ (By.XPATH, "//h1"),
214
+ (By.XPATH, "//span[contains(@class, 'x126k92a')]"),
215
+ (By.CSS_SELECTOR, "article span[dir='auto']"),
216
+ ]
217
+ for by, sel in caption_selectors:
218
+ try:
219
+ el = WebDriverWait(driver, 3).until(EC.presence_of_element_located((by, sel)))
220
+ # Try to get text, if empty, we might need innerHTML but text is cleaner
221
+ t = el.text.strip()
222
+ if not t:
223
+ # If text is empty due to formatting, try extracting via JS
224
+ t = driver.execute_script("return arguments[0].innerText;", el)
225
+
226
+ if t and len(t) > 3:
227
+ texts.append(t.strip())
228
+ break
229
+ except Exception:
230
+ continue
231
+
232
+ # Load more comments (Tahap 1 Ekspansi dari medos_scraping.py)
233
+ for _ in range(5):
234
+ try:
235
+ # First try the default svg
236
+ btn = driver.find_element(
237
+ By.CSS_SELECTOR,
238
+ "svg[aria-label='Load more comments'], svg[aria-label='Muat komentar lainnya']"
239
+ )
240
+ driver.execute_script("arguments[0].click();", btn)
241
+ time.sleep(2)
242
+ except Exception:
243
+ try:
244
+ # Fallback to load more text
245
+ btn2 = driver.find_element(
246
+ By.XPATH,
247
+ "//div[@role='button']//span[contains(text(),'Load') or contains(text(),'Muat')]"
248
+ )
249
+ driver.execute_script("arguments[0].click();", btn2)
250
+ time.sleep(2)
251
+ except Exception:
252
+ break
253
+
254
+ # Collect visible comments (Ekstraksi dari medos_scraping.py)
255
+ try:
256
+ # Locators from working script + fallbacks
257
+ xpaths = [
258
+ "//div[contains(@class, 'x1cy8zhl')]/span", # From user's working macro
259
+ "//ul//li//span[@dir='auto']",
260
+ "//div[@role='button']//span[@dir='auto']",
261
+ "//div[contains(@class, 'x1xegmmw')]//span[@dir='auto']"
262
+ ]
263
+ seen_texts = set()
264
+ for t in texts:
265
+ seen_texts.add(t)
266
+
267
+ for xpath in xpaths:
268
+ spans = driver.find_elements(By.XPATH, xpath)
269
+ for span in spans:
270
+ try:
271
+ t = span.text.strip()
272
+ if t and len(t) > 3 and t not in seen_texts:
273
+ seen_texts.add(t)
274
+ texts.append(t)
275
+ except Exception:
276
+ pass
277
+ except Exception as e:
278
+ print(f"[Medos] Gagal ambil komentar: {e}")
279
+
280
+ return texts
281
+
282
+
283
+ # ── Public API ─────────────────────────────────────────────────────────────────
284
+
285
+ def scrape_medos(username: str, password: str, target_account: str, mode: str = "all") -> list:
286
+ """
287
+ Scrape Instagram profile/hashtag posts and return list of text strings.
288
+ mode: 'all' | 'date' (last 7 months)
289
+ """
290
+ if not username or not password or not target_account:
291
+ print("[Medos] Parameter tidak lengkap.")
292
+ return []
293
+
294
+ cookies_file = f"/app/ig_cookies_{username}.json"
295
+ driver = _create_driver(mobile=False)
296
+ texts_out: list = []
297
+
298
+ try:
299
+ if not _login(driver, username, password, cookies_file):
300
+ print("[Medos] Login gagal, scraping dibatalkan.")
301
+ return []
302
+
303
+ # Determine target URL
304
+ account = target_account.strip()
305
+ if account.startswith("#"):
306
+ tag = account.lstrip("#")
307
+ target_url = f"{IG_BASE}explore/tags/{tag}/"
308
+ else:
309
+ target_url = f"{IG_BASE}{account.lstrip('@')}/"
310
+
311
+ post_links = _collect_post_links(driver, target_url, max_scrolls=5)
312
+ print(f"[Medos] {len(post_links)} link postingan ditemukan untuk '{account}'.")
313
+
314
+ for link in post_links[:30]: # cap 30 posts
315
+ try:
316
+ result = _scrape_post(driver, link)
317
+ texts_out.extend(result)
318
+ print(f"[Medos] {link} β†’ {len(result)} teks")
319
+ except Exception as e:
320
+ print(f"[Medos] Error pada {link}: {e}")
321
+
322
+ except Exception as e:
323
+ print(f"[Medos] Fatal error: {e}")
324
+ finally:
325
+ try:
326
+ driver.quit()
327
+ except Exception:
328
+ pass
329
+
330
+ print(f"[Medos] Total teks dari Instagram: {len(texts_out)}")
331
+ return texts_out
services/news.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ news.py – News scraper dispatcher.
3
+ Exports: scrape_news(portal, pages, keyword) -> list[dict]
4
+
5
+ portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import random
10
+ import re
11
+ import time
12
+ from urllib.parse import quote, quote_plus, urlparse, urlunparse
13
+
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+
17
+
18
+ # ── Shared HTTP session helpers ────────────────────────────────────────────────
19
+
20
+ _HEADERS = {
21
+ "User-Agent": (
22
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
23
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
24
+ ),
25
+ "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
26
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
27
+ }
28
+
29
+ def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
30
+ for attempt in range(retries):
31
+ try:
32
+ r = sess.get(url, timeout=20, allow_redirects=True)
33
+ r.raise_for_status()
34
+ return r
35
+ except Exception as e:
36
+ if attempt < retries - 1:
37
+ time.sleep(delay)
38
+ return None
39
+
40
+ def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
41
+ container = None
42
+ for cls in container_classes:
43
+ container = soup.find("div", class_=cls)
44
+ if container:
45
+ break
46
+ scope = container if container else soup
47
+ texts = []
48
+ for p in scope.find_all("p"):
49
+ t = p.get_text(" ", strip=True)
50
+ if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
51
+ texts.append(t)
52
+ return texts
53
+
54
+
55
+ # ── Detik.com ──────────────────────────────────────────────────────────────────
56
+
57
+ def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
58
+ import datetime
59
+ sess = requests.Session()
60
+ sess.headers.update(_HEADERS)
61
+ results = []
62
+
63
+ for page in range(1, max_pages + 1):
64
+ r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
65
+ if not r: break
66
+ soup = BeautifulSoup(r.text, "html.parser")
67
+ news_list = soup.find_all('div', class_='media')
68
+ if not news_list: break
69
+
70
+ for news in news_list:
71
+ try:
72
+ title_tag = news.find('h3', class_='media__title')
73
+ if not title_tag: continue
74
+ link_tag = title_tag.find('a', class_='media__link')
75
+ if not link_tag or not link_tag.has_attr('href'): continue
76
+ link = link_tag['href']
77
+ title = link_tag.text.strip()
78
+
79
+ news_date = None
80
+ date_tag = news.find('div', class_='media__date')
81
+ if date_tag:
82
+ span_tag = date_tag.find('span')
83
+ if span_tag and span_tag.has_attr('d-time'):
84
+ timestamp = span_tag['d-time']
85
+ news_date = datetime.datetime.fromtimestamp(int(timestamp))
86
+
87
+ news_resp = _get(sess, link)
88
+ if not news_resp: continue
89
+ news_soup = BeautifulSoup(news_resp.text, 'html.parser')
90
+
91
+ content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
92
+ content = ""
93
+ if content_div:
94
+ parts = []
95
+ for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
96
+ text = tag.get_text(strip=True)
97
+ if text:
98
+ prefix = tag.name.upper() if tag.name.startswith('h') else ''
99
+ parts.append(f"{prefix}: {text}" if prefix else text)
100
+ content = '\n'.join(parts)
101
+
102
+ nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
103
+ tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []
104
+
105
+ results.append({
106
+ 'judul': title,
107
+ 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
108
+ 'tag': ', '.join(tags),
109
+ 'isi_berita': content,
110
+ 'link': link
111
+ })
112
+ except Exception: pass
113
+ time.sleep(2)
114
+ return results
115
+
116
+
117
+ # ── Radar ──────────────────────────────────────────────────────────────────────
118
+
119
+ def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
120
+ BASE_HOST = "https://radarcirebon.disway.id"
121
+ sess = requests.Session()
122
+ sess.headers.update(_HEADERS)
123
+ results = []
124
+
125
+ def _abs(href):
126
+ if not href: return None
127
+ href = href.strip()
128
+ return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")
129
+
130
+ for page in range(1, max_pages + 1):
131
+ q = quote_plus(keyword)
132
+ offset = (page - 1) * 30
133
+ url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="
134
+
135
+ r = _get(sess, url)
136
+ if not r: break
137
+ soup = BeautifulSoup(r.text, "html.parser")
138
+
139
+ news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
140
+ for item in news_list:
141
+ try:
142
+ a = item.find('a', href=True)
143
+ if not a: continue
144
+ link = _abs(a.get('href'))
145
+ title = a.get_text(strip=True)
146
+
147
+ detail_r = _get(sess, link)
148
+ if not detail_r: continue
149
+ detail_soup = BeautifulSoup(detail_r.text, "html.parser")
150
+
151
+ h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
152
+ title_detail = h1.get_text(strip=True) if h1 else title
153
+
154
+ date_text = ""
155
+ date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
156
+ if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)
157
+
158
+ content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
159
+ content = ""
160
+ if content_container:
161
+ content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])
162
+
163
+ tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
164
+ tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]
165
+
166
+ results.append({
167
+ "judul": title_detail,
168
+ "tanggal": date_text,
169
+ "tag": ", ".join(tags) if tags else "-",
170
+ "isi_berita": content,
171
+ "link": link
172
+ })
173
+ except Exception: pass
174
+ time.sleep(2)
175
+ return results
176
+
177
+ # ── Antara ─────────────────────────────────────────────────────────────────────
178
+
179
+ def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
180
+ BASE_HOST = "https://www.antaranews.com"
181
+ sess = requests.Session()
182
+ sess.headers.update(_HEADERS)
183
+ results = []
184
+
185
+ def _norm(href):
186
+ if not href: return None
187
+ href = href.strip()
188
+ if href.startswith("/"): href = BASE_HOST + href
189
+ elif not href.startswith("http"): return None
190
+ return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")
191
+
192
+ for page in range(1, max_pages + 1):
193
+ q = quote_plus(keyword)
194
+ url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
195
+ r = _get(sess, url)
196
+ if not r: break
197
+ soup = BeautifulSoup(r.text, "html.parser")
198
+
199
+ anchors = soup.select('a[href*="/berita/"]')
200
+ links = {_norm(a.get('href')) for a in anchors if a.get('href')}
201
+
202
+ for link in links:
203
+ if not link: continue
204
+ detail_r = _get(sess, link)
205
+ if not detail_r: continue
206
+ detail_soup = BeautifulSoup(detail_r.text, "html.parser")
207
+
208
+ h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
209
+ title_detail = h1.get_text(strip=True) if h1 else ""
210
+
211
+ date_detail = ""
212
+ cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
213
+ if cal_icon and cal_icon.find_parent('li'):
214
+ date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)
215
+
216
+ content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])
217
+
218
+ tags = []
219
+ for a in detail_soup.select('a[href*="/tag/"]'):
220
+ tag_text = a.get('title') or a.get_text(strip=True)
221
+ if tag_text: tags.append(tag_text)
222
+
223
+ results.append({
224
+ "judul": title_detail,
225
+ "tanggal": date_detail,
226
+ "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
227
+ "isi_berita": "\n".join(content_parts),
228
+ "link": link
229
+ })
230
+ return results
231
+
232
+ # ── CNN ───────────────────────���────────────────────────────────────────────────
233
+
234
+ def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
235
+ from selenium.webdriver.common.by import By
236
+ from selenium.webdriver.support.ui import WebDriverWait
237
+ from selenium.webdriver.support import expected_conditions as EC
238
+ from ._driver import _create_driver
239
+
240
+ BASE_HOST = "https://www.cnnindonesia.com"
241
+ results = []
242
+
243
+ driver = _create_driver(mobile=False)
244
+ for page in range(1, max_pages + 1):
245
+ q = quote(keyword)
246
+ url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
247
+ driver.get(url)
248
+
249
+ if page == 1:
250
+ try:
251
+ WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
252
+ except: pass
253
+
254
+ try:
255
+ WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
256
+ except: continue
257
+
258
+ soup = BeautifulSoup(driver.page_source, "html.parser")
259
+ links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}
260
+
261
+ sess = requests.Session()
262
+ sess.headers.update(_HEADERS)
263
+ for link in links:
264
+ html = _get(sess, link)
265
+ if not html: continue
266
+ ds = BeautifulSoup(html.text, "html.parser")
267
+
268
+ title_el = ds.select_one('h1')
269
+ title = title_el.get_text(strip=True) if title_el else "-"
270
+
271
+ date_el = ds.select_one('div.text-cnn_grey.text-sm')
272
+ date_text = date_el.get_text(strip=True) if date_el else "-"
273
+
274
+ tags_list = []
275
+ tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
276
+ if tk_header and tk_header.find_next_sibling('div'):
277
+ tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]
278
+
279
+ content_container = ds.select_one("div.detail-text")
280
+ content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"
281
+
282
+ results.append({
283
+ "judul": title,
284
+ "tanggal": date_text,
285
+ "tag": ", ".join(tags_list) if tags_list else "-",
286
+ "isi_berita": content,
287
+ "link": link
288
+ })
289
+ driver.quit()
290
+ return results
291
+
292
+ # ── RadarCirebonID ─────────────────────────────────────────────────────────────
293
+
294
+ def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
295
+ BASE_HOST = "https://radarcirebon.id"
296
+ sess = requests.Session()
297
+ sess.headers.update(_HEADERS)
298
+ results = []
299
+
300
+ for page in range(1, max_pages + 1):
301
+ q = quote(keyword).replace('%20', '+')
302
+ url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
303
+ r = _get(sess, url)
304
+ if not r: break
305
+
306
+ soup = BeautifulSoup(r.text, "html.parser")
307
+ links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}
308
+
309
+ for link in links:
310
+ detail_r = _get(sess, link)
311
+ if not detail_r: continue
312
+ ds = BeautifulSoup(detail_r.text, "html.parser")
313
+
314
+ title_el = ds.select_one('h1.entry-title')
315
+ date_el = ds.select_one('time.entry-date')
316
+
317
+ c_parts = []
318
+ cc = ds.select_one('div.entry-content')
319
+ if cc:
320
+ for p in cc.select('p'):
321
+ if not p.find_parent(class_='read-also'):
322
+ t = p.get_text(" ", strip=True)
323
+ if t: c_parts.append(t)
324
+
325
+ tc = ds.select_one('div.wp-block-tag-cloud')
326
+ tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []
327
+
328
+ results.append({
329
+ "judul": title_el.get_text(strip=True) if title_el else "-",
330
+ "tanggal": date_el.get_text(strip=True) if date_el else "-",
331
+ "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
332
+ "isi_berita": "\n".join(c_parts) if c_parts else "-",
333
+ "link": link
334
+ })
335
+
336
+ return results
337
+
338
+
339
+ # ── Public API ─────────────────────────────────────────────────────────────────
340
+
341
+ _PORTAL_MAP = {
342
+ "detik": _scrape_detik,
343
+ "detik.com": _scrape_detik,
344
+ "radar": _scrape_radar,
345
+ "radardisway": _scrape_radar,
346
+ "radarcirebon.disway.id": _scrape_radar,
347
+ "antara": _scrape_antara,
348
+ "antaranews": _scrape_antara,
349
+ "antaranews.com": _scrape_antara,
350
+ "cnn": _scrape_cnn,
351
+ "cnnindonesia": _scrape_cnn,
352
+ "cnnindonesia.com": _scrape_cnn,
353
+ "radarcirebon": _scrape_radarcirebon,
354
+ "radarcirebon.id": _scrape_radarcirebon,
355
+ }
356
+
357
+
358
+ def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
359
+ if not portal: return []
360
+ portal_key = portal.strip().lower().rstrip("/")
361
+ scraper = _PORTAL_MAP.get(portal_key)
362
+
363
+ if scraper is None:
364
+ for key, fn in _PORTAL_MAP.items():
365
+ if key in portal_key or portal_key in key:
366
+ scraper = fn
367
+ break
368
+
369
+ if scraper is None:
370
+ try:
371
+ domain = urlparse(portal).netloc or portal_key
372
+ for key, fn in _PORTAL_MAP.items():
373
+ if key in domain:
374
+ scraper = fn
375
+ break
376
+ except Exception: pass
377
+
378
+ if scraper is None:
379
+ print(f"[News] Portal '{portal}' tidak dikenali.")
380
+ return []
381
+
382
+ print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
383
+ try:
384
+ return scraper(keyword, max_pages=pages)
385
+ except Exception as e:
386
+ print(f"[News] Error saat scraping: {e}")
387
+ return []
services/preprocessing.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ preprocessing.py – Clean & preprocess text for sentiment analysis.
3
+ Only contains utility functions; no Colab/notebook code.
4
+ """
5
+ import re
6
+ import html as html_lib
7
+
8
+ from bs4 import BeautifulSoup
9
+
10
+ try:
11
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
12
+ from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
13
+ _sastrawi_available = True
14
+ except ImportError:
15
+ _sastrawi_available = False
16
+
17
+ try:
18
+ from stop_words import get_stop_words
19
+ _stopwords_id = get_stop_words('indonesian')
20
+ except Exception:
21
+ _stopwords_id = []
22
+
23
+ # ── Stopwords ──────────────────────────────────────────────────────────────────
24
+ _sastrawi_stopwords: list = []
25
+ _stemmer = None
26
+
27
+ if _sastrawi_available:
28
+ _stemmer = StemmerFactory().create_stemmer()
29
+ _sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
30
+
31
+ _ADDITIONAL_STOPWORDS = [
32
+ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
33
+ 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi',
34
+ 'ya','lbh','digunakan','semangat','dah','sangat','penting',
35
+ 'lancar','cepat','senang','makasih','bermanfaat','keren','baik',
36
+ 'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai',
37
+ 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk',
38
+ 'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita',
39
+ # HTML attributes
40
+ 'class','id','span','div','href','src','style','alt','aria','role',
41
+ 'tabindex','button','label','img','input','placeholder','form',
42
+ 'field','hidden','value','by','link','tags',
43
+ ]
44
+
45
+ _NOISE_STOPWORDS = [
46
+ 'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak',
47
+ 'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat',
48
+ 'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk',
49
+ 'hl','hd','sy','amp','fbf',
50
+ ]
51
+
52
+ _SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
53
+
54
+ FINAL_STOPWORDS: set = set(
55
+ _stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS
56
+ ) | _SINGLE_LETTERS
57
+
58
+
59
+ # ── Individual text cleaners ───────────────────────────────────────────────────
60
+
61
+ _AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
62
+
63
+ def clean_html(text: str) -> str:
64
+ """Strip HTML tags and unescape HTML entities."""
65
+ if not text:
66
+ return ""
67
+ try:
68
+ soup = BeautifulSoup(str(text), "html.parser")
69
+ for tag in soup(["script", "style"]):
70
+ tag.decompose()
71
+ cleaned = soup.get_text(separator=" ")
72
+ except Exception:
73
+ cleaned = str(text)
74
+ cleaned = html_lib.unescape(cleaned)
75
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
76
+ return cleaned
77
+
78
+
79
+ def clean_text(text: str) -> str:
80
+ """Basic single-string cleaner: lowercase, remove URLs, non-alpha chars."""
81
+ if not text:
82
+ return ""
83
+ text = str(text).lower()
84
+ text = _AUTHOR_COMMENT_PATTERN.sub("", text)
85
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
86
+ text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
87
+ text = re.sub(r'\s+', ' ', text).strip()
88
+ return text
89
+
90
+
91
+ def _preprocess_single(text: str) -> str:
92
+ """Full pipeline for one text string."""
93
+ # 1. Strip HTML
94
+ text = clean_html(text)
95
+ # 2. Lowercase + remove URLs/non-alpha
96
+ text = clean_text(text)
97
+ # 3. Stem (Sastrawi)
98
+ if _stemmer:
99
+ text = _stemmer.stem(text)
100
+ # 4. Remove stopwords & noise
101
+ tokens = [
102
+ w for w in text.split()
103
+ if w not in FINAL_STOPWORDS and len(w) > 1
104
+ ]
105
+ # 5. Keep only tokens with at least one letter
106
+ tokens = [t for t in tokens if re.search(r'[a-z]', t)]
107
+ return " ".join(tokens).strip()
108
+
109
+
110
+ # ── Public API ─────────────────────────────────────────────────────────────────
111
+
112
+ def preprocess_text(texts) -> list:
113
+ """
114
+ Accept either a single string or a list of strings.
115
+ Returns a list of cleaned strings.
116
+ """
117
+ if isinstance(texts, str):
118
+ texts = [texts]
119
+ return [_preprocess_single(t) for t in texts if isinstance(t, str)]
services/sentiment.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ sentiment.py – Sentiment analysis using IndoBERT / HuggingFace pipeline.
3
+ Model is loaded lazily (first call) to avoid crashing at import time.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from typing import Optional
9
+
10
+ # ── Model configuration ────────────────────────────────────────────────────────
11
+ # If you have a local fine-tuned model, place it in ./indoBERT-sentiment
12
+ # and set MODEL_DIR. Otherwise it downloads from HuggingFace.
13
+ _LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
14
+ _HF_MODEL_ID = "taufiqdp/indonesian-sentiment"
15
+
16
+ # ── Lazy-loaded globals ────────────────────────────────────────────────────────
17
+ _pipeline: Optional[object] = None
18
+
19
+
20
+ def _load_pipeline():
21
+ global _pipeline
22
+ if _pipeline is not None:
23
+ return _pipeline
24
+
25
+ import torch
26
+ from transformers import pipeline as hf_pipeline
27
+
28
+ # Prefer local model if it exists (avoids repeated downloads in Docker)
29
+ if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
30
+ model_source = _LOCAL_MODEL_DIR
31
+ print(f"[Sentiment] Loading model from local dir: {model_source}")
32
+ else:
33
+ model_source = _HF_MODEL_ID
34
+ print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")
35
+
36
+ device = 0 if torch.cuda.is_available() else -1
37
+
38
+ _pipeline = hf_pipeline(
39
+ "text-classification",
40
+ model=model_source,
41
+ tokenizer=model_source,
42
+ device=device,
43
+ truncation=True,
44
+ max_length=256,
45
+ return_all_scores=False,
46
+ )
47
+ print("[Sentiment] Model loaded successfully.")
48
+ return _pipeline
49
+
50
+
51
+ # ── Helpers ────────────────────────────────────────────────────────────────────
52
+
53
+ def _normalize_label(lbl: str) -> str:
54
+ """Normalise raw model label to 'positif', 'negatif', or 'netral'."""
55
+ l = lbl.lower()
56
+ if l in ("positif", "positive", "pos"):
57
+ return "positif"
58
+ if l in ("negatif", "negative", "neg"):
59
+ return "negatif"
60
+ if l in ("netral", "neutral", "neu"):
61
+ return "netral"
62
+ if "label_" in l:
63
+ try:
64
+ from transformers import AutoConfig
65
+ cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
66
+ idx = int(l.split("_")[-1])
67
+ return _normalize_label(cfg.id2label[idx])
68
+ except Exception:
69
+ return "netral"
70
+ return "netral"
71
+
72
+
73
+ # ── Keywords Override ──────────────────────────────────────────────────────────
74
+
75
+ _NEGATIVE_KEYWORDS = {
76
+ "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
77
+ "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
78
+ "kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur",
79
+ "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang",
80
+ "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
81
+ "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
82
+ }
83
+
84
+ _POSITIVE_KEYWORDS = {
85
+ "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
86
+ "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
87
+ "sempurna", "berhasil", "luas", "indah"
88
+ }
89
+
90
+ _NEUTRAL_KEYWORDS = {
91
+ "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
92
+ }
93
+
94
+ def _override_label(text: str, model_label: str) -> str:
95
+ text_lower = text.lower()
96
+
97
+ if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
98
+ return "negatif"
99
+ if any(w in text_lower for w in _POSITIVE_KEYWORDS):
100
+ return "positif"
101
+ if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
102
+ return "netral"
103
+
104
+ return model_label
105
+
106
+
107
+ # ── Public API ─────────────────────────────────────────────────────────────────
108
+
109
+ def analyze_sentiment(texts: list) -> dict:
110
+ """
111
+ Run sentiment analysis on a list of text strings.
112
+
113
+ Args:
114
+ texts: list of pre-processed strings
115
+
116
+ Returns:
117
+ dict with keys: positif, negatif, netral, total, detail
118
+ Example:
119
+ {
120
+ "positif": 12, "negatif": 4, "netral": 6, "total": 22,
121
+ "detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
122
+ }
123
+ """
124
+ if not texts:
125
+ return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
126
+
127
+ # Filter out empty strings
128
+ texts = [t for t in texts if t and t.strip()]
129
+ if not texts:
130
+ return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
131
+
132
+ clf = _load_pipeline()
133
+
134
+ try:
135
+ preds = clf(texts, batch_size=16, truncation=True)
136
+ except Exception as e:
137
+ print(f"[Sentiment] Prediction error: {e}")
138
+ return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}
139
+
140
+ counts = {"positif": 0, "negatif": 0, "netral": 0}
141
+ detail = []
142
+ for text, pred in zip(texts, preds):
143
+ model_label = _normalize_label(pred["label"])
144
+ final_label = _override_label(text, model_label)
145
+
146
+ counts[final_label] += 1
147
+ detail.append({
148
+ "text": text[:200],
149
+ "label": final_label,
150
+ "score": round(float(pred["score"]), 4),
151
+ })
152
+
153
+ return {
154
+ "positif": counts["positif"],
155
+ "negatif": counts["negatif"],
156
+ "netral": counts["netral"],
157
+ "total": len(texts),
158
+ "detail": detail,
159
+ }
services/tiktok.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ tiktok.py – TikTok scraper using Selenium.
3
+ Exports: scrape_tiktok(cookie_str, target_username) -> list[dict]
4
+
5
+ Returns structured data per-video:
6
+ url, profile_username, upload_date, like_count,
7
+ caption_short, caption_detail, comments, scrape_date
8
+
9
+ cookie_str accepts:
10
+ 1. Raw string: "sessionid=xxx; tt_webid=yyy; ..."
11
+ 2. JSON array: [{"name":"sessionid","value":"xxx",...}, ...]
12
+ 3. JSON object: {"sessionid": "xxx", "tt_webid": "yyy"}
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import time
18
+ from datetime import datetime
19
+
20
+ from selenium.webdriver.common.by import By
21
+ from selenium.webdriver.support.ui import WebDriverWait
22
+ from selenium.webdriver.support import expected_conditions as EC
23
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
24
+
25
+ from ._driver import _create_driver
26
+
27
+
28
+ # ── Cookie injection ───────────────────────────────────────────────────────────
29
+
30
+ def _inject_cookies(driver, cookie_str: str) -> bool:
31
+ driver.get("https://www.tiktok.com/")
32
+ time.sleep(3)
33
+
34
+ if not cookie_str or not cookie_str.strip():
35
+ print("[TikTok] Tidak ada cookie yang diberikan.")
36
+ return False
37
+
38
+ stripped = cookie_str.strip()
39
+
40
+ if stripped.startswith("["):
41
+ try:
42
+ cookies = json.loads(stripped)
43
+ count = 0
44
+ for c in cookies:
45
+ if not isinstance(c, dict) or "name" not in c:
46
+ continue
47
+ safe = {k: c[k] for k in ("name", "value", "domain", "path", "secure", "httpOnly", "expiry") if k in c}
48
+ safe.setdefault("domain", ".tiktok.com")
49
+ try:
50
+ driver.add_cookie(safe)
51
+ count += 1
52
+ except Exception:
53
+ safe.pop("domain", None)
54
+ try:
55
+ driver.add_cookie(safe)
56
+ count += 1
57
+ except Exception:
58
+ pass
59
+ driver.refresh()
60
+ time.sleep(3)
61
+ return count > 0
62
+ except Exception as e:
63
+ print(f"[TikTok] JSON array error: {e}")
64
+
65
+ if stripped.startswith("{"):
66
+ try:
67
+ obj = json.loads(stripped)
68
+ count = 0
69
+ for name, value in obj.items():
70
+ try:
71
+ driver.add_cookie({"name": str(name), "value": str(value), "domain": ".tiktok.com"})
72
+ count += 1
73
+ except Exception:
74
+ pass
75
+ driver.refresh()
76
+ time.sleep(3)
77
+ return count > 0
78
+ except Exception as e:
79
+ print(f"[TikTok] JSON object error: {e}")
80
+
81
+ try:
82
+ count = 0
83
+ for item in stripped.split(";"):
84
+ item = item.strip()
85
+ if "=" not in item:
86
+ continue
87
+ name, _, value = item.partition("=")
88
+ try:
89
+ driver.add_cookie({"name": name.strip(), "value": value.strip(), "domain": ".tiktok.com"})
90
+ count += 1
91
+ except Exception:
92
+ pass
93
+ driver.refresh()
94
+ time.sleep(3)
95
+ return count > 0
96
+ except Exception as e:
97
+ print(f"[TikTok] String cookie error: {e}")
98
+ return False
99
+
100
+
101
+ # ── Scraping helpers ───────────────────────────────────────────────────────────
102
+
103
+ _VIDEO_LINK_SELECTORS = [
104
+ 'div[data-e2e="user-post-item"] a',
105
+ 'div[data-e2e="user-post-item-list"] a',
106
+ 'a[href*="/video/"]',
107
+ 'div[class*="DivItemContainerV2"] a',
108
+ 'div[class*="DivWrapper"] a[href*="/video/"]',
109
+ ]
110
+
111
+
112
+ def _get_video_links(driver, profile_url: str, max_videos: int = 30) -> list:
113
+ print(f"[TikTok] Membuka profil: {profile_url}")
114
+ driver.get(profile_url)
115
+
116
+ loaded = False
117
+ for sel in _VIDEO_LINK_SELECTORS:
118
+ try:
119
+ WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
120
+ loaded = True
121
+ break
122
+ except TimeoutException:
123
+ continue
124
+
125
+ if not loaded:
126
+ time.sleep(5)
127
+
128
+ links: set = set()
129
+ stall = 0
130
+
131
+ while len(links) < max_videos:
132
+ prev = len(links)
133
+ for sel in _VIDEO_LINK_SELECTORS:
134
+ for el in driver.find_elements(By.CSS_SELECTOR, sel):
135
+ href = el.get_attribute("href")
136
+ if href and "/video/" in href:
137
+ links.add(href.split("?")[0])
138
+ if len(links) >= max_videos:
139
+ break
140
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
141
+ time.sleep(3)
142
+ if len(links) == prev:
143
+ stall += 1
144
+ if stall >= 3:
145
+ break
146
+ else:
147
+ stall = 0
148
+
149
+ return list(links)[:max_videos]
150
+
151
+
152
+ def _scrape_video(driver, video_url: str, profile_username: str) -> dict | None:
153
+ print(f"[TikTok] Memproses: {video_url}")
154
+ driver.get(video_url)
155
+ time.sleep(5)
156
+
157
+ video_data = {
158
+ "url": video_url,
159
+ "profile_username": profile_username,
160
+ "upload_date": "N/A",
161
+ "like_count": "N/A",
162
+ "caption_short": "",
163
+ "caption_detail": "",
164
+ "comments": [],
165
+ "scrape_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
166
+ }
167
+
168
+ try:
169
+ date_el = WebDriverWait(driver, 8).until(
170
+ EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
171
+ )
172
+ video_data["upload_date"] = date_el.text.strip()
173
+ except TimeoutException:
174
+ pass
175
+
176
+ try:
177
+ like_el = driver.find_element(By.CSS_SELECTOR, 'strong[data-e2e="like-count"]')
178
+ video_data["like_count"] = like_el.text.strip()
179
+ except NoSuchElementException:
180
+ pass
181
+
182
+ try:
183
+ desc_container = WebDriverWait(driver, 5).until(
184
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
185
+ )
186
+ try:
187
+ cap_el = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]')
188
+ video_data["caption_short"] = cap_el.text.strip()
189
+
190
+ try:
191
+ more_btn = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
192
+ driver.execute_script("arguments[0].click();", more_btn)
193
+ time.sleep(2)
194
+ detail_container = WebDriverWait(driver, 5).until(
195
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
196
+ )
197
+ desc_text = ""
198
+ try:
199
+ desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
200
+ except NoSuchElementException:
201
+ pass
202
+ kw_text = ""
203
+ try:
204
+ kw_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
205
+ except NoSuchElementException:
206
+ pass
207
+ video_data["caption_detail"] = f"Deskripsi: {desc_text}\nKeywords: {kw_text}".strip()
208
+ except Exception:
209
+ pass
210
+ except NoSuchElementException:
211
+ pass
212
+ except TimeoutException:
213
+ pass
214
+
215
+ try:
216
+ WebDriverWait(driver, 15).until(
217
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
218
+ )
219
+
220
+ reply_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'repl'))]"
221
+ stall = 0
222
+ last_count = 0
223
+
224
+ for _ in range(15):
225
+ try:
226
+ btns = driver.find_elements(By.XPATH, reply_xpath)
227
+ if btns:
228
+ driver.execute_script("arguments[0].click();", btns[0])
229
+ time.sleep(2)
230
+ stall = 0
231
+ continue
232
+ except Exception:
233
+ pass
234
+
235
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
236
+ time.sleep(3)
237
+ cur = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
238
+ if cur > last_count:
239
+ last_count = cur
240
+ stall = 0
241
+ else:
242
+ stall += 1
243
+ if stall >= 4:
244
+ break
245
+
246
+ items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
247
+ for item in items:
248
+ try:
249
+ author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
250
+ if author_el:
251
+ cat_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]').text.strip()
252
+ if cat_text:
253
+ video_data["comments"].append({
254
+ "author": author_el[0].text.strip(),
255
+ "comment": cat_text,
256
+ "replies": []
257
+ })
258
+ continue
259
+
260
+ # Check for replies (level 2)
261
+ r_author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
262
+ if r_author_el and video_data["comments"]:
263
+ r_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]').text.strip()
264
+ if r_text:
265
+ video_data["comments"][-1]["replies"].append({
266
+ "author": r_author_el[0].text.strip(),
267
+ "comment": r_text
268
+ })
269
+ except Exception:
270
+ pass
271
+ except TimeoutException:
272
+ pass
273
+
274
+ return video_data
275
+
276
+
277
+ # ── Public API ───���─────────────────────────────────────────────────────────────
278
+
279
+ def scrape_tiktok(cookie_str: str, target_username: str, max_videos: int = 20) -> list:
280
+ """
281
+ Scrape captions & comments from a TikTok profile.
282
+
283
+ Returns:
284
+ list of dicts with: url, profile_username, upload_date, like_count,
285
+ caption_short, caption_detail, comments, scrape_date
286
+ """
287
+ if not target_username:
288
+ print("[TikTok] target_username tidak ada.")
289
+ return []
290
+
291
+ username = target_username.lstrip("@")
292
+ profile_url = f"https://www.tiktok.com/@{username}"
293
+
294
+ driver = _create_driver(mobile=False)
295
+ all_data: list = []
296
+
297
+ try:
298
+ if cookie_str and cookie_str.strip():
299
+ _inject_cookies(driver, cookie_str)
300
+
301
+ links = _get_video_links(driver, profile_url, max_videos)
302
+
303
+ for url in links:
304
+ try:
305
+ data = _scrape_video(driver, url, username)
306
+ if data:
307
+ all_data.append(data)
308
+ except Exception as e:
309
+ print(f"[TikTok] Error {url}: {e}")
310
+ time.sleep(1.5)
311
+
312
+ except Exception as e:
313
+ print(f"[TikTok] Fatal error: {e}")
314
+ finally:
315
+ try:
316
+ driver.quit()
317
+ except Exception:
318
+ pass
319
+
320
+ return all_data
services/wordcloud_service.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ wordcloud_service.py – Generate a word-cloud image from a list of texts.
3
+ Stripped from the original Colab notebook; only the generation function remains.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import io
8
+ import os
9
+ import re
10
+ import numpy as np
11
+ import matplotlib
12
+ matplotlib.use("Agg") # Must be before pyplot import β€” headless/no-display
13
+ import matplotlib.pyplot as plt
14
+ from wordcloud import WordCloud
15
+
16
+ # ── Stopwords (same set as preprocessing.py) ──────────────────────────────────
17
+ try:
18
+ from stop_words import get_stop_words
19
+ _stopwords_id = get_stop_words('indonesian')
20
+ except Exception:
21
+ _stopwords_id = []
22
+
23
+ try:
24
+ from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
25
+ _sastrawi_sw = StopWordRemoverFactory().get_stop_words()
26
+ except Exception:
27
+ _sastrawi_sw = []
28
+
29
+ _EXTRA_STOPWORDS = [
30
+ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg',
31
+ 'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru',
32
+ ]
33
+
34
+ _BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS)
35
+ _SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
36
+ WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS
37
+
38
+
39
+ # ── Internal helpers ───────────────────────────────────────────────────────────
40
+
41
+ def _merge_texts(texts: list) -> str:
42
+ """Join a list of strings, keeping only alphabetic tokens."""
43
+ joined = " ".join(str(t) for t in texts if t)
44
+ tokens = joined.lower().split()
45
+ tokens = [
46
+ w for w in tokens
47
+ if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2
48
+ ]
49
+ return " ".join(tokens)
50
+
51
+
52
+ def _circular_mask(size: int = 400) -> np.ndarray:
53
+ x, y = np.ogrid[:size, :size]
54
+ center = size // 2
55
+ radius = center - 10
56
+ mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2
57
+ return (255 * mask).astype(np.uint8)
58
+
59
+
60
+ # ── Public API ─────────────────────────────────────────────────────────────────
61
+
62
+ def generate_wordcloud(texts: list, output_dest) -> bool:
63
+ """
64
+ Generate a circular wordcloud from a list of text strings.
65
+
66
+ Args:
67
+ texts: list of strings (raw or pre-processed)
68
+ output_dest: file path string OR a BytesIO buffer.
69
+ If a string path is given, the PNG is saved to disk.
70
+ If a BytesIO buffer is given, the PNG is written there
71
+ (no file is created on disk).
72
+
73
+ Returns:
74
+ True on success, False on failure.
75
+ """
76
+ if not texts:
77
+ print("[WordCloud] No texts provided.")
78
+ return False
79
+
80
+ text_data = _merge_texts(texts)
81
+ if not text_data.strip():
82
+ print("[WordCloud] All text was filtered out by stopwords; nothing to plot.")
83
+ return False
84
+
85
+ # If saving to a file path, ensure the directory exists
86
+ if isinstance(output_dest, str):
87
+ output_dir = os.path.dirname(output_dest)
88
+ if output_dir:
89
+ os.makedirs(output_dir, exist_ok=True)
90
+
91
+ try:
92
+ mask = _circular_mask(400)
93
+ wc = WordCloud(
94
+ width=800,
95
+ height=800,
96
+ background_color="white",
97
+ colormap="viridis",
98
+ mask=mask,
99
+ contour_width=2,
100
+ contour_color="steelblue",
101
+ stopwords=WORDCLOUD_STOPWORDS,
102
+ max_words=100,
103
+ ).generate(text_data)
104
+
105
+ fig, ax = plt.subplots(figsize=(8, 8))
106
+ ax.imshow(wc, interpolation="bilinear")
107
+ ax.axis("off")
108
+ plt.tight_layout(pad=0)
109
+ plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png")
110
+ plt.close(fig)
111
+
112
+ if isinstance(output_dest, str):
113
+ print(f"[WordCloud] Saved to {output_dest}")
114
+ else:
115
+ print("[WordCloud] Written to in-memory buffer (temporal).")
116
+ return True
117
+
118
+ except Exception as e:
119
+ print(f"[WordCloud] Error generating wordcloud: {e}")
120
+ return False
templates/index.html ADDED
@@ -0,0 +1,1009 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="id">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>SentiScope β€” Sentiment Analysis Dashboard</title>
7
+ <meta name="description" content="Dashboard analisis sentimen media sosial dengan scraping otomatis, word cloud, dan indoBERT.">
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Space+Grotesk:wght@400;500;600;700&display=swap" rel="stylesheet">
11
+ <style>
12
+ /* ── Reset & Base ──────────────────────────────────────────────────── */
13
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
14
+
15
+ :root {
16
+ --bg: #07071a;
17
+ --surface: #0e0e28;
18
+ --surface-2: #14143a;
19
+ --border: rgba(130, 100, 255, 0.18);
20
+ --border-hover: rgba(130, 100, 255, 0.42);
21
+ --purple: #7c3aed;
22
+ --purple-light: #a855f7;
23
+ --cyan: #06b6d4;
24
+ --text: #e2e8f0;
25
+ --text-muted: #8892a4;
26
+ --text-dim: #4b5563;
27
+ --radius: 14px;
28
+ --radius-sm: 8px;
29
+ --transition: 0.22s cubic-bezier(0.4, 0, 0.2, 1);
30
+ }
31
+
32
+ html { scroll-behavior: smooth; }
33
+
34
+ body {
35
+ font-family: 'Inter', system-ui, sans-serif;
36
+ background: var(--bg);
37
+ color: var(--text);
38
+ min-height: 100vh;
39
+ overflow-x: hidden;
40
+ }
41
+
42
+ body::before {
43
+ content: '';
44
+ position: fixed;
45
+ inset: 0;
46
+ background:
47
+ radial-gradient(ellipse 70% 50% at 15% 20%, rgba(124,58,237,0.12) 0%, transparent 60%),
48
+ radial-gradient(ellipse 50% 40% at 85% 75%, rgba(6,182,212,0.10) 0%, transparent 60%),
49
+ radial-gradient(ellipse 40% 35% at 50% 5%, rgba(168,85,247,0.08) 0%, transparent 55%);
50
+ pointer-events: none;
51
+ z-index: 0;
52
+ }
53
+
54
+ /* ── Layout ────────────────────────────────────────────────────────── */
55
+ .wrapper {
56
+ position: relative;
57
+ z-index: 1;
58
+ max-width: 920px;
59
+ margin: 0 auto;
60
+ padding: 2.5rem 1.25rem 4rem;
61
+ }
62
+
63
+ /* ── Hero ───────────────────────────────────────────────────────────── */
64
+ .hero { text-align: center; margin-bottom: 2.5rem; }
65
+
66
+ .hero-badge {
67
+ display: inline-flex;
68
+ align-items: center;
69
+ gap: 0.45rem;
70
+ background: rgba(124,58,237,0.15);
71
+ border: 1px solid rgba(124,58,237,0.35);
72
+ border-radius: 100px;
73
+ padding: 0.28rem 0.9rem;
74
+ font-size: 0.75rem;
75
+ font-weight: 600;
76
+ color: var(--purple-light);
77
+ letter-spacing: 0.04em;
78
+ text-transform: uppercase;
79
+ margin-bottom: 1rem;
80
+ }
81
+
82
+ .hero h1 {
83
+ font-family: 'Space Grotesk', sans-serif;
84
+ font-size: clamp(2rem, 5vw, 3.2rem);
85
+ font-weight: 700;
86
+ line-height: 1.15;
87
+ background: linear-gradient(135deg, #c084fc 0%, #818cf8 40%, #38bdf8 100%);
88
+ -webkit-background-clip: text;
89
+ -webkit-text-fill-color: transparent;
90
+ background-clip: text;
91
+ margin-bottom: 0.7rem;
92
+ }
93
+
94
+ .hero p {
95
+ color: var(--text-muted);
96
+ font-size: 0.95rem;
97
+ max-width: 520px;
98
+ margin: 0 auto;
99
+ line-height: 1.6;
100
+ }
101
+
102
+ /* ── Tab navigation ────────────────────────────────────────────────── */
103
+ .tab-nav {
104
+ display: flex;
105
+ gap: 0.5rem;
106
+ background: var(--surface);
107
+ border: 1px solid var(--border);
108
+ border-radius: var(--radius);
109
+ padding: 0.4rem;
110
+ margin-bottom: 2rem;
111
+ }
112
+
113
+ .tab-btn {
114
+ flex: 1;
115
+ display: flex;
116
+ align-items: center;
117
+ justify-content: center;
118
+ gap: 0.5rem;
119
+ padding: 0.7rem 1.2rem;
120
+ border: none;
121
+ border-radius: var(--radius-sm);
122
+ background: transparent;
123
+ color: var(--text-muted);
124
+ font-family: 'Inter', sans-serif;
125
+ font-size: 0.88rem;
126
+ font-weight: 500;
127
+ cursor: pointer;
128
+ transition: var(--transition);
129
+ }
130
+
131
+ .tab-btn:hover { color: var(--text); background: rgba(255,255,255,0.05); }
132
+
133
+ .tab-btn.active {
134
+ background: linear-gradient(135deg, rgba(124,58,237,0.35), rgba(6,182,212,0.2));
135
+ color: #fff;
136
+ font-weight: 600;
137
+ box-shadow: 0 0 0 1px rgba(124,58,237,0.5) inset;
138
+ }
139
+
140
+ /* ── Tab panels ─────────────────────────────────────────────────────── */
141
+ .tab-panel { display: none; }
142
+ .tab-panel.active { display: block; }
143
+
144
+ /* ── Glass card ─────────────────────────────────────────────────────── */
145
+ .card {
146
+ background: linear-gradient(135deg, rgba(14,14,40,0.9) 0%, rgba(20,20,58,0.75) 100%);
147
+ border: 1px solid var(--border);
148
+ border-radius: var(--radius);
149
+ padding: 1.6rem;
150
+ margin-bottom: 1.25rem;
151
+ backdrop-filter: blur(12px);
152
+ transition: border-color var(--transition), box-shadow var(--transition);
153
+ }
154
+
155
+ .card:hover { border-color: var(--border-hover); }
156
+
157
+ /* ── Platform header ────────────────────────────────────────────────── */
158
+ .platform-header {
159
+ display: flex;
160
+ align-items: center;
161
+ justify-content: space-between;
162
+ margin-bottom: 1.1rem;
163
+ }
164
+
165
+ .platform-title {
166
+ display: flex;
167
+ align-items: center;
168
+ gap: 0.6rem;
169
+ font-family: 'Space Grotesk', sans-serif;
170
+ font-size: 1rem;
171
+ font-weight: 600;
172
+ color: #c4b5fd;
173
+ }
174
+
175
+ .platform-icon {
176
+ width: 32px;
177
+ height: 32px;
178
+ border-radius: 8px;
179
+ display: flex;
180
+ align-items: center;
181
+ justify-content: center;
182
+ font-size: 1rem;
183
+ }
184
+
185
+ .pi-instagram { background: linear-gradient(135deg, #f09433, #e6683c, #dc2743, #cc2366, #bc1888); }
186
+ .pi-tiktok { background: #161823; border: 1px solid #333; }
187
+ .pi-facebook { background: #1877f2; }
188
+ .pi-news { background: linear-gradient(135deg, #0ea5e9, #6366f1); }
189
+ .pi-dataset { background: linear-gradient(135deg, #059669, #0891b2); }
190
+
191
+ /* ── Toggle switch ──────────────────────────────────────────────────── */
192
+ .toggle-wrap { display: flex; align-items: center; gap: 0.6rem; }
193
+
194
+ .toggle-label { font-size: 0.78rem; color: var(--text-dim); font-weight: 500; }
195
+
196
+ .toggle { position: relative; width: 42px; height: 24px; }
197
+
198
+ .toggle input { opacity: 0; width: 0; height: 0; }
199
+
200
+ .slider {
201
+ position: absolute;
202
+ inset: 0;
203
+ background: rgba(255,255,255,0.1);
204
+ border-radius: 100px;
205
+ cursor: pointer;
206
+ transition: var(--transition);
207
+ }
208
+
209
+ .slider::before {
210
+ content: '';
211
+ position: absolute;
212
+ width: 18px;
213
+ height: 18px;
214
+ left: 3px;
215
+ top: 3px;
216
+ background: white;
217
+ border-radius: 50%;
218
+ transition: var(--transition);
219
+ }
220
+
221
+ .toggle input:checked + .slider { background: linear-gradient(135deg, var(--purple), var(--cyan)); }
222
+ .toggle input:checked + .slider::before { transform: translateX(18px); }
223
+
224
+ .platform-fields {
225
+ overflow: hidden;
226
+ transition: max-height 0.35s ease, opacity 0.3s ease;
227
+ }
228
+
229
+ .platform-fields.collapsed {
230
+ max-height: 0 !important;
231
+ opacity: 0;
232
+ pointer-events: none;
233
+ }
234
+
235
+ /* ── Form elements ──────────────────────────────────────────────────── */
236
+ .form-row { display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; }
237
+ .form-row.cols-3 { grid-template-columns: 1fr 1fr 1fr; }
238
+ .form-group { display: flex; flex-direction: column; gap: 0.3rem; }
239
+ .form-group.full { grid-column: 1 / -1; }
240
+
241
+ label { font-size: 0.78rem; color: var(--text-muted); font-weight: 500; letter-spacing: 0.01em; }
242
+
243
+ input[type="text"],
244
+ input[type="password"],
245
+ input[type="number"],
246
+ textarea,
247
+ select {
248
+ background: rgba(7,7,26,0.7);
249
+ border: 1px solid rgba(130,100,255,0.2);
250
+ border-radius: var(--radius-sm);
251
+ color: var(--text);
252
+ padding: 0.65rem 0.9rem;
253
+ font-family: 'Inter', sans-serif;
254
+ font-size: 0.88rem;
255
+ width: 100%;
256
+ transition: border-color var(--transition), box-shadow var(--transition);
257
+ outline: none;
258
+ }
259
+
260
+ input::placeholder, textarea::placeholder { color: var(--text-dim); }
261
+
262
+ input:focus, textarea:focus, select:focus {
263
+ border-color: var(--purple);
264
+ box-shadow: 0 0 0 3px rgba(124,58,237,0.2);
265
+ }
266
+
267
+ select option { background: var(--surface-2); }
268
+ textarea { resize: vertical; min-height: 88px; line-height: 1.5; }
269
+
270
+ .field-hint { font-size: 0.72rem; color: var(--text-dim); line-height: 1.4; margin-top: 0.2rem; }
271
+
272
+ /* ── Cookie tabs ────────────────────────────────────────────────────── */
273
+ .cookie-tabs { display: flex; gap: 0.3rem; margin-bottom: 0.5rem; }
274
+
275
+ .cookie-tab-btn {
276
+ padding: 0.25rem 0.7rem;
277
+ font-size: 0.72rem;
278
+ font-weight: 600;
279
+ border: 1px solid rgba(130,100,255,0.25);
280
+ border-radius: 6px;
281
+ background: transparent;
282
+ color: var(--text-muted);
283
+ cursor: pointer;
284
+ transition: var(--transition);
285
+ }
286
+
287
+ .cookie-tab-btn.active {
288
+ background: rgba(124,58,237,0.25);
289
+ color: #c4b5fd;
290
+ border-color: rgba(124,58,237,0.5);
291
+ }
292
+
293
+ /* ── Tag hint ───────────────────────────────────────────────────────── */
294
+ .tag-hint {
295
+ display: inline-flex;
296
+ align-items: center;
297
+ gap: 0.3rem;
298
+ font-size: 0.72rem;
299
+ color: var(--cyan);
300
+ background: rgba(6,182,212,0.1);
301
+ border: 1px solid rgba(6,182,212,0.25);
302
+ border-radius: 6px;
303
+ padding: 0.15rem 0.55rem;
304
+ margin-top: 0.3rem;
305
+ }
306
+
307
+ /* ── Portal chips ───────────────────────────────────────────────────── */
308
+ .portal-grid {
309
+ display: grid;
310
+ grid-template-columns: repeat(auto-fill, minmax(160px, 1fr));
311
+ gap: 0.5rem;
312
+ }
313
+
314
+ .portal-chip {
315
+ display: flex;
316
+ align-items: center;
317
+ gap: 0.5rem;
318
+ padding: 0.55rem 0.75rem;
319
+ border: 1px solid rgba(130,100,255,0.2);
320
+ border-radius: var(--radius-sm);
321
+ cursor: pointer;
322
+ background: rgba(7,7,26,0.5);
323
+ transition: var(--transition);
324
+ user-select: none;
325
+ }
326
+
327
+ .portal-chip:hover { border-color: rgba(130,100,255,0.45); background: rgba(124,58,237,0.1); }
328
+ .portal-chip input[type="checkbox"] { display: none; }
329
+ .portal-chip.checked { border-color: var(--purple); background: rgba(124,58,237,0.2); }
330
+
331
+ .chip-label { font-size: 0.82rem; font-weight: 500; color: var(--text-muted); }
332
+ .portal-chip.checked .chip-label { color: var(--text); }
333
+
334
+ .chip-dot {
335
+ width: 8px;
336
+ height: 8px;
337
+ border-radius: 50%;
338
+ background: var(--text-dim);
339
+ flex-shrink: 0;
340
+ transition: var(--transition);
341
+ }
342
+
343
+ .portal-chip.checked .chip-dot { background: var(--purple-light); }
344
+
345
+ /* ── Submit button ──────────────────────────────────────────────────── */
346
+ .btn-submit {
347
+ display: flex;
348
+ align-items: center;
349
+ justify-content: center;
350
+ gap: 0.6rem;
351
+ width: 100%;
352
+ padding: 1rem;
353
+ background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 50%, #0891b2 100%);
354
+ border: none;
355
+ border-radius: var(--radius);
356
+ color: #fff;
357
+ font-family: 'Space Grotesk', sans-serif;
358
+ font-size: 1rem;
359
+ font-weight: 600;
360
+ cursor: pointer;
361
+ transition: opacity var(--transition), transform var(--transition), box-shadow var(--transition);
362
+ letter-spacing: 0.02em;
363
+ margin-top: 0.5rem;
364
+ position: relative;
365
+ overflow: hidden;
366
+ }
367
+
368
+ .btn-submit::before {
369
+ content: '';
370
+ position: absolute;
371
+ inset: 0;
372
+ background: linear-gradient(135deg, rgba(255,255,255,0.12), transparent);
373
+ opacity: 0;
374
+ transition: opacity var(--transition);
375
+ }
376
+
377
+ .btn-submit:hover::before { opacity: 1; }
378
+ .btn-submit:hover { transform: translateY(-2px); box-shadow: 0 8px 32px rgba(124,58,237,0.45); }
379
+ .btn-submit:active { transform: translateY(0); }
380
+ .btn-submit:disabled { opacity: 0.65; pointer-events: none; cursor: not-allowed; transform: none; }
381
+
382
+ /* ── Spinner ────────────────────────────────────────────────────────── */
383
+ .spinner {
384
+ display: none;
385
+ width: 18px;
386
+ height: 18px;
387
+ border: 2.5px solid rgba(255,255,255,0.3);
388
+ border-top-color: #fff;
389
+ border-radius: 50%;
390
+ animation: spin 0.7s linear infinite;
391
+ flex-shrink: 0;
392
+ }
393
+
394
+ @keyframes spin { to { transform: rotate(360deg); } }
395
+
396
+ /* ── Alert ──────────────────────────────────────────────────────────── */
397
+ .alert {
398
+ border-radius: var(--radius);
399
+ padding: 1rem 1.25rem;
400
+ margin-bottom: 1.5rem;
401
+ font-size: 0.88rem;
402
+ border: 1px solid;
403
+ display: flex;
404
+ gap: 0.6rem;
405
+ align-items: flex-start;
406
+ }
407
+
408
+ .alert-error {
409
+ background: rgba(239,68,68,0.08);
410
+ border-color: rgba(239,68,68,0.3);
411
+ color: #fca5a5;
412
+ }
413
+
414
+ /* ── Results section ────────────────────────────────────────────────── */
415
+ .results-section { margin-top: 2.5rem; }
416
+
417
+ .results-header {
418
+ display: flex;
419
+ align-items: center;
420
+ gap: 0.6rem;
421
+ margin-bottom: 1.5rem;
422
+ }
423
+
424
+ .results-header h2 {
425
+ font-family: 'Space Grotesk', sans-serif;
426
+ font-size: 1.3rem;
427
+ font-weight: 700;
428
+ background: linear-gradient(135deg, var(--cyan), var(--purple-light));
429
+ -webkit-background-clip: text;
430
+ -webkit-text-fill-color: transparent;
431
+ background-clip: text;
432
+ }
433
+
434
+ .stats-strip {
435
+ font-size: 0.8rem;
436
+ color: var(--text-dim);
437
+ background: rgba(255,255,255,0.04);
438
+ border: 1px solid var(--border);
439
+ border-radius: 8px;
440
+ padding: 0.4rem 0.9rem;
441
+ margin-left: auto;
442
+ }
443
+
444
+ /* ── Sentiment cards ───────────────────────────────────────────────── */
445
+ .sentiment-grid {
446
+ display: grid;
447
+ grid-template-columns: repeat(3, 1fr);
448
+ gap: 1rem;
449
+ margin-bottom: 1.5rem;
450
+ }
451
+
452
+ .s-card {
453
+ border-radius: var(--radius);
454
+ padding: 1.4rem 1rem;
455
+ text-align: center;
456
+ border: 1px solid;
457
+ position: relative;
458
+ overflow: hidden;
459
+ }
460
+
461
+ .s-card::before { content: ''; position: absolute; inset: 0; opacity: 0.06; border-radius: inherit; }
462
+
463
+ .s-card.positif { background: rgba(34,197,94,0.08); border-color: rgba(34,197,94,0.3); }
464
+ .s-card.positif::before { background: #22c55e; }
465
+ .s-card.negatif { background: rgba(239,68,68,0.08); border-color: rgba(239,68,68,0.3); }
466
+ .s-card.negatif::before { background: #ef4444; }
467
+ .s-card.netral { background: rgba(148,163,184,0.06); border-color: rgba(148,163,184,0.2); }
468
+ .s-card.netral::before { background: #94a3b8; }
469
+
470
+ .s-count { font-family: 'Space Grotesk', sans-serif; font-size: 2.8rem; font-weight: 700; line-height: 1; margin-bottom: 0.3rem; }
471
+ .s-card.positif .s-count { color: #4ade80; }
472
+ .s-card.negatif .s-count { color: #f87171; }
473
+ .s-card.netral .s-count { color: #94a3b8; }
474
+
475
+ .s-label { font-size: 0.82rem; color: var(--text-muted); font-weight: 500; }
476
+
477
+ .s-bar-wrap { margin-top: 0.8rem; height: 4px; background: rgba(255,255,255,0.08); border-radius: 100px; overflow: hidden; }
478
+ .s-bar { height: 100%; border-radius: 100px; transition: width 1.2s cubic-bezier(0.4,0,0.2,1); }
479
+ .s-card.positif .s-bar { background: linear-gradient(90deg, #16a34a, #4ade80); }
480
+ .s-card.negatif .s-bar { background: linear-gradient(90deg, #b91c1c, #f87171); }
481
+ .s-card.netral .s-bar { background: linear-gradient(90deg, #475569, #94a3b8); }
482
+
483
+ /* ── Word cloud ─────────────────────────────────────────────────────── */
484
+ .wordcloud-card {
485
+ background: var(--surface);
486
+ border: 1px solid var(--border);
487
+ border-radius: var(--radius);
488
+ padding: 1.5rem;
489
+ text-align: center;
490
+ }
491
+
492
+ .wordcloud-card h3 {
493
+ font-family: 'Space Grotesk', sans-serif;
494
+ font-size: 1rem;
495
+ color: var(--purple-light);
496
+ margin-bottom: 1rem;
497
+ }
498
+
499
+ .wordcloud-img { max-width: 100%; border-radius: 10px; border: 1px solid var(--border); }
500
+
501
+ /* ── Divider ────────────────────────────────────────────────────────── */
502
+ .divider {
503
+ display: flex;
504
+ align-items: center;
505
+ gap: 0.75rem;
506
+ color: var(--text-dim);
507
+ font-size: 0.75rem;
508
+ margin: 0.75rem 0;
509
+ }
510
+
511
+ .divider::before, .divider::after { content: ''; flex: 1; height: 1px; background: var(--border); }
512
+
513
+ /* ── Section label ──────────────────────────────────────────────────── */
514
+ .section-label {
515
+ font-size: 0.7rem;
516
+ font-weight: 700;
517
+ text-transform: uppercase;
518
+ letter-spacing: 0.08em;
519
+ color: var(--text-dim);
520
+ margin-bottom: 0.6rem;
521
+ }
522
+
523
+ /* ── File upload ────────────────────────────────────────────────────── */
524
+ .upload-zone {
525
+ border: 2px dashed rgba(130,100,255,0.28);
526
+ border-radius: var(--radius);
527
+ padding: 2.5rem 1.5rem;
528
+ text-align: center;
529
+ transition: var(--transition);
530
+ cursor: pointer;
531
+ background: rgba(124,58,237,0.04);
532
+ position: relative;
533
+ }
534
+
535
+ .upload-zone:hover, .upload-zone.drag-over { border-color: var(--purple); background: rgba(124,58,237,0.1); }
536
+
537
+ .upload-zone input[type="file"] { position: absolute; inset: 0; opacity: 0; cursor: pointer; width: 100%; height: 100%; }
538
+
539
+ .upload-icon { font-size: 2rem; margin-bottom: 0.5rem; }
540
+ .upload-text { font-size: 0.9rem; color: var(--text-muted); }
541
+ .upload-sub { font-size: 0.78rem; color: var(--text-dim); margin-top: 0.3rem; }
542
+ .upload-filename { display: none; margin-top: 0.6rem; font-size: 0.82rem; color: var(--cyan); font-weight: 500; }
543
+
544
+ /* ── Responsive ─────────────────────────────────────────────────────── */
545
+ @media (max-width: 640px) {
546
+ .form-row { grid-template-columns: 1fr; }
547
+ .form-row.cols-3 { grid-template-columns: 1fr 1fr; }
548
+ .sentiment-grid { grid-template-columns: 1fr; }
549
+ .tab-btn span.tab-text { display: none; }
550
+ .hero h1 { font-size: 1.8rem; }
551
+ }
552
+
553
+ /* ── Animations ─────────────────────────────────────────────────────── */
554
+ @keyframes fadeUp {
555
+ from { opacity: 0; transform: translateY(20px); }
556
+ to { opacity: 1; transform: translateY(0); }
557
+ }
558
+
559
+ .animate-in { animation: fadeUp 0.5s ease both; }
560
+ .delay-1 { animation-delay: 0.05s; }
561
+ .delay-2 { animation-delay: 0.10s; }
562
+ .delay-3 { animation-delay: 0.15s; }
563
+ .delay-4 { animation-delay: 0.20s; }
564
+ .delay-5 { animation-delay: 0.25s; }
565
+ </style>
566
+ </head>
567
+ <body>
568
+ <div class="wrapper">
569
+
570
+ <!-- Hero -->
571
+ <header class="hero animate-in">
572
+ <div class="hero-badge">πŸ”¬ AI-Powered</div>
573
+ <h1>SentiScope</h1>
574
+ <p>Analisis sentimen media sosial otomatis dengan IndoBERT β€” Instagram, TikTok, Facebook & Berita Online.</p>
575
+ </header>
576
+
577
+ <!-- Error alert -->
578
+ {% if error %}
579
+ <div class="alert alert-error animate-in" role="alert">
580
+ <span>⚠️</span>
581
+ <span>{{ error }}</span>
582
+ </div>
583
+ {% endif %}
584
+
585
+ <!-- Tab navigation -->
586
+ <nav class="tab-nav animate-in delay-1" role="tablist">
587
+ <button class="tab-btn {% if active_tab != 'dataset' %}active{% endif %}"
588
+ id="tab-scraping" role="tab" onclick="switchTab('scraping')">
589
+ <span class="tab-icon">πŸ•·οΈ</span>
590
+ <span class="tab-text">Scraping Otomatis</span>
591
+ </button>
592
+ <button class="tab-btn {% if active_tab == 'dataset' %}active{% endif %}"
593
+ id="tab-dataset" role="tab" onclick="switchTab('dataset')">
594
+ <span class="tab-icon">πŸ“‚</span>
595
+ <span class="tab-text">Upload Dataset</span>
596
+ </button>
597
+ </nav>
598
+
599
+ <!-- ═══════════════════════ TAB 1: Scraping ═══════════════════════════ -->
600
+ <div class="tab-panel {% if active_tab != 'dataset' %}active{% endif %}" id="panel-scraping">
601
+ <form id="scraping-form" action="/process" method="post">
602
+
603
+ <!-- Hidden enable flags β€” managed by JS toggles -->
604
+ <input type="hidden" id="enable_instagram" name="enable_instagram" value="">
605
+ <input type="hidden" id="enable_tiktok" name="enable_tiktok" value="">
606
+ <input type="hidden" id="enable_facebook" name="enable_facebook" value="">
607
+ <input type="hidden" id="enable_news" name="enable_news" value="">
608
+
609
+ <!-- ── Instagram ──────────────────────────────────────────────── -->
610
+ <div class="card animate-in delay-2">
611
+ <div class="platform-header">
612
+ <div class="platform-title">
613
+ <div class="platform-icon pi-instagram">πŸ“Έ</div>
614
+ Instagram
615
+ </div>
616
+ <div class="toggle-wrap">
617
+ <span class="toggle-label" id="ig-toggle-label">Nonaktif</span>
618
+ <label class="toggle">
619
+ <input type="checkbox" id="ig-toggle" onchange="togglePlatform('ig')">
620
+ <span class="slider"></span>
621
+ </label>
622
+ </div>
623
+ </div>
624
+ <div class="platform-fields collapsed" id="ig-fields" style="max-height:600px;">
625
+ <div class="form-row" style="margin-bottom:0.9rem;">
626
+ <div class="form-group">
627
+ <label for="ig_username">Username Instagram</label>
628
+ <input id="ig_username" type="text" name="ig_username" placeholder="akun_instagram" autocomplete="username">
629
+ </div>
630
+ <div class="form-group">
631
+ <label for="ig_password">Password Instagram</label>
632
+ <input id="ig_password" type="password" name="ig_password" placeholder="β€’β€’β€’β€’β€’β€’β€’β€’" autocomplete="current-password">
633
+ </div>
634
+ </div>
635
+ <div class="form-row">
636
+ <div class="form-group full">
637
+ <label for="target_accounts">Target Akun / #Hashtag (satu per baris)</label>
638
+ <textarea id="target_accounts" name="target_accounts"
639
+ placeholder="cirebonkab&#10;@rctvcirebon&#10;#jalanrusak"></textarea>
640
+ <span class="tag-hint">↡ Satu target per baris, @ dan # opsional</span>
641
+ </div>
642
+ <div class="form-group">
643
+ <label for="mode">Mode Waktu</label>
644
+ <select id="mode" name="mode">
645
+ <option value="all">Semua Postingan</option>
646
+ <option value="date">7 Bulan Terakhir</option>
647
+ </select>
648
+ </div>
649
+ </div>
650
+ </div>
651
+ </div>
652
+
653
+ <!-- ── TikTok ──��───────────────────────────────────────────────── -->
654
+ <div class="card animate-in delay-3">
655
+ <div class="platform-header">
656
+ <div class="platform-title">
657
+ <div class="platform-icon pi-tiktok">🎡</div>
658
+ TikTok
659
+ </div>
660
+ <div class="toggle-wrap">
661
+ <span class="toggle-label" id="tt-toggle-label">Nonaktif</span>
662
+ <label class="toggle">
663
+ <input type="checkbox" id="tt-toggle" onchange="togglePlatform('tt')">
664
+ <span class="slider"></span>
665
+ </label>
666
+ </div>
667
+ </div>
668
+ <div class="platform-fields collapsed" id="tt-fields" style="max-height:500px;">
669
+ <div class="form-group" style="margin-bottom:0.9rem;">
670
+ <label>Format Cookie TikTok</label>
671
+ <div class="cookie-tabs">
672
+ <button type="button" class="cookie-tab-btn active" onclick="setCookieHint('raw',this)">String Mentah</button>
673
+ <button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_arr',this)">JSON Array</button>
674
+ <button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_obj',this)">JSON Object</button>
675
+ </div>
676
+ <textarea id="tiktok_cookie" name="tiktok_cookie"
677
+ placeholder="sessionid=xxx; tt_webid=yyy; ..."
678
+ style="min-height:70px;font-family:monospace;font-size:0.8rem;"></textarea>
679
+ <p class="field-hint" id="cookie-hint">
680
+ Format: <code>sessionid=ABC; tt_webid=123</code> β€” ambil dari DevTools β†’ Application β†’ Cookies β†’ tiktok.com
681
+ </p>
682
+ </div>
683
+ <div class="form-group">
684
+ <label for="tiktok_targets">Target Username TikTok (satu per baris)</label>
685
+ <textarea id="tiktok_targets" name="tiktok_targets"
686
+ placeholder="@rctvcirebon&#10;@cirebonnews&#10;kuningan_update"></textarea>
687
+ <span class="tag-hint">↡ Satu username per baris, @ opsional</span>
688
+ </div>
689
+ </div>
690
+ </div>
691
+
692
+ <!-- ── Facebook ────────────────────────────────────────────────── -->
693
+ <div class="card animate-in delay-3">
694
+ <div class="platform-header">
695
+ <div class="platform-title">
696
+ <div class="platform-icon pi-facebook">πŸ“˜</div>
697
+ Facebook
698
+ </div>
699
+ <div class="toggle-wrap">
700
+ <span class="toggle-label" id="fb-toggle-label">Nonaktif</span>
701
+ <label class="toggle">
702
+ <input type="checkbox" id="fb-toggle" onchange="togglePlatform('fb')">
703
+ <span class="slider"></span>
704
+ </label>
705
+ </div>
706
+ </div>
707
+ <div class="platform-fields collapsed" id="fb-fields" style="max-height:500px;">
708
+ <div class="form-row" style="margin-bottom:0.9rem;">
709
+ <div class="form-group">
710
+ <label for="fb_username">Email / No. HP Facebook</label>
711
+ <input id="fb_username" type="text" name="fb_username" placeholder="email@contoh.com" autocomplete="username">
712
+ </div>
713
+ <div class="form-group">
714
+ <label for="fb_password">Password Facebook</label>
715
+ <input id="fb_password" type="password" name="fb_password" placeholder="β€’β€’β€’β€’β€’β€’β€’β€’" autocomplete="current-password">
716
+ </div>
717
+ </div>
718
+ <div class="form-group">
719
+ <label for="facebook_groups">URL Grup Facebook (satu per baris, wajib diisi)</label>
720
+ <textarea id="facebook_groups" name="facebook_groups"
721
+ placeholder="https://web.facebook.com/groups/123456&#10;https://web.facebook.com/groups/teraswarga"></textarea>
722
+ <p class="field-hint">⚠️ Harus diisi β€” tidak ada grup default. Jika kosong, Facebook tidak akan di-scrape.</p>
723
+ </div>
724
+ </div>
725
+ </div>
726
+
727
+ <!-- ── Berita Online ───────────────────────────────────────────── -->
728
+ <div class="card animate-in delay-4">
729
+ <div class="platform-header">
730
+ <div class="platform-title">
731
+ <div class="platform-icon pi-news">πŸ“°</div>
732
+ Berita Online
733
+ </div>
734
+ <div class="toggle-wrap">
735
+ <span class="toggle-label" id="news-toggle-label">Nonaktif</span>
736
+ <label class="toggle">
737
+ <input type="checkbox" id="news-toggle" onchange="togglePlatform('news')">
738
+ <span class="slider"></span>
739
+ </label>
740
+ </div>
741
+ </div>
742
+ <div class="platform-fields collapsed" id="news-fields" style="max-height:500px;">
743
+ <div class="section-label">Pilih Portal (bisa lebih dari satu)</div>
744
+ <div class="portal-grid" id="portal-grid">
745
+ <label class="portal-chip" onclick="toggleChip(this)">
746
+ <input type="checkbox" name="_portal_detik" value="detik">
747
+ <span class="chip-dot"></span><span class="chip-label">Detik.com</span>
748
+ </label>
749
+ <label class="portal-chip" onclick="toggleChip(this)">
750
+ <input type="checkbox" name="_portal_antara" value="antara">
751
+ <span class="chip-dot"></span><span class="chip-label">Antara News</span>
752
+ </label>
753
+ <label class="portal-chip" onclick="toggleChip(this)">
754
+ <input type="checkbox" name="_portal_radar" value="radar">
755
+ <span class="chip-dot"></span><span class="chip-label">Radar (Disway)</span>
756
+ </label>
757
+ <label class="portal-chip" onclick="toggleChip(this)">
758
+ <input type="checkbox" name="_portal_radarcirebon" value="radarcirebon">
759
+ <span class="chip-dot"></span><span class="chip-label">Radar Cirebon ID</span>
760
+ </label>
761
+ <label class="portal-chip" onclick="toggleChip(this)">
762
+ <input type="checkbox" name="_portal_cnn" value="cnn">
763
+ <span class="chip-dot"></span><span class="chip-label">CNN Indonesia</span>
764
+ </label>
765
+ </div>
766
+ <!-- Hidden field filled by JS -->
767
+ <input type="hidden" id="news_portals" name="news_portals" value="">
768
+ <div class="form-row" style="margin-top:1rem;">
769
+ <div class="form-group">
770
+ <label for="news_keyword">Keyword Pencarian</label>
771
+ <input id="news_keyword" type="text" name="news_keyword" value="kabupaten cirebon" placeholder="kabupaten cirebon">
772
+ </div>
773
+ <div class="form-group">
774
+ <label for="news_pages">Jumlah Halaman per Portal</label>
775
+ <input id="news_pages" type="number" name="news_pages" value="1" min="1" max="20">
776
+ </div>
777
+ </div>
778
+ </div>
779
+ </div>
780
+
781
+ <button class="btn-submit animate-in delay-5" type="submit" id="scraping-submit">
782
+ <span class="spinner" id="scraping-spinner"></span>
783
+ <span id="scraping-btn-text">⚑ Mulai Scraping &amp; Analisis</span>
784
+ </button>
785
+ </form>
786
+ </div>
787
+
788
+ <!-- ═══════════════════════ TAB 2: Dataset ════════════════════════════ -->
789
+ <div class="tab-panel {% if active_tab == 'dataset' %}active{% endif %}" id="panel-dataset">
790
+ <form id="dataset-form" action="/wordcloud-dataset" method="post" enctype="multipart/form-data">
791
+ <div class="card animate-in">
792
+ <div class="platform-header">
793
+ <div class="platform-title">
794
+ <div class="platform-icon pi-dataset">πŸ“‚</div>
795
+ Upload Dataset
796
+ </div>
797
+ </div>
798
+
799
+ <div class="form-group" style="margin-bottom:1.25rem;">
800
+ <label>File Dataset (CSV, JSON, atau TXT)</label>
801
+ <div class="upload-zone" id="upload-zone">
802
+ <input type="file" name="dataset_file" id="dataset_file"
803
+ accept=".csv,.json,.txt,.tsv"
804
+ onchange="showFilename(this)">
805
+ <div class="upload-icon">πŸ“</div>
806
+ <div class="upload-text">Klik atau seret file ke sini</div>
807
+ <div class="upload-sub">Mendukung .csv, .json, .txt β€” maks 50 MB</div>
808
+ <div class="upload-filename" id="upload-filename">βœ“ <span></span></div>
809
+ </div>
810
+ </div>
811
+
812
+ <div class="form-group" style="margin-bottom:1.25rem;">
813
+ <label for="text_column">Nama Kolom Teks (untuk CSV/JSON)</label>
814
+ <input id="text_column" type="text" name="text_column" value="text" placeholder="text / content / komentar">
815
+ <p class="field-hint">Kolom yang berisi teks yang akan dianalisis. Kosongkan untuk pakai kolom pertama.</p>
816
+ </div>
817
+
818
+ <div class="divider">atau paste teks langsung</div>
819
+
820
+ <div class="form-group">
821
+ <label for="dataset_text">Teks Dataset (satu dokumen/kalimat per baris)</label>
822
+ <textarea id="dataset_text" name="dataset_text" style="min-height:140px;"
823
+ placeholder="Masukkan teks di sini, satu kalimat per baris...&#10;Cirebon semakin maju dengan infrastruktur yang baik&#10;Jalan di daerah X masih rusak parah"></textarea>
824
+ </div>
825
+ </div>
826
+
827
+ <button class="btn-submit" type="submit" id="dataset-submit">
828
+ <span class="spinner" id="dataset-spinner"></span>
829
+ <span id="dataset-btn-text">☁️ Buat Word Cloud &amp; Analisis Sentimen</span>
830
+ </button>
831
+ </form>
832
+ </div>
833
+
834
+ <!-- ═══════════════════════ Hasil Analisis ════════════════════════════ -->
835
+ {% if result %}
836
+ <section class="results-section animate-in">
837
+ <div class="results-header">
838
+ <h2>πŸ“Š Hasil Analisis Sentimen</h2>
839
+ <span class="stats-strip">{{ total_scraped }} teks dikumpulkan Β· {{ result.total }} dianalisis</span>
840
+ </div>
841
+
842
+ {% if csv_filename %}
843
+ <div style="margin-bottom: 1.5rem;">
844
+ <a href="{{ csv_filename }}" download class="btn-submit" style="display:inline-flex; width:auto; padding:0.7rem 1.25rem; background:linear-gradient(135deg, #059669, #10b981); text-decoration:none; font-size:0.9rem;">
845
+ πŸ“₯ Download Data Scraping (CSV)
846
+ </a>
847
+ </div>
848
+ {% endif %}
849
+
850
+ <div class="sentiment-grid">
851
+ {% set total = result.total if result.total > 0 else 1 %}
852
+ <div class="s-card positif">
853
+ <div class="s-count" id="count-pos">0</div>
854
+ <div class="s-label">😊 Positif</div>
855
+ <div class="s-bar-wrap"><div class="s-bar" id="bar-pos" style="width:0%"></div></div>
856
+ </div>
857
+ <div class="s-card negatif">
858
+ <div class="s-count" id="count-neg">0</div>
859
+ <div class="s-label">😠 Negatif</div>
860
+ <div class="s-bar-wrap"><div class="s-bar" id="bar-neg" style="width:0%"></div></div>
861
+ </div>
862
+ <div class="s-card netral">
863
+ <div class="s-count" id="count-neu">0</div>
864
+ <div class="s-label">😐 Netral</div>
865
+ <div class="s-bar-wrap"><div class="s-bar" id="bar-neu" style="width:0%"></div></div>
866
+ </div>
867
+ </div>
868
+
869
+ {% if image %}
870
+ <div class="wordcloud-card">
871
+ <h3>☁️ Word Cloud</h3>
872
+ <img class="wordcloud-img" src="data:image/png;base64,{{ image }}" alt="Word Cloud">
873
+ </div>
874
+ {% endif %}
875
+ </section>
876
+
877
+ <script>
878
+ (function () {
879
+ var pos = {{ result.positif }};
880
+ var neg = {{ result.negatif }};
881
+ var neu = {{ result.netral }};
882
+ var total = {{ result.total if result.total > 0 else 1 }};
883
+
884
+ function animCount(el, target) {
885
+ var start = 0;
886
+ var step = Math.max(1, Math.ceil(target / 40));
887
+ var timer = setInterval(function () {
888
+ start = Math.min(start + step, target);
889
+ el.textContent = start;
890
+ if (start >= target) clearInterval(timer);
891
+ }, 25);
892
+ }
893
+
894
+ setTimeout(function () {
895
+ animCount(document.getElementById('count-pos'), pos);
896
+ animCount(document.getElementById('count-neg'), neg);
897
+ animCount(document.getElementById('count-neu'), neu);
898
+ document.getElementById('bar-pos').style.width = (pos / total * 100).toFixed(1) + '%';
899
+ document.getElementById('bar-neg').style.width = (neg / total * 100).toFixed(1) + '%';
900
+ document.getElementById('bar-neu').style.width = (neu / total * 100).toFixed(1) + '%';
901
+ }, 300);
902
+ })();
903
+ </script>
904
+ {% endif %}
905
+
906
+ </div><!-- /wrapper -->
907
+
908
+ <script>
909
+ // ── Tab switching ─────────────────────────────────────────────────────────
910
+ function switchTab(name) {
911
+ document.querySelectorAll('.tab-btn').forEach(function (b) { b.classList.remove('active'); });
912
+ document.querySelectorAll('.tab-panel').forEach(function (p) { p.classList.remove('active'); });
913
+ document.getElementById('tab-' + name).classList.add('active');
914
+ document.getElementById('panel-' + name).classList.add('active');
915
+ }
916
+
917
+ // ── Platform toggle ───────────────────────────────────────────────────────
918
+ function togglePlatform(id) {
919
+ var fields = document.getElementById(id + '-fields');
920
+ var toggle = document.getElementById(id + '-toggle');
921
+ var label = document.getElementById(id + '-toggle-label');
922
+ var flagMap = { ig: 'enable_instagram', tt: 'enable_tiktok', fb: 'enable_facebook', news: 'enable_news' };
923
+
924
+ if (toggle.checked) {
925
+ fields.classList.remove('collapsed');
926
+ if (label) label.textContent = 'Aktif';
927
+ document.getElementById(flagMap[id]).value = '1';
928
+ } else {
929
+ fields.classList.add('collapsed');
930
+ if (label) label.textContent = 'Nonaktif';
931
+ document.getElementById(flagMap[id]).value = '';
932
+ }
933
+ }
934
+
935
+ // ── Portal chip multi-select ──────────────────────────────────────────────
936
+ function toggleChip(label) {
937
+ var cb = label.querySelector('input[type="checkbox"]');
938
+ cb.checked = !cb.checked;
939
+ label.classList.toggle('checked', cb.checked);
940
+ updatePortalField();
941
+ }
942
+
943
+ function updatePortalField() {
944
+ var vals = [];
945
+ document.querySelectorAll('#portal-grid .portal-chip.checked input').forEach(function (cb) {
946
+ vals.push(cb.value);
947
+ });
948
+ document.getElementById('news_portals').value = vals.join(',');
949
+ }
950
+
951
+ // ── Cookie format hints ───────────────────────────────────────────────────
952
+ var cookieHints = {
953
+ raw: 'Format: <code>sessionid=ABC; tt_webid=123</code> οΏ½οΏ½ ambil dari DevTools β†’ Application β†’ Cookies β†’ tiktok.com',
954
+ json_arr: 'Format JSON Array: <code>[{"name":"sessionid","value":"ABC","domain":".tiktok.com"}]</code>',
955
+ json_obj: 'Format JSON Object: <code>{"sessionid": "ABC", "tt_webid": "123"}</code>',
956
+ };
957
+
958
+ var cookiePlaceholders = {
959
+ raw: 'sessionid=xxx; tt_webid=yyy; ...',
960
+ json_arr: '[{"name":"sessionid","value":"xxx","domain":".tiktok.com"},...]',
961
+ json_obj: '{"sessionid": "xxx", "tt_webid": "yyy"}',
962
+ };
963
+
964
+ function setCookieHint(fmt, btn) {
965
+ document.querySelectorAll('.cookie-tab-btn').forEach(function (b) { b.classList.remove('active'); });
966
+ btn.classList.add('active');
967
+ document.getElementById('cookie-hint').innerHTML = cookieHints[fmt];
968
+ document.getElementById('tiktok_cookie').placeholder = cookiePlaceholders[fmt];
969
+ }
970
+
971
+ // ── File upload label ─────────────────────────────────────────────────────
972
+ function showFilename(input) {
973
+ var wrap = document.getElementById('upload-filename');
974
+ if (input.files && input.files[0]) {
975
+ wrap.style.display = 'block';
976
+ wrap.querySelector('span').textContent = input.files[0].name;
977
+ } else {
978
+ wrap.style.display = 'none';
979
+ }
980
+ }
981
+
982
+ // Drag-over styling
983
+ var zone = document.getElementById('upload-zone');
984
+ if (zone) {
985
+ zone.addEventListener('dragover', function (e) { e.preventDefault(); zone.classList.add('drag-over'); });
986
+ zone.addEventListener('dragleave', function () { zone.classList.remove('drag-over'); });
987
+ zone.addEventListener('drop', function () { zone.classList.remove('drag-over'); });
988
+ }
989
+
990
+ // ── Form submit spinners ──────────────────────────────────────────────────
991
+ function bindSubmit(formId, spinnerId, btnTextId, btnId, loadingText) {
992
+ var form = document.getElementById(formId);
993
+ if (!form) return;
994
+ form.addEventListener('submit', function () {
995
+ document.getElementById(btnId).disabled = true;
996
+ document.getElementById(spinnerId).style.display = 'inline-block';
997
+ document.getElementById(btnTextId).innerHTML = loadingText + '<span class="dots"><span></span><span></span><span></span></span>';
998
+ });
999
+ }
1000
+
1001
+ bindSubmit('scraping-form', 'scraping-spinner', 'scraping-btn-text', 'scraping-submit', 'Memproses (mungkin beberapa menit)');
1002
+ bindSubmit('dataset-form', 'dataset-spinner', 'dataset-btn-text', 'dataset-submit', 'Memproses dataset');
1003
+
1004
+ // Build news_portals on submit (capture phase)
1005
+ var sf = document.getElementById('scraping-form');
1006
+ if (sf) sf.addEventListener('submit', updatePortalField, true);
1007
+ </script>
1008
+ </body>
1009
+ </html>
web_scrapping.py ADDED
@@ -0,0 +1,1026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Web Scrapping.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1OLoBK18jpB685Ivi8Zi3SzuVYiXJ9jRa
8
+ """
9
+
10
+ !pip install selenium
11
+ !pip install webdriver-manager
12
+
13
+ # Detik.com
14
+
15
+ import requests
16
+ from bs4 import BeautifulSoup
17
+ import pandas as pd
18
+ import time
19
+ from datetime import datetime
20
+
21
+ def scrape_detik_search(keyword, max_pages=1):
22
+ base_search_url = "https://www.detik.com/search/searchall"
23
+ results = []
24
+
25
+ headers = {
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
27
+ }
28
+
29
+ for page in range(1, max_pages + 1):
30
+ params = {
31
+ 'query': keyword,
32
+ 'siteid': '2',
33
+ 'sortby': 'time',
34
+ 'page': page
35
+ }
36
+ print(f"Scraping page {page}...")
37
+ r = requests.get(base_search_url, params=params, headers=headers)
38
+ if r.status_code != 200:
39
+ print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
40
+ break
41
+
42
+ soup = BeautifulSoup(r.text, 'html.parser')
43
+
44
+ news_list = soup.find_all('div', class_='media')
45
+
46
+ if not news_list:
47
+ print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
48
+ break
49
+
50
+ for news in news_list:
51
+ try:
52
+ title_tag = news.find('h3', class_='media__title')
53
+ if not title_tag:
54
+ continue
55
+ link_tag = title_tag.find('a', class_='media__link')
56
+ if not link_tag or not link_tag.has_attr('href'):
57
+ continue
58
+ link = link_tag['href']
59
+ title = link_tag.text.strip()
60
+
61
+ date_tag = news.find('div', class_='media__date')
62
+ if date_tag:
63
+ span_tag = date_tag.find('span')
64
+ if span_tag and span_tag.has_attr('d-time'):
65
+ timestamp = span_tag['d-time']
66
+ news_date = datetime.fromtimestamp(int(timestamp))
67
+ else:
68
+ news_date = None
69
+ else:
70
+ news_date = None
71
+
72
+ # if news_date and news_date < cutoff_date:
73
+ # print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
74
+ # return pd.DataFrame(results)
75
+
76
+ # Ambil halaman detail berita dengan header
77
+ news_resp = requests.get(link, headers=headers)
78
+ if news_resp.status_code != 200:
79
+ print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
80
+ continue
81
+
82
+ news_soup = BeautifulSoup(news_resp.text, 'html.parser')
83
+
84
+ content_div = news_soup.find('div', class_='detail__body-text') or \
85
+ news_soup.find('div', class_='detail_text')
86
+
87
+ if content_div:
88
+ content_parts = []
89
+ for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
90
+ text = tag.get_text(strip=True)
91
+ if text:
92
+ prefix = tag.name.upper() if tag.name.startswith('h') else ''
93
+ if prefix:
94
+ content_parts.append(f"{prefix}: {text}")
95
+ else:
96
+ content_parts.append(text)
97
+ content = '\n'.join(content_parts)
98
+ else:
99
+ content = ''
100
+
101
+ # Ambil tag dari elemen nav > a.nav__item
102
+ nav_div = news_soup.find('div', class_='nav')
103
+
104
+ tags = []
105
+ if nav_div:
106
+ tags = [a.text.strip() for a in nav_div.find_all('a', class_='nav__item')]
107
+
108
+ results.append({
109
+ 'judul': title,
110
+ 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
111
+ 'tag': ', '.join(tags),
112
+ 'isi_berita': content,
113
+ 'link': link
114
+ })
115
+
116
+ print(f"Berhasil scrape berita: {title}")
117
+
118
+ time.sleep(1)
119
+
120
+ except Exception as e:
121
+ print(f"Error saat memproses berita: {e}")
122
+ continue
123
+
124
+ time.sleep(2)
125
+
126
+ return pd.DataFrame(results)
127
+
128
+ if __name__ == "__main__":
129
+ keyword = "Kabupaten Cirebon"
130
+ df = scrape_detik_search(keyword)
131
+ if not df.empty:
132
+ df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
133
+ print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
134
+ else:
135
+ print("Tidak ada data yang berhasil di-scrape.")
136
+
137
+ # Radar Cirebon KW
138
+
139
+ import requests
140
+ from bs4 import BeautifulSoup
141
+ import pandas as pd
142
+ import time
143
+ from urllib.parse import quote_plus
144
+
145
+ BASE_HOST = "https://radarcirebon.disway.id"
146
+ BASE_SEARCH = BASE_HOST + "/search/kata/"
147
+
148
+ def make_search_url(keyword, page, per_page=30):
149
+ q = quote_plus(keyword)
150
+ if page == 1:
151
+ return f"{BASE_SEARCH}?c={q}&num="
152
+ else:
153
+ offset = (page - 1) * per_page
154
+ return f"{BASE_SEARCH}{offset}/{offset}/?c={q}&num="
155
+
156
+ def absolute_url(href):
157
+ if not href:
158
+ return None
159
+ href = href.strip()
160
+ if href.startswith("http://") or href.startswith("https://"):
161
+ return href
162
+ if href.startswith("/"):
163
+ return BASE_HOST + href
164
+ return BASE_HOST + "/" + href
165
+
166
+ def scrape_radar_cirebon(keyword, max_pages=100, per_page=30, delay_between_items=1.0, delay_between_pages=2.0):
167
+ sess = requests.Session()
168
+ sess.headers.update({
169
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
170
+ })
171
+
172
+ results = []
173
+ seen_links = set()
174
+
175
+ for page in range(1, max_pages + 1):
176
+ url = make_search_url(keyword, page, per_page)
177
+ print(f"\nScraping page {page} -> {url}")
178
+ try:
179
+ r = sess.get(url, timeout=15)
180
+ except Exception as e:
181
+ print(f" ERROR: Gagal request halaman search: {e}")
182
+ break
183
+
184
+ if r.status_code != 200:
185
+ print(f" ERROR: status code {r.status_code}, hentikan scraping.")
186
+ break
187
+
188
+ soup = BeautifulSoup(r.text, "html.parser")
189
+
190
+ # Ambil daftar berita
191
+ news_list = soup.find_all(class_='media-heading')
192
+ if not news_list:
193
+ news_list = soup.find_all('div', class_='media')
194
+ if not news_list:
195
+ news_list = soup.find_all('article')
196
+ if not news_list:
197
+ news_list = soup.select('ul.search-results li') or soup.select('div.search-result') or []
198
+
199
+ if not news_list:
200
+ print(" Tidak ada berita ditemukan di halaman ini.")
201
+ continue
202
+
203
+ print(f" Ketemu {len(news_list)} item.")
204
+
205
+ for item in news_list:
206
+ try:
207
+ a = item.find('a', href=True) or item.select_one('a[href]')
208
+ if not a:
209
+ continue
210
+
211
+ link = absolute_url(a.get('href'))
212
+ if not link or link in seen_links:
213
+ continue
214
+ seen_links.add(link)
215
+
216
+ title = a.get_text(strip=True)
217
+
218
+ # Ambil halaman detail
219
+ try:
220
+ detail_r = sess.get(link, timeout=15)
221
+ except Exception as e:
222
+ print(f" ERROR request detail {link}: {e}")
223
+ continue
224
+ if detail_r.status_code != 200:
225
+ print(f" ERROR status {detail_r.status_code} for {link}")
226
+ continue
227
+
228
+ detail_soup = BeautifulSoup(detail_r.text, "html.parser")
229
+
230
+ # Judul detail
231
+ h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
232
+ title_detail = h1.get_text(strip=True) if h1 else title
233
+
234
+ # Tanggal detail
235
+ date_text = None # Inisialisasi variabel
236
+
237
+ # Opsi 1: Cari tag dengan class 'date' secara langsung
238
+ date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
239
+ if date_detail_tag:
240
+ print("Ditemukan dengan Target Langsung")
241
+ # PERBAIKAN: Gunakan variabel 'date_detail_tag', bukan 'tag'
242
+ date_text = date_detail_tag.get_text(strip=True)
243
+
244
+ # Opsi 2: Jika Opsi 1 gagal, cari di dalam kontainer 'post-info'
245
+ if not date_text:
246
+ post_info_div = detail_soup.find('div', class_='post-info')
247
+ if post_info_div:
248
+ tag_tanggal = post_info_div.find('span', class_='date')
249
+ if tag_tanggal:
250
+ print("Ditemukan dengan Target Kontainer")
251
+ date_text = tag_tanggal.get_text(strip=True)
252
+
253
+ # Opsi 3: Jika masih gagal, gunakan Regex sebagai usaha terakhir
254
+ if not date_text:
255
+ # Pola Regex untuk format seperti "Rabu 22-08-2024" atau "Selasa, 21 Agustus 2024"
256
+ date_pattern = re.compile(r'\w+,\s*\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{2}-\d{2}-\d{4}')
257
+ found_text = detail_soup.find(string=date_pattern)
258
+ if found_text:
259
+ print("Ditemukan dengan Target Pola Teks (Regex)")
260
+ date_text = found_text.strip()
261
+
262
+ # Isi berita
263
+ content_container = None
264
+ for cls in ('entry-content', 'post-content', 'article-body', 'detail__body-text', 'detail_text', 'content', 'article__content'):
265
+ content_container = detail_soup.find('div', class_=cls)
266
+ if content_container:
267
+ break
268
+ if not content_container:
269
+ content_container = detail_soup.find('article')
270
+
271
+ content_parts = []
272
+ search_scope = content_container if content_container else detail_soup
273
+ for p in search_scope.find_all('p'):
274
+ text = p.get_text(strip=True)
275
+ if text and 'Baca Juga:' not in text:
276
+ content_parts.append(text)
277
+ content = "\n".join(content_parts)
278
+
279
+ tags = []
280
+ try:
281
+ # 1. Cari SEMUA tag <a> yang tautannya (href) mengandung '/listtag/'
282
+ # Ini adalah pola unik untuk tag di situs tersebut.
283
+ tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
284
+
285
+ # 2. Loop melalui setiap tautan tag yang ditemukan
286
+ for a_tag in tag_links:
287
+ # 3. Ambil teks dari atribut 'title', karena itu berisi nama tag yang bersih
288
+ tag_text = a_tag.get('title', '').strip()
289
+
290
+ # 4. Pastikan teks tidak kosong sebelum menambahkannya ke list
291
+ if tag_text:
292
+ tags.append(tag_text)
293
+
294
+ # Jika tidak ada tag yang ditemukan, list akan tetap kosong, yang mana sudah benar.
295
+
296
+ except Exception as e:
297
+ # Menjaga agar program tidak berhenti jika ada error tak terduga
298
+ print(f" Terjadi error saat mencari tag: {e}")
299
+
300
+ # Gabungkan hasil tag menjadi satu string untuk disimpan
301
+ final_tags = ", ".join(tags) if tags else "-"
302
+
303
+ results.append({
304
+ "judul": title_detail,
305
+ "tanggal": date_text,
306
+ "tag": final_tags, # INI BAGIAN YANG DIPERBAIKI
307
+ "isi_berita": content,
308
+ "link": link
309
+ })
310
+
311
+ print(f" Berhasil: {title_detail} | Tags: {', '.join(tags) if tags else '-'}")
312
+
313
+ time.sleep(delay_between_items)
314
+
315
+ except Exception as e:
316
+ print(f" Error saat memproses item: {e}")
317
+ continue
318
+
319
+ time.sleep(delay_between_pages)
320
+
321
+ df = pd.DataFrame(results)
322
+ return df
323
+
324
+ if __name__ == "__main__":
325
+ keyword = "kabupaten cirebon"
326
+ df = scrape_radar_cirebon(keyword, max_pages=100)
327
+ if not df.empty:
328
+ df.to_csv("/content/drive/MyDrive/Machine Learning/Sentiment Analysis/radarcirebondisway_berita.csv", index=False, encoding="utf-8-sig")
329
+ print("\nSelesai menyimpan data berita ke radarcirebon_berita.csv")
330
+ else:
331
+ print("\nTidak ada data yang berhasil di-scrape.")
332
+
333
+ # Antara News
334
+
335
+ import requests
336
+ from bs4 import BeautifulSoup
337
+ import pandas as pd
338
+ import time
339
+ import re
340
+ import random
341
+ from urllib.parse import quote_plus, urlparse, urlunparse
342
+
343
+ BASE_HOST = "https://www.antaranews.com"
344
+ BASE_SEARCH = BASE_HOST + "/search"
345
+
346
+ def make_search_url(keyword, page):
347
+ q = quote_plus(keyword)
348
+ if page == 1:
349
+ return f"{BASE_SEARCH}?q={q}"
350
+ else:
351
+ return f"{BASE_SEARCH}?q={q}&page={page}"
352
+
353
+ def absolute_url(href):
354
+ if not href:
355
+ return None
356
+ href = href.strip()
357
+ if href.startswith("http://") or href.startswith("https://"):
358
+ return href
359
+ if href.startswith("/"):
360
+ return BASE_HOST + href
361
+ return BASE_HOST + "/" + href
362
+
363
+ def normalize_url(href):
364
+ """Buat URL konsisten: absolut + buang query/fragment + hapus trailing slash."""
365
+ if not href:
366
+ return None
367
+ href = absolute_url(href)
368
+ parsed = urlparse(href)
369
+ clean = parsed._replace(query="", fragment="")
370
+ return urlunparse(clean).rstrip("/")
371
+
372
+ def get_with_retry(sess, url, max_retries=3, delay_range=(2, 5)):
373
+ """Request dengan retry & delay acak."""
374
+ for attempt in range(max_retries):
375
+ try:
376
+ r = sess.get(url, timeout=15)
377
+ r.raise_for_status()
378
+ return r
379
+ except Exception as e:
380
+ print(f" Percobaan {attempt+1} gagal: {e}")
381
+ if attempt < max_retries - 1:
382
+ time.sleep(random.uniform(*delay_range))
383
+ return None
384
+
385
+ def scrape_antaranews(keyword, max_pages=5, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
386
+ sess = requests.Session()
387
+ sess.headers.update({
388
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
389
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
390
+ 'Chrome/115.0 Safari/537.36'
391
+ })
392
+
393
+ results = []
394
+ seen_links = set()
395
+
396
+ for page in range(1, max_pages + 1):
397
+ url = make_search_url(keyword, page)
398
+ print(f"\nScraping page {page} -> {url}")
399
+
400
+ r = get_with_retry(sess, url)
401
+ if not r:
402
+ print(f" ERROR: Gagal request halaman search setelah retry.")
403
+ continue
404
+
405
+ soup = BeautifulSoup(r.text, "html.parser")
406
+
407
+ # Ambil semua anchor yang mengarah ke artikel berita (biasanya /berita/...)
408
+ anchors = soup.select('a[href*="/berita/"]')
409
+ all_links_in_page = {normalize_url(a.get('href')) for a in anchors if a.get('href')}
410
+ all_links_in_page = {l for l in all_links_in_page if l}
411
+ new_links = all_links_in_page - seen_links
412
+ print(f" Ketemu {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
413
+
414
+ seen_links.update(all_links_in_page)
415
+
416
+ for link in sorted(new_links):
417
+ detail_r = get_with_retry(sess, link)
418
+ if not detail_r:
419
+ print(f" ERROR: Gagal request detail {link}")
420
+ continue
421
+
422
+ detail_soup = BeautifulSoup(detail_r.text, "html.parser")
423
+
424
+ # Judul
425
+ h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
426
+ title_detail = h1.get_text(strip=True) if h1 else ""
427
+
428
+ # Waktu / tanggal
429
+ date_detail = ""
430
+ cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
431
+ if cal_icon:
432
+ parent_li = cal_icon.find_parent('li') or cal_icon.find_parent()
433
+ if parent_li:
434
+ date_detail = parent_li.get_text(" ", strip=True)
435
+ if not date_detail:
436
+ text_all = detail_soup.get_text(" ", strip=True)
437
+ m = re.search(r'\b(?:[A-Za-z]+,\s*\d{1,2}\s+[A-Za-z]+ \d{4}\s*\d{1,2}:\d{2}\s*WIB|\d+\s+jam lalu|\bWIB\b)', text_all)
438
+ if m:
439
+ date_detail = m.group(0)
440
+
441
+ # Isi berita
442
+ content_parts = []
443
+ article_body = detail_soup.find('div', class_='wrap__article-detail-content') \
444
+ or detail_soup.find('div', class_='detail__body-text') \
445
+ or detail_soup.find('article')
446
+ search_scope = article_body if article_body else detail_soup
447
+ for p in search_scope.find_all('p'):
448
+ text = p.get_text(strip=True)
449
+ if text and not text.lower().startswith("baca juga"):
450
+ content_parts.append(text)
451
+ content = "\n".join(content_parts)
452
+
453
+ # Ambil tag
454
+ tags = []
455
+ found = False
456
+ for ul in detail_soup.find_all('ul', class_='list-inline'):
457
+ if ul.find('i', class_='fa-tags') or ul.find('i', class_='fas fa-tags'):
458
+ for a in ul.find_all('a', href=True):
459
+ if '/tag/' in a['href']:
460
+ tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
461
+ if tag_text:
462
+ tags.append(tag_text)
463
+ if tags:
464
+ found = True
465
+ break
466
+ if not found:
467
+ for a in detail_soup.select('a[href*="/tag/"]'):
468
+ tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
469
+ if tag_text:
470
+ tags.append(tag_text)
471
+ tags = list(dict.fromkeys(tags))
472
+
473
+ results.append({
474
+ "judul": title_detail,
475
+ "tanggal": date_detail,
476
+ "tag": ", ".join(tags) if tags else "-",
477
+ "isi_berita": content,
478
+ "link": link
479
+ })
480
+
481
+ print(f" Berhasil: {title_detail} | Tanggal: {date_detail if date_detail else '-'} | Tags: {', '.join(tags) if tags else '-'}")
482
+
483
+ time.sleep(random.uniform(*delay_between_items))
484
+
485
+ time.sleep(random.uniform(*delay_between_pages))
486
+
487
+ df = pd.DataFrame(results)
488
+ return df
489
+
490
+ if __name__ == "__main__":
491
+ keyword = "kabupaten cirebon"
492
+ df = scrape_antaranews(keyword, max_pages=100)
493
+ if not df.empty:
494
+ df.to_csv("antaranews_berita.csv", index=False, encoding="utf-8-sig")
495
+ print(f"\nSelesai menyimpan {len(df)} data berita ke antaranews_berita.csv")
496
+ else:
497
+ print("\nTidak ada data yang berhasil di-scrape.")
498
+
499
+ # Jalanin di IDE lokal karena butuh chrome (CNN)
500
+
501
+ import requests
502
+ from bs4 import BeautifulSoup
503
+ import pandas as pd
504
+ import time
505
+ import random
506
+ from urllib.parse import quote, urlparse, urlunparse
507
+ import re
508
+
509
+ from selenium import webdriver
510
+ from selenium.webdriver.chrome.service import Service
511
+ from selenium.webdriver.common.by import By
512
+ from selenium.webdriver.support.ui import WebDriverWait
513
+ from selenium.webdriver.support import expected_conditions as EC
514
+ from webdriver_manager.chrome import ChromeDriverManager
515
+ from selenium.common.exceptions import TimeoutException
516
+
517
+ BASE_HOST = "https://www.cnnindonesia.com"
518
+
519
+ # <<< DIUBAH: Fungsi ini dimodifikasi untuk menangani nomor halaman >>>
520
+ def make_search_url(keyword, page):
521
+ """
522
+ Membuat URL pencarian yang benar untuk setiap halaman.
523
+ """
524
+ q = quote(keyword)
525
+ base_url = f"{BASE_HOST}/search?query={q}&result_type=latest"
526
+ if page == 1:
527
+ return base_url
528
+ else:
529
+ return f"{base_url}&page={page}"
530
+
531
+ # --- Fungsi-fungsi pembantu lainnya tidak ada perubahan ---
532
+ def absolute_url(href):
533
+ if not href: return None
534
+ href = href.strip()
535
+ if href.startswith("http://") or href.startswith("https://"): return href
536
+ if href.startswith("/"): return BASE_HOST + href
537
+ return BASE_HOST + "/" + href
538
+
539
+ def normalize_url(href):
540
+ if not href: return None
541
+ href = absolute_url(href)
542
+ parsed = urlparse(href)
543
+ clean = parsed._replace(query="", fragment="")
544
+ return urlunparse(clean).rstrip("/")
545
+
546
+ def parse_cnn_date(raw_date):
547
+ if not raw_date: return "-"
548
+ if '|' in raw_date: raw_date = raw_date.split('|')[1]
549
+ raw = raw_date.replace(" WIB", "").strip()
550
+ try:
551
+ from datetime import datetime
552
+ import locale
553
+ try: locale.setlocale(locale.LC_TIME, 'id_ID.UTF-8')
554
+ except locale.Error: locale.setlocale(locale.LC_TIME, '')
555
+ dt = datetime.strptime(raw, "%A, %d %b %Y %H:%M")
556
+ return dt.strftime("%Y-%m-%d %H:%M")
557
+ except Exception: return raw_date.strip()
558
+
559
+ def looks_like_article_href(href):
560
+ if not href: return False
561
+ parsed = urlparse(href.strip())
562
+ path = parsed.path
563
+ if any(skip in path for skip in ['/search', '/tag', '/kategori', '/author', '/channel', '/indeks', '/video', '/foto']): return False
564
+ if re.search(r'/\d{14}-\d{2,3}-\d{6,}', path): return True
565
+ return False
566
+
567
+ HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"}
568
+
569
+ def fetch_article_detail(url, retries=3, delay=3):
570
+ for attempt in range(1, retries + 1):
571
+ try:
572
+ resp = requests.get(url, headers=HEADERS, timeout=15)
573
+ if resp.status_code == 200: return resp.text
574
+ else: print(f" WARNING: HTTP {resp.status_code} saat akses {url}")
575
+ except Exception as e: print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
576
+ time.sleep(delay)
577
+ return None
578
+
579
+ def scrape_cnn_with_selenium(keyword, max_pages=3, delay_between_items=(1,2)):
580
+ results = []
581
+ seen_links = set()
582
+
583
+ print("Menginisialisasi browser Chrome...")
584
+ service = Service(ChromeDriverManager().install())
585
+ options = webdriver.ChromeOptions()
586
+ options.add_argument("--headless")
587
+ options.add_argument("--disable-blink-features=AutomationControlled")
588
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
589
+ options.add_experimental_option('useAutomationExtension', False)
590
+
591
+ driver = webdriver.Chrome(service=service, options=options)
592
+ driver.set_page_load_timeout(30)
593
+
594
+ # <<< DIUBAH: Logika perulangan kembali menggunakan nomor halaman (bukan scroll) >>>
595
+ for page in range(1, max_pages + 1):
596
+ # Membuat URL untuk halaman yang dituju
597
+ url = make_search_url(keyword, page)
598
+ print(f"\nMembuka halaman {page} -> {url}")
599
+ driver.get(url)
600
+
601
+ # Penanganan cookie hanya perlu saat pertama kali halaman dimuat (page 1)
602
+ if page == 1:
603
+ try:
604
+ print("Mencari pop-up cookie...")
605
+ cookie_agree_button = WebDriverWait(driver, 10).until(
606
+ EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))
607
+ )
608
+ cookie_agree_button.click()
609
+ print(" Pop-up cookie ditemukan dan ditutup.")
610
+ time.sleep(2)
611
+ except TimeoutException:
612
+ print(" Pop-up cookie tidak ditemukan, melanjutkan proses.")
613
+
614
+ print(f"Mengambil data dari halaman {page}...")
615
+
616
+ try:
617
+ # Menunggu konten dimuat di setiap halaman baru
618
+ WebDriverWait(driver, 15).until(
619
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a"))
620
+ )
621
+ except TimeoutException:
622
+ print(f" WARNING: Waktu habis menunggu konten di halaman {page}. Mungkin halaman ini kosong.")
623
+ continue # Lanjut ke halaman berikutnya jika ada
624
+
625
+ page_html = driver.page_source
626
+ soup = BeautifulSoup(page_html, "html.parser")
627
+
628
+ link_elements = soup.select('div.nhl-list article a[href]')
629
+
630
+ all_links_in_page = {normalize_url(a['href']) for a in link_elements if looks_like_article_href(a['href'])}
631
+ new_links = all_links_in_page - seen_links
632
+
633
+ if not new_links:
634
+ print(" Tidak ada link baru yang ditemukan di halaman ini.")
635
+ # Tidak perlu berhenti, karena halaman berikutnya mungkin punya link baru
636
+
637
+ print(f" Ditemukan {len(new_links)} link baru.")
638
+ seen_links.update(new_links)
639
+
640
+ # Proses scrape detail artikel tidak ada perubahan
641
+ for link in sorted(new_links):
642
+ print(f" -> Memproses: {link}")
643
+ html_detail = fetch_article_detail(link)
644
+ if not html_detail: continue
645
+ detail_soup = BeautifulSoup(html_detail, "html.parser")
646
+
647
+ title_el = detail_soup.select_one('h1')
648
+ title_text = title_el.get_text(strip=True) if title_el else "-"
649
+
650
+ date_el = detail_soup.select_one('div.text-cnn_grey.text-sm')
651
+ date_text = parse_cnn_date(date_el.get_text(strip=True)) if date_el else "-"
652
+
653
+ tags_list = []
654
+ topik_terkait_header = detail_soup.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
655
+ if topik_terkait_header:
656
+ tags_container = topik_terkait_header.find_next_sibling('div')
657
+ if tags_container:
658
+ tags_elements = tags_container.select('a')
659
+ tags_list = [tag.get_text(strip=True) for tag in tags_elements]
660
+
661
+ content_parts = []
662
+ content_container = detail_soup.select_one("div.detail-text")
663
+ if content_container:
664
+ for p in content_container.find_all('p'):
665
+ text = p.get_text(" ", strip=True)
666
+ if text and not text.lower().startswith("lihat juga") and not text.lower().startswith("scroll to continue"):
667
+ content_parts.append(text)
668
+
669
+ results.append({
670
+ "judul": title_text, "tanggal": date_text,
671
+ "tag": ", ".join(tags_list) if tags_list else "-",
672
+ "isi_berita": "\n".join(content_parts) if content_parts else "-", "link": link
673
+ })
674
+ print(f" Berhasil: {title_text} | Tanggal: {date_text}")
675
+ time.sleep(random.uniform(*delay_between_items))
676
+
677
+ print("\nMenutup browser...")
678
+ driver.quit()
679
+ return pd.DataFrame(results)
680
+
681
+ if __name__ == "__main__":
682
+ keyword = "kabupaten cirebon"
683
+ df = scrape_cnn_with_selenium(keyword, max_pages=100)
684
+ if not df.empty:
685
+ df.to_csv("cnnindonesia_berita_final.csv", index=False, encoding="utf-8-sig")
686
+ print(f"\nSelesai menyimpan {len(df)} data berita ke cnnindonesia_berita_final.csv")
687
+ else:
688
+ print("\nTidak ada data yang berhasil di-scrape.")
689
+
690
+ # Radar Cirebon ID
691
+
692
+ import requests
693
+ from bs4 import BeautifulSoup
694
+ import pandas as pd
695
+ import time
696
+ import random
697
+ from urllib.parse import quote, urlparse, urlunparse
698
+ import re
699
+
700
+ # Mengganti BASE_HOST ke situs target yang baru
701
+ BASE_HOST = "https://radarcirebon.id"
702
+
703
+ def make_search_url(keyword, page):
704
+ """
705
+ Membuat URL pencarian sesuai format radarcirebon.id.
706
+ Contoh: https://radarcirebon.id/search/kabupaten+cirebon/page/2/
707
+ """
708
+ # Mengganti spasi dengan '+' sesuai format URL situs
709
+ q = quote(keyword).replace('%20', '+')
710
+ if page == 1:
711
+ return f"{BASE_HOST}/search/{q}/"
712
+ else:
713
+ return f"{BASE_HOST}/search/{q}/page/{page}/"
714
+
715
+ def normalize_url(href):
716
+ """
717
+ Memastikan URL dalam format absolut dan bersih (tanpa parameter).
718
+ """
719
+ if not href:
720
+ return None
721
+ href = href.strip()
722
+ # Membuat URL absolut jika hanya berupa path
723
+ if href.startswith("//"):
724
+ href = "https:" + href
725
+ elif href.startswith("/"):
726
+ href = BASE_HOST + href
727
+ elif not href.startswith("http"):
728
+ return None # Mengabaikan link yang tidak valid
729
+
730
+ parsed = urlparse(href)
731
+ clean = parsed._replace(query="", fragment="")
732
+ return urlunparse(clean).rstrip("/")
733
+
734
+ def parse_radarcirebon_date(raw_date):
735
+ """
736
+ Mengubah format tanggal dari 'Selasa, 12 Agu 2025 - 11:01'
737
+ menjadi format standar 'YYYY-MM-DD HH:MM'.
738
+ """
739
+ if not raw_date:
740
+ return "-"
741
+ try:
742
+ # Pemetaan manual untuk nama bulan 3 huruf dalam Bahasa Indonesia
743
+ month_map = {
744
+ 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'Mei': '05', 'Jun': '06',
745
+ 'Jul': '07', 'Agu': '08', 'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
746
+ }
747
+ # Membersihkan hari dan memisahkan bagian-bagian tanggal
748
+ date_part = raw_date.split(', ')[1] # -> "12 Agu 2025 - 11:01"
749
+ parts = date_part.replace(' - ', ' ').split() # -> ['12', 'Agu', '2025', '11:01']
750
+
751
+ day = parts[0].zfill(2) # zfill(2) untuk memastikan format '01', '02', dst.
752
+ month_abbr = parts[1]
753
+ year = parts[2]
754
+ time_str = parts[3]
755
+
756
+ # Mengambil angka bulan dari pemetaan
757
+ month = month_map.get(month_abbr, '00')
758
+
759
+ return f"{year}-{month}-{day} {time_str}"
760
+ except Exception:
761
+ return raw_date.strip()
762
+
763
+ def looks_like_article_href(href):
764
+ """
765
+ Memfilter URL agar hanya mengambil link artikel yang valid.
766
+ Contoh URL artikel: /2025/08/12/nama-artikel/
767
+ """
768
+ if not href:
769
+ return False
770
+ # Pola URL artikel di radarcirebon.id selalu mengandung /YYYY/MM/DD/
771
+ return bool(re.search(r'/\d{4}/\d{2}/\d{2}/', href))
772
+
773
+ HEADERS = {
774
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
775
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
776
+ "Chrome/126.0.0.0 Safari/537.36"
777
+ }
778
+
779
+ def fetch_url(url, retries=3, delay=3):
780
+ """
781
+ Fungsi untuk mengambil konten dari sebuah URL dengan mekanisme coba lagi (retry).
782
+ """
783
+ for attempt in range(1, retries + 1):
784
+ try:
785
+ resp = requests.get(url, headers=HEADERS, timeout=15)
786
+ if resp.status_code == 200:
787
+ return resp.text
788
+ else:
789
+ print(f" WARNING: HTTP {resp.status_code} saat akses {url}")
790
+ except Exception as e:
791
+ print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
792
+ time.sleep(delay)
793
+ return None
794
+
795
+ def scrape_radarcirebon(keyword, max_pages=3, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
796
+ """
797
+ Fungsi utama untuk melakukan scraping dari situs radarcirebon.id.
798
+ """
799
+ results = []
800
+ seen_links = set()
801
+
802
+ for page in range(1, max_pages + 1):
803
+ url = make_search_url(keyword, page)
804
+ print(f"\nScraping halaman {page} -> {url}")
805
+
806
+ html = fetch_url(url)
807
+ if not html:
808
+ print(f" ERROR: Gagal mengambil halaman pencarian {page}")
809
+ continue
810
+
811
+ soup = BeautifulSoup(html, "html.parser")
812
+
813
+ # Selektor CSS baru untuk menemukan link artikel di halaman pencarian
814
+ link_elements = soup.select('article .wp-block-latest-posts__post-title a')
815
+ print(f" DEBUG: Ditemukan {len(link_elements)} elemen link di halaman {page}")
816
+
817
+ all_links_in_page = set()
818
+ for a in link_elements:
819
+ href_raw = a.get('href')
820
+ if href_raw and looks_like_article_href(href_raw):
821
+ norm = normalize_url(href_raw)
822
+ if norm:
823
+ all_links_in_page.add(norm)
824
+
825
+ new_links = all_links_in_page - seen_links
826
+ print(f" Menemukan {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
827
+ seen_links.update(all_links_in_page)
828
+
829
+ for link in sorted(list(new_links)):
830
+ html_detail = fetch_url(link)
831
+ if not html_detail:
832
+ print(f" ERROR: Gagal mengambil artikel {link}")
833
+ continue
834
+
835
+ detail_soup = BeautifulSoup(html_detail, "html.parser")
836
+
837
+ # Selektor baru untuk judul artikel
838
+ title_el = detail_soup.select_one('h1.entry-title')
839
+ title_detail = title_el.get_text(strip=True) if title_el else "-"
840
+
841
+ # Selektor baru untuk tanggal
842
+ date_el = detail_soup.select_one('time.entry-date')
843
+ date_detail = parse_radarcirebon_date(date_el.get_text(strip=True)) if date_el else "-"
844
+
845
+ # Selektor baru untuk isi berita
846
+ content_parts = []
847
+ content_container = detail_soup.select_one('div.entry-content')
848
+ if content_container:
849
+ for p in content_container.select('p'):
850
+ # Mengabaikan paragraf yang berisi link "Baca Juga"
851
+ if not p.find_parent(class_='read-also'):
852
+ text = p.get_text(" ", strip=True)
853
+ if text:
854
+ content_parts.append(text)
855
+ content = "\n".join(content_parts)
856
+
857
+ # Selektor baru untuk tag
858
+ tags_container = detail_soup.select_one('div.wp-block-tag-cloud')
859
+ tags = [a.get_text(strip=True) for a in tags_container.select('a')] if tags_container else []
860
+ tags = list(dict.fromkeys(tags)) # Menghapus duplikat
861
+
862
+ results.append({
863
+ "judul": title_detail,
864
+ "tanggal": date_detail,
865
+ "tag": ", ".join(tags) if tags else "-",
866
+ "isi_berita": content if content else "-",
867
+ "link": link
868
+ })
869
+ print(f" Berhasil: {title_detail} | Tanggal: {date_detail}")
870
+ time.sleep(random.uniform(*delay_between_items))
871
+
872
+ # Beri jeda antar halaman untuk tidak membebani server
873
+ time.sleep(random.uniform(*delay_between_pages))
874
+
875
+ return pd.DataFrame(results)
876
+
877
+ if __name__ == "__main__":
878
+ keyword = "kabupaten cirebon"
879
+ # Batasi max_pages sesuai kebutuhan Anda, misalnya 3 halaman
880
+ df = scrape_radarcirebon(keyword, max_pages=3)
881
+ if not df.empty:
882
+ # Menyimpan ke file CSV baru
883
+ output_filename = "radarcirebon_berita.csv"
884
+ df.to_csv(output_filename, index=False, encoding="utf-8-sig")
885
+ print(f"\nSelesai menyimpan {len(df)} data berita ke {output_filename}")
886
+ else:
887
+ print("\nTidak ada data yang berhasil di-scrape.")
888
+
889
+ # Download html
890
+
891
+ import requests
892
+
893
+ url = "https://radarcirebon.id/2025/08/12/warga-resah-dprd-cirebon-panggil-dpkpp-untuk-tuntaskan-masalah-psu-di-dua-perumahan/"
894
+ headers = {
895
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
896
+ }
897
+ resp = requests.get(url, headers=headers)
898
+ with open("detail.html", "w", encoding="utf-8") as f:
899
+ f.write(resp.text)
900
+ print("HTML halaman disimpan ke page.html")
901
+
902
+ # Detik.com memiliki batas waktu
903
+
904
+ import requests
905
+ from bs4 import BeautifulSoup
906
+ import pandas as pd
907
+ import time
908
+ from datetime import datetime
909
+
910
+ def scrape_detik_search(keyword, max_years=3, max_pages=100):
911
+ base_search_url = "https://www.detik.com/search/searchall"
912
+ results = []
913
+
914
+ cutoff_date = datetime.now().replace(year=datetime.now().year - max_years)
915
+
916
+ headers = {
917
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
918
+ }
919
+
920
+ for page in range(1, max_pages + 1):
921
+ params = {
922
+ 'query': keyword,
923
+ 'siteid': '2',
924
+ 'sortby': 'time',
925
+ 'page': page
926
+ }
927
+ print(f"Scraping page {page}...")
928
+ r = requests.get(base_search_url, params=params, headers=headers)
929
+ if r.status_code != 200:
930
+ print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
931
+ break
932
+
933
+ soup = BeautifulSoup(r.text, 'html.parser')
934
+
935
+ news_list = soup.find_all('div', class_='media')
936
+
937
+ if not news_list:
938
+ print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
939
+ break
940
+
941
+ for news in news_list:
942
+ try:
943
+ title_tag = news.find('h3', class_='media__title')
944
+ if not title_tag:
945
+ continue
946
+ link_tag = title_tag.find('a', class_='media__link')
947
+ if not link_tag or not link_tag.has_attr('href'):
948
+ continue
949
+ link = link_tag['href']
950
+ title = link_tag.text.strip()
951
+
952
+ date_tag = news.find('div', class_='media__date')
953
+ if date_tag:
954
+ span_tag = date_tag.find('span')
955
+ if span_tag and span_tag.has_attr('d-time'):
956
+ timestamp = span_tag['d-time']
957
+ news_date = datetime.fromtimestamp(int(timestamp))
958
+ else:
959
+ news_date = None
960
+ else:
961
+ news_date = None
962
+
963
+ if news_date and news_date < cutoff_date:
964
+ print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
965
+ return pd.DataFrame(results)
966
+
967
+ # Ambil halaman detail berita dengan header
968
+ news_resp = requests.get(link, headers=headers)
969
+ if news_resp.status_code != 200:
970
+ print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
971
+ continue
972
+
973
+ news_soup = BeautifulSoup(news_resp.text, 'html.parser')
974
+
975
+ content_div = news_soup.find('div', class_='detail__body-text') or \
976
+ news_soup.find('div', class_='detail_text')
977
+
978
+ if content_div:
979
+ content_parts = []
980
+ for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
981
+ text = tag.get_text(strip=True)
982
+ if text:
983
+ prefix = tag.name.upper() if tag.name.startswith('h') else ''
984
+ if prefix:
985
+ content_parts.append(f"{prefix}: {text}")
986
+ else:
987
+ content_parts.append(text)
988
+ content = '\n'.join(content_parts)
989
+ else:
990
+ content = ''
991
+
992
+ tag_list_div = news_soup.find('div', class_='tag__list') or \
993
+ news_soup.find('div', class_='detail_tag')
994
+
995
+ tags = []
996
+ if tag_list_div:
997
+ tags = [t.text.strip() for t in tag_list_div.find_all('a')]
998
+
999
+ results.append({
1000
+ 'judul': title,
1001
+ 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
1002
+ 'tag': ', '.join(tags),
1003
+ 'isi_berita': content,
1004
+ 'link': link
1005
+ })
1006
+
1007
+ print(f"Berhasil scrape berita: {title}")
1008
+
1009
+ time.sleep(1)
1010
+
1011
+ except Exception as e:
1012
+ print(f"Error saat memproses berita: {e}")
1013
+ continue
1014
+
1015
+ time.sleep(2)
1016
+
1017
+ return pd.DataFrame(results)
1018
+
1019
+ if __name__ == "__main__":
1020
+ keyword = "Kabupaten Cirebon"
1021
+ df = scrape_detik_search(keyword)
1022
+ if not df.empty:
1023
+ df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
1024
+ print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
1025
+ else:
1026
+ print("Tidak ada data yang berhasil di-scrape.")
word_cloud.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Word Cloud.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1rwyDXgYaTJQJvXu2FPeggecHOxIYQ3l3
8
+ """
9
+
10
+ !pip install stop-words
11
+ !pip install sastrawi
12
+ !pip install transformers
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ import matplotlib.pyplot as plt
17
+
18
+ import html
19
+ import re
20
+ import json
21
+
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sklearn.decomposition import NMF
24
+ from wordcloud import WordCloud
25
+ from tqdm import tqdm
26
+ from IPython.display import display
27
+ from bs4 import BeautifulSoup
28
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
29
+ from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
30
+ from stop_words import get_stop_words
31
+ from collections import Counter
32
+ from transformers import pipeline
33
+
34
+ # ===============================================
35
+ # --- Konfigurasi ---
36
+ # ===============================================
37
+ FILE_PATH = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/medsos (6).csv'
38
+ N_TOPICS = 15
39
+ N_TOP_WORDS = 10 # top kata per topik (juga dipakai untuk wordcloud)
40
+ SAMPLE_DATA_TO_SHOW = 5 # Jumlah sampel data yang ingin ditampilkan per sentimen
41
+
42
+ # ===============================================
43
+ # 1. Stopwords: stop_words + Sastrawi + tambahan
44
+ # ===============================================
45
+ stopwords_indonesia = get_stop_words('indonesian')
46
+ factory = StopWordRemoverFactory()
47
+ sastrawi_stopwords = factory.get_stop_words()
48
+
49
+ additional_stopwords = [
50
+ 'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
51
+ 'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi','apk',
52
+ 'sllu','apknya','sngt','joos','ni','kak',
53
+ # kata umum
54
+ 'manfaatnya','ya','lbh','digunakan','semangat','dah','sangat','penting',
55
+ 'lancar','cepat','senang','makasih','bermanfaat','keren','berguna','baik',
56
+ 'indonesia','usaha','memudahkan','pokoknya','puas','mantap','dananya','luar',
57
+ 'hati','ber','terimakasih','tepat','memudah','terbaik','mempermudah','praktis',
58
+ 'simple','kadang','memuaskan','bagus','semoga','smoga','aplikasi','transaksi',
59
+ 'kesimpulan','sip','pelayanannya','orang','manfaat','untuk','proses','membantu',
60
+ 'pengiriman','muda','mantaap','kedepannya','pake','aktifitas','sejauh','untung',
61
+ 'tenang','bikin','pakek','saldo','keluhan','dimanapun','cukup','menggunakan',
62
+ 'sengat','banget','pakai','terpercaya','top','sukses',
63
+ # hasil wordcloud
64
+ 'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk','guna',
65
+ 'baru','jelas','level','selengkapnya','yuk','mohon','punya','cara','hari',
66
+ 'kota','news','baca','fitur','kasih','suruh',
67
+ 'besar','sapa','bawa','atas','hidup','jaga','moga','kali','balas','perintah',
68
+ 'masyarakat','ide','hadir','ikut','ingat','tali','alhamdulillah','sambut',
69
+ 'masa','tuju','terima','ibu','silaturahmi','pasang','bangun','dukung',
70
+ 'muhammad','teladan','tahun','insan','bulan','iman','erat','syukur',
71
+ 'kabupaten','cirebon','langsung','cinta','kuat','tebar','hubung','ikat',
72
+ 'resmi','giat','selenggara','luka','kendara','putih','fyp','reses','mulai',
73
+ 'rctvcirebon','radarcirebon','temu','satu','factor','harap','wararctv',
74
+ 'maksimal','salah','tiktokberita','kawasan','sangka','juang','merah','puluh',
75
+ 'ribu','omo','argo','role','jati','tingkat','kata','emis','majalengka',
76
+ 'madam','sebut','tawur','duga',
77
+ # tambahan kata lain
78
+ 'visi','saw','keras','sayang','bentuk','didik','jalin','keluarga','momen',
79
+ 'program','baginda','hikmah','panjang','lingkung','wewararctv', 'magelang',
80
+ 'kang', 'langkah', 'limpah', 'explore', 'tabindex', 'penuh', 'aa', 'rasa', 'tags',
81
+ 'notranslate', 'desa', 'daerah', 'lengkap', 'aa', 'kunjung', 'laku', 'klik', 'berkah',
82
+ 'aboutcirebon', 'jl', 'terus', 'hasil', 'instastory', 'taut', 'upaya', 'berita',
83
+ 'beri', 'lanjut', 'pemkabcirebon', 'warga', 'pemkabcirebon', 'selamat', 'wujud', 'maju',
84
+ 'wakil', 'ungkap', 'turut', 'pihak', 'wilayah', 'dinas', 'promo', 'pemkotcirebon', 'hadap',
85
+ 'barat', 'layan', 'siap', 'milik', 'lokasi', 'ujar', 'rupa', 'gratis', 'daftar', 'jawa', 'tengah',
86
+ 'kolaborasi', 'tempat', 'tegas', 'gelar', 'wib'
87
+ # Bulan
88
+ 'januari', 'februari', 'maret', 'april', 'mei', 'juni', 'juli', 'agustus', 'september',
89
+ 'oktober', 'november', 'desember'
90
+ ]
91
+
92
+ # ===== Tambahan stopwords untuk kata tidak jelas =====
93
+ noise_stopwords = [
94
+ 'by','zd','xyri','yu','uobl','ypdohk','xt','pz','lziwak','mp',
95
+ 'rp','xdj','xexx','xggy','xjbqb','xstzfhl','link','class','hfl','xat',
96
+ 'qhh','dhg','cr', 'tdsg', 'ct', 'etr', 'nq', 'oe', 'ejq', 'psk', 'href',
97
+ 'hl', 'hd' , 'sy', 'amp', 'fbf', 'tags'
98
+ ]
99
+
100
+ CUSTOM_STOPWORDS = [
101
+ # HTML & atribut umum
102
+ "class", "id", "span", "div", "href", "src", "style", "alt",
103
+ "aria", "role", "tabindex", "button", "label", "img", "input",
104
+ "placeholder", "form", "field", "hidden", "value", 'aa',
105
+
106
+ # Token acak/huruf tunggal
107
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
108
+ "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
109
+ "w", "x", "y", "z",
110
+
111
+ # Kata noise berulang dari teks kamu
112
+ "hfl", "xjbqb", "ejq", "ypdohk", "xexx", "hfr", "eyih",
113
+ "dwj", "hkzxv", "yuc", "igjr", "eqks", "oq", "kjzd", "oxk",
114
+ "zsgpy", "dycq", "g", "o", "wa", "wo", "ae", "ov", "vv", "uxc",
115
+
116
+ # Kata teknis netral
117
+ "content", "data", "video", "playlist", "source", "watch",
118
+ "channel", "views", "subscribe", "update", "next", "prev",
119
+ "click", "menu", "link", "button", "card", "section",
120
+
121
+ # Angka & simbol sering muncul
122
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
123
+ ]
124
+
125
+ # Gabungkan semua stopwords
126
+ final_stopwords = list(set(stopwords_indonesia + sastrawi_stopwords + additional_stopwords + noise_stopwords + CUSTOM_STOPWORDS))
127
+
128
+ # ===============================================
129
+ # 2. Pembersihan HTML + Stemming Sastrawi
130
+ # ===============================================
131
+ stemmer = StemmerFactory().create_stemmer()
132
+ html_noise = ['fbf','tabindex','tags','notranslate','aria-label','div','span','class']
133
+ noise_words = set(noise_stopwords + CUSTOM_STOPWORDS + html_noise)
134
+
135
+ def clean_html(text):
136
+ if pd.isna(text):
137
+ return ""
138
+ s = BeautifulSoup(str(text), "html.parser")
139
+ for tag in s(["script", "style"]):
140
+ tag.decompose()
141
+ cleaned = s.get_text(separator=" ")
142
+ cleaned = html.unescape(cleaned)
143
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
144
+ return cleaned
145
+
146
+ def remove_single_letters(text):
147
+ return re.sub(r"\b\w\b", "", text)
148
+
149
+ def hapus (text):
150
+ tokens = [word for word in text.split() if word not in noise_words]
151
+ text = " ".join(tokens)
152
+ return text
153
+
154
+
155
+ def preprocess_text(text):
156
+ # 1. Clean HTML
157
+ text = clean_html(text)
158
+
159
+ # 2. Lowercase
160
+ text = text.lower()
161
+
162
+ # 3. Stemming
163
+ text = stemmer.stem(text)
164
+
165
+ # 4. Hapus stopwords dan html noise
166
+ tokens = [word for word in text.split()
167
+ if word not in final_stopwords and word not in html_noise]
168
+
169
+ # 5. Ambil hanya kata (huruf saja)
170
+ tokens = [t for t in tokens if re.search(r"[a-zA-Z]", t)]
171
+
172
+ # 6. Gabung kembali
173
+ text = " ".join(tokens)
174
+
175
+ # 7. Hapus huruf tunggal
176
+ text = remove_single_letters(text)
177
+
178
+ return text.strip()
179
+
180
+ # ===============================================
181
+ # 3. Load & Preprocess Dataset
182
+ # ===============================================
183
+ try:
184
+ df = pd.read_csv(FILE_PATH)
185
+ df.dropna(subset=['caption'], inplace=True)
186
+ df['caption'] = df['caption'].astype(str)
187
+ df['caption_clean'] = df['caption'].apply(preprocess_text)
188
+ df['caption'] = df['caption'].apply(hapus)
189
+
190
+ print("βœ… Dataset berhasil dimuat & dipreproses.")
191
+ print(f"Jumlah data: {len(df)} baris")
192
+ if 'caption_pred' in df.columns:
193
+ print("\nDistribusi Sentimen (caption_pred):")
194
+ print(df['caption_pred'].value_counts())
195
+ except FileNotFoundError:
196
+ print(f"❌ Error: File '{FILE_PATH}' tidak ditemukan.")
197
+ raise SystemExit
198
+
199
+ # ===============================================
200
+ # 4. Fungsi utilitas
201
+ # ===============================================
202
+ def get_top_words_per_topic(model, feature_names, n_top_words):
203
+ topics = {}
204
+ for topic_idx, topic in enumerate(model.components_):
205
+ top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
206
+ top_features = [feature_names[i] for i in top_features_ind]
207
+ topics[topic_idx] = top_features
208
+ return topics
209
+
210
+ def format_topics_sentences(topics):
211
+ return {topic_idx: ", ".join(words) for topic_idx, words in topics.items()}
212
+
213
+ def create_circular_wordcloud(words_list, title, n_words=10):
214
+ text_data = " ".join(words_list[:n_words])
215
+ if not text_data.strip():
216
+ print(f"Tidak ada kata untuk word cloud '{title}'.")
217
+ return
218
+ x, y = np.ogrid[:400, :400]
219
+ mask = (x - 200) ** 2 + (y - 200) ** 2 > 190 ** 2
220
+ mask = 255 * mask.astype(int)
221
+ wc = WordCloud(width=800, height=800, background_color='white',
222
+ colormap='viridis', mask=mask,
223
+ contour_width=3, contour_color='steelblue').generate(text_data)
224
+ plt.figure(figsize=(8, 8))
225
+ plt.imshow(wc, interpolation='bilinear')
226
+ plt.title(title, fontsize=18, pad=15)
227
+ plt.axis('off')
228
+ plt.show()
229
+
230
+ def get_top_words_by_doc_frequency(df_subset, n_top_words=10):
231
+ word_doc_count = Counter()
232
+ for text in df_subset['caption_clean'].fillna(""):
233
+ tokens = [w for w in text.split() if not re.fullmatch(r"[a-z]", w)]
234
+ unique_tokens = set(tokens)
235
+ word_doc_count.update(unique_tokens)
236
+ return word_doc_count.most_common(n_top_words)
237
+
238
+ summarizer = pipeline(
239
+ "summarization",
240
+ model="google/mt5-small",
241
+ tokenizer="google/mt5-small"
242
+ )
243
+
244
+ def generate_summary(text, max_length=60, min_length=20):
245
+ if not text or len(text.split()) < 10:
246
+ return text
247
+ try:
248
+ result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
249
+ return result[0]['summary_text']
250
+ except Exception as e:
251
+ print(f"⚠️ Error summarizing: {e}")
252
+ return text
253
+
254
+ def summarize_text(corpus, n_topics=5, n_words=10):
255
+ vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
256
+ X = vectorizer.fit_transform(corpus)
257
+
258
+ nmf = NMF(n_components=n_topics, random_state=42)
259
+ nmf.fit(X)
260
+
261
+ feature_names = vectorizer.get_feature_names_out()
262
+ key_sentences = []
263
+
264
+ for topic_idx, topic in enumerate(nmf.components_):
265
+ top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
266
+ key_sentences.extend(top_words)
267
+
268
+ # ubah jadi paragraf ringkas
269
+ summary = " ".join(key_sentences)
270
+ return summary
271
+
272
+ # ===============================================
273
+ # 5. GLOBAL Topic Modeling dan Pembuatan Ringkasan (PARAGRAF)
274
+ # ===============================================
275
+ print("\n--- 🧠 Memprediksi Topik dan Membuat Ringkasan untuk Semua Data ---")
276
+
277
+ # πŸ”Ή Gabungkan caption + comment jadi satu teks
278
+ df['combined_text'] = df['caption_clean'].fillna('') + " " + df['comments_pred'].fillna('')
279
+
280
+ # --- TF-IDF Vectorizer ---
281
+ global_vectorizer = TfidfVectorizer(
282
+ max_df=0.9,
283
+ min_df=10,
284
+ max_features=1000,
285
+ stop_words=final_stopwords,
286
+ ngram_range=(1, 2)
287
+ )
288
+
289
+ global_tfidf = global_vectorizer.fit_transform(df['combined_text'])
290
+ global_feature_names = global_vectorizer.get_feature_names_out()
291
+
292
+ # --- Bagian NMF + Summary ---
293
+ if global_tfidf.shape[1] == 0:
294
+ df['predicted_topic_id'] = -1
295
+ df['predicted_topic'] = "Tidak ada fitur yang cukup untuk modeling"
296
+ df['summary'] = "Tidak dapat membuat ringkasan"
297
+ print("⚠️ Peringatan: Kosakata terlalu sedikit setelah preprocessing. Topic modeling tidak dapat dilakukan.")
298
+ else:
299
+ global_nmf_model = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
300
+ global_nmf_model.fit(global_tfidf)
301
+
302
+ # Distribusi topik per dokumen
303
+ topic_distribution = global_nmf_model.transform(global_tfidf)
304
+ df['predicted_topic_id'] = np.argmax(topic_distribution, axis=1)
305
+
306
+ # Ambil kata-kata penting tiap topik
307
+ def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
308
+ top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
309
+ return [feature_names[i] for i in top_indices]
310
+
311
+ # Mapping topik β†’ keyword utama
312
+ topic_keywords = {}
313
+ for topic_idx in range(N_TOPICS):
314
+ top_words = get_top_words_for_topic(global_nmf_model, global_feature_names, topic_idx, N_TOP_WORDS)
315
+ topic_keywords[topic_idx] = ", ".join(top_words)
316
+
317
+ df['predicted_topic'] = df['predicted_topic_id'].map(topic_keywords).fillna("Topik tidak teridentifikasi")
318
+
319
+ # πŸ”Ή Update ringkasan pakai IndoBERT, berdasarkan teks gabungan
320
+ df['summary'] = df['combined_text'].apply(lambda x: generate_summary(x))
321
+
322
+ print("βœ… Prediksi topik selesai, ringkasan memakai IndoBERT Summarization (gabungan caption + comment).")
323
+
324
+ # Menampilkan hasil untuk verifikasi
325
+ print("\n--- ✨ Contoh Hasil Prediksi Topik dan Ringkasan ---")
326
+ display(df[['caption', 'comments_pred', 'predicted_topic', 'summary']].head(10))
327
+
328
+ # ===============================================
329
+ # 6. Analisis per Sentimen + WordCloud + TAMPILKAN BUKTI BERDASARKAN KEYWORD
330
+ # ===============================================
331
+ analysis_result = {} # tempat simpan hasil JSON
332
+
333
+ if 'caption_pred' in df.columns:
334
+ sentiments = ['positif', 'negatif', 'netral']
335
+
336
+ # Pandas tampilkan teks penuh
337
+ pd.set_option('display.max_colwidth', None)
338
+
339
+ for sentiment in sentiments:
340
+ print(f"\n\n=======================================================")
341
+ print(f"πŸ“Š Analisis Mendalam untuk Sentimen: '{sentiment.upper()}'")
342
+ print(f"=======================================================")
343
+
344
+ subset_df = df[df['caption_pred'] == sentiment].copy()
345
+ analysis_result[sentiment] = [] # list kosong untuk simpan hasil tiap sentimen
346
+
347
+ if subset_df.empty:
348
+ print(f"Tidak ada data untuk sentimen '{sentiment}'.")
349
+ continue
350
+
351
+ # 1. Dapatkan kata-kata teratas
352
+ top_words_tuples = get_top_words_by_doc_frequency(subset_df, n_top_words=N_TOP_WORDS)
353
+
354
+ if not top_words_tuples:
355
+ print(f"Tidak ada kata signifikan pada sentimen '{sentiment}' untuk dianalisis.")
356
+ continue
357
+
358
+ # 2. Buat WordCloud
359
+ words_list_for_wc = [word for word, count in top_words_tuples]
360
+ create_circular_wordcloud(words_list_for_wc, f"WordCloud Sentimen {sentiment.upper()}", n_words=N_TOP_WORDS)
361
+
362
+ # 3. Tampilkan bukti ringkasan
363
+ print(f"\n--- πŸ“„ Bukti Ringkasan Berdasarkan Kata Kunci Populer ---")
364
+
365
+ for word, doc_count in top_words_tuples:
366
+ relevant_data = subset_df[
367
+ subset_df['caption_clean'].str.contains(r'\b{}\b'.format(re.escape(word)), case=False, na=False)
368
+ ]
369
+
370
+ summaries_list = []
371
+ if not relevant_data.empty:
372
+ print(f"\nβœ… Kata Kunci: '{word}' (ditemukan dalam {len(relevant_data)} data pada sentimen ini)")
373
+
374
+ for i, row in enumerate(relevant_data.itertuples(index=False), 1):
375
+ caption = getattr(row, "caption_clean", "")
376
+ link = getattr(row, "link", None) or getattr(row, "url", None) or "-"
377
+ comment = getattr(row, "comments_pred", "")
378
+ print(f" {i}. {caption} πŸ”— {link} πŸ’¬ {comment}")
379
+
380
+ summaries_list.append({
381
+ "caption": caption,
382
+ "link": link,
383
+ "comment": comment
384
+ })
385
+
386
+ else:
387
+ print(f"\n❌ Kata Kunci: '{word}' (tidak ditemukan data relevan untuk ditampilkan)")
388
+
389
+ # tetap simpan ke JSON meskipun kosong
390
+ analysis_result[sentiment].append({
391
+ "keyword": word,
392
+ "count": int(len(relevant_data)),
393
+ "summary": summaries_list
394
+ })
395
+
396
+ else:
397
+ print("\nKolom 'caption_pred' tidak ditemukan. Melewati analisis per sentimen.")
398
+
399
+ # ===============================================
400
+ # Simpan hasil JSON
401
+ # ===============================================
402
+ with open("sentiment_analysis_result.json", "w", encoding="utf-8") as f:
403
+ json.dump(analysis_result, f, ensure_ascii=False, indent=4)
404
+
405
+ print("\nπŸ“‚ Hasil analisis juga telah disimpan di 'sentiment_analysis_result.json'")
406
+
407
+ # ===============================================
408
+ # Prediksi Dataset Berita (judul, isi_berita, tag, link)
409
+ # ===============================================
410
+
411
+ FILE_BERITA = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita (6).csv'
412
+
413
+ try:
414
+ df_berita = pd.read_csv(FILE_BERITA)
415
+ df_berita.dropna(subset=['isi_berita'], inplace=True)
416
+ df_berita['isi_berita'] = df_berita['isi_berita'].astype(str)
417
+
418
+ # Preprocessing isi_berita
419
+ df_berita['isi_berita_clean'] = df_berita['isi_berita'].apply(preprocess_text)
420
+
421
+ print("βœ… Dataset berita berhasil dimuat & dipreproses.")
422
+ print(f"Jumlah data: {len(df_berita)} baris")
423
+
424
+ except FileNotFoundError:
425
+ print(f"❌ Error: File '{FILE_BERITA}' tidak ditemukan.")
426
+ raise SystemExit
427
+
428
+ # ===============================================
429
+ # Topic Modeling untuk berita
430
+ # ===============================================
431
+ print("\n--- 🧠 Memprediksi Topik & Ringkasan untuk Dataset Berita ---")
432
+
433
+ # πŸ”Ή Gabungkan isi_berita_clean + judul + tag
434
+ df_berita['combined_text'] = (
435
+ df_berita['isi_berita_clean'].fillna('') + " " +
436
+ df_berita['judul'].fillna('') + " " +
437
+ df_berita['tag'].fillna('')
438
+ )
439
+
440
+ # --- TF-IDF Vectorizer ---
441
+ vectorizer_berita = TfidfVectorizer(
442
+ max_df=0.9,
443
+ min_df=5,
444
+ max_features=1000,
445
+ stop_words=final_stopwords,
446
+ ngram_range=(1, 2)
447
+ )
448
+
449
+ tfidf_berita = vectorizer_berita.fit_transform(df_berita['combined_text'])
450
+ feature_names_berita = vectorizer_berita.get_feature_names_out()
451
+
452
+ if tfidf_berita.shape[1] == 0:
453
+ df_berita['predicted_topic_id'] = -1
454
+ df_berita['predicted_topic'] = "Tidak cukup fitur untuk modeling"
455
+ df_berita['summary'] = "Tidak dapat membuat ringkasan"
456
+ else:
457
+ nmf_berita = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
458
+ nmf_berita.fit(tfidf_berita)
459
+
460
+ topic_dist_berita = nmf_berita.transform(tfidf_berita)
461
+ df_berita['predicted_topic_id'] = np.argmax(topic_dist_berita, axis=1)
462
+
463
+ # Ambil kata topik
464
+ def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
465
+ top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
466
+ return [feature_names[i] for i in top_indices]
467
+
468
+ topic_keywords_berita = {}
469
+ for topic_idx in range(N_TOPICS):
470
+ top_words = get_top_words_for_topic(nmf_berita, feature_names_berita, topic_idx, N_TOP_WORDS)
471
+ topic_keywords_berita[topic_idx] = ", ".join(top_words)
472
+
473
+ df_berita['predicted_topic'] = df_berita['predicted_topic_id'].map(topic_keywords_berita).fillna("Topik tidak teridentifikasi")
474
+
475
+ # πŸ”Ή Summarization IndoBERT (Google mT5)
476
+ df_berita['summary'] = df_berita['isi_berita'].apply(lambda x: generate_summary(x))
477
+
478
+ print("βœ… Prediksi topik & ringkasan berita selesai.")
479
+
480
+ # ===============================================
481
+ # Simpan hasil JSON
482
+ # ===============================================
483
+ output_data = []
484
+ for row in df_berita.itertuples(index=False):
485
+ output_data.append({
486
+ "judul": getattr(row, "judul", ""),
487
+ "tag": getattr(row, "tag", ""),
488
+ "link": getattr(row, "link", ""),
489
+ "isi_berita": getattr(row, "isi_berita", ""),
490
+ "isi_berita_clean": getattr(row, "isi_berita_clean", ""),
491
+ "predicted_topic": getattr(row, "predicted_topic", ""),
492
+ "summary": getattr(row, "summary", "")
493
+ })
494
+
495
+ with open("berita_analysis_result.json", "w", encoding="utf-8") as f:
496
+ json.dump(output_data, f, ensure_ascii=False, indent=4)
497
+
498
+ print("\nπŸ“‚ Hasil analisis berita disimpan di 'berita_analysis_result.json'")
499
+
500
+ !pip install pyngrok flask
501
+
502
+ from flask import Flask, jsonify
503
+ from pyngrok import ngrok
504
+ import json
505
+
506
+ # Masukkan token ngrok kamu
507
+ ngrok.set_auth_token("31odwJIHeYFk9aOrDfXDajKjK87_7esvX4phWySwTCG3BQ1R2")
508
+
509
+ # Load JSON hasil analisis sentiment
510
+ with open("sentiment_analysis_result.json", "r", encoding="utf-8") as f:
511
+ sentiment_result = json.load(f)
512
+
513
+ # Load JSON hasil analisis berita
514
+ with open("berita_analysis_result.json", "r", encoding="utf-8") as f:
515
+ berita_result = json.load(f)
516
+
517
+ # Inisialisasi Flask
518
+ app = Flask(__name__)
519
+
520
+ # Endpoint untuk sentiment
521
+ @app.route("/api/sentiment", methods=["GET"])
522
+ def api_sentiment():
523
+ return jsonify(sentiment_result)
524
+
525
+ # Endpoint untuk berita
526
+ @app.route("/api/berita", methods=["GET"])
527
+ def api_berita():
528
+ return jsonify(berita_result)
529
+
530
+ # Jalankan Flask di port 5000
531
+ port = 5000
532
+ public_url = ngrok.connect(port)
533
+ print("πŸ”— Public URL:", public_url)
534
+
535
+ app.run(port=port)