Spaces:

NzTama
/

Sentiment

Runtime error

App Files Files Community

NzTama commited on Mar 29

Commit

fa8ff66

0 Parent(s):

Initial clean deploy: Sentiment Analysis

Browse files

Files changed (34) hide show

.dockerignore +35 -0
.gitattributes +45 -0
.gitignore +0 -0
Dockerfile +60 -0
Procfile +1 -0
README.md +11 -0
app.py +401 -0
docker-compose.yml +20 -0
fb.py +390 -0
medos_scraping.py +461 -0
preparing.py +236 -0
requirements.txt +0 -0
runtime.txt +1 -0
sentimentanalysis.py +675 -0
services/__init__.py +1 -0
services/__pycache__/__init__.cpython-311.pyc +0 -0
services/__pycache__/facebook.cpython-311.pyc +0 -0
services/__pycache__/medos.cpython-311.pyc +0 -0
services/__pycache__/news.cpython-311.pyc +0 -0
services/__pycache__/preprocessing.cpython-311.pyc +0 -0
services/__pycache__/sentiment.cpython-311.pyc +0 -0
services/__pycache__/tiktok.cpython-311.pyc +0 -0
services/__pycache__/wordcloud_service.cpython-311.pyc +0 -0
services/_driver.py +66 -0
services/facebook.py +304 -0
services/medos.py +331 -0
services/news.py +387 -0
services/preprocessing.py +119 -0
services/sentiment.py +159 -0
services/tiktok.py +320 -0
services/wordcloud_service.py +120 -0
templates/index.html +1009 -0
web_scrapping.py +1026 -0
word_cloud.py +535 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,35 @@

+# Python bytecache
+__pycache__/
+*.py[cod]
+*.pyo
+# Virtual environments
+.venv/
+venv/
+env/
+# IDE
+.idea/
+.vscode/
+# Cookies (may contain sensitive data, don't bake into image)
+*.json
+!requirements.txt
+# Output files
+static/output/*.png
+# Notebook files
+*.ipynb
+# Git
+.git/
+.gitignore
+# Model directory — mount as volume instead
+indoBERT-sentiment/
+# Misc
+*.csv
+Procfile
+runtime.txt

.gitattributes ADDED Viewed

	@@ -0,0 +1,45 @@

+<<<<<<< HEAD
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+=======
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+>>>>>>> 649536a0e30a230c86bf243c4a705ac8f70543b6
+static/output/*.png filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

Binary file (130 Bytes). View file

Dockerfile ADDED Viewed

	@@ -0,0 +1,60 @@

+# ─── Base Image ─────────────────────────────────────────────────────────────
+FROM python:3.11-slim
+# Environment Variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# ─── System Dependencies ─────────────────────────────────────────────────────
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget curl gnupg ca-certificates unzip \
+    # Chromium + driver (AUTO MATCH, STABLE)
+    chromium chromium-driver \
+    # Required libs
+    libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 \
+    libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 \
+    libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \
+    libasound2 libxshmfence1 fonts-liberation libappindicator3-1 \
+    xdg-utils libvulkan1 libx11-xcb1 \
+    # Fonts
+    fonts-noto fonts-noto-cjk \
+    # Build tools
+    gcc g++ build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# ─── Set Chromium Path ───────────────────────────────────────────────────────
+ENV CHROME_BIN=/usr/bin/chromium
+ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
+# ─── Hugging Face Spaces Rules (Non-Root User) ───────────────────────────────
+# Hugging Face Spaces requires running Docker as a non-root user (UID 1000)
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Pre-create output directory and ensure permissions
+RUN mkdir -p $HOME/app/static/output && chown -R user:user $HOME
+# Switch to the non-root user
+USER user
+# ─── App Setup ───────────────────────────────────────────────────────────────
+COPY --chown=user:user requirements.txt .
+# Install dependencies into user directory
+# PyTorch CPU version specified explicitly
+RUN pip install --no-cache-dir --user torch --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir --user -r requirements.txt
+# Copy project files
+COPY --chown=user:user . .
+# ─── Expose Port ─────────────────────────────────────────────────────────────
+# Hugging Face exposes port 7860
+EXPOSE 7860
+# ─── Run App ────────────────────────────────────────────────────────────────
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: uvicorn app:app --host 0.0.0.0 --port $PORT

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Sentiment
+emoji: 🐨
+colorFrom: indigo
+colorTo: yellow
+sdk: docker
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+app.py  –  FastAPI application for Scraping + Sentiment Analysis + WordCloud.
+"""
+from __future__ import annotations
+import base64
+import io
+import csv
+import json
+import os
+import traceback
+from typing import Optional
+import uvicorn
+from fastapi import FastAPI, File, Form, Request, UploadFile
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from services.medos import scrape_medos
+from services.tiktok import scrape_tiktok
+from services.news import scrape_news
+from services.preprocessing import preprocess_text
+from services.sentiment import analyze_sentiment
+from services.wordcloud_service import generate_wordcloud
+from services.facebook import scrape_facebook
+# ── App setup ──────────────────────────────────────────────────────────────────
+app = FastAPI(title="Sentiment Analysis Dashboard")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+templates = Jinja2Templates(directory="templates")
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def _split_targets(raw: str | None) -> list[str]:
+    """Split a newline/comma-separated string into a clean list of non-empty strings."""
+    if not raw or not raw.strip():
+        return []
+    parts = []
+    for line in raw.replace(",", "\n").splitlines():
+        s = line.strip()
+        if s:
+            parts.append(s)
+    return parts
+def _is_enabled(flag: str | None) -> bool:
+    """Return True only if the enable flag is explicitly '1'."""
+    return (flag or "").strip() == "1"
+def _flatten_for_csv(raw_texts: list) -> list[dict]:
+    flat = []
+    for item in raw_texts:
+        if isinstance(item, str):
+            flat.append({"text": item})
+        elif isinstance(item, dict):
+            base = {k: v for k, v in item.items() if k != "comments"}
+            comments = item.get("comments", [])
+            if not comments:
+                flat.append(base)
+            else:
+                for c in comments:
+                    row = dict(base)
+                    if isinstance(c, str):
+                        row["comment_text"] = c
+                    elif isinstance(c, dict):
+                        row["comment_author"] = c.get("author", "")
+                        row["comment_text"] = c.get("comment", "")
+                        flat.append(row)
+                        for r in c.get("replies", []):
+                            rep_row = dict(base)
+                            rep_row["comment_author"] = r.get("author", "")
+                            rep_row["comment_text"] = r.get("comment", "")
+                            flat.append(rep_row)
+                        continue
+                    flat.append(row)
+    return flat
+def _extract_texts(raw_texts: list) -> list[str]:
+    extracted = []
+    for item in raw_texts:
+        if isinstance(item, str):
+            extracted.append(item)
+        elif isinstance(item, dict):
+            if "caption_short" in item: extracted.append(item["caption_short"])
+            if "caption_detail" in item: extracted.append(item["caption_detail"])
+            if "caption" in item: extracted.append(item["caption"])
+            if "judul" in item: extracted.append(item["judul"])
+            if "isi_berita" in item: extracted.append(item["isi_berita"])
+            if "tag" in item: extracted.append(item["tag"])
+            for c in item.get("comments", []):
+                if isinstance(c, str):
+                    extracted.append(c)
+                elif isinstance(c, dict):
+                    extracted.append(c.get("comment", ""))
+                    for r in c.get("replies", []):
+                        extracted.append(r.get("comment", ""))
+    return extracted
+def _run_pipeline(raw_texts: list) -> dict:
+    """Shared preprocessing → sentiment → wordcloud pipeline."""
+    if not raw_texts:
+        return {
+            "error": "Tidak ada teks yang berhasil dikumpulkan.",
+            "result": None,
+            "image": None,
+            "total_scraped": 0,
+            "csv_filename": None,
+        }
+    # Save CSV
+    import os
+    import csv
+    from datetime import datetime
+    os.makedirs("static/output", exist_ok=True)
+    csv_fname = f"scraped_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    csv_path = os.path.join("static", "output", csv_fname)
+    flat_data = _flatten_for_csv(raw_texts)
+    if flat_data:
+        keys = set()
+        for d in flat_data: keys.update(d.keys())
+        with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
+            writer = csv.DictWriter(f, fieldnames=list(keys))
+            writer.writeheader()
+            writer.writerows(flat_data)
+        csv_url = f"/static/output/{csv_fname}"
+    else:
+        csv_url = None
+    # Extract text for ML pipeline
+    text_list = _extract_texts(raw_texts)
+    total_scraped = len(text_list)
+    print(f"[APP] Total item yg di-ekstrak teksnya: {total_scraped}")
+    # Preprocess
+    print("[APP] Preprocessing…")
+    clean_texts = preprocess_text(text_list)
+    clean_texts = [t for t in clean_texts if t and t.strip()]
+    if not clean_texts:
+        return {
+            "error": "Semua teks kosong setelah preprocessing. Coba input yang berbeda.",
+            "result": None,
+            "image": None,
+            "total_scraped": total_scraped,
+            "csv_filename": csv_url,
+        }
+    # Sentiment
+    print(f"[APP] Analyzing sentiment on {len(clean_texts)} texts…")
+    try:
+        sentiment = analyze_sentiment(clean_texts)
+    except Exception as e:
+        print(f"[APP] Sentiment error: {e}\n{traceback.format_exc()}")
+        sentiment = None
+    # WordCloud — generate into memory as base64 (no file saved)
+    print("[APP] Generating wordcloud…")
+    image_b64 = None
+    try:
+        buf = io.BytesIO()
+        wc_ok = generate_wordcloud(clean_texts, buf)
+        if wc_ok:
+            buf.seek(0)
+            image_b64 = base64.b64encode(buf.read()).decode("utf-8")
+    except Exception as e:
+        print(f"[APP] WordCloud error: {e}")
+    return {
+        "error": None,
+        "result": sentiment,
+        "image": image_b64,
+        "total_scraped": total_scraped,
+        "csv_filename": csv_url,
+    }
+# ── Routes ─────────────────────────────────────────────────────────────────────
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    return templates.TemplateResponse(request=request, name="index.html")
+@app.post("/process", response_class=HTMLResponse)
+async def process(
+    request: Request,
+    # ── Platform enable flags (set by JS, "1" = enabled) ──────────────────
+    enable_instagram: str = Form(""),
+    enable_tiktok:    str = Form(""),
+    enable_facebook:  str = Form(""),
+    enable_news:      str = Form(""),
+    # ── Instagram (separate credentials) ─────────────────────────────────
+    ig_username:     str = Form(None),
+    ig_password:     str = Form(None),
+    target_accounts: str = Form(None),
+    mode:            str = Form("all"),
+    # ── TikTok ────────────────────────────────────────────────────────────
+    tiktok_cookie:  str = Form(None),
+    tiktok_targets: str = Form(None),
+    # ── Facebook (separate credentials, explicit groups only) ─────────────
+    fb_username:     str = Form(None),
+    fb_password:     str = Form(None),
+    facebook_groups: str = Form(None),
+    # ── News ──────────────────────────────────────────────────────────────
+    news_portals: str = Form(None),   # comma-separated portal keys
+    news_keyword: str = Form("kabupaten cirebon"),
+    news_pages:   int = Form(1),
+):
+    raw_texts: list = []
+    # ── 1. Instagram ────────────────────────────────────────────────────────
+    if _is_enabled(enable_instagram):
+        ig_targets = _split_targets(target_accounts)
+        if not ig_username or not ig_password:
+            print("[APP] Instagram diaktifkan tapi username/password kosong — skip.")
+        elif not ig_targets:
+            print("[APP] Instagram diaktifkan tapi tidak ada target — skip.")
+        else:
+            for tgt in ig_targets:
+                print(f"[APP] Scraping Instagram: {tgt}")
+                try:
+                    texts = scrape_medos(ig_username, ig_password, tgt, mode)
+                    raw_texts.extend(texts)
+                    print(f"[APP] Instagram @{tgt} → {len(texts)} teks")
+                except Exception as e:
+                    print(f"[APP] Instagram error ({tgt}): {e}")
+    else:
+        print("[APP] Instagram dinonaktifkan — skip.")
+    # ── 2. TikTok ───────────────────────────────────────────────────────────
+    if _is_enabled(enable_tiktok):
+        tt_targets = _split_targets(tiktok_targets)
+        if not tt_targets:
+            print("[APP] TikTok diaktifkan tapi tidak ada target — skip.")
+        else:
+            for tgt in tt_targets:
+                print(f"[APP] Scraping TikTok: {tgt}")
+                try:
+                    texts = scrape_tiktok(tiktok_cookie or "", tgt)
+                    raw_texts.extend(texts)
+                    print(f"[APP] TikTok @{tgt} → {len(texts)} teks")
+                except Exception as e:
+                    print(f"[APP] TikTok error ({tgt}): {e}")
+    else:
+        print("[APP] TikTok dinonaktifkan — skip.")
+    # ── 3. Facebook ─────────────────────────────────────────────────────────
+    # TIDAK memakai default groups — harus ada URL & credentials eksplisit
+    if _is_enabled(enable_facebook):
+        fb_groups = _split_targets(facebook_groups)
+        if not fb_username or not fb_password:
+            print("[APP] Facebook diaktifkan tapi username/password kosong — skip.")
+        elif not fb_groups:
+            print("[APP] Facebook diaktifkan tapi tidak ada URL grup — skip (tidak ada default).")
+        else:
+            print(f"[APP] Scraping Facebook {len(fb_groups)} grup…")
+            try:
+                texts = scrape_facebook(fb_username, fb_password, fb_groups)
+                raw_texts.extend(texts)
+                print(f"[APP] Facebook → {len(texts)} teks")
+            except Exception as e:
+                print(f"[APP] Facebook error: {e}")
+    else:
+        print("[APP] Facebook dinonaktifkan — skip.")
+    # ── 4. News ─────────────────────────────────────────────────────────────
+    if _is_enabled(enable_news):
+        portals = _split_targets(news_portals)
+        if not portals:
+            print("[APP] News diaktifkan tapi tidak ada portal dipilih — skip.")
+        else:
+            for portal in portals:
+                print(f"[APP] Scraping news: portal={portal}, keyword={news_keyword}, pages={news_pages}")
+                try:
+                    texts = scrape_news(portal, news_pages, keyword=news_keyword)
+                    raw_texts.extend(texts)
+                    print(f"[APP] News ({portal}) → {len(texts)} teks")
+                except Exception as e:
+                    print(f"[APP] News error ({portal}): {e}")
+    else:
+        print("[APP] News dinonaktifkan — skip.")
+    # ── Pipeline ────────────────────────────────────────────────────────────
+    outcome = _run_pipeline(raw_texts)
+    return templates.TemplateResponse(
+        request=request,
+        name="index.html",
+        context={
+            "error": outcome["error"],
+            "result": outcome["result"],
+            "image": outcome["image"],
+            "total_scraped": outcome["total_scraped"],
+            "csv_filename": outcome["csv_filename"],
+            "active_tab": "scraping",
+        },
+    )
+@app.post("/wordcloud-dataset", response_class=HTMLResponse)
+async def wordcloud_dataset(
+    request: Request,
+    dataset_text:  str        = Form(None),
+    dataset_file:  UploadFile = File(None),
+    text_column:   str        = Form("text"),
+):
+    """
+    Word cloud + sentiment from an uploaded dataset (CSV/TXT/JSON) or pasted text.
+    """
+    raw_texts: list = []
+    # Priority: file upload
+    if dataset_file and dataset_file.filename:
+        fname = dataset_file.filename.lower()
+        content_bytes = await dataset_file.read()
+        try:
+            content_str = content_bytes.decode("utf-8", errors="replace")
+        except Exception:
+            content_str = content_bytes.decode("latin-1", errors="replace")
+        if fname.endswith(".csv") or fname.endswith(".tsv"):
+            delimiter = "\t" if fname.endswith(".tsv") else ","
+            reader = csv.DictReader(io.StringIO(content_str), delimiter=delimiter)
+            cols = reader.fieldnames or []
+            for row in reader:
+                if text_column and text_column in cols and row.get(text_column):
+                    raw_texts.append(str(row[text_column]))
+                else:
+                    raw_texts.append(row)
+        elif fname.endswith(".json"):
+            try:
+                data = json.loads(content_str)
+                if isinstance(data, list):
+                    for item in data:
+                        if isinstance(item, str) and item:
+                            raw_texts.append(item)
+                        elif isinstance(item, dict):
+                            if text_column and text_column in item and item.get(text_column):
+                                raw_texts.append(str(item[text_column]))
+                            else:
+                                raw_texts.append(item)
+            except Exception as e:
+                print(f"[Dataset] JSON parse error: {e}")
+        else:
+            # Plain text ��� each non-empty line is one document
+            for line in content_str.splitlines():
+                line = line.strip()
+                if line:
+                    raw_texts.append(line)
+    elif dataset_text and dataset_text.strip():
+        for line in dataset_text.splitlines():
+            line = line.strip()
+            if line:
+                raw_texts.append(line)
+    if not raw_texts:
+        return templates.TemplateResponse(
+            request=request,
+            name="index.html",
+            context={
+                "error": "Tidak ada teks ditemukan dalam dataset. Pastikan file / teks tidak kosong.",
+                "result": None,
+                "image": None,
+                "total_scraped": 0,
+                "csv_filename": None,
+                "active_tab": "dataset",
+            },
+        )
+    outcome = _run_pipeline(raw_texts)
+    return templates.TemplateResponse(
+        request=request,
+        name="index.html",
+        context={
+            "error": outcome["error"],
+            "result": outcome["result"],
+            "image": outcome["image"],
+            "total_scraped": outcome["total_scraped"],
+            "csv_filename": outcome["csv_filename"],
+            "active_tab": "dataset",
+        },
+    )
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+version: "3.9"
+services:
+  app:
+    build: .
+    container_name: sentiment_app
+    ports:
+      # Map host 8000 to container 7860 (Hugging Face default)
+      - "8000:7860"
+    # Chrome needs a larger /dev/shm to avoid crashes in headless mode
+    shm_size: "2gb"
+    environment:
+      - PYTHONUNBUFFERED=1
+    volumes:
+      # Persist wordcloud output between runs
+      - ./static/output:/home/user/app/static/output
+      # Mount a local model folder if you have one (optional)
+      # Rename or create the folder 'indoBERT-sentiment' in the project root
+      - ./indoBERT-sentiment:/home/user/app/indoBERT-sentiment
+    restart: unless-stopped

fb.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import os
+import time
+import json
+import csv
+from datetime import datetime
+import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+# ========== KONFIGURASI ==========
+FB_USERNAME = "fatihr252@gmail.com"
+FB_PASSWORD = "Bambank1"
+COOKIES_FILE = "fb_cookies.json"
+# daftar grup yang ingin di-scrape
+GROUP_INPUTS = [
+    "https://web.facebook.com/groups/183039928416039?locale=id_ID",
+    "https://web.facebook.com/groups/teraswarga?locale=id_ID",
+    "https://web.facebook.com/groups/967901979894945?locale=id_ID"
+]
+# lokasi hasil scraping
+OUTPUT_CSV = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+OUTPUT_JSON = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+# ========== SETUP SELENIUM ==========
+options = uc.ChromeOptions()
+options.add_argument("--disable-notifications")
+options.add_argument("--disable-infobars")
+options.add_argument("--start-maximized")
+driver = uc.Chrome(options=options, use_subprocess=True)
+wait = WebDriverWait(driver, 15)
+# ========== FUNGSI LOGIN ==========
+def save_cookies(driver, path):
+    with open(path, "w") as file:
+        json.dump(driver.get_cookies(), file)
+def load_cookies(driver, path):
+    with open(path, "r") as file:
+        cookies = json.load(file)
+        for cookie in cookies:
+            driver.add_cookie(cookie)
+def fb_login(force=False):
+    """
+    force=True akan memaksa login pakai username/password
+    walaupun ada cookies.
+    """
+    driver.get("https://www.facebook.com/")
+    time.sleep(3)
+    if not force and os.path.exists(COOKIES_FILE):
+        try:
+            load_cookies(driver, COOKIES_FILE)
+            driver.refresh()
+            time.sleep(5)
+            if "login" not in driver.current_url:
+                print("✅ Login pakai cookies berhasil")
+                # pastikan search bar muncul sebelum keluar
+                try:
+                    wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
+                    print("🔍 Search bar tersedia, siap mencari grup")
+                except:
+                    print("⚠️ Search bar belum muncul, tetap lanjutkan")
+                return
+        except Exception as e:
+            print("⚠️ Cookies gagal dipakai:", e)
+    print("🔑 Login manual pakai username/password...")
+    # --- Login form handling ---
+    try:
+        # versi klasik (id=email, id=pass)
+        email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
+        pass_input = driver.find_element(By.ID, "pass")
+        email_input.clear()
+        email_input.send_keys(FB_USERNAME)
+        pass_input.clear()
+        pass_input.send_keys(FB_PASSWORD)
+        driver.find_element(By.NAME, "login").click()
+    except Exception:
+        try:
+            # versi dinamis (_r_s_, _r_17_)
+            email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@name="email" and @type="text"]')))
+            pass_input = driver.find_element(By.XPATH, '//input[@name="pass" and @type="password"]')
+            email_input.clear()
+            email_input.send_keys(FB_USERNAME)
+            pass_input.clear()
+            pass_input.send_keys(FB_PASSWORD)
+            pass_input.submit()
+        except Exception:
+            try:
+                # versi lain (data-testid)
+                email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@data-testid="royal-email"]')))
+                pass_input = driver.find_element(By.XPATH, '//input[@data-testid="royal-pass"]')
+                email_input.clear()
+                email_input.send_keys(FB_USERNAME)
+                pass_input.clear()
+                pass_input.send_keys(FB_PASSWORD)
+                driver.find_element(By.NAME, "login").click()
+            except Exception as e:
+                raise Exception(f"❌ Tidak menemukan form login yang cocok: {e}")
+    time.sleep(5)
+    if "login" in driver.current_url:
+        raise Exception("❌ Login gagal! Cek username/password")
+    save_cookies(driver, COOKIES_FILE)
+    print("✅ Login sukses & cookies disimpan")
+    # setelah login sukses, pastikan search bar ada
+    try:
+        wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
+        print("🔍 Search bar tersedia, siap mencari grup")
+    except:
+        print("⚠️ Search bar belum muncul, coba manual redirect ke beranda")
+        driver.get("https://www.facebook.com/")
+        wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
+def ensure_logged_in():
+    """Cek apakah user masih login, kalau muncul halaman login atau popup, login ulang."""
+    try:
+        # --- Kasus URL berubah ke login page ---
+        if driver.current_url and "login" in driver.current_url:
+            print("⚠️ Redirect ke halaman login, mencoba login ulang...")
+            fb_login(force=True)
+            return
+        # --- Kasus popup 'See more on Facebook' muncul ---
+        try:
+            popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
+            if popup.is_displayed():
+                print("⚠️ Popup login terdeteksi, login ulang...")
+                fb_login(force=True)
+                return
+        except:
+            pass
+        # --- Kasus ada input email/password nongol di modal ---
+        try:
+            login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
+            if login_modal.is_displayed():
+                print("⚠️ Form login modal terdeteksi, login ulang...")
+                fb_login(force=True)
+                return
+        except:
+            pass
+    except Exception as e:
+        print("⚠️ Gagal cek login:", e)
+# ========== SEARCH & BUKA GRUP ==========
+def open_group(group_input):
+    """
+    Bisa menerima nama grup ATAU link grup langsung.
+    """
+    # --- Kasus: input berupa link langsung ---
+    if group_input.startswith("http"):
+        print(f"🔗 Buka langsung link grup: {group_input}")
+        driver.get(group_input)
+        time.sleep(5)
+        ensure_logged_in()
+        return group_input
+    # --- Kasus: input berupa nama grup ---
+    try:
+        search_box = wait.until(
+            EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]'))
+        )
+        print(f"🔍 Mencari grup '{group_input}' via search...")
+        search_box.clear()
+        search_box.send_keys(group_input)
+        search_box.submit()
+        time.sleep(5)
+        # cari hasil grup dengan nama persis
+        link = None
+        results = driver.find_elements(By.XPATH, f'//a[contains(text(),"{group_input}")]')
+        if results:
+            link = results[0].get_attribute("href")
+        if link:
+            print(f"✅ Grup ditemukan: {link}")
+            driver.get(link)
+            time.sleep(5)
+            return link
+        else:
+            print(f"❌ Grup '{group_input}' tidak ditemukan via search")
+            return None
+    except Exception as e:
+        print(f"⚠️ Search gagal untuk '{group_input}':", e)
+        return None
+def scroll_to_bottom(driver, max_scrolls=10, pause_time=2):
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    for i in range(max_scrolls):
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(pause_time)
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        last_height = new_height
+# ========== SCRAPING POSTINGAN GRUP ==========
+def scrape_group(group_url, group_name, max_scrolls=3, max_posts=None):
+    print(f"📥 Scraping grup: {group_name} ({group_url})")
+    driver.get(group_url)
+    time.sleep(4)
+    ensure_logged_in()
+    posts = []
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    for scroll_round in range(max_scrolls):
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(4)
+        ensure_logged_in()
+        post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
+        print(f"🔎 Ditemukan {len(post_elements)} postingan pada scroll {scroll_round+1}")
+        for idx, post in enumerate(post_elements):
+            if max_posts and len(posts) >= max_posts:
+                break
+            try:
+                driver.execute_script("arguments[0].scrollIntoView(true);", post)
+                time.sleep(1)
+                # --- article_ctx: konteks utama artikel/post ---
+                article_ctx = None
+                try:
+                    # Biasanya post itu sendiri sudah konteks utama
+                    article_ctx = post
+                except:
+                    article_ctx = None
+                # --- permalink & buka halaman post ---
+                permalink = None
+                post_context = post  # default fallback ke post list
+                try:
+                    # coba ambil link /posts/
+                    link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
+                    permalink = link_el.get_attribute("href").split("?")[0]
+                except:
+                    try:
+                        # coba ambil link /permalink/
+                        link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
+                        permalink = link_el.get_attribute("href").split("?")[0]
+                    except:
+                        try:
+                            # fallback ambil ID dari data-ft
+                            post_id = post.get_attribute("data-ft")
+                            if post_id and "top_level_post_id" in post_id:
+                                import json
+                                d = json.loads(post_id)
+                                pid = d.get("top_level_post_id")
+                                if pid:
+                                    permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
+                        except:
+                            pass
+                if not permalink:
+                    print("⚠️ Tidak ada permalink & tidak bisa generate. Tetap lanjut simpan data.")
+                    permalink = group_url  # fallback isi dengan URL grup
+                # --- buka halaman permalink ---
+                try:
+                    driver.get(permalink)
+                    time.sleep(3)
+                    ensure_logged_in()
+                    # ambil elemen post baru dari halaman permalink
+                    post_context = driver.find_element(By.XPATH, "//div[@role='article']")
+                except Exception as e:
+                    print(f"⚠️ Gagal buka permalink {permalink}: {e}")
+                    post_context = None  # jangan pakai lagi elemen lama
+                # --- ambil author ---
+                author = "Unknown"
+                try:
+                    if post_context:
+                        try:
+                            author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
+                        except:
+                            try:
+                                author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
+                            except:
+                                author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
+                except:
+                    pass
+                # --- expand komentar ---
+                while True:
+                    try:
+                        btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
+                        driver.execute_script("arguments[0].click();", btn)
+                        time.sleep(2)
+                    except:
+                        break
+                while True:
+                    try:
+                        btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
+                        driver.execute_script("arguments[0].click();", btn)
+                        time.sleep(2)
+                    except:
+                        break
+                # --- ambil caption & komentar dari post_context ---
+                if post_context:
+                    try:
+                        caption_blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
+                        caption_texts = [cb.text.strip() for cb in caption_blocks if cb.text.strip()]
+                        caption = "\n".join(caption_texts)[:2000] if caption_texts else ""
+                    except:
+                        caption = ""
+                    # ambil komentar
+                    comments = []
+                    try:
+                        comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar']//div[@dir='auto']")
+                        seen = set()
+                        for cb in comment_blocks:
+                            text = cb.text.strip()
+                            if text and text not in seen:
+                                seen.add(text)
+                                comments.append(text)
+                    except:
+                        comments = []
+                data = {
+                    "group_name": group_name,
+                    "group_url": group_url,
+                    "post_url": permalink,
+                    "author": author,
+                    "caption": caption,
+                    "comments": comments,
+                }
+                print(f"✅ Post captured: {author} | {caption[:60]}... | {len(comments)} komentar")
+                posts.append(data)
+            except Exception as e:
+                print(f"⚠️ Error baca postingan {idx}: {e}")
+                continue
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        last_height = new_height
+    return posts
+# ========== MAIN ==========
+all_data = []
+fb_login()
+for g in GROUP_INPUTS:
+    group_url = open_group(g)
+    if group_url:
+        posts = scrape_group(group_url, g)
+        all_data.extend(posts)
+# simpan ke CSV
+with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
+    fieldnames = ["group_name", "group_url", "post_url", "author", "caption", "comments"]
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    writer.writeheader()
+    for row in all_data:
+        writer.writerow(row)
+# simpan ke JSON
+with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+    json.dump(all_data, f, ensure_ascii=False, indent=2)
+print(f"✅ Selesai. Data disimpan ke {OUTPUT_CSV} dan {OUTPUT_JSON}")
+try:
+    driver.quit()
+except:
+    pass

medos_scraping.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import time
+import pandas as pd
+import json
+import os
+from datetime import datetime
+from json import JSONDecodeError
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from selenium.webdriver.common.keys import Keys
+# ==============================================================================
+# KONFIGURASI SELENIUM
+# ==============================================================================
+def setup_driver():
+    """Menyiapkan instance Selenium WebDriver."""
+    options = webdriver.ChromeOptions()
+    # options.add_argument('--headless')
+    options.add_argument('--disable-gpu')
+    options.add_argument('--log-level=3')
+    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36')
+    options.add_experimental_option('excludeSwitches', ['enable-logging'])
+    try:
+        driver = webdriver.Chrome(options=options)
+        return driver
+    except Exception as e:
+        print(f"Error saat memulai WebDriver: {e}")
+        print("Pastikan chromedriver sudah diunduh dan berada di folder yang sama.")
+        return None
+# ==============================================================================
+# FUNGSI COOKIES & CAPTCHA
+# ==============================================================================
+def save_cookies(driver, path):
+    """Menyimpan cookies dari sesi browser ke file JSON."""
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(driver.get_cookies(), f, indent=2)
+    print(f"\nCookies berhasil disimpan ke {path}")
+# [PERBAIKAN] Fungsi ini dibuat lebih tangguh terhadap file kosong/rusak
+def load_cookies(driver, path):
+    """Memuat cookies dari file JSON. Mengembalikan True jika berhasil, False jika gagal."""
+    if not os.path.exists(path) or os.path.getsize(path) == 0:
+        print(f"File cookies '{path}' tidak ditemukan atau kosong.")
+        return False
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            cookies = json.load(f)
+        if not isinstance(cookies, list):
+            print(f"Format data di '{path}' tidak valid (bukan list).")
+            return False
+        for cookie in cookies:
+            driver.add_cookie(cookie)
+        print(f"Cookies berhasil dimuat dari {path}")
+        return True
+    except JSONDecodeError:
+        print(f"Gagal membaca '{path}' karena file rusak (JSONDecodeError).")
+        return False
+    except Exception as e:
+        print(f"Terjadi error saat memuat cookies dari '{path}': {e}")
+        return False
+def establish_and_verify_session(driver, base_cookies_path, profile_cookies_path, profile_url):
+    """
+    Menangani alur CAPTCHA dengan membangun sesi dasar terlebih dahulu.
+    """
+    # --- TAHAP 1: MEMBANGUN SESI DASAR (HOMEPAGE) ---
+    print("\n--- Tahap 1: Membangun Sesi Dasar di tiktok.com ---")
+    driver.get("https://www.tiktok.com/")
+    # [PERBAIKAN] Cek hasil dari load_cookies
+    if not load_cookies(driver, base_cookies_path):
+        print("\n" + "="*50)
+        print("‼️ TINDAKAN AWAL DIPERLUKAN ‼️")
+        input("File cookies dasar tidak valid/tidak ada. Selesaikan CAPTCHA di tiktok.com, lalu tekan [Enter]...")
+        save_cookies(driver, base_cookies_path)
+    driver.refresh()
+    print("Sesi dasar telah dibuat/dimuat.")
+    # --- TAHAP 2: VERIFIKASI SESI PROFIL ---
+    print(f"\n--- Tahap 2: Verifikasi Sesi di Halaman Profil ---")
+    driver.get(profile_url)
+    # [PERBAIKAN] Cek hasil dari load_cookies
+    if load_cookies(driver, profile_cookies_path):
+        print("Mencoba memvalidasi sesi dengan cookies profil...")
+        driver.refresh()
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
+            )
+            print("✅ Sesi profil berhasil dipulihkan.")
+            return True
+        except TimeoutException:
+            print("⚠️ Cookies profil tidak valid. Diperlukan verifikasi manual.")
+    print("\n" + "="*50)
+    print("‼️ VERIFIKASI SEBELUM SCRAPING ‼️")
+    input("Halaman profil telah dimuat. Jika ada CAPTCHA, selesaikan sekarang. Tekan [Enter]...")
+    save_cookies(driver, profile_cookies_path)
+    try:
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
+        )
+        print("✅ Sesi profil berhasil dibuat/diperbarui.")
+        return True
+    except TimeoutException:
+        print("❌ Gagal memverifikasi halaman profil.")
+        return False
+# ==============================================================================
+# FUNGSI-FUNGSI BANTUAN SCRAPING (Tidak Berubah)
+# ==============================================================================
+def get_video_links(driver, max_videos):
+    """
+    Mengambil link video dari halaman profil dengan melakukan scroll
+    hingga batas maksimal tercapai atau halaman paling bawah.
+    """
+    print(f"\n🔎 Mulai mengumpulkan link video (target: {max_videos} video)...")
+    video_links = set()
+    try:
+        # 1. Tunggu hingga elemen video pertama kali muncul
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
+        )
+        print("✅ Halaman profil berhasil dimuat.")
+        # 2. Loop untuk scroll dan kumpulkan link
+        while len(video_links) < max_videos:
+            # Simpan jumlah link sebelum scroll untuk deteksi akhir halaman
+            links_before_scroll = len(video_links)
+            # Kumpulkan semua link yang ada di DOM saat ini
+            video_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a')
+            for elem in video_elements:
+                href = elem.get_attribute('href')
+                if href:
+                    video_links.add(href)
+            # Cek apakah target sudah tercapai setelah pengumpulan
+            if len(video_links) >= max_videos:
+                print(f"🎯 Target {max_videos} video tercapai ({len(video_links)} ditemukan). Berhenti scroll.")
+                break
+            # Lakukan scroll ke paling bawah halaman
+            print(f"📜 Scrolling... Ditemukan {len(video_links)}/{max_videos} video.")
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            # Beri waktu agar konten baru sempat dimuat
+            time.sleep(3)
+            # 3. Deteksi jika sudah tidak ada video baru yang dimuat (paling bawah)
+            #   Untuk menghindari infinite loop, kita cek apakah jumlah link bertambah.
+            if len(video_links) == links_before_scroll:
+                print("🏁 Halaman sudah paling bawah atau tidak ada video baru yang dimuat.")
+                break
+    except TimeoutException:
+        print("❌ Gagal memuat halaman profil atau tidak ada video ditemukan.")
+        return []
+    print(f"\n👍 Selesai mengumpulkan. Total {len(video_links)} link video unik ditemukan.")
+    # Pastikan hasil akhir tidak melebihi max_videos
+    return list(video_links)[:max_videos]
+def check_for_captcha(driver):
+    """
+    [PERBAIKAN V2] Memeriksa CAPTCHA, termasuk di dalam iFrame.
+    """
+    captcha_texts = [
+        "Drag the slider to fit the puzzle",
+        "Drag the puzzle piece into place",
+        "Geser puzzle untuk melengkapi gambar",
+        "Verify to continue"
+    ]
+    # Menggunakan contains(., '...') agar lebih kuat dalam mencari teks
+    xpath_query = "//*[" + " or ".join([f"contains(., '{text}')" for text in captcha_texts]) + "]"
+    # 1. Cek di dalam iFrame terlebih dahulu (penyebab paling umum)
+    try:
+        iframes = driver.find_elements(By.TAG_NAME, 'iframe')
+        if iframes:
+            print(f"\n    Mendeteksi {len(iframes)} iFrame, memeriksa satu per satu untuk CAPTCHA...")
+            for frame in iframes:
+                try:
+                    # Pindah fokus ke dalam iFrame
+                    driver.switch_to.frame(frame)
+                    # Cari elemen CAPTCHA di dalam iFrame
+                    driver.find_element(By.XPATH, xpath_query)
+                    print("\n⚠️  CAPTCHA terdeteksi di dalam sebuah iFrame!")
+                    # PENTING: Kembali ke konteks halaman utama agar sisa skrip tidak error
+                    driver.switch_to.default_content()
+                    return True
+                except NoSuchElementException:
+                    # Jika tidak ditemukan di iFrame ini, kembali dan lanjut ke iFrame berikutnya
+                    driver.switch_to.default_content()
+                    continue
+    except Exception as e:
+        print(f"\n    Error saat memeriksa iFrame: {e}")
+        # Pastikan kembali ke konteks utama jika ada error tak terduga
+        driver.switch_to.default_content()
+    # 2. Jika tidak ada di iFrame, cek di halaman utama (sebagai cadangan)
+    try:
+        driver.find_element(By.XPATH, xpath_query)
+        print("\n⚠️  CAPTCHA terdeteksi di halaman utama!")
+        return True
+    except NoSuchElementException:
+        return False
+def scrape_video_details(driver, video_url):
+    """Mengambil caption dan seluruh komentar, dengan penanganan CAPTCHA dan logika ekspansi konten."""
+    print(f"\n--- Memproses video: {video_url} ---")
+    driver.get(video_url)
+    max_retries = 2
+    for attempt in range(max_retries):
+        try:
+            upload_date = "N/A"
+            like_count = "N/A"
+            try:
+                date_element = WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
+                )
+                upload_date = date_element.text
+            except TimeoutException:
+                print("  -> Info tanggal video tidak ditemukan.")
+            try:
+                like_element = WebDriverWait(driver, 10).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, 'strong[data-e2e="like-count"]'))
+                )
+                like_count = like_element.text
+                print(f"  -> Jumlah 'like' ditemukan: {like_count}")
+            except TimeoutException:
+                print("  -> Info jumlah 'like' tidak ditemukan.")
+            video_data = {'url': video_url, 'upload_date': upload_date, 'like_count': like_count, 'caption_short': '', 'caption_detail': '', 'comments': []}
+            # --- [PERBAIKAN DIMULAI DI SINI] ---
+            try:
+                # 1. Tetap tunggu container utamanya
+                desc_container = WebDriverWait(driver, 5).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
+                )
+                # 2. Cari caption di dalam try...except baru
+                try:
+                    video_data['caption_short'] = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]').text
+                    print(f"  -> Caption singkat ditemukan: {video_data['caption_short'][:50]}...")
+                    # 3. Logika untuk tombol 'more' hanya dijalankan jika caption ditemukan
+                    try:
+                        more_button = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
+                        driver.execute_script("arguments[0].click();", more_button)
+                        print("  -> Tombol 'more' (ikon) pada caption diklik.")
+                        time.sleep(2)
+                        detail_container = WebDriverWait(driver, 5).until(
+                            EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
+                        )
+                        desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
+                        keywords_text = ""
+                        try:
+                            keywords_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
+                        except NoSuchElementException: pass
+                        video_data['caption_detail'] = f"Deskripsi: {desc_text}\nKeywords: {keywords_text}".strip()
+                        print(f"  -> Caption detail ditemukan: {video_data['caption_detail'][:50]}...")
+                    except (NoSuchElementException, TimeoutException):
+                        print("  -> Tidak ada tombol 'more' untuk caption detail.")
+                except NoSuchElementException:
+                    # Jika elemen caption tidak ada, cetak pesan dan lanjutkan
+                    print("  -> Video ini tidak memiliki caption.")
+            except TimeoutException:
+                # Jika bahkan container deskripsinya tidak ada, anggap halaman gagal dimuat
+                print("  -> Bagian deskripsi/caption tidak ditemukan, kemungkinan halaman terhalang.")
+            # --- [PERBAIKAN SELESAI DI SINI] ---
+            # ... (Sisa kode untuk mengambil komentar tidak perlu diubah) ...
+            try:
+                comment_container = WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
+                )
+                print("  -> Bagian komentar ditemukan. Memuat seluruh komentar...")
+                body = driver.find_element(By.TAG_NAME, 'body')
+            except TimeoutException:
+                print("  -> Bagian komentar tidak ditemukan.")
+                return video_data
+            try:
+                print("  -> Memulai proses scroll dan klik balasan secara dinamis...")
+                reply_button_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'reply') or contains(text(), 'replies'))]"
+                last_comment_count = 0
+                stalled_attempts = 0
+                max_stalled_attempts = 5
+                while stalled_attempts < max_stalled_attempts:
+                    try:
+                        view_buttons = driver.find_elements(By.XPATH, reply_button_xpath)
+                        if view_buttons:
+                            print(f"    -> Menemukan {len(view_buttons)} tombol balasan. Mengklik satu...")
+                            driver.execute_script("arguments[0].click();", view_buttons[0])
+                            time.sleep(2)
+                            stalled_attempts = 0
+                            continue
+                    except Exception as e:
+                        print(f"    -> Error minor saat mengklik tombol balasan: {e}")
+                    print("    -> Tidak ada tombol balasan terlihat. Melakukan scroll...")
+                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+                    time.sleep(3)
+                    current_comment_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
+                    if current_comment_count > last_comment_count:
+                        print(f"    -> Konten baru dimuat. Total item sekarang: {current_comment_count}")
+                        last_comment_count = current_comment_count
+                        stalled_attempts = 0
+                    else:
+                        stalled_attempts += 1
+                        print(f"    -> Konten tidak bertambah, percobaan ke-{stalled_attempts}/{max_stalled_attempts}.")
+                print("  -> Scroll dan klik selesai. Memulai ekstraksi final...")
+                comment_item_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
+                print(f"  -> Ditemukan total {comment_item_count} item komentar. Memproses satu per satu...")
+                for i in range(comment_item_count):
+                    try:
+                        all_comment_items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
+                        item = all_comment_items[i]
+                        try:
+                            author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
+                            comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]')
+                            new_comment = {
+                                'author': author_element.text,
+                                'comment': comment_element.text,
+                                'replies': []
+                            }
+                            video_data['comments'].append(new_comment)
+                            continue
+                        except NoSuchElementException:
+                            pass
+                        try:
+                            reply_author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
+                            reply_comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]')
+                            if video_data['comments']:
+                                new_reply = {
+                                    'author': reply_author_element.text,
+                                    'comment': reply_comment_element.text
+                                }
+                                video_data['comments'][-1]['replies'].append(new_reply)
+                        except NoSuchElementException:
+                            pass
+                    except IndexError:
+                        print(f"    -> Peringatan: Jumlah komentar berubah saat proses. Melewatkan indeks ke-{i}.")
+                        break
+                    except Exception as e:
+                        print(f"    -> Terjadi error pada item ke-{i}, melewati. Error: {e}")
+                print("  -> Selesai. Berhasil memproses dan mengelompokkan komentar.")
+            except Exception as e:
+                print(f"  -> Gagal pada proses utama karena: {e}")
+            return video_data
+        except TimeoutException:
+            print("  -> Gagal memuat elemen halaman (Timeout).")
+            if check_for_captcha(driver):
+                print("\n" + "="*50)
+                print(f"⚠️ CAPTCHA terdeteksi pada percobaan ke-{attempt + 1} untuk video: {video_url}")
+                input("   Silakan selesaikan CAPTCHA di browser, lalu tekan [Enter] untuk mencoba lagi...")
+                driver.refresh()
+                print("   Mencoba lagi...")
+                continue
+            else:
+                print("  -> Tidak ada CAPTCHA. Melewati video ini.")
+                return None
+    print(f"  -> Gagal memproses video setelah {max_retries} kali percobaan. Melewati video ini.")
+    return None
+# ==============================================================================
+# EKSEKUSI UTAMA (Tidak Berubah)
+# ==============================================================================
+if __name__ == "__main__":
+    PROFILE_USERNAMES = ["rctvcirebon", "cirebonkabtv", "kang_jigus", "kangimron_", "info.cirebonan"]
+#
+    MAX_VIDEOS_PER_PROFILE = 200
+    BASE_COOKIES_FILE = "tiktok_base_cookies.json"
+    PROFILE_COOKIES_FILE = "tiktok_profile_cookies.json"
+    all_data = []
+    driver = setup_driver()
+    if driver:
+        try:
+            if not PROFILE_USERNAMES:
+                print("Daftar PROFILE_USERNAMES kosong.")
+            else:
+                first_profile_url = f"https://www.tiktok.com/@{PROFILE_USERNAMES[0]}"
+                session_ok = establish_and_verify_session(driver, BASE_COOKIES_FILE, PROFILE_COOKIES_FILE, first_profile_url)
+                if session_ok:
+                    for username in PROFILE_USERNAMES:
+                        print("\n" + "="*70)
+                        print(f"MEMULAI SCRAPING UNTUK PROFIL: @{username}")
+                        print("="*70)
+                        profile_url = f"https://www.tiktok.com/@{username}"
+                        driver.get(profile_url)
+                        # [PERUBAHAN] Panggilan fungsi disederhanakan
+                        video_urls = get_video_links(driver, MAX_VIDEOS_PER_PROFILE)
+                        for url in video_urls:
+                            data = scrape_video_details(driver, url)
+                            if data:
+                                data['profile_username'] = username
+                                data['scrape_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                                all_data.append(data)
+                            time.sleep(2)
+            # ... sisa kode untuk menyimpan file tidak perlu diubah ...
+            if all_data:
+                print("\nMenyimpan semua data yang terkumpul...")
+                df = pd.DataFrame(all_data)
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                output_filename = f"tiktok_data_multi_{timestamp}"
+                df.to_csv(f'{output_filename}.csv', index=False, encoding='utf-8-sig')
+                print(f"Data telah disimpan ke {output_filename}.csv")
+                with open(f'{output_filename}.json', 'w', encoding='utf-8') as f:
+                    json.dump(all_data, f, ensure_ascii=False, indent=4)
+                print(f"Data telah disimpan ke {output_filename}.json")
+            else:
+                print("\nTidak ada data yang berhasil dikumpulkan untuk disimpan.")
+        except Exception as e:
+            print(f"\nTerjadi kesalahan fatal selama proses: {e}")
+        finally:
+            print("\n--- PROSES SELESAI ---")
+            driver.quit()

preparing.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# -*- coding: utf-8 -*-
+"""Preparing.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
+"""
+!pip -q install -U transformers accelerate torch
+!pip install transformers
+!pip install --upgrade transformers
+!pip uninstall -y torch torchvision torchaudio transformers
+!pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
+!pip install transformers accelerate
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import torch
+import re
+from textblob import TextBlob
+from transformers import AutoConfig, pipeline
+# Menampilkan Dataset
+folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'
+try:
+    # Dapatkan daftar semua file dalam folder
+    files = os.listdir(folder_path)
+    # Loop melalui setiap file
+    for file_name in files:
+        # Periksa apakah file tersebut adalah file CSV
+        if file_name.endswith('.csv'):
+            file_path = os.path.join(folder_path, file_name)
+            print(f"Membaca file: {file_name}")
+            try:
+                # Baca file CSV menggunakan Pandas
+                df = pd.read_csv(file_path)
+                # Tampilkan beberapa baris pertama dari dataset
+                print(df)
+                print("\n") # Beri jarak antar file
+            except Exception as e:
+                print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")
+except FileNotFoundError:
+    print(f"Error: Folder '{folder_path}' tidak ditemukan.")
+except Exception as e:
+    print(f"Terjadi error: {e}")
+# Medsos
+# 1. Memproses data Instagram
+ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
+df_ig = pd.read_csv(ig_path)
+df_ig = df_ig.rename(columns={
+    'source_name': 'profile',
+    'post_url': 'url'
+})
+# 2. Memproses data TikTok
+tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
+df_tiktok = pd.read_csv(tiktok_path)
+df_tiktok = df_tiktok.rename(columns={
+    'like_count': 'likes',
+    'caption_short': 'caption',
+    'profile_username': 'profile',
+    'scrape_date': 'datetime'
+})
+df_tiktok = df_tiktok.drop(columns=['upload_date'])
+# --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---
+kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']
+# Proses DataFrame Instagram
+df1_pilihan = df_ig[kolom_yang_dipilih].copy()
+df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
+df1_pilihan['asal_dataset'] = 'Instagram'
+# Proses DataFrame TikTok
+df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
+df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
+df2_pilihan['asal_dataset'] = 'Tiktok'
+# --- Penggabungan ---
+df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)
+# --- Pembersihan Data (Preprocessing) ---
+# Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
+df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)
+def clean_text(text):
+    if pd.isna(text): return ""
+    text = str(text).lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def format_author(text):
+    formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
+    return formatted
+df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
+df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
+df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
+df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
+df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)
+# Hapus baris kosong dan duplikat di akhir
+df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
+df_gabungan = df_gabungan.drop_duplicates()
+# --- HASIL AKHIR ---
+print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
+print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
+print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
+df_gabungan.info()
+# Simpan ke file CSV baru
+save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
+df_gabungan.to_csv(save_path, index=False)
+print(f"\nData berhasil disimpan di: {save_path}")
+# Berita
+df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')
+# Apply string operations to the 'tag' column
+df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)
+# Filter the DataFrame based on the 'tag' column
+df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
+df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
+df_berita = df_berita_filtered.dropna().drop_duplicates()
+df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])
+df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
+df_berita['tag'] = df_berita['tag'].apply(clean_text)
+df_berita['judul'] = df_berita['judul'].apply(clean_text)
+df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()
+df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
+df_berita['isi_berita'] = (
+    df_berita['isi_berita']
+    .str.replace(r'(?i)scroll.*?content', '', regex=True)
+    .str.replace(r'(?i)h3:', '', regex=True)
+    .str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
+    .str.replace(r'(?i)editor.*?antara', '', regex=True)
+    .str.replace(r'(?i)pewarta.*?antara', '', regex=True)
+    .str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
+    .str.replace(r'(?i)dilarang.*?antara', '', regex=True)
+    .str.replace(r'(?i)advertisement', '', regex=True)
+    .str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
+    .str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
+    .str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
+    .str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
+)
+df_berita = df_berita.drop_duplicates()
+df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])
+print(df_berita)
+save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
+df_berita.to_csv(save_path, index=False)
+MODEL_ID = "taufiqdp/indonesian-sentiment"  # IndoBERT fine-tuned (3 kelas)
+# (opsional) kalau kamu perlu token HF untuk repo privat:
+# from huggingface_hub import login
+# login("hf_xxx")  # token kamu
+config = AutoConfig.from_pretrained(MODEL_ID)
+clf = pipeline(
+    task="text-classification",
+    model=MODEL_ID,
+    tokenizer=MODEL_ID,
+    device=0 if torch.cuda.is_available() else -1,
+    truncation=True,
+    max_length=256,
+    return_all_scores=False,
+)
+def normalize_label(lbl: str) -> str:
+    l = lbl.lower()
+    if l in ("positif","positive"): return "positif"
+    if l in ("negatif","negative"): return "negatif"
+    if l in ("netral","neutral"):   return "netral"
+    # fallback jika format 'LABEL_0/1/2'
+    if "label_" in l:
+        try:
+            idx = int(l.split("_")[-1])
+            return config.id2label[idx].lower()
+        except:
+            return "netral"
+    return l
+# Sentimen untuk CAPTION
+texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
+preds_caption = clf(texts_caption, batch_size=64)
+df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]
+# Sentimen untuk COMMENTS
+texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
+preds_comments = clf(texts_comments, batch_size=64)
+df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]
+# (opsional) buat kolom sentimen gabungan
+# kalau caption netral/empty, ambil dari comments
+def combine_sentiment(row):
+    if row['sentimen_caption'] != "netral":
+        return row['sentimen_caption']
+    return row['sentimen_comments']
+df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)
+df_gabungan.to_csv('medsos2.csv', index=False)
+# contoh ke dataframe berita (judul/tag)
+texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
+preds_b = clf(texts_b, batch_size=64)
+df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]
+df_berita.to_csv('berita2.csv', index=False)

requirements.txt ADDED Viewed

Binary file (3.95 kB). View file

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10

sentimentanalysis.py ADDED Viewed

	@@ -0,0 +1,675 @@

+# -*- coding: utf-8 -*-
+"""SentimentAnalysis
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/fatihramadhan/sentimentanalysis.74f160cb-74cc-4609-ba85-0081c3654a18.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20260326/auto/storage/goog4_request%26X-Goog-Date%3D20260326T141800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2fe877a762338b5e556a035ce46a5a6bf9c51c0d33c4b062e919cfd44e0297ff787b3a23bf4290b33ca0467d04cf7ba377d77c975cd79da4f1adfec176cb7d78d1eddf1eec10e87d86e656200eaed9b0781f5f5d215ee084957aa5a30c2e9fa1731c23b333d5f742767875bd84e34b83339d834639567639d817ad1295fbc8fd552a5ae92f938b90cb8d916b4a7190e208c6d0effdc10665a9405efffc12a2d4497159428e898204e32ad2d629a58e985c020c7febef459895fd34b052c37a041102284e207ed788a6490c64656ece6150fc355120a49cf2b2fdadda53018d3dba4f8aeda15faaa1eb9c9cef82a476c38be69504e5a5f98cf61686a2b337ea77
+"""
+# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
+# THEN FEEL FREE TO DELETE THIS CELL.
+# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
+# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
+# NOTEBOOK.
+import kagglehub
+fatihramadhan_sentimentdataset_path = kagglehub.dataset_download('fatihramadhan/sentimentdataset')
+print('Data source import complete.')
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import html
+import torch
+import evaluate
+import os
+import transformers
+import inspect
+import joblib
+from pathlib import Path
+from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.utils import resample
+# ----------------------------
+# Konfigurasi
+# ----------------------------
+INPUT_PATH = "/kaggle/input/sentimentdataset/dataset_gabungan.csv"
+# Jika kamu pakai model cased (mis. indobenchmark/indobert-base-p2), set ke False
+APPLY_LOWERCASE = True
+# Batasi huruf berulang (contoh: "baguuuusss" -> "baguus")
+LIMIT_REPEAT_CHARS = True
+MAX_REPEAT = 2
+# Nama kolom (biarkan None agar ditebak otomatis)
+TEXT_COL = None
+LABEL_COL = None
+# Label yang didukung (akan dinormalisasi ke bentuk ini)
+CANON_LABELS = {"positif": "positif", "positive": "positif", "pos": "positif", 'positi': 'positif',
+                "negatif": "negatif", "negative": "negatif", "neg": "negatif", 'negartif': 'negatif',
+                "netral": "netral", "neutral": "netral", "neu": "netral", 'netr' : 'netral'}
+# ----------------------------
+# Utilitas
+# ----------------------------
+def guess_column(df: pd.DataFrame, candidates):
+    for c in candidates:
+        if c in df.columns:
+            return c
+    # fallback: pilih kolom bertipe object terpanjang
+    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
+    return obj_cols[0] if obj_cols else df.columns[0]
+url_pattern = re.compile(r"(https?://\S+|www\.\S+)")
+mention_pattern = re.compile(r"@\w+")
+hashtag_pattern = re.compile(r"#(\w+)")
+multi_space_pattern = re.compile(r"\s+")
+rt_fw_pattern = re.compile(r"\b(rt|fw|fwd)\b[:]?", flags=re.IGNORECASE)
+# Optional: pola khusus yang sering ada di data komentar (hapus segmen "author ... comment")
+author_comment_pattern = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
+def limit_repeated_chars(text: str, max_repeat: int = 2) -> str:
+    return re.sub(r"(.)\1{%d,}" % (max_repeat), r"\1" * max_repeat, text)
+class TextPreprocessor(BaseEstimator, TransformerMixin):
+    def __init__(self,
+                 apply_lowercase=True,
+                 limit_repeat=True,
+                 max_repeat=2,
+                 canon_labels=None):
+        self.apply_lowercase = apply_lowercase
+        self.limit_repeat = limit_repeat
+        self.max_repeat = max_repeat
+        self.canon_labels = canon_labels or {}
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        # pastikan Series + atasi NaN di sini, JANGAN di _clean_text
+        texts = pd.Series(X).fillna("").astype(str)
+        return texts.apply(self._clean_text)
+    def transform_labels(self, y):
+        if y is None:
+            return None
+        labels = pd.Series(y).astype(str)
+        return labels.apply(self._normalize_label)
+    def _normalize_label(self, x):
+        if pd.isna(x):
+            return None
+        s = str(x).strip().lower()
+        return self.canon_labels.get(s, None)
+    def _clean_text(self, t: str) -> str:
+        if not isinstance(t, str):
+            return ""
+        # Hapus pola "author ... comment"
+        t = author_comment_pattern.sub("", t)
+        # Hapus tag HTML / atribut
+        t = remove_html_elements(t)
+        # Unescape HTML entities
+        t = html.unescape(t)
+        # Ganti URL dan mention
+        t = url_pattern.sub(" <url> ", t)
+        t = mention_pattern.sub(" <user> ", t)
+        # Hashtag "#kata" -> "kata"
+        t = hashtag_pattern.sub(lambda m: f"{m.group(1)}", t)
+        # Hapus token RT/FW
+        t = rt_fw_pattern.sub(" ", t)
+        # Hanya simpan huruf, angka, dan spasi
+        t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
+        # Normalisasi whitespace
+        t = multi_space_pattern.sub(" ", t).strip()
+        # Lowercase jika diinginkan
+        if self.apply_lowercase:
+            t = t.lower()
+        # Batasi huruf berulang
+        if self.limit_repeat:
+            t = limit_repeated_chars(t, self.max_repeat)
+        return t
+def remove_html_elements(text: str) -> str:
+    if not isinstance(text, str):
+        return ""
+    # Unescape HTML entities (&amp; -> &, dll)
+    text = html.unescape(text)
+    # Hapus semua <tag> lengkap
+    text = TAG_RE.sub(" ", text)
+    # Hapus atribut HTML yang nyangkut sebagai plain text
+    text = ATTR_RE.sub(" ", text)
+    # Hapus simbol "<" atau ">" sisa
+    text = re.sub(r"[<>]", " ", text)
+    # Normalkan spasi
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+# regex: hapus <tag> beserta isinya
+TAG_RE = re.compile(r"<[^>]+>")
+# regex: hapus atribut-atribut html yang sering nyangkut
+ATTR_RE = re.compile(r"\b(class|id|style|role|tabindex|href|src|alt)=[^\s>]+", flags=re.IGNORECASE)
+# ----------------------------
+# Load
+# ----------------------------
+path = Path(INPUT_PATH)
+if not path.exists():
+    raise FileNotFoundError(f"File tidak ditemukan: {path.resolve()}")
+df = pd.read_csv(path)
+# ----------------------------
+# Tentukan kolom teks & label
+# ----------------------------
+if TEXT_COL is None:
+    TEXT_COL = guess_column(df, ["text", "tweet", "content", "sentence", "caption", "judul", "deskripsi"])
+if LABEL_COL is None:
+    LABEL_COL = guess_column(df, ["label", "sentiment", "polarity", "target", "kelas"])
+print(f"Kolom teks terdeteksi : {TEXT_COL}")
+print(f"Kolom label terdeteksi: {LABEL_COL}")
+# ----------------------------
+# Load Preproc
+# ----------------------------
+preproc = TextPreprocessor(
+    apply_lowercase=APPLY_LOWERCASE,
+    limit_repeat=LIMIT_REPEAT_CHARS,
+    max_repeat=MAX_REPEAT,
+    canon_labels=CANON_LABELS
+)
+# ----------------------------
+# Penggunaan Preproc
+# ----------------------------
+# fit_transform teks
+df["text"] = preproc.fit_transform(df[TEXT_COL])
+df["sentiment"] = preproc.transform_labels(df[LABEL_COL])
+# ----------------------------
+# Drop Data jika Text Kosong
+# ----------------------------
+df = df[df["text"].str.strip().ne("")]
+# ----------------------------
+# Tampilkan contoh label tak dikenal
+# ----------------------------
+unknown = df[df["sentiment"].isna()]
+print("\nContoh label tak dikenal yang akan dibuang:")
+print(unknown[[LABEL_COL]].value_counts())  # tampilkan 10 teratas
+# Buang label tak dikenal
+before = len(df)
+df = df[df["sentiment"].notna()]
+dropped_unknown = before - len(df)
+# ----------------------------
+# Hapus duplikasi (berdasarkan teks bersih)
+# ----------------------------
+df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
+# ----------------------------
+# Ringkasan
+# ----------------------------
+print("\nRingkasan setelah preprocessing:")
+print(f" - Baris total        : {len(df)}")
+print(f" - Dibuang label tak dikenal: {dropped_unknown}")
+print(" - Distribusi label:")
+print(df["sentiment"].value_counts(dropna=False))
+# Contoh pratinjau
+print("\nContoh 5 baris:")
+print(df[[TEXT_COL, "text", LABEL_COL, "sentiment"]].head(5))
+# df.to_csv('/content/drive/MyDrive/Machine Learning/Latih Model/bersihhh.csv')
+# ----------------------------
+# Save Preproc
+# ----------------------------
+joblib.dump(preproc, "preprocessor.joblib")
+# ============================
+# PERBAIKAN LABEL BERDASARKAN KATA KUNCI
+# ============================
+# Definisikan kamus kata kunci untuk tiap label
+NEGATIVE_KEYWORDS = {
+    # Kata kasar / slang
+    "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
+    "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
+    "kik", "goblog", "kntl",
+    # Kata resmi / formal
+    "buruk", "lemah", "rendah", "gagal", "hancur", "rusak", "cacat",
+    "jahat", "dusta", "bohong", "fitnah", "korup", "curang", "palsu",
+    "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
+    "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
+}
+POSITIVE_KEYWORDS = {
+    # Kata umum positif
+    "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
+    "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
+    "sempurna", "berhasil", "luas", "indah"
+}
+NEUTRAL_KEYWORDS = {
+    # Kata netral / umum
+    "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
+}
+def correct_label(row):
+    text = row["text"]
+    label = row["sentiment"]
+    # cek kata negatif
+    if any(word in text for word in NEGATIVE_KEYWORDS):
+        return "negatif"
+    # cek kata positif
+    if any(word in text for word in POSITIVE_KEYWORDS):
+        return "positif"
+    # cek kata netral
+    if any(word in text for word in NEUTRAL_KEYWORDS):
+        return "netral"
+    # kalau tidak ada aturan yang kena, pakai label asli
+    return label
+# Terapkan perbaikan
+df["sentiment"] = df.apply(correct_label, axis=1)
+# Ringkasan distribusi setelah perbaikan
+print("\nDistribusi label setelah perbaikan:")
+print(df["sentiment"].value_counts())
+# Pisahkan tiap kelas
+df_negatif = df[df["sentiment"] == "negatif"]
+df_positif = df[df["sentiment"] == "positif"]
+df_netral  = df[df["sentiment"] == "netral"]
+# Tentukan target jumlah (misal samakan dengan kelas netral)
+target_count = df_netral.shape[0]
+# Oversampling positif & negatif
+df_negatif_over = resample(df_negatif,
+                           replace=True,
+                           n_samples=target_count,
+                           random_state=42)
+df_positif_over = resample(df_positif,
+                           replace=True,
+                           n_samples=target_count,
+                           random_state=42)
+# Gabungkan kembali
+df_balanced = pd.concat([df_netral, df_negatif_over, df_positif_over])
+print("Distribusi setelah balancing:")
+print(df_balanced["sentiment"].value_counts())
+# ============================
+# VISUALISASI DISTRIBUSI LABEL
+# ============================
+# ambil distribusi label_clean
+label_counts = df_balanced["sentiment"].value_counts()
+# -------- Diagram Batang --------
+plt.figure(figsize=(6,4))
+label_counts.plot(kind="bar", color=["red","green","blue"])
+plt.title("Distribusi Sentimen")
+plt.xlabel("Label")
+plt.ylabel("Jumlah")
+plt.xticks(rotation=0)
+plt.show()
+print('\n')
+# -------- Diagram Lingkaran (Pie) --------
+plt.figure(figsize=(5,5))
+label_counts.plot(kind="pie", autopct='%1.1f%%', startangle=90, colors=["red","green","blue"])
+plt.title("Persentase Sentimen")
+plt.ylabel("")  # hilangkan label Y
+plt.show()
+# ============================
+# SPLIT DATASET (train/val/test)
+# ============================
+# ambil teks & label hasil bersih
+X = df_balanced["text"].values
+y = df_balanced["sentiment"].values
+# 1. Bagi train + temp (80%) dan test (20%)
+X_train, X_temp, y_train, y_temp = train_test_split(
+    X, y, test_size=0.2, random_state=42, stratify=y
+)
+# 2. Dari temp (20%), bagi lagi jadi val (10%) + test (10%)
+X_val, X_test, y_val, y_test = train_test_split(
+    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
+)
+# Cek ukuran hasil split
+print("Ukuran dataset:")
+print(f"Train: {len(X_train)}")
+print(f"Validation: {len(X_val)}")
+print(f"Test: {len(X_test)}")
+# ============================
+# FINE-TUNING IndoBERT
+# ============================
+# pastikan pakai GPU kalau tersedia
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("Device:", device)
+os.environ["WANDB_API_KEY"] = "009f08e71506e55bdfd282b691a4abee4ac85ff9"
+os.environ["WANDB_DISABLED"] = "false"
+# ----------------------------
+# 1. Tokenizer & Label Encoding
+# ----------------------------
+MODEL_NAME = "indobenchmark/indobert-base-p1"  # model IndoBERT pre-trained
+tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
+# mapping label ke angka
+label2id = {"negatif": 0, "netral": 1, "positif": 2}
+id2label = {v: k for k, v in label2id.items()}
+def encode_labels(labels):
+    return [label2id[l] for l in labels]
+y_train_enc = encode_labels(y_train)
+y_val_enc   = encode_labels(y_val)
+y_test_enc  = encode_labels(y_test)
+# ----------------------------
+# 2. Dataset class
+# ----------------------------
+class SentimentDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer, max_len=128):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        label = self.labels[idx]
+        enc = self.tokenizer(
+            text,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_len,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": enc["input_ids"].squeeze(),
+            "attention_mask": enc["attention_mask"].squeeze(),
+            "labels": torch.tensor(label, dtype=torch.long)
+        }
+train_dataset = SentimentDataset(X_train, y_train_enc, tokenizer)
+val_dataset   = SentimentDataset(X_val, y_val_enc, tokenizer)
+test_dataset  = SentimentDataset(X_test, y_test_enc, tokenizer)
+# ----------------------------
+# 3. Model
+# ----------------------------
+model = BertForSequenceClassification.from_pretrained(
+    MODEL_NAME,
+    num_labels=3,
+    id2label=id2label,
+    label2id=label2id
+).to(device)
+# ----------------------------
+# 4. Training Arguments
+# ----------------------------
+training_args = TrainingArguments(
+    output_dir="./results",
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=32,
+    num_train_epochs=5,                   # cukup 10–15, early stopping yang handle
+    learning_rate=2e-5,                    # lebih kecil → stabil
+    weight_decay=0.05,                     # lebih besar → regularisasi
+    warmup_ratio=0.1,                      # 10% step awal dipakai warmup
+    logging_dir="./logs",
+    logging_steps=500,
+    save_total_limit=2,
+    eval_strategy="epoch",         # evaluasi setiap epoch
+    save_strategy="epoch",         # simpan juga setiap epoch
+    load_best_model_at_end=True,
+    metric_for_best_model="f1",
+    greater_is_better=True
+)
+# ----------------------------
+# 5. Metrics
+# ----------------------------
+metric_acc = evaluate.load("accuracy")
+metric_f1  = evaluate.load("f1")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    acc = metric_acc.compute(predictions=preds, references=labels)
+    f1  = metric_f1.compute(predictions=preds, references=labels, average="weighted")
+    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
+# ----------------------------
+# 6. Trainer
+# ----------------------------
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # stop kalau 2 epoch tidak membaik
+)
+# ----------------------------
+# 7. Mulai Training
+# ----------------------------
+trainer.train()
+# =============================
+# 8. Evaluasi & Simpan Prediksi
+# =============================
+# hasil prediksi di test set
+pred_results = trainer.predict(test_dataset)
+# ambil logits → konversi ke label prediksi
+pred_logits = pred_results.predictions
+pred_labels = np.argmax(pred_logits, axis=1)
+# konversi angka ke label teks
+pred_text_labels = [id2label[i] for i in pred_labels]
+true_text_labels = [id2label[i] for i in y_test_enc]
+# gabungkan dengan teks asli
+df_test_results = pd.DataFrame({
+    "text": X_test,
+    "true_label": true_text_labels,
+    "predicted_label": pred_text_labels
+})
+# simpan ke CSV
+df_test_results.to_csv("test_predictions.csv", index=False)
+print("✅ Hasil prediksi test set sudah disimpan ke test_predictions.csv")
+# ============================
+# EVALUASI & SIMPAN MODEL
+# ============================
+# 1. Evaluasi di test set
+print("\nEvaluasi di Test Set:")
+test_result = trainer.evaluate(test_dataset)
+print(test_result)
+# 2. Prediksi label test set (opsional, untuk analisis lebih lanjut)
+predictions = trainer.predict(test_dataset)
+pred_labels = np.argmax(predictions.predictions, axis=-1)
+# contoh lihat 10 prediksi pertama
+for i in range(10):
+    print(f"Teks: {X_test[i]}")
+    print(f"Label Asli: {id2label[y_test_enc[i]]} | Prediksi: {id2label[pred_labels[i]]}")
+    print("---")
+# 3. Simpan model + tokenizer
+SAVE_DIR = "./indoBERT-sentiment"
+trainer.save_model(SAVE_DIR)
+tokenizer.save_pretrained(SAVE_DIR)
+print(f"\nModel & tokenizer sudah disimpan ke: {SAVE_DIR}")
+# ==========================
+# LOAD MODEL & TOKENIZER
+# ==========================
+MODEL_DIR = "./indoBERT-sentiment"
+tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
+model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
+device = 0 if torch.cuda.is_available() else -1
+sentiment_pipeline = pipeline(
+    "text-classification",
+    model=model,
+    tokenizer=tokenizer,
+    device=device
+)
+# load preprocessor yang sudah disimpan
+preproc = joblib.load("preprocessor.joblib")
+# ==========================
+# FUNGSI PREDIKSI
+# ==========================
+def predict_text(text):
+    if not isinstance(text, str) or text.strip() == "":
+        return "EMPTY"
+    result = sentiment_pipeline(text, truncation=True, max_length=512)[0]
+    return result["label"]
+# ==========================
+# PREDIKSI FILE 1 (MEDIA SOSIAL)
+# ==========================
+file1 = pd.read_csv("/kaggle/input/sentimentdataset/gabungan (1).csv")
+# Preprocessing caption
+file1["caption"] = preproc.transform(file1["caption"])
+# Preprocessing comment
+file1["comments"] = preproc.transform(file1["comments"])
+# drop NaN biar aman
+file1 = file1.dropna(subset=["caption", "comments"])
+outputs1 = []
+for idx, row in file1.iterrows():
+    print(f"[File1] Proses baris {idx+1}/{len(file1)}")
+    # caption
+    caption_text = str(row["caption"]).strip()
+    caption_pred = predict_text(caption_text)
+    # comments
+    comments_text = str(row["comments"]).strip()
+    comments_pred_label = predict_text(comments_text)
+    outputs1.append({
+        "link": row.get("link", ""),             # simpan link medsos
+        "caption": caption_text,
+        "caption_pred": caption_pred,
+        "comments_pred": comments_text,          # simpan teks asli komentar
+        "comments_summary": comments_pred_label  # hasil prediksi sentimen komentar
+    })
+df_out1 = pd.DataFrame(outputs1)
+df_out1.to_csv("medsos.csv", index=False, encoding="utf-8-sig")
+print("✅ Hasil prediksi file1 sudah disimpan ke medsos.csv")
+# ==========================
+# PREDIKSI FILE 2 (BERITA)
+# ==========================
+file2 = pd.read_csv("/kaggle/input/sentimentdataset/berita2 (1).csv")
+# Preprocessing judul
+file2["judul"] = preproc.transform(file2["judul"])
+# Preprocessing tag (✅ perbaikan: tidak menimpa judul)
+file2["tag"] = preproc.transform(file2["tag"])
+# Preprocessing isi_berita
+file2["isi_berita"] = preproc.transform(file2["isi_berita"])
+# drop NaN biar aman
+file2 = file2.dropna(subset=["judul", "tag", "isi_berita"])
+outputs2 = []
+for idx, row in file2.iterrows():
+    print(f"[File2] Proses baris {idx+1}/{len(file2)}")
+    combined_text = f"{row['judul']} {row['tag']} {row['isi_berita']}"
+    pred = predict_text(combined_text)
+    outputs2.append({
+        "link": row.get("link", ""),    # simpan link berita
+        "judul": row["judul"],
+        "tag": row["tag"],
+        "isi_berita": row["isi_berita"],
+        "prediction": pred
+    })
+df_out2 = pd.DataFrame(outputs2)
+df_out2.to_csv("berita.csv", index=False, encoding="utf-8-sig")
+print("✅ Hasil prediksi file2 sudah disimpan ke berita.csv")

services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty init file to make 'services' a proper Python package

services/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (158 Bytes). View file

services/__pycache__/facebook.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

services/__pycache__/medos.cpython-311.pyc ADDED Viewed

Binary file (15.3 kB). View file

services/__pycache__/news.cpython-311.pyc ADDED Viewed

Binary file (22.8 kB). View file

services/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (5.7 kB). View file

services/__pycache__/sentiment.cpython-311.pyc ADDED Viewed

Binary file (5.12 kB). View file

services/__pycache__/tiktok.cpython-311.pyc ADDED Viewed

Binary file (11.1 kB). View file

services/__pycache__/wordcloud_service.cpython-311.pyc ADDED Viewed

Binary file (5.59 kB). View file

services/_driver.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""
+_driver.py  –  Shared Selenium Chrome driver factory.
+All scrapers import _create_driver() from here so that Docker env-vars
+(CHROME_BIN, CHROMEDRIVER_PATH) are respected in one place.
+"""
+from __future__ import annotations
+import os
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+def _create_driver(mobile: bool = False) -> webdriver.Chrome:
+    """
+    Return a headless Chrome/Chromium instance tuned for Docker.
+    Picks up:
+      CHROME_BIN        – path to chromium binary   (default: /usr/bin/chromium)
+      CHROMEDRIVER_PATH – path to chromedriver       (default: /usr/bin/chromedriver)
+    """
+    chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/chromium")
+    driver_bin = os.environ.get("CHROMEDRIVER_PATH", "/usr/bin/chromedriver")
+    options = webdriver.ChromeOptions()
+    options.binary_location = chrome_bin
+    # ── Headless & sandbox flags ──────────────────────────────────────────────
+    options.add_argument("--headless=new")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-software-rasterizer")
+    options.add_argument("--disable-extensions")
+    options.add_argument("--disable-infobars")
+    options.add_argument("--disable-notifications")
+    options.add_argument("--disable-popup-blocking")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument("--ignore-certificate-errors")
+    options.add_argument("--window-size=1920,1080")
+    options.add_argument("--remote-debugging-port=0")  # avoid port conflicts
+    # ── User-Agent ────────────────────────────────────────────────────────────
+    if mobile:
+        options.add_argument(
+            "--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
+            "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
+        )
+    else:
+        options.add_argument(
+            "--user-agent=Mozilla/5.0 (X11; Linux x86_64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+        )
+    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
+    options.add_experimental_option("useAutomationExtension", False)
+    service = Service(executable_path=driver_bin)
+    driver = webdriver.Chrome(service=service, options=options)
+    # Hide webdriver fingerprint
+    driver.execute_cdp_cmd(
+        "Page.addScriptToEvaluateOnNewDocument",
+        {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
+    )
+    return driver

services/facebook.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""
+facebook.py  –  Facebook group scraper using Selenium.
+Exports: scrape_facebook(username, password, groups) -> list[dict]
+Returns structured data per-post:
+  group_name, group_url, post_url, author, caption, comments
+"""
+from __future__ import annotations
+import json
+import os
+import time
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from ._driver import _create_driver
+COOKIES_FILE = "fb_cookies.json"
+FB_BASE = "https://www.facebook.com"
+MOBILE_FB = "https://m.facebook.com"
+# ── Cookie helpers ─────────────────────────────────────────────────────────────
+def _save_cookies(driver, path: str) -> None:
+    try:
+        with open(path, "w") as f:
+            json.dump(driver.get_cookies(), f)
+    except Exception as e:
+        print(f"[Facebook] Gagal simpan cookies: {e}")
+def _load_cookies(driver, path: str) -> bool:
+    if not os.path.exists(path) or os.path.getsize(path) == 0:
+        return False
+    try:
+        with open(path, "r") as f:
+            cookies = json.load(f)
+        for cookie in cookies:
+            try:
+                driver.add_cookie(cookie)
+            except Exception:
+                pass
+        return True
+    except Exception as e:
+        print(f"[Facebook] Gagal load cookies: {e}")
+        return False
+# ── Login ──────────────────────────────────────────────────────────────────────
+def _fb_login(driver, username: str, password: str) -> bool:
+    wait = WebDriverWait(driver, 20)
+    driver.get(MOBILE_FB)
+    time.sleep(3)
+    if os.path.exists(COOKIES_FILE):
+        try:
+            _load_cookies(driver, COOKIES_FILE)
+            driver.refresh()
+            time.sleep(4)
+            if "login" not in driver.current_url and "checkpoint" not in driver.current_url:
+                print("[Facebook] Login via cookies berhasil.")
+                return True
+            driver.delete_all_cookies()
+            driver.get(MOBILE_FB)
+            time.sleep(2)
+        except Exception as e:
+            pass
+    print("[Facebook] Login manual username/password...")
+    try:
+        email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]')))
+        pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]')
+        email_input.clear()
+        email_input.send_keys(username)
+        pass_input.clear()
+        pass_input.send_keys(password)
+        pass_input.send_keys("\n")
+        time.sleep(1)
+        try:
+            login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]')
+            driver.execute_script("arguments[0].click();", login_btn)
+        except Exception:
+            pass
+    except Exception:
+        try:
+            driver.get(f"{FB_BASE}/login.php")
+            time.sleep(3)
+            email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
+            pass_input = driver.find_element(By.ID, "pass")
+            email_input.clear()
+            email_input.send_keys(username)
+            pass_input.clear()
+            pass_input.send_keys(password)
+            driver.find_element(By.NAME, "login").click()
+        except Exception as e2:
+            return False
+    time.sleep(6)
+    if "login" in driver.current_url or "checkpoint" in driver.current_url:
+        return False
+    _save_cookies(driver, COOKIES_FILE)
+    return True
+def ensure_logged_in(driver, username, password):
+    try:
+        url = driver.current_url
+        if url and "login" in url:
+            _fb_login(driver, username, password)
+            return
+        try:
+            popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
+            if popup.is_displayed():
+                _fb_login(driver, username, password)
+                return
+        except: pass
+        try:
+            login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
+            if login_modal.is_displayed():
+                _fb_login(driver, username, password)
+                return
+        except: pass
+    except: pass
+# ── Scraping ───────────────────────────────────────────────────────────────────
+def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list:
+    """Scrape posts from a single FB group URL. Returns list of dict strings."""
+    posts: list = []
+    group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com")
+    print(f"[Facebook] Scraping grup: {group_url}")
+    try:
+        driver.get(group_url)
+        time.sleep(6)
+        ensure_logged_in(driver, username, password)
+    except Exception as e:
+        print(f"[Facebook] Gagal buka grup: {e}")
+        return posts
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    for scroll_n in range(max_scrolls):
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(4)
+        ensure_logged_in(driver, username, password)
+        post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
+        print(f"[Facebook] Scroll {scroll_n + 1} → {len(post_elements)} artikel ditemukan")
+        for idx, post in enumerate(post_elements):
+            try:
+                driver.execute_script("arguments[0].scrollIntoView(true);", post)
+                time.sleep(1)
+                permalink = None
+                post_context = post
+                try:
+                    link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
+                    permalink = link_el.get_attribute("href").split("?")[0]
+                except:
+                    try:
+                        link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
+                        permalink = link_el.get_attribute("href").split("?")[0]
+                    except:
+                        try:
+                            post_id = post.get_attribute("data-ft")
+                            if post_id and "top_level_post_id" in post_id:
+                                d = json.loads(post_id)
+                                pid = d.get("top_level_post_id")
+                                if pid:
+                                    permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
+                        except:
+                            pass
+                if not permalink:
+                    permalink = group_url
+                try:
+                    driver.execute_script(f"window.open('{permalink}', '_blank');")
+                    time.sleep(1)
+                    driver.switch_to.window(driver.window_handles[-1])
+                    time.sleep(3)
+                    ensure_logged_in(driver, username, password)
+                    post_context = driver.find_element(By.XPATH, "//div[@role='article']")
+                except:
+                    post_context = None
+                author = "Unknown"
+                try:
+                    if post_context:
+                        try:
+                            author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
+                        except:
+                            try:
+                                author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
+                            except:
+                                author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
+                except: pass
+                # Expand comments if permalink tab is open
+                if post_context:
+                    while True:
+                        try:
+                            btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
+                            driver.execute_script("arguments[0].click();", btn)
+                            time.sleep(2)
+                        except: break
+                    while True:
+                        try:
+                            btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
+                            driver.execute_script("arguments[0].click();", btn)
+                            time.sleep(2)
+                        except: break
+                caption = ""
+                comments = []
+                if post_context:
+                    try:
+                        blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
+                        caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000]
+                    except: pass
+                    try:
+                        comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']")
+                        seen_c = set()
+                        for cb in comment_blocks:
+                            c = cb.text.strip()
+                            if c and c not in seen_c:
+                                seen_c.add(c)
+                                comments.append(c)
+                    except: pass
+                if len(driver.window_handles) > 1:
+                    driver.close()
+                    driver.switch_to.window(driver.window_handles[0])
+                if caption or comments:
+                    posts.append({
+                        "group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2],
+                        "group_url": group_url,
+                        "post_url": permalink,
+                        "author": author,
+                        "caption": caption,
+                        "comments": comments
+                    })
+            except Exception as e:
+                print(f"[Facebook] Error baca post: {e}")
+                if len(driver.window_handles) > 1:
+                    driver.close()
+                    driver.switch_to.window(driver.window_handles[0])
+                continue
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        last_height = new_height
+    return posts
+# ── Public API ─────────────────────────────────────────────────────────────────
+def scrape_facebook(username: str, password: str, groups: list | None = None) -> list:
+    if not username or not password:
+        print("[Facebook] Username/password tidak disediakan.")
+        return []
+    if not groups:
+        print("[Facebook] Tidak ada URL grup yang disediakan — skip.")
+        return []
+    driver = _create_driver(mobile=False)
+    all_data: list = []
+    try:
+        if not _fb_login(driver, username, password):
+            return []
+        for group_url in groups:
+            if not group_url or not group_url.strip():
+                continue
+            data = _scrape_group(driver, username, password, group_url.strip())
+            all_data.extend(data)
+    except Exception as e:
+        print(f"[Facebook] Fatal error: {e}")
+    finally:
+        try:
+            driver.quit()
+        except Exception:
+            pass
+    print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}")
+    return all_data

services/medos.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+medos.py  –  Instagram scraper using Selenium.
+Exports: scrape_medos(username, password, target_account, mode) -> list[str]
+Strategy:
+  1. Try saved cookies first (faster, avoids login throttling).
+  2. Fall back to username/password login via mobile IG version.
+  3. Collect post links from profile / hashtag page.
+  4. Scrape caption + visible comments from each post.
+"""
+from __future__ import annotations
+import json
+import os
+import time
+from datetime import datetime, timedelta
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from ._driver import _create_driver
+IG_BASE = "https://www.instagram.com/"
+# ── Cookie helpers ─────────────────────────────────────────────────────────────
+def _save_cookies(driver, path: str) -> None:
+    try:
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(driver.get_cookies(), f, ensure_ascii=False, indent=2)
+    except Exception as e:
+        print(f"[Medos] Gagal simpan cookies: {e}")
+def _load_cookies(driver, path: str) -> bool:
+    if not os.path.exists(path) or os.path.getsize(path) == 0:
+        return False
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            cookies = json.load(f)
+        driver.get(IG_BASE)
+        time.sleep(2)
+        driver.delete_all_cookies()
+        for c in cookies:
+            allowed = {k: c[k] for k in c.keys() & {"name", "value", "domain", "path", "secure", "httpOnly", "expiry"}}
+            if "expiry" in allowed and isinstance(allowed["expiry"], float):
+                allowed["expiry"] = int(allowed["expiry"])
+            try:
+                driver.add_cookie(allowed)
+            except Exception:
+                allowed.pop("domain", None)
+                try:
+                    driver.add_cookie(allowed)
+                except Exception:
+                    pass
+        return True
+    except Exception as e:
+        print(f"[Medos] Gagal load cookies: {e}")
+        return False
+def _is_logged_in(driver) -> bool:
+    """Check if the session has a valid sessionid cookie on instagram."""
+    return any(c.get("name") == "sessionid" for c in driver.get_cookies())
+# ── Login ──────────────────────────────────────────────────────────────────────
+def _login(driver, username: str, password: str, cookies_file: str) -> bool:
+    # 1. Try saved cookies
+    if _load_cookies(driver, cookies_file):
+        driver.get(IG_BASE)
+        time.sleep(3)
+        if _is_logged_in(driver):
+            print("[Medos] Login via cookies OK.")
+            return True
+        print("[Medos] Cookies kadaluarsa, coba login manual.")
+    # 2. Username/password login
+    login_url = f"{IG_BASE}accounts/login/"
+    driver.get(login_url)
+    print("[Medos] Membuka halaman login Instagram…")
+    try:
+        # Wait for username OR email field
+        WebDriverWait(driver, 20).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='username'], input[name='email']"))
+        )
+    except TimeoutException:
+        print("[Medos] Halaman login tidak termuat.")
+        try:
+            with open("/app/static/output/ig_login_error.html", "w", encoding="utf-8") as f:
+                f.write(driver.page_source)
+            driver.save_screenshot("/app/static/output/ig_login_error.png")
+            print("[Medos] Log error HTML dan screenshot disimpan ke /app/static/output/")
+        except Exception as e:
+            print(f"[Medos] Gagal menyimpan log error: {e}")
+        return False
+    try:
+        # Try both username/email and password/pass
+        user_field = None
+        for sel in ["input[name='username']", "input[name='email']"]:
+            try:
+                user_field = driver.find_element(By.CSS_SELECTOR, sel)
+                break
+            except NoSuchElementException:
+                pass
+        pass_field = None
+        for sel in ["input[name='password']", "input[name='pass']"]:
+            try:
+                pass_field = driver.find_element(By.CSS_SELECTOR, sel)
+                break
+            except NoSuchElementException:
+                pass
+        if not user_field or not pass_field:
+            print("[Medos] Field login (username/password) tidak ditemukan.")
+            return False
+        user_field.clear()
+        user_field.send_keys(username)
+        time.sleep(0.8)
+        pass_field.clear()
+        pass_field.send_keys(password)
+        time.sleep(0.5)
+        # Submit form: Press ENTER inside password field
+        pass_field.send_keys("\n")
+        time.sleep(1)
+        # Fallback: Try clicking the submit button if it exists
+        try:
+            submit_btn = driver.find_element(By.CSS_SELECTOR, "button[type='submit'], input[type='submit'], div[role='button']")
+            driver.execute_script("arguments[0].click();", submit_btn)
+        except Exception:
+            pass
+        # Wait for redirect away from login page
+        WebDriverWait(driver, 20).until(
+            lambda d: "/accounts/login/" not in d.current_url and "login" not in d.current_url.lower()
+        )
+        print("[Medos] Login sukses.")
+    except TimeoutException:
+        print("[Medos] Login timeout — cek credentials atau akun ter-throttle.")
+        return False
+    except Exception as e:
+        print(f"[Medos] Login gagal: {e}")
+        return False
+    # 3. Dismiss save-info / notification popups
+    for _ in range(2):
+        try:
+            WebDriverWait(driver, 6).until(
+                EC.element_to_be_clickable((
+                    By.XPATH,
+                    "//button[contains(text(),'Not Now') or "
+                    "contains(text(),'Bukan Sekarang') or "
+                    "contains(text(),'Not now')]"
+                ))
+            ).click()
+            time.sleep(1.5)
+        except Exception:
+            pass
+    _save_cookies(driver, cookies_file)
+    return True
+# ── Scraping helpers ───────────────────────────────────────────────────────────
+def _collect_post_links(driver, target_url: str, max_scrolls: int = 5) -> list:
+    print(f"[Medos] Membuka: {target_url}")
+    driver.get(target_url)
+    time.sleep(6)
+    links: set = set()
+    stall = 0
+    for i in range(max_scrolls):
+        prev_count = len(links)
+        for el in driver.find_elements(By.CSS_SELECTOR, "a[href*='/p/'], a[href*='/reel/']"):
+            href = el.get_attribute("href")
+            if href:
+                links.add(href.split("?")[0])
+        print(f"[Medos] Scroll {i+1}: {len(links)} link ditemukan.")
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(3.5)
+        if len(links) == prev_count:
+            stall += 1
+            if stall >= 3:
+                break
+        else:
+            stall = 0
+    return list(links)
+def _scrape_post(driver, link: str) -> list:
+    """Return list of text strings (caption + comments) from one post."""
+    driver.get(link)
+    time.sleep(4)
+    texts = []
+    # Caption — based on medos_scraping.py
+    caption_selectors = [
+        (By.XPATH, "//div[@data-testid='post-caption']"),
+        (By.XPATH, "//h1"),
+        (By.XPATH, "//span[contains(@class, 'x126k92a')]"),
+        (By.CSS_SELECTOR, "article span[dir='auto']"),
+    ]
+    for by, sel in caption_selectors:
+        try:
+            el = WebDriverWait(driver, 3).until(EC.presence_of_element_located((by, sel)))
+            # Try to get text, if empty, we might need innerHTML but text is cleaner
+            t = el.text.strip()
+            if not t:
+                # If text is empty due to formatting, try extracting via JS
+                t = driver.execute_script("return arguments[0].innerText;", el)
+            if t and len(t) > 3:
+                texts.append(t.strip())
+                break
+        except Exception:
+            continue
+    # Load more comments (Tahap 1 Ekspansi dari medos_scraping.py)
+    for _ in range(5):
+        try:
+            # First try the default svg
+            btn = driver.find_element(
+                By.CSS_SELECTOR,
+                "svg[aria-label='Load more comments'], svg[aria-label='Muat komentar lainnya']"
+            )
+            driver.execute_script("arguments[0].click();", btn)
+            time.sleep(2)
+        except Exception:
+            try:
+                # Fallback to load more text
+                btn2 = driver.find_element(
+                    By.XPATH,
+                    "//div[@role='button']//span[contains(text(),'Load') or contains(text(),'Muat')]"
+                )
+                driver.execute_script("arguments[0].click();", btn2)
+                time.sleep(2)
+            except Exception:
+                break
+    # Collect visible comments (Ekstraksi dari medos_scraping.py)
+    try:
+        # Locators from working script + fallbacks
+        xpaths = [
+            "//div[contains(@class, 'x1cy8zhl')]/span",  # From user's working macro
+            "//ul//li//span[@dir='auto']",
+            "//div[@role='button']//span[@dir='auto']",
+            "//div[contains(@class, 'x1xegmmw')]//span[@dir='auto']"
+        ]
+        seen_texts = set()
+        for t in texts:
+            seen_texts.add(t)
+        for xpath in xpaths:
+            spans = driver.find_elements(By.XPATH, xpath)
+            for span in spans:
+                try:
+                    t = span.text.strip()
+                    if t and len(t) > 3 and t not in seen_texts:
+                        seen_texts.add(t)
+                        texts.append(t)
+                except Exception:
+                    pass
+    except Exception as e:
+        print(f"[Medos] Gagal ambil komentar: {e}")
+    return texts
+# ── Public API ─────────────────────────────────────────────────────────────────
+def scrape_medos(username: str, password: str, target_account: str, mode: str = "all") -> list:
+    """
+    Scrape Instagram profile/hashtag posts and return list of text strings.
+    mode: 'all' | 'date' (last 7 months)
+    """
+    if not username or not password or not target_account:
+        print("[Medos] Parameter tidak lengkap.")
+        return []
+    cookies_file = f"/app/ig_cookies_{username}.json"
+    driver = _create_driver(mobile=False)
+    texts_out: list = []
+    try:
+        if not _login(driver, username, password, cookies_file):
+            print("[Medos] Login gagal, scraping dibatalkan.")
+            return []
+        # Determine target URL
+        account = target_account.strip()
+        if account.startswith("#"):
+            tag = account.lstrip("#")
+            target_url = f"{IG_BASE}explore/tags/{tag}/"
+        else:
+            target_url = f"{IG_BASE}{account.lstrip('@')}/"
+        post_links = _collect_post_links(driver, target_url, max_scrolls=5)
+        print(f"[Medos] {len(post_links)} link postingan ditemukan untuk '{account}'.")
+        for link in post_links[:30]:  # cap 30 posts
+            try:
+                result = _scrape_post(driver, link)
+                texts_out.extend(result)
+                print(f"[Medos] {link} → {len(result)} teks")
+            except Exception as e:
+                print(f"[Medos] Error pada {link}: {e}")
+    except Exception as e:
+        print(f"[Medos] Fatal error: {e}")
+    finally:
+        try:
+            driver.quit()
+        except Exception:
+            pass
+    print(f"[Medos] Total teks dari Instagram: {len(texts_out)}")
+    return texts_out

services/news.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""
+news.py  –  News scraper dispatcher.
+Exports: scrape_news(portal, pages, keyword) -> list[dict]
+portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
+"""
+from __future__ import annotations
+import random
+import re
+import time
+from urllib.parse import quote, quote_plus, urlparse, urlunparse
+import requests
+from bs4 import BeautifulSoup
+# ── Shared HTTP session helpers ────────────────────────────────────────────────
+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+}
+def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
+    for attempt in range(retries):
+        try:
+            r = sess.get(url, timeout=20, allow_redirects=True)
+            r.raise_for_status()
+            return r
+        except Exception as e:
+            if attempt < retries - 1:
+                time.sleep(delay)
+    return None
+def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
+    container = None
+    for cls in container_classes:
+        container = soup.find("div", class_=cls)
+        if container:
+            break
+    scope = container if container else soup
+    texts = []
+    for p in scope.find_all("p"):
+        t = p.get_text(" ", strip=True)
+        if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
+            texts.append(t)
+    return texts
+# ── Detik.com ──────────────────────────────────────────────────────────────────
+def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
+    import datetime
+    sess = requests.Session()
+    sess.headers.update(_HEADERS)
+    results = []
+    for page in range(1, max_pages + 1):
+        r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
+        if not r: break
+        soup = BeautifulSoup(r.text, "html.parser")
+        news_list = soup.find_all('div', class_='media')
+        if not news_list: break
+        for news in news_list:
+            try:
+                title_tag = news.find('h3', class_='media__title')
+                if not title_tag: continue
+                link_tag = title_tag.find('a', class_='media__link')
+                if not link_tag or not link_tag.has_attr('href'): continue
+                link = link_tag['href']
+                title = link_tag.text.strip()
+                news_date = None
+                date_tag = news.find('div', class_='media__date')
+                if date_tag:
+                    span_tag = date_tag.find('span')
+                    if span_tag and span_tag.has_attr('d-time'):
+                        timestamp = span_tag['d-time']
+                        news_date = datetime.datetime.fromtimestamp(int(timestamp))
+                news_resp = _get(sess, link)
+                if not news_resp: continue
+                news_soup = BeautifulSoup(news_resp.text, 'html.parser')
+                content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
+                content = ""
+                if content_div:
+                    parts = []
+                    for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+                        text = tag.get_text(strip=True)
+                        if text:
+                            prefix = tag.name.upper() if tag.name.startswith('h') else ''
+                            parts.append(f"{prefix}: {text}" if prefix else text)
+                    content = '\n'.join(parts)
+                nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
+                tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []
+                results.append({
+                    'judul': title,
+                    'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
+                    'tag': ', '.join(tags),
+                    'isi_berita': content,
+                    'link': link
+                })
+            except Exception: pass
+        time.sleep(2)
+    return results
+# ── Radar ──────────────────────────────────────────────────────────────────────
+def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
+    BASE_HOST = "https://radarcirebon.disway.id"
+    sess = requests.Session()
+    sess.headers.update(_HEADERS)
+    results = []
+    def _abs(href):
+        if not href: return None
+        href = href.strip()
+        return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")
+    for page in range(1, max_pages + 1):
+        q = quote_plus(keyword)
+        offset = (page - 1) * 30
+        url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="
+        r = _get(sess, url)
+        if not r: break
+        soup = BeautifulSoup(r.text, "html.parser")
+        news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
+        for item in news_list:
+            try:
+                a = item.find('a', href=True)
+                if not a: continue
+                link = _abs(a.get('href'))
+                title = a.get_text(strip=True)
+                detail_r = _get(sess, link)
+                if not detail_r: continue
+                detail_soup = BeautifulSoup(detail_r.text, "html.parser")
+                h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
+                title_detail = h1.get_text(strip=True) if h1 else title
+                date_text = ""
+                date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
+                if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)
+                content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
+                content = ""
+                if content_container:
+                    content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])
+                tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
+                tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]
+                results.append({
+                    "judul": title_detail,
+                    "tanggal": date_text,
+                    "tag": ", ".join(tags) if tags else "-",
+                    "isi_berita": content,
+                    "link": link
+                })
+            except Exception: pass
+        time.sleep(2)
+    return results
+# ── Antara ─────────────────────────────────────────────────────────────────────
+def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
+    BASE_HOST = "https://www.antaranews.com"
+    sess = requests.Session()
+    sess.headers.update(_HEADERS)
+    results = []
+    def _norm(href):
+        if not href: return None
+        href = href.strip()
+        if href.startswith("/"): href = BASE_HOST + href
+        elif not href.startswith("http"): return None
+        return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")
+    for page in range(1, max_pages + 1):
+        q = quote_plus(keyword)
+        url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
+        r = _get(sess, url)
+        if not r: break
+        soup = BeautifulSoup(r.text, "html.parser")
+        anchors = soup.select('a[href*="/berita/"]')
+        links = {_norm(a.get('href')) for a in anchors if a.get('href')}
+        for link in links:
+            if not link: continue
+            detail_r = _get(sess, link)
+            if not detail_r: continue
+            detail_soup = BeautifulSoup(detail_r.text, "html.parser")
+            h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
+            title_detail = h1.get_text(strip=True) if h1 else ""
+            date_detail = ""
+            cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
+            if cal_icon and cal_icon.find_parent('li'):
+                date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)
+            content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])
+            tags = []
+            for a in detail_soup.select('a[href*="/tag/"]'):
+                tag_text = a.get('title') or a.get_text(strip=True)
+                if tag_text: tags.append(tag_text)
+            results.append({
+                "judul": title_detail,
+                "tanggal": date_detail,
+                "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
+                "isi_berita": "\n".join(content_parts),
+                "link": link
+            })
+    return results
+# ── CNN ───────────────────────���────────────────────────────────────────────────
+def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
+    from selenium.webdriver.common.by import By
+    from selenium.webdriver.support.ui import WebDriverWait
+    from selenium.webdriver.support import expected_conditions as EC
+    from ._driver import _create_driver
+    BASE_HOST = "https://www.cnnindonesia.com"
+    results = []
+    driver = _create_driver(mobile=False)
+    for page in range(1, max_pages + 1):
+        q = quote(keyword)
+        url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
+        driver.get(url)
+        if page == 1:
+            try:
+                WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
+            except: pass
+        try:
+            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
+        except: continue
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}
+        sess = requests.Session()
+        sess.headers.update(_HEADERS)
+        for link in links:
+            html = _get(sess, link)
+            if not html: continue
+            ds = BeautifulSoup(html.text, "html.parser")
+            title_el = ds.select_one('h1')
+            title = title_el.get_text(strip=True) if title_el else "-"
+            date_el = ds.select_one('div.text-cnn_grey.text-sm')
+            date_text = date_el.get_text(strip=True) if date_el else "-"
+            tags_list = []
+            tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
+            if tk_header and tk_header.find_next_sibling('div'):
+                tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]
+            content_container = ds.select_one("div.detail-text")
+            content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"
+            results.append({
+                "judul": title,
+                "tanggal": date_text,
+                "tag": ", ".join(tags_list) if tags_list else "-",
+                "isi_berita": content,
+                "link": link
+            })
+    driver.quit()
+    return results
+# ── RadarCirebonID ─────────────────────────────────────────────────────────────
+def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
+    BASE_HOST = "https://radarcirebon.id"
+    sess = requests.Session()
+    sess.headers.update(_HEADERS)
+    results = []
+    for page in range(1, max_pages + 1):
+        q = quote(keyword).replace('%20', '+')
+        url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
+        r = _get(sess, url)
+        if not r: break
+        soup = BeautifulSoup(r.text, "html.parser")
+        links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}
+        for link in links:
+            detail_r = _get(sess, link)
+            if not detail_r: continue
+            ds = BeautifulSoup(detail_r.text, "html.parser")
+            title_el = ds.select_one('h1.entry-title')
+            date_el = ds.select_one('time.entry-date')
+            c_parts = []
+            cc = ds.select_one('div.entry-content')
+            if cc:
+                for p in cc.select('p'):
+                    if not p.find_parent(class_='read-also'):
+                        t = p.get_text(" ", strip=True)
+                        if t: c_parts.append(t)
+            tc = ds.select_one('div.wp-block-tag-cloud')
+            tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []
+            results.append({
+                "judul": title_el.get_text(strip=True) if title_el else "-",
+                "tanggal": date_el.get_text(strip=True) if date_el else "-",
+                "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
+                "isi_berita": "\n".join(c_parts) if c_parts else "-",
+                "link": link
+            })
+    return results
+# ── Public API ─────────────────────────────────────────────────────────────────
+_PORTAL_MAP = {
+    "detik":                 _scrape_detik,
+    "detik.com":             _scrape_detik,
+    "radar":                 _scrape_radar,
+    "radardisway":           _scrape_radar,
+    "radarcirebon.disway.id": _scrape_radar,
+    "antara":                _scrape_antara,
+    "antaranews":            _scrape_antara,
+    "antaranews.com":        _scrape_antara,
+    "cnn":                   _scrape_cnn,
+    "cnnindonesia":          _scrape_cnn,
+    "cnnindonesia.com":      _scrape_cnn,
+    "radarcirebon":          _scrape_radarcirebon,
+    "radarcirebon.id":       _scrape_radarcirebon,
+}
+def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
+    if not portal: return []
+    portal_key = portal.strip().lower().rstrip("/")
+    scraper = _PORTAL_MAP.get(portal_key)
+    if scraper is None:
+        for key, fn in _PORTAL_MAP.items():
+            if key in portal_key or portal_key in key:
+                scraper = fn
+                break
+    if scraper is None:
+        try:
+            domain = urlparse(portal).netloc or portal_key
+            for key, fn in _PORTAL_MAP.items():
+                if key in domain:
+                    scraper = fn
+                    break
+        except Exception: pass
+    if scraper is None:
+        print(f"[News] Portal '{portal}' tidak dikenali.")
+        return []
+    print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
+    try:
+        return scraper(keyword, max_pages=pages)
+    except Exception as e:
+        print(f"[News] Error saat scraping: {e}")
+        return []

services/preprocessing.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+preprocessing.py  –  Clean & preprocess text for sentiment analysis.
+Only contains utility functions; no Colab/notebook code.
+"""
+import re
+import html as html_lib
+from bs4 import BeautifulSoup
+try:
+    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
+    _sastrawi_available = True
+except ImportError:
+    _sastrawi_available = False
+try:
+    from stop_words import get_stop_words
+    _stopwords_id = get_stop_words('indonesian')
+except Exception:
+    _stopwords_id = []
+# ── Stopwords ──────────────────────────────────────────────────────────────────
+_sastrawi_stopwords: list = []
+_stemmer = None
+if _sastrawi_available:
+    _stemmer = StemmerFactory().create_stemmer()
+    _sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
+_ADDITIONAL_STOPWORDS = [
+    'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
+    'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi',
+    'ya','lbh','digunakan','semangat','dah','sangat','penting',
+    'lancar','cepat','senang','makasih','bermanfaat','keren','baik',
+    'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai',
+    'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk',
+    'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita',
+    # HTML attributes
+    'class','id','span','div','href','src','style','alt','aria','role',
+    'tabindex','button','label','img','input','placeholder','form',
+    'field','hidden','value','by','link','tags',
+]
+_NOISE_STOPWORDS = [
+    'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak',
+    'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat',
+    'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk',
+    'hl','hd','sy','amp','fbf',
+]
+_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
+FINAL_STOPWORDS: set = set(
+    _stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS
+) | _SINGLE_LETTERS
+# ── Individual text cleaners ───────────────────────────────────────────────────
+_AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
+def clean_html(text: str) -> str:
+    """Strip HTML tags and unescape HTML entities."""
+    if not text:
+        return ""
+    try:
+        soup = BeautifulSoup(str(text), "html.parser")
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        cleaned = soup.get_text(separator=" ")
+    except Exception:
+        cleaned = str(text)
+    cleaned = html_lib.unescape(cleaned)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
+def clean_text(text: str) -> str:
+    """Basic single-string cleaner: lowercase, remove URLs, non-alpha chars."""
+    if not text:
+        return ""
+    text = str(text).lower()
+    text = _AUTHOR_COMMENT_PATTERN.sub("", text)
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def _preprocess_single(text: str) -> str:
+    """Full pipeline for one text string."""
+    # 1. Strip HTML
+    text = clean_html(text)
+    # 2. Lowercase + remove URLs/non-alpha
+    text = clean_text(text)
+    # 3. Stem (Sastrawi)
+    if _stemmer:
+        text = _stemmer.stem(text)
+    # 4. Remove stopwords & noise
+    tokens = [
+        w for w in text.split()
+        if w not in FINAL_STOPWORDS and len(w) > 1
+    ]
+    # 5. Keep only tokens with at least one letter
+    tokens = [t for t in tokens if re.search(r'[a-z]', t)]
+    return " ".join(tokens).strip()
+# ── Public API ─────────────────────────────────────────────────────────────────
+def preprocess_text(texts) -> list:
+    """
+    Accept either a single string or a list of strings.
+    Returns a list of cleaned strings.
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    return [_preprocess_single(t) for t in texts if isinstance(t, str)]

services/sentiment.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+sentiment.py  –  Sentiment analysis using IndoBERT / HuggingFace pipeline.
+Model is loaded lazily (first call) to avoid crashing at import time.
+"""
+from __future__ import annotations
+import os
+from typing import Optional
+# ── Model configuration ────────────────────────────────────────────────────────
+# If you have a local fine-tuned model, place it in ./indoBERT-sentiment
+# and set MODEL_DIR. Otherwise it downloads from HuggingFace.
+_LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
+_HF_MODEL_ID = "taufiqdp/indonesian-sentiment"
+# ── Lazy-loaded globals ────────────────────────────────────────────────────────
+_pipeline: Optional[object] = None
+def _load_pipeline():
+    global _pipeline
+    if _pipeline is not None:
+        return _pipeline
+    import torch
+    from transformers import pipeline as hf_pipeline
+    # Prefer local model if it exists (avoids repeated downloads in Docker)
+    if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
+        model_source = _LOCAL_MODEL_DIR
+        print(f"[Sentiment] Loading model from local dir: {model_source}")
+    else:
+        model_source = _HF_MODEL_ID
+        print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")
+    device = 0 if torch.cuda.is_available() else -1
+    _pipeline = hf_pipeline(
+        "text-classification",
+        model=model_source,
+        tokenizer=model_source,
+        device=device,
+        truncation=True,
+        max_length=256,
+        return_all_scores=False,
+    )
+    print("[Sentiment] Model loaded successfully.")
+    return _pipeline
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def _normalize_label(lbl: str) -> str:
+    """Normalise raw model label to 'positif', 'negatif', or 'netral'."""
+    l = lbl.lower()
+    if l in ("positif", "positive", "pos"):
+        return "positif"
+    if l in ("negatif", "negative", "neg"):
+        return "negatif"
+    if l in ("netral", "neutral", "neu"):
+        return "netral"
+    if "label_" in l:
+        try:
+            from transformers import AutoConfig
+            cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
+            idx = int(l.split("_")[-1])
+            return _normalize_label(cfg.id2label[idx])
+        except Exception:
+            return "netral"
+    return "netral"
+# ── Keywords Override ──────────────────────────────────────────────────────────
+_NEGATIVE_KEYWORDS = {
+    "bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
+    "kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
+    "kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur",
+    "rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang",
+    "palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
+    "tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
+}
+_POSITIVE_KEYWORDS = {
+    "bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
+    "cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
+    "sempurna", "berhasil", "luas", "indah"
+}
+_NEUTRAL_KEYWORDS = {
+    "ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
+}
+def _override_label(text: str, model_label: str) -> str:
+    text_lower = text.lower()
+    if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
+        return "negatif"
+    if any(w in text_lower for w in _POSITIVE_KEYWORDS):
+        return "positif"
+    if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
+        return "netral"
+    return model_label
+# ── Public API ─────────────────────────────────────────────────────────────────
+def analyze_sentiment(texts: list) -> dict:
+    """
+    Run sentiment analysis on a list of text strings.
+    Args:
+        texts: list of pre-processed strings
+    Returns:
+        dict with keys: positif, negatif, netral, total, detail
+        Example:
+          {
+            "positif": 12, "negatif": 4, "netral": 6, "total": 22,
+            "detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
+          }
+    """
+    if not texts:
+        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
+    # Filter out empty strings
+    texts = [t for t in texts if t and t.strip()]
+    if not texts:
+        return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
+    clf = _load_pipeline()
+    try:
+        preds = clf(texts, batch_size=16, truncation=True)
+    except Exception as e:
+        print(f"[Sentiment] Prediction error: {e}")
+        return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}
+    counts = {"positif": 0, "negatif": 0, "netral": 0}
+    detail = []
+    for text, pred in zip(texts, preds):
+        model_label = _normalize_label(pred["label"])
+        final_label = _override_label(text, model_label)
+        counts[final_label] += 1
+        detail.append({
+            "text": text[:200],
+            "label": final_label,
+            "score": round(float(pred["score"]), 4),
+        })
+    return {
+        "positif": counts["positif"],
+        "negatif": counts["negatif"],
+        "netral": counts["netral"],
+        "total": len(texts),
+        "detail": detail,
+    }

services/tiktok.py ADDED Viewed

	@@ -0,0 +1,320 @@

+"""
+tiktok.py  –  TikTok scraper using Selenium.
+Exports: scrape_tiktok(cookie_str, target_username) -> list[dict]
+Returns structured data per-video:
+  url, profile_username, upload_date, like_count,
+  caption_short, caption_detail, comments, scrape_date
+cookie_str accepts:
+  1. Raw string: "sessionid=xxx; tt_webid=yyy; ..."
+  2. JSON array:  [{"name":"sessionid","value":"xxx",...}, ...]
+  3. JSON object: {"sessionid": "xxx", "tt_webid": "yyy"}
+"""
+from __future__ import annotations
+import json
+import time
+from datetime import datetime
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+from ._driver import _create_driver
+# ── Cookie injection ───────────────────────────────────────────────────────────
+def _inject_cookies(driver, cookie_str: str) -> bool:
+    driver.get("https://www.tiktok.com/")
+    time.sleep(3)
+    if not cookie_str or not cookie_str.strip():
+        print("[TikTok] Tidak ada cookie yang diberikan.")
+        return False
+    stripped = cookie_str.strip()
+    if stripped.startswith("["):
+        try:
+            cookies = json.loads(stripped)
+            count = 0
+            for c in cookies:
+                if not isinstance(c, dict) or "name" not in c:
+                    continue
+                safe = {k: c[k] for k in ("name", "value", "domain", "path", "secure", "httpOnly", "expiry") if k in c}
+                safe.setdefault("domain", ".tiktok.com")
+                try:
+                    driver.add_cookie(safe)
+                    count += 1
+                except Exception:
+                    safe.pop("domain", None)
+                    try:
+                        driver.add_cookie(safe)
+                        count += 1
+                    except Exception:
+                        pass
+            driver.refresh()
+            time.sleep(3)
+            return count > 0
+        except Exception as e:
+            print(f"[TikTok] JSON array error: {e}")
+    if stripped.startswith("{"):
+        try:
+            obj = json.loads(stripped)
+            count = 0
+            for name, value in obj.items():
+                try:
+                    driver.add_cookie({"name": str(name), "value": str(value), "domain": ".tiktok.com"})
+                    count += 1
+                except Exception:
+                    pass
+            driver.refresh()
+            time.sleep(3)
+            return count > 0
+        except Exception as e:
+            print(f"[TikTok] JSON object error: {e}")
+    try:
+        count = 0
+        for item in stripped.split(";"):
+            item = item.strip()
+            if "=" not in item:
+                continue
+            name, _, value = item.partition("=")
+            try:
+                driver.add_cookie({"name": name.strip(), "value": value.strip(), "domain": ".tiktok.com"})
+                count += 1
+            except Exception:
+                pass
+        driver.refresh()
+        time.sleep(3)
+        return count > 0
+    except Exception as e:
+        print(f"[TikTok] String cookie error: {e}")
+        return False
+# ── Scraping helpers ───────────────────────────────────────────────────────────
+_VIDEO_LINK_SELECTORS = [
+    'div[data-e2e="user-post-item"] a',
+    'div[data-e2e="user-post-item-list"] a',
+    'a[href*="/video/"]',
+    'div[class*="DivItemContainerV2"] a',
+    'div[class*="DivWrapper"] a[href*="/video/"]',
+]
+def _get_video_links(driver, profile_url: str, max_videos: int = 30) -> list:
+    print(f"[TikTok] Membuka profil: {profile_url}")
+    driver.get(profile_url)
+    loaded = False
+    for sel in _VIDEO_LINK_SELECTORS:
+        try:
+            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
+            loaded = True
+            break
+        except TimeoutException:
+            continue
+    if not loaded:
+        time.sleep(5)
+    links: set = set()
+    stall = 0
+    while len(links) < max_videos:
+        prev = len(links)
+        for sel in _VIDEO_LINK_SELECTORS:
+            for el in driver.find_elements(By.CSS_SELECTOR, sel):
+                href = el.get_attribute("href")
+                if href and "/video/" in href:
+                    links.add(href.split("?")[0])
+        if len(links) >= max_videos:
+            break
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(3)
+        if len(links) == prev:
+            stall += 1
+            if stall >= 3:
+                break
+        else:
+            stall = 0
+    return list(links)[:max_videos]
+def _scrape_video(driver, video_url: str, profile_username: str) -> dict | None:
+    print(f"[TikTok] Memproses: {video_url}")
+    driver.get(video_url)
+    time.sleep(5)
+    video_data = {
+        "url": video_url,
+        "profile_username": profile_username,
+        "upload_date": "N/A",
+        "like_count": "N/A",
+        "caption_short": "",
+        "caption_detail": "",
+        "comments": [],
+        "scrape_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    try:
+        date_el = WebDriverWait(driver, 8).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
+        )
+        video_data["upload_date"] = date_el.text.strip()
+    except TimeoutException:
+        pass
+    try:
+        like_el = driver.find_element(By.CSS_SELECTOR, 'strong[data-e2e="like-count"]')
+        video_data["like_count"] = like_el.text.strip()
+    except NoSuchElementException:
+        pass
+    try:
+        desc_container = WebDriverWait(driver, 5).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
+        )
+        try:
+            cap_el = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]')
+            video_data["caption_short"] = cap_el.text.strip()
+            try:
+                more_btn = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
+                driver.execute_script("arguments[0].click();", more_btn)
+                time.sleep(2)
+                detail_container = WebDriverWait(driver, 5).until(
+                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
+                )
+                desc_text = ""
+                try:
+                    desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
+                except NoSuchElementException:
+                    pass
+                kw_text = ""
+                try:
+                    kw_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
+                except NoSuchElementException:
+                    pass
+                video_data["caption_detail"] = f"Deskripsi: {desc_text}\nKeywords: {kw_text}".strip()
+            except Exception:
+                pass
+        except NoSuchElementException:
+            pass
+    except TimeoutException:
+        pass
+    try:
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
+        )
+        reply_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'repl'))]"
+        stall = 0
+        last_count = 0
+        for _ in range(15):
+            try:
+                btns = driver.find_elements(By.XPATH, reply_xpath)
+                if btns:
+                    driver.execute_script("arguments[0].click();", btns[0])
+                    time.sleep(2)
+                    stall = 0
+                    continue
+            except Exception:
+                pass
+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            time.sleep(3)
+            cur = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
+            if cur > last_count:
+                last_count = cur
+                stall = 0
+            else:
+                stall += 1
+                if stall >= 4:
+                    break
+        items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
+        for item in items:
+            try:
+                author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
+                if author_el:
+                    cat_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]').text.strip()
+                    if cat_text:
+                        video_data["comments"].append({
+                            "author": author_el[0].text.strip(),
+                            "comment": cat_text,
+                            "replies": []
+                        })
+                    continue
+                # Check for replies (level 2)
+                r_author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
+                if r_author_el and video_data["comments"]:
+                    r_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]').text.strip()
+                    if r_text:
+                        video_data["comments"][-1]["replies"].append({
+                            "author": r_author_el[0].text.strip(),
+                            "comment": r_text
+                        })
+            except Exception:
+                pass
+    except TimeoutException:
+        pass
+    return video_data
+# ── Public API ───���─────────────────────────────────────────────────────────────
+def scrape_tiktok(cookie_str: str, target_username: str, max_videos: int = 20) -> list:
+    """
+    Scrape captions & comments from a TikTok profile.
+    Returns:
+        list of dicts with: url, profile_username, upload_date, like_count,
+                            caption_short, caption_detail, comments, scrape_date
+    """
+    if not target_username:
+        print("[TikTok] target_username tidak ada.")
+        return []
+    username = target_username.lstrip("@")
+    profile_url = f"https://www.tiktok.com/@{username}"
+    driver = _create_driver(mobile=False)
+    all_data: list = []
+    try:
+        if cookie_str and cookie_str.strip():
+            _inject_cookies(driver, cookie_str)
+        links = _get_video_links(driver, profile_url, max_videos)
+        for url in links:
+            try:
+                data = _scrape_video(driver, url, username)
+                if data:
+                    all_data.append(data)
+            except Exception as e:
+                print(f"[TikTok] Error {url}: {e}")
+            time.sleep(1.5)
+    except Exception as e:
+        print(f"[TikTok] Fatal error: {e}")
+    finally:
+        try:
+            driver.quit()
+        except Exception:
+            pass
+    return all_data

services/wordcloud_service.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+wordcloud_service.py  –  Generate a word-cloud image from a list of texts.
+Stripped from the original Colab notebook; only the generation function remains.
+"""
+from __future__ import annotations
+import io
+import os
+import re
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")  # Must be before pyplot import — headless/no-display
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+# ── Stopwords (same set as preprocessing.py) ──────────────────────────────────
+try:
+    from stop_words import get_stop_words
+    _stopwords_id = get_stop_words('indonesian')
+except Exception:
+    _stopwords_id = []
+try:
+    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
+    _sastrawi_sw = StopWordRemoverFactory().get_stop_words()
+except Exception:
+    _sastrawi_sw = []
+_EXTRA_STOPWORDS = [
+    'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg',
+    'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru',
+]
+_BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS)
+_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
+WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS
+# ── Internal helpers ───────────────────────────────────────────────────────────
+def _merge_texts(texts: list) -> str:
+    """Join a list of strings, keeping only alphabetic tokens."""
+    joined = " ".join(str(t) for t in texts if t)
+    tokens = joined.lower().split()
+    tokens = [
+        w for w in tokens
+        if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2
+    ]
+    return " ".join(tokens)
+def _circular_mask(size: int = 400) -> np.ndarray:
+    x, y = np.ogrid[:size, :size]
+    center = size // 2
+    radius = center - 10
+    mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2
+    return (255 * mask).astype(np.uint8)
+# ── Public API ─────────────────────────────────────────────────────────────────
+def generate_wordcloud(texts: list, output_dest) -> bool:
+    """
+    Generate a circular wordcloud from a list of text strings.
+    Args:
+        texts:       list of strings (raw or pre-processed)
+        output_dest: file path string OR a BytesIO buffer.
+                     If a string path is given, the PNG is saved to disk.
+                     If a BytesIO buffer is given, the PNG is written there
+                     (no file is created on disk).
+    Returns:
+        True on success, False on failure.
+    """
+    if not texts:
+        print("[WordCloud] No texts provided.")
+        return False
+    text_data = _merge_texts(texts)
+    if not text_data.strip():
+        print("[WordCloud] All text was filtered out by stopwords; nothing to plot.")
+        return False
+    # If saving to a file path, ensure the directory exists
+    if isinstance(output_dest, str):
+        output_dir = os.path.dirname(output_dest)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+    try:
+        mask = _circular_mask(400)
+        wc = WordCloud(
+            width=800,
+            height=800,
+            background_color="white",
+            colormap="viridis",
+            mask=mask,
+            contour_width=2,
+            contour_color="steelblue",
+            stopwords=WORDCLOUD_STOPWORDS,
+            max_words=100,
+        ).generate(text_data)
+        fig, ax = plt.subplots(figsize=(8, 8))
+        ax.imshow(wc, interpolation="bilinear")
+        ax.axis("off")
+        plt.tight_layout(pad=0)
+        plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png")
+        plt.close(fig)
+        if isinstance(output_dest, str):
+            print(f"[WordCloud] Saved to {output_dest}")
+        else:
+            print("[WordCloud] Written to in-memory buffer (temporal).")
+        return True
+    except Exception as e:
+        print(f"[WordCloud] Error generating wordcloud: {e}")
+        return False

templates/index.html ADDED Viewed

	@@ -0,0 +1,1009 @@

+<!DOCTYPE html>
+<html lang="id">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>SentiScope — Sentiment Analysis Dashboard</title>
+  <meta name="description" content="Dashboard analisis sentimen media sosial dengan scraping otomatis, word cloud, dan indoBERT.">
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Space+Grotesk:wght@400;500;600;700&display=swap" rel="stylesheet">
+  <style>
+    /* ── Reset & Base ──────────────────────────────────────────────────── */
+    *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
+    :root {
+      --bg: #07071a;
+      --surface: #0e0e28;
+      --surface-2: #14143a;
+      --border: rgba(130, 100, 255, 0.18);
+      --border-hover: rgba(130, 100, 255, 0.42);
+      --purple: #7c3aed;
+      --purple-light: #a855f7;
+      --cyan: #06b6d4;
+      --text: #e2e8f0;
+      --text-muted: #8892a4;
+      --text-dim: #4b5563;
+      --radius: 14px;
+      --radius-sm: 8px;
+      --transition: 0.22s cubic-bezier(0.4, 0, 0.2, 1);
+    }
+    html { scroll-behavior: smooth; }
+    body {
+      font-family: 'Inter', system-ui, sans-serif;
+      background: var(--bg);
+      color: var(--text);
+      min-height: 100vh;
+      overflow-x: hidden;
+    }
+    body::before {
+      content: '';
+      position: fixed;
+      inset: 0;
+      background:
+        radial-gradient(ellipse 70% 50% at 15% 20%, rgba(124,58,237,0.12) 0%, transparent 60%),
+        radial-gradient(ellipse 50% 40% at 85% 75%, rgba(6,182,212,0.10) 0%, transparent 60%),
+        radial-gradient(ellipse 40% 35% at 50% 5%, rgba(168,85,247,0.08) 0%, transparent 55%);
+      pointer-events: none;
+      z-index: 0;
+    }
+    /* ── Layout ────────────────────────────────────────────────────────── */
+    .wrapper {
+      position: relative;
+      z-index: 1;
+      max-width: 920px;
+      margin: 0 auto;
+      padding: 2.5rem 1.25rem 4rem;
+    }
+    /* ── Hero ───────────────────────────────────────────────────────────── */
+    .hero { text-align: center; margin-bottom: 2.5rem; }
+    .hero-badge {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.45rem;
+      background: rgba(124,58,237,0.15);
+      border: 1px solid rgba(124,58,237,0.35);
+      border-radius: 100px;
+      padding: 0.28rem 0.9rem;
+      font-size: 0.75rem;
+      font-weight: 600;
+      color: var(--purple-light);
+      letter-spacing: 0.04em;
+      text-transform: uppercase;
+      margin-bottom: 1rem;
+    }
+    .hero h1 {
+      font-family: 'Space Grotesk', sans-serif;
+      font-size: clamp(2rem, 5vw, 3.2rem);
+      font-weight: 700;
+      line-height: 1.15;
+      background: linear-gradient(135deg, #c084fc 0%, #818cf8 40%, #38bdf8 100%);
+      -webkit-background-clip: text;
+      -webkit-text-fill-color: transparent;
+      background-clip: text;
+      margin-bottom: 0.7rem;
+    }
+    .hero p {
+      color: var(--text-muted);
+      font-size: 0.95rem;
+      max-width: 520px;
+      margin: 0 auto;
+      line-height: 1.6;
+    }
+    /* ── Tab navigation ────────────────────────────────────────────────── */
+    .tab-nav {
+      display: flex;
+      gap: 0.5rem;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 0.4rem;
+      margin-bottom: 2rem;
+    }
+    .tab-btn {
+      flex: 1;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      gap: 0.5rem;
+      padding: 0.7rem 1.2rem;
+      border: none;
+      border-radius: var(--radius-sm);
+      background: transparent;
+      color: var(--text-muted);
+      font-family: 'Inter', sans-serif;
+      font-size: 0.88rem;
+      font-weight: 500;
+      cursor: pointer;
+      transition: var(--transition);
+    }
+    .tab-btn:hover { color: var(--text); background: rgba(255,255,255,0.05); }
+    .tab-btn.active {
+      background: linear-gradient(135deg, rgba(124,58,237,0.35), rgba(6,182,212,0.2));
+      color: #fff;
+      font-weight: 600;
+      box-shadow: 0 0 0 1px rgba(124,58,237,0.5) inset;
+    }
+    /* ── Tab panels ─────────────────────────────────────────────────────── */
+    .tab-panel { display: none; }
+    .tab-panel.active { display: block; }
+    /* ── Glass card ─────────────────────────────────────────────────────── */
+    .card {
+      background: linear-gradient(135deg, rgba(14,14,40,0.9) 0%, rgba(20,20,58,0.75) 100%);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1.6rem;
+      margin-bottom: 1.25rem;
+      backdrop-filter: blur(12px);
+      transition: border-color var(--transition), box-shadow var(--transition);
+    }
+    .card:hover { border-color: var(--border-hover); }
+    /* ── Platform header ────────────────────────────────────────────────── */
+    .platform-header {
+      display: flex;
+      align-items: center;
+      justify-content: space-between;
+      margin-bottom: 1.1rem;
+    }
+    .platform-title {
+      display: flex;
+      align-items: center;
+      gap: 0.6rem;
+      font-family: 'Space Grotesk', sans-serif;
+      font-size: 1rem;
+      font-weight: 600;
+      color: #c4b5fd;
+    }
+    .platform-icon {
+      width: 32px;
+      height: 32px;
+      border-radius: 8px;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      font-size: 1rem;
+    }
+    .pi-instagram { background: linear-gradient(135deg, #f09433, #e6683c, #dc2743, #cc2366, #bc1888); }
+    .pi-tiktok { background: #161823; border: 1px solid #333; }
+    .pi-facebook { background: #1877f2; }
+    .pi-news { background: linear-gradient(135deg, #0ea5e9, #6366f1); }
+    .pi-dataset { background: linear-gradient(135deg, #059669, #0891b2); }
+    /* ── Toggle switch ──────────────────────────────────────────────────── */
+    .toggle-wrap { display: flex; align-items: center; gap: 0.6rem; }
+    .toggle-label { font-size: 0.78rem; color: var(--text-dim); font-weight: 500; }
+    .toggle { position: relative; width: 42px; height: 24px; }
+    .toggle input { opacity: 0; width: 0; height: 0; }
+    .slider {
+      position: absolute;
+      inset: 0;
+      background: rgba(255,255,255,0.1);
+      border-radius: 100px;
+      cursor: pointer;
+      transition: var(--transition);
+    }
+    .slider::before {
+      content: '';
+      position: absolute;
+      width: 18px;
+      height: 18px;
+      left: 3px;
+      top: 3px;
+      background: white;
+      border-radius: 50%;
+      transition: var(--transition);
+    }
+    .toggle input:checked + .slider { background: linear-gradient(135deg, var(--purple), var(--cyan)); }
+    .toggle input:checked + .slider::before { transform: translateX(18px); }
+    .platform-fields {
+      overflow: hidden;
+      transition: max-height 0.35s ease, opacity 0.3s ease;
+    }
+    .platform-fields.collapsed {
+      max-height: 0 !important;
+      opacity: 0;
+      pointer-events: none;
+    }
+    /* ── Form elements ──────────────────────────────────────────────────── */
+    .form-row { display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; }
+    .form-row.cols-3 { grid-template-columns: 1fr 1fr 1fr; }
+    .form-group { display: flex; flex-direction: column; gap: 0.3rem; }
+    .form-group.full { grid-column: 1 / -1; }
+    label { font-size: 0.78rem; color: var(--text-muted); font-weight: 500; letter-spacing: 0.01em; }
+    input[type="text"],
+    input[type="password"],
+    input[type="number"],
+    textarea,
+    select {
+      background: rgba(7,7,26,0.7);
+      border: 1px solid rgba(130,100,255,0.2);
+      border-radius: var(--radius-sm);
+      color: var(--text);
+      padding: 0.65rem 0.9rem;
+      font-family: 'Inter', sans-serif;
+      font-size: 0.88rem;
+      width: 100%;
+      transition: border-color var(--transition), box-shadow var(--transition);
+      outline: none;
+    }
+    input::placeholder, textarea::placeholder { color: var(--text-dim); }
+    input:focus, textarea:focus, select:focus {
+      border-color: var(--purple);
+      box-shadow: 0 0 0 3px rgba(124,58,237,0.2);
+    }
+    select option { background: var(--surface-2); }
+    textarea { resize: vertical; min-height: 88px; line-height: 1.5; }
+    .field-hint { font-size: 0.72rem; color: var(--text-dim); line-height: 1.4; margin-top: 0.2rem; }
+    /* ── Cookie tabs ────────────────────────────────────────────────────── */
+    .cookie-tabs { display: flex; gap: 0.3rem; margin-bottom: 0.5rem; }
+    .cookie-tab-btn {
+      padding: 0.25rem 0.7rem;
+      font-size: 0.72rem;
+      font-weight: 600;
+      border: 1px solid rgba(130,100,255,0.25);
+      border-radius: 6px;
+      background: transparent;
+      color: var(--text-muted);
+      cursor: pointer;
+      transition: var(--transition);
+    }
+    .cookie-tab-btn.active {
+      background: rgba(124,58,237,0.25);
+      color: #c4b5fd;
+      border-color: rgba(124,58,237,0.5);
+    }
+    /* ── Tag hint ───────────────────────────────────────────────────────── */
+    .tag-hint {
+      display: inline-flex;
+      align-items: center;
+      gap: 0.3rem;
+      font-size: 0.72rem;
+      color: var(--cyan);
+      background: rgba(6,182,212,0.1);
+      border: 1px solid rgba(6,182,212,0.25);
+      border-radius: 6px;
+      padding: 0.15rem 0.55rem;
+      margin-top: 0.3rem;
+    }
+    /* ── Portal chips ───────────────────────────────────────────────────── */
+    .portal-grid {
+      display: grid;
+      grid-template-columns: repeat(auto-fill, minmax(160px, 1fr));
+      gap: 0.5rem;
+    }
+    .portal-chip {
+      display: flex;
+      align-items: center;
+      gap: 0.5rem;
+      padding: 0.55rem 0.75rem;
+      border: 1px solid rgba(130,100,255,0.2);
+      border-radius: var(--radius-sm);
+      cursor: pointer;
+      background: rgba(7,7,26,0.5);
+      transition: var(--transition);
+      user-select: none;
+    }
+    .portal-chip:hover { border-color: rgba(130,100,255,0.45); background: rgba(124,58,237,0.1); }
+    .portal-chip input[type="checkbox"] { display: none; }
+    .portal-chip.checked { border-color: var(--purple); background: rgba(124,58,237,0.2); }
+    .chip-label { font-size: 0.82rem; font-weight: 500; color: var(--text-muted); }
+    .portal-chip.checked .chip-label { color: var(--text); }
+    .chip-dot {
+      width: 8px;
+      height: 8px;
+      border-radius: 50%;
+      background: var(--text-dim);
+      flex-shrink: 0;
+      transition: var(--transition);
+    }
+    .portal-chip.checked .chip-dot { background: var(--purple-light); }
+    /* ── Submit button ──────────────────────────────────────────────────── */
+    .btn-submit {
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      gap: 0.6rem;
+      width: 100%;
+      padding: 1rem;
+      background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 50%, #0891b2 100%);
+      border: none;
+      border-radius: var(--radius);
+      color: #fff;
+      font-family: 'Space Grotesk', sans-serif;
+      font-size: 1rem;
+      font-weight: 600;
+      cursor: pointer;
+      transition: opacity var(--transition), transform var(--transition), box-shadow var(--transition);
+      letter-spacing: 0.02em;
+      margin-top: 0.5rem;
+      position: relative;
+      overflow: hidden;
+    }
+    .btn-submit::before {
+      content: '';
+      position: absolute;
+      inset: 0;
+      background: linear-gradient(135deg, rgba(255,255,255,0.12), transparent);
+      opacity: 0;
+      transition: opacity var(--transition);
+    }
+    .btn-submit:hover::before { opacity: 1; }
+    .btn-submit:hover { transform: translateY(-2px); box-shadow: 0 8px 32px rgba(124,58,237,0.45); }
+    .btn-submit:active { transform: translateY(0); }
+    .btn-submit:disabled { opacity: 0.65; pointer-events: none; cursor: not-allowed; transform: none; }
+    /* ── Spinner ────────────────────────────────────────────────────────── */
+    .spinner {
+      display: none;
+      width: 18px;
+      height: 18px;
+      border: 2.5px solid rgba(255,255,255,0.3);
+      border-top-color: #fff;
+      border-radius: 50%;
+      animation: spin 0.7s linear infinite;
+      flex-shrink: 0;
+    }
+    @keyframes spin { to { transform: rotate(360deg); } }
+    /* ── Alert ──────────────────────────────────────────────────────────── */
+    .alert {
+      border-radius: var(--radius);
+      padding: 1rem 1.25rem;
+      margin-bottom: 1.5rem;
+      font-size: 0.88rem;
+      border: 1px solid;
+      display: flex;
+      gap: 0.6rem;
+      align-items: flex-start;
+    }
+    .alert-error {
+      background: rgba(239,68,68,0.08);
+      border-color: rgba(239,68,68,0.3);
+      color: #fca5a5;
+    }
+    /* ── Results section ────────────────────────────────────────────────── */
+    .results-section { margin-top: 2.5rem; }
+    .results-header {
+      display: flex;
+      align-items: center;
+      gap: 0.6rem;
+      margin-bottom: 1.5rem;
+    }
+    .results-header h2 {
+      font-family: 'Space Grotesk', sans-serif;
+      font-size: 1.3rem;
+      font-weight: 700;
+      background: linear-gradient(135deg, var(--cyan), var(--purple-light));
+      -webkit-background-clip: text;
+      -webkit-text-fill-color: transparent;
+      background-clip: text;
+    }
+    .stats-strip {
+      font-size: 0.8rem;
+      color: var(--text-dim);
+      background: rgba(255,255,255,0.04);
+      border: 1px solid var(--border);
+      border-radius: 8px;
+      padding: 0.4rem 0.9rem;
+      margin-left: auto;
+    }
+    /* ── Sentiment cards ───────────────────────────────────────────────── */
+    .sentiment-grid {
+      display: grid;
+      grid-template-columns: repeat(3, 1fr);
+      gap: 1rem;
+      margin-bottom: 1.5rem;
+    }
+    .s-card {
+      border-radius: var(--radius);
+      padding: 1.4rem 1rem;
+      text-align: center;
+      border: 1px solid;
+      position: relative;
+      overflow: hidden;
+    }
+    .s-card::before { content: ''; position: absolute; inset: 0; opacity: 0.06; border-radius: inherit; }
+    .s-card.positif { background: rgba(34,197,94,0.08); border-color: rgba(34,197,94,0.3); }
+    .s-card.positif::before { background: #22c55e; }
+    .s-card.negatif { background: rgba(239,68,68,0.08); border-color: rgba(239,68,68,0.3); }
+    .s-card.negatif::before { background: #ef4444; }
+    .s-card.netral { background: rgba(148,163,184,0.06); border-color: rgba(148,163,184,0.2); }
+    .s-card.netral::before { background: #94a3b8; }
+    .s-count { font-family: 'Space Grotesk', sans-serif; font-size: 2.8rem; font-weight: 700; line-height: 1; margin-bottom: 0.3rem; }
+    .s-card.positif .s-count { color: #4ade80; }
+    .s-card.negatif .s-count { color: #f87171; }
+    .s-card.netral .s-count { color: #94a3b8; }
+    .s-label { font-size: 0.82rem; color: var(--text-muted); font-weight: 500; }
+    .s-bar-wrap { margin-top: 0.8rem; height: 4px; background: rgba(255,255,255,0.08); border-radius: 100px; overflow: hidden; }
+    .s-bar { height: 100%; border-radius: 100px; transition: width 1.2s cubic-bezier(0.4,0,0.2,1); }
+    .s-card.positif .s-bar { background: linear-gradient(90deg, #16a34a, #4ade80); }
+    .s-card.negatif .s-bar { background: linear-gradient(90deg, #b91c1c, #f87171); }
+    .s-card.netral .s-bar { background: linear-gradient(90deg, #475569, #94a3b8); }
+    /* ── Word cloud ─────────────────────────────────────────────────────── */
+    .wordcloud-card {
+      background: var(--surface);
+      border: 1px solid var(--border);
+      border-radius: var(--radius);
+      padding: 1.5rem;
+      text-align: center;
+    }
+    .wordcloud-card h3 {
+      font-family: 'Space Grotesk', sans-serif;
+      font-size: 1rem;
+      color: var(--purple-light);
+      margin-bottom: 1rem;
+    }
+    .wordcloud-img { max-width: 100%; border-radius: 10px; border: 1px solid var(--border); }
+    /* ── Divider ────────────────────────────────────────────────────────── */
+    .divider {
+      display: flex;
+      align-items: center;
+      gap: 0.75rem;
+      color: var(--text-dim);
+      font-size: 0.75rem;
+      margin: 0.75rem 0;
+    }
+    .divider::before, .divider::after { content: ''; flex: 1; height: 1px; background: var(--border); }
+    /* ── Section label ──────────────────────────────────────────────────── */
+    .section-label {
+      font-size: 0.7rem;
+      font-weight: 700;
+      text-transform: uppercase;
+      letter-spacing: 0.08em;
+      color: var(--text-dim);
+      margin-bottom: 0.6rem;
+    }
+    /* ── File upload ────────────────────────────────────────────────────── */
+    .upload-zone {
+      border: 2px dashed rgba(130,100,255,0.28);
+      border-radius: var(--radius);
+      padding: 2.5rem 1.5rem;
+      text-align: center;
+      transition: var(--transition);
+      cursor: pointer;
+      background: rgba(124,58,237,0.04);
+      position: relative;
+    }
+    .upload-zone:hover, .upload-zone.drag-over { border-color: var(--purple); background: rgba(124,58,237,0.1); }
+    .upload-zone input[type="file"] { position: absolute; inset: 0; opacity: 0; cursor: pointer; width: 100%; height: 100%; }
+    .upload-icon { font-size: 2rem; margin-bottom: 0.5rem; }
+    .upload-text { font-size: 0.9rem; color: var(--text-muted); }
+    .upload-sub { font-size: 0.78rem; color: var(--text-dim); margin-top: 0.3rem; }
+    .upload-filename { display: none; margin-top: 0.6rem; font-size: 0.82rem; color: var(--cyan); font-weight: 500; }
+    /* ── Responsive ─────────────────────────────────────────────────────── */
+    @media (max-width: 640px) {
+      .form-row { grid-template-columns: 1fr; }
+      .form-row.cols-3 { grid-template-columns: 1fr 1fr; }
+      .sentiment-grid { grid-template-columns: 1fr; }
+      .tab-btn span.tab-text { display: none; }
+      .hero h1 { font-size: 1.8rem; }
+    }
+    /* ── Animations ─────────────────────────────────────────────────────── */
+    @keyframes fadeUp {
+      from { opacity: 0; transform: translateY(20px); }
+      to   { opacity: 1; transform: translateY(0); }
+    }
+    .animate-in { animation: fadeUp 0.5s ease both; }
+    .delay-1 { animation-delay: 0.05s; }
+    .delay-2 { animation-delay: 0.10s; }
+    .delay-3 { animation-delay: 0.15s; }
+    .delay-4 { animation-delay: 0.20s; }
+    .delay-5 { animation-delay: 0.25s; }
+  </style>
+</head>
+<body>
+<div class="wrapper">
+  <!-- Hero -->
+  <header class="hero animate-in">
+    <div class="hero-badge">🔬 AI-Powered</div>
+    <h1>SentiScope</h1>
+    <p>Analisis sentimen media sosial otomatis dengan IndoBERT — Instagram, TikTok, Facebook & Berita Online.</p>
+  </header>
+  <!-- Error alert -->
+  {% if error %}
+  <div class="alert alert-error animate-in" role="alert">
+    <span>⚠️</span>
+    <span>{{ error }}</span>
+  </div>
+  {% endif %}
+  <!-- Tab navigation -->
+  <nav class="tab-nav animate-in delay-1" role="tablist">
+    <button class="tab-btn {% if active_tab != 'dataset' %}active{% endif %}"
+            id="tab-scraping" role="tab" onclick="switchTab('scraping')">
+      <span class="tab-icon">🕷️</span>
+      <span class="tab-text">Scraping Otomatis</span>
+    </button>
+    <button class="tab-btn {% if active_tab == 'dataset' %}active{% endif %}"
+            id="tab-dataset" role="tab" onclick="switchTab('dataset')">
+      <span class="tab-icon">📂</span>
+      <span class="tab-text">Upload Dataset</span>
+    </button>
+  </nav>
+  <!-- ═══════════════════════ TAB 1: Scraping ═══════════════════════════ -->
+  <div class="tab-panel {% if active_tab != 'dataset' %}active{% endif %}" id="panel-scraping">
+    <form id="scraping-form" action="/process" method="post">
+      <!-- Hidden enable flags — managed by JS toggles -->
+      <input type="hidden" id="enable_instagram" name="enable_instagram" value="">
+      <input type="hidden" id="enable_tiktok"    name="enable_tiktok"    value="">
+      <input type="hidden" id="enable_facebook"  name="enable_facebook"  value="">
+      <input type="hidden" id="enable_news"      name="enable_news"      value="">
+      <!-- ── Instagram ──────────────────────────────────────────────── -->
+      <div class="card animate-in delay-2">
+        <div class="platform-header">
+          <div class="platform-title">
+            <div class="platform-icon pi-instagram">📸</div>
+            Instagram
+          </div>
+          <div class="toggle-wrap">
+            <span class="toggle-label" id="ig-toggle-label">Nonaktif</span>
+            <label class="toggle">
+              <input type="checkbox" id="ig-toggle" onchange="togglePlatform('ig')">
+              <span class="slider"></span>
+            </label>
+          </div>
+        </div>
+        <div class="platform-fields collapsed" id="ig-fields" style="max-height:600px;">
+          <div class="form-row" style="margin-bottom:0.9rem;">
+            <div class="form-group">
+              <label for="ig_username">Username Instagram</label>
+              <input id="ig_username" type="text" name="ig_username" placeholder="akun_instagram" autocomplete="username">
+            </div>
+            <div class="form-group">
+              <label for="ig_password">Password Instagram</label>
+              <input id="ig_password" type="password" name="ig_password" placeholder="••••••••" autocomplete="current-password">
+            </div>
+          </div>
+          <div class="form-row">
+            <div class="form-group full">
+              <label for="target_accounts">Target Akun / #Hashtag (satu per baris)</label>
+              <textarea id="target_accounts" name="target_accounts"
+                        placeholder="cirebonkab&#10;@rctvcirebon&#10;#jalanrusak"></textarea>
+              <span class="tag-hint">↵ Satu target per baris, @ dan # opsional</span>
+            </div>
+            <div class="form-group">
+              <label for="mode">Mode Waktu</label>
+              <select id="mode" name="mode">
+                <option value="all">Semua Postingan</option>
+                <option value="date">7 Bulan Terakhir</option>
+              </select>
+            </div>
+          </div>
+        </div>
+      </div>
+      <!-- ── TikTok ──��───────────────────────────────────────────────── -->
+      <div class="card animate-in delay-3">
+        <div class="platform-header">
+          <div class="platform-title">
+            <div class="platform-icon pi-tiktok">🎵</div>
+            TikTok
+          </div>
+          <div class="toggle-wrap">
+            <span class="toggle-label" id="tt-toggle-label">Nonaktif</span>
+            <label class="toggle">
+              <input type="checkbox" id="tt-toggle" onchange="togglePlatform('tt')">
+              <span class="slider"></span>
+            </label>
+          </div>
+        </div>
+        <div class="platform-fields collapsed" id="tt-fields" style="max-height:500px;">
+          <div class="form-group" style="margin-bottom:0.9rem;">
+            <label>Format Cookie TikTok</label>
+            <div class="cookie-tabs">
+              <button type="button" class="cookie-tab-btn active" onclick="setCookieHint('raw',this)">String Mentah</button>
+              <button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_arr',this)">JSON Array</button>
+              <button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_obj',this)">JSON Object</button>
+            </div>
+            <textarea id="tiktok_cookie" name="tiktok_cookie"
+                      placeholder="sessionid=xxx; tt_webid=yyy; ..."
+                      style="min-height:70px;font-family:monospace;font-size:0.8rem;"></textarea>
+            <p class="field-hint" id="cookie-hint">
+              Format: <code>sessionid=ABC; tt_webid=123</code> — ambil dari DevTools → Application → Cookies → tiktok.com
+            </p>
+          </div>
+          <div class="form-group">
+            <label for="tiktok_targets">Target Username TikTok (satu per baris)</label>
+            <textarea id="tiktok_targets" name="tiktok_targets"
+                      placeholder="@rctvcirebon&#10;@cirebonnews&#10;kuningan_update"></textarea>
+            <span class="tag-hint">↵ Satu username per baris, @ opsional</span>
+          </div>
+        </div>
+      </div>
+      <!-- ── Facebook ────────────────────────────────────────────────── -->
+      <div class="card animate-in delay-3">
+        <div class="platform-header">
+          <div class="platform-title">
+            <div class="platform-icon pi-facebook">📘</div>
+            Facebook
+          </div>
+          <div class="toggle-wrap">
+            <span class="toggle-label" id="fb-toggle-label">Nonaktif</span>
+            <label class="toggle">
+              <input type="checkbox" id="fb-toggle" onchange="togglePlatform('fb')">
+              <span class="slider"></span>
+            </label>
+          </div>
+        </div>
+        <div class="platform-fields collapsed" id="fb-fields" style="max-height:500px;">
+          <div class="form-row" style="margin-bottom:0.9rem;">
+            <div class="form-group">
+              <label for="fb_username">Email / No. HP Facebook</label>
+              <input id="fb_username" type="text" name="fb_username" placeholder="email@contoh.com" autocomplete="username">
+            </div>
+            <div class="form-group">
+              <label for="fb_password">Password Facebook</label>
+              <input id="fb_password" type="password" name="fb_password" placeholder="••••••••" autocomplete="current-password">
+            </div>
+          </div>
+          <div class="form-group">
+            <label for="facebook_groups">URL Grup Facebook (satu per baris, wajib diisi)</label>
+            <textarea id="facebook_groups" name="facebook_groups"
+                      placeholder="https://web.facebook.com/groups/123456&#10;https://web.facebook.com/groups/teraswarga"></textarea>
+            <p class="field-hint">⚠️ Harus diisi — tidak ada grup default. Jika kosong, Facebook tidak akan di-scrape.</p>
+          </div>
+        </div>
+      </div>
+      <!-- ── Berita Online ───────────────────────────────────────────── -->
+      <div class="card animate-in delay-4">
+        <div class="platform-header">
+          <div class="platform-title">
+            <div class="platform-icon pi-news">📰</div>
+            Berita Online
+          </div>
+          <div class="toggle-wrap">
+            <span class="toggle-label" id="news-toggle-label">Nonaktif</span>
+            <label class="toggle">
+              <input type="checkbox" id="news-toggle" onchange="togglePlatform('news')">
+              <span class="slider"></span>
+            </label>
+          </div>
+        </div>
+        <div class="platform-fields collapsed" id="news-fields" style="max-height:500px;">
+          <div class="section-label">Pilih Portal (bisa lebih dari satu)</div>
+          <div class="portal-grid" id="portal-grid">
+            <label class="portal-chip" onclick="toggleChip(this)">
+              <input type="checkbox" name="_portal_detik" value="detik">
+              <span class="chip-dot"></span><span class="chip-label">Detik.com</span>
+            </label>
+            <label class="portal-chip" onclick="toggleChip(this)">
+              <input type="checkbox" name="_portal_antara" value="antara">
+              <span class="chip-dot"></span><span class="chip-label">Antara News</span>
+            </label>
+            <label class="portal-chip" onclick="toggleChip(this)">
+              <input type="checkbox" name="_portal_radar" value="radar">
+              <span class="chip-dot"></span><span class="chip-label">Radar (Disway)</span>
+            </label>
+            <label class="portal-chip" onclick="toggleChip(this)">
+              <input type="checkbox" name="_portal_radarcirebon" value="radarcirebon">
+              <span class="chip-dot"></span><span class="chip-label">Radar Cirebon ID</span>
+            </label>
+            <label class="portal-chip" onclick="toggleChip(this)">
+              <input type="checkbox" name="_portal_cnn" value="cnn">
+              <span class="chip-dot"></span><span class="chip-label">CNN Indonesia</span>
+            </label>
+          </div>
+          <!-- Hidden field filled by JS -->
+          <input type="hidden" id="news_portals" name="news_portals" value="">
+          <div class="form-row" style="margin-top:1rem;">
+            <div class="form-group">
+              <label for="news_keyword">Keyword Pencarian</label>
+              <input id="news_keyword" type="text" name="news_keyword" value="kabupaten cirebon" placeholder="kabupaten cirebon">
+            </div>
+            <div class="form-group">
+              <label for="news_pages">Jumlah Halaman per Portal</label>
+              <input id="news_pages" type="number" name="news_pages" value="1" min="1" max="20">
+            </div>
+          </div>
+        </div>
+      </div>
+      <button class="btn-submit animate-in delay-5" type="submit" id="scraping-submit">
+        <span class="spinner" id="scraping-spinner"></span>
+        <span id="scraping-btn-text">⚡ Mulai Scraping &amp; Analisis</span>
+      </button>
+    </form>
+  </div>
+  <!-- ═══════════════════════ TAB 2: Dataset ════════════════════════════ -->
+  <div class="tab-panel {% if active_tab == 'dataset' %}active{% endif %}" id="panel-dataset">
+    <form id="dataset-form" action="/wordcloud-dataset" method="post" enctype="multipart/form-data">
+      <div class="card animate-in">
+        <div class="platform-header">
+          <div class="platform-title">
+            <div class="platform-icon pi-dataset">📂</div>
+            Upload Dataset
+          </div>
+        </div>
+        <div class="form-group" style="margin-bottom:1.25rem;">
+          <label>File Dataset (CSV, JSON, atau TXT)</label>
+          <div class="upload-zone" id="upload-zone">
+            <input type="file" name="dataset_file" id="dataset_file"
+                   accept=".csv,.json,.txt,.tsv"
+                   onchange="showFilename(this)">
+            <div class="upload-icon">📁</div>
+            <div class="upload-text">Klik atau seret file ke sini</div>
+            <div class="upload-sub">Mendukung .csv, .json, .txt — maks 50 MB</div>
+            <div class="upload-filename" id="upload-filename">✓ <span></span></div>
+          </div>
+        </div>
+        <div class="form-group" style="margin-bottom:1.25rem;">
+          <label for="text_column">Nama Kolom Teks (untuk CSV/JSON)</label>
+          <input id="text_column" type="text" name="text_column" value="text" placeholder="text / content / komentar">
+          <p class="field-hint">Kolom yang berisi teks yang akan dianalisis. Kosongkan untuk pakai kolom pertama.</p>
+        </div>
+        <div class="divider">atau paste teks langsung</div>
+        <div class="form-group">
+          <label for="dataset_text">Teks Dataset (satu dokumen/kalimat per baris)</label>
+          <textarea id="dataset_text" name="dataset_text" style="min-height:140px;"
+                    placeholder="Masukkan teks di sini, satu kalimat per baris...&#10;Cirebon semakin maju dengan infrastruktur yang baik&#10;Jalan di daerah X masih rusak parah"></textarea>
+        </div>
+      </div>
+      <button class="btn-submit" type="submit" id="dataset-submit">
+        <span class="spinner" id="dataset-spinner"></span>
+        <span id="dataset-btn-text">☁️ Buat Word Cloud &amp; Analisis Sentimen</span>
+      </button>
+    </form>
+  </div>
+  <!-- ═══════════════════════ Hasil Analisis ════════════════════════════ -->
+  {% if result %}
+  <section class="results-section animate-in">
+    <div class="results-header">
+      <h2>📊 Hasil Analisis Sentimen</h2>
+      <span class="stats-strip">{{ total_scraped }} teks dikumpulkan · {{ result.total }} dianalisis</span>
+    </div>
+    {% if csv_filename %}
+    <div style="margin-bottom: 1.5rem;">
+      <a href="{{ csv_filename }}" download class="btn-submit" style="display:inline-flex; width:auto; padding:0.7rem 1.25rem; background:linear-gradient(135deg, #059669, #10b981); text-decoration:none; font-size:0.9rem;">
+        📥 Download Data Scraping (CSV)
+      </a>
+    </div>
+    {% endif %}
+    <div class="sentiment-grid">
+      {% set total = result.total if result.total > 0 else 1 %}
+      <div class="s-card positif">
+        <div class="s-count" id="count-pos">0</div>
+        <div class="s-label">😊 Positif</div>
+        <div class="s-bar-wrap"><div class="s-bar" id="bar-pos" style="width:0%"></div></div>
+      </div>
+      <div class="s-card negatif">
+        <div class="s-count" id="count-neg">0</div>
+        <div class="s-label">😠 Negatif</div>
+        <div class="s-bar-wrap"><div class="s-bar" id="bar-neg" style="width:0%"></div></div>
+      </div>
+      <div class="s-card netral">
+        <div class="s-count" id="count-neu">0</div>
+        <div class="s-label">😐 Netral</div>
+        <div class="s-bar-wrap"><div class="s-bar" id="bar-neu" style="width:0%"></div></div>
+      </div>
+    </div>
+    {% if image %}
+    <div class="wordcloud-card">
+      <h3>☁️ Word Cloud</h3>
+      <img class="wordcloud-img" src="data:image/png;base64,{{ image }}" alt="Word Cloud">
+    </div>
+    {% endif %}
+  </section>
+  <script>
+    (function () {
+      var pos   = {{ result.positif }};
+      var neg   = {{ result.negatif }};
+      var neu   = {{ result.netral }};
+      var total = {{ result.total if result.total > 0 else 1 }};
+      function animCount(el, target) {
+        var start = 0;
+        var step  = Math.max(1, Math.ceil(target / 40));
+        var timer = setInterval(function () {
+          start = Math.min(start + step, target);
+          el.textContent = start;
+          if (start >= target) clearInterval(timer);
+        }, 25);
+      }
+      setTimeout(function () {
+        animCount(document.getElementById('count-pos'), pos);
+        animCount(document.getElementById('count-neg'), neg);
+        animCount(document.getElementById('count-neu'), neu);
+        document.getElementById('bar-pos').style.width = (pos / total * 100).toFixed(1) + '%';
+        document.getElementById('bar-neg').style.width = (neg / total * 100).toFixed(1) + '%';
+        document.getElementById('bar-neu').style.width = (neu / total * 100).toFixed(1) + '%';
+      }, 300);
+    })();
+  </script>
+  {% endif %}
+</div><!-- /wrapper -->
+<script>
+  // ── Tab switching ─────────────────────────────────────────────────────────
+  function switchTab(name) {
+    document.querySelectorAll('.tab-btn').forEach(function (b) { b.classList.remove('active'); });
+    document.querySelectorAll('.tab-panel').forEach(function (p) { p.classList.remove('active'); });
+    document.getElementById('tab-' + name).classList.add('active');
+    document.getElementById('panel-' + name).classList.add('active');
+  }
+  // ── Platform toggle ───────────────────────────────────────────────────────
+  function togglePlatform(id) {
+    var fields  = document.getElementById(id + '-fields');
+    var toggle  = document.getElementById(id + '-toggle');
+    var label   = document.getElementById(id + '-toggle-label');
+    var flagMap = { ig: 'enable_instagram', tt: 'enable_tiktok', fb: 'enable_facebook', news: 'enable_news' };
+    if (toggle.checked) {
+      fields.classList.remove('collapsed');
+      if (label) label.textContent = 'Aktif';
+      document.getElementById(flagMap[id]).value = '1';
+    } else {
+      fields.classList.add('collapsed');
+      if (label) label.textContent = 'Nonaktif';
+      document.getElementById(flagMap[id]).value = '';
+    }
+  }
+  // ── Portal chip multi-select ──────────────────────────────────────────────
+  function toggleChip(label) {
+    var cb = label.querySelector('input[type="checkbox"]');
+    cb.checked = !cb.checked;
+    label.classList.toggle('checked', cb.checked);
+    updatePortalField();
+  }
+  function updatePortalField() {
+    var vals = [];
+    document.querySelectorAll('#portal-grid .portal-chip.checked input').forEach(function (cb) {
+      vals.push(cb.value);
+    });
+    document.getElementById('news_portals').value = vals.join(',');
+  }
+  // ── Cookie format hints ───────────────────────────────────────────────────
+  var cookieHints = {
+    raw:      'Format: <code>sessionid=ABC; tt_webid=123</code> �� ambil dari DevTools → Application → Cookies → tiktok.com',
+    json_arr: 'Format JSON Array: <code>[{"name":"sessionid","value":"ABC","domain":".tiktok.com"}]</code>',
+    json_obj: 'Format JSON Object: <code>{"sessionid": "ABC", "tt_webid": "123"}</code>',
+  };
+  var cookiePlaceholders = {
+    raw:      'sessionid=xxx; tt_webid=yyy; ...',
+    json_arr: '[{"name":"sessionid","value":"xxx","domain":".tiktok.com"},...]',
+    json_obj: '{"sessionid": "xxx", "tt_webid": "yyy"}',
+  };
+  function setCookieHint(fmt, btn) {
+    document.querySelectorAll('.cookie-tab-btn').forEach(function (b) { b.classList.remove('active'); });
+    btn.classList.add('active');
+    document.getElementById('cookie-hint').innerHTML    = cookieHints[fmt];
+    document.getElementById('tiktok_cookie').placeholder = cookiePlaceholders[fmt];
+  }
+  // ── File upload label ─────────────────────────────────────────────────────
+  function showFilename(input) {
+    var wrap = document.getElementById('upload-filename');
+    if (input.files && input.files[0]) {
+      wrap.style.display = 'block';
+      wrap.querySelector('span').textContent = input.files[0].name;
+    } else {
+      wrap.style.display = 'none';
+    }
+  }
+  // Drag-over styling
+  var zone = document.getElementById('upload-zone');
+  if (zone) {
+    zone.addEventListener('dragover',  function (e) { e.preventDefault(); zone.classList.add('drag-over'); });
+    zone.addEventListener('dragleave', function ()  { zone.classList.remove('drag-over'); });
+    zone.addEventListener('drop',      function ()  { zone.classList.remove('drag-over'); });
+  }
+  // ── Form submit spinners ──────────────────────────────────────────────────
+  function bindSubmit(formId, spinnerId, btnTextId, btnId, loadingText) {
+    var form = document.getElementById(formId);
+    if (!form) return;
+    form.addEventListener('submit', function () {
+      document.getElementById(btnId).disabled        = true;
+      document.getElementById(spinnerId).style.display = 'inline-block';
+      document.getElementById(btnTextId).innerHTML    = loadingText + '<span class="dots"><span></span><span></span><span></span></span>';
+    });
+  }
+  bindSubmit('scraping-form', 'scraping-spinner', 'scraping-btn-text', 'scraping-submit', 'Memproses (mungkin beberapa menit)');
+  bindSubmit('dataset-form',  'dataset-spinner',  'dataset-btn-text',  'dataset-submit',  'Memproses dataset');
+  // Build news_portals on submit (capture phase)
+  var sf = document.getElementById('scraping-form');
+  if (sf) sf.addEventListener('submit', updatePortalField, true);
+</script>
+</body>
+</html>

web_scrapping.py ADDED Viewed

	@@ -0,0 +1,1026 @@

+# -*- coding: utf-8 -*-
+"""Web Scrapping.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1OLoBK18jpB685Ivi8Zi3SzuVYiXJ9jRa
+"""
+!pip install selenium
+!pip install webdriver-manager
+# Detik.com
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+from datetime import datetime
+def scrape_detik_search(keyword, max_pages=1):
+    base_search_url = "https://www.detik.com/search/searchall"
+    results = []
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
+    }
+    for page in range(1, max_pages + 1):
+        params = {
+            'query': keyword,
+            'siteid': '2',
+            'sortby': 'time',
+            'page': page
+        }
+        print(f"Scraping page {page}...")
+        r = requests.get(base_search_url, params=params, headers=headers)
+        if r.status_code != 200:
+            print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
+            break
+        soup = BeautifulSoup(r.text, 'html.parser')
+        news_list = soup.find_all('div', class_='media')
+        if not news_list:
+            print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
+            break
+        for news in news_list:
+            try:
+                title_tag = news.find('h3', class_='media__title')
+                if not title_tag:
+                    continue
+                link_tag = title_tag.find('a', class_='media__link')
+                if not link_tag or not link_tag.has_attr('href'):
+                    continue
+                link = link_tag['href']
+                title = link_tag.text.strip()
+                date_tag = news.find('div', class_='media__date')
+                if date_tag:
+                    span_tag = date_tag.find('span')
+                    if span_tag and span_tag.has_attr('d-time'):
+                        timestamp = span_tag['d-time']
+                        news_date = datetime.fromtimestamp(int(timestamp))
+                    else:
+                        news_date = None
+                else:
+                    news_date = None
+                # if news_date and news_date < cutoff_date:
+                #     print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
+                #     return pd.DataFrame(results)
+                # Ambil halaman detail berita dengan header
+                news_resp = requests.get(link, headers=headers)
+                if news_resp.status_code != 200:
+                    print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
+                    continue
+                news_soup = BeautifulSoup(news_resp.text, 'html.parser')
+                content_div = news_soup.find('div', class_='detail__body-text') or \
+                              news_soup.find('div', class_='detail_text')
+                if content_div:
+                    content_parts = []
+                    for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+                        text = tag.get_text(strip=True)
+                        if text:
+                            prefix = tag.name.upper() if tag.name.startswith('h') else ''
+                            if prefix:
+                                content_parts.append(f"{prefix}: {text}")
+                            else:
+                                content_parts.append(text)
+                    content = '\n'.join(content_parts)
+                else:
+                    content = ''
+                # Ambil tag dari elemen nav > a.nav__item
+                nav_div = news_soup.find('div', class_='nav')
+                tags = []
+                if nav_div:
+                    tags = [a.text.strip() for a in nav_div.find_all('a', class_='nav__item')]
+                results.append({
+                    'judul': title,
+                    'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
+                    'tag': ', '.join(tags),
+                    'isi_berita': content,
+                    'link': link
+                })
+                print(f"Berhasil scrape berita: {title}")
+                time.sleep(1)
+            except Exception as e:
+                print(f"Error saat memproses berita: {e}")
+                continue
+        time.sleep(2)
+    return pd.DataFrame(results)
+if __name__ == "__main__":
+    keyword = "Kabupaten Cirebon"
+    df = scrape_detik_search(keyword)
+    if not df.empty:
+        df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
+        print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
+    else:
+        print("Tidak ada data yang berhasil di-scrape.")
+# Radar Cirebon KW
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+from urllib.parse import quote_plus
+BASE_HOST = "https://radarcirebon.disway.id"
+BASE_SEARCH = BASE_HOST + "/search/kata/"
+def make_search_url(keyword, page, per_page=30):
+    q = quote_plus(keyword)
+    if page == 1:
+        return f"{BASE_SEARCH}?c={q}&num="
+    else:
+        offset = (page - 1) * per_page
+        return f"{BASE_SEARCH}{offset}/{offset}/?c={q}&num="
+def absolute_url(href):
+    if not href:
+        return None
+    href = href.strip()
+    if href.startswith("http://") or href.startswith("https://"):
+        return href
+    if href.startswith("/"):
+        return BASE_HOST + href
+    return BASE_HOST + "/" + href
+def scrape_radar_cirebon(keyword, max_pages=100, per_page=30, delay_between_items=1.0, delay_between_pages=2.0):
+    sess = requests.Session()
+    sess.headers.update({
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
+    })
+    results = []
+    seen_links = set()
+    for page in range(1, max_pages + 1):
+        url = make_search_url(keyword, page, per_page)
+        print(f"\nScraping page {page} -> {url}")
+        try:
+            r = sess.get(url, timeout=15)
+        except Exception as e:
+            print(f"  ERROR: Gagal request halaman search: {e}")
+            break
+        if r.status_code != 200:
+            print(f"  ERROR: status code {r.status_code}, hentikan scraping.")
+            break
+        soup = BeautifulSoup(r.text, "html.parser")
+        # Ambil daftar berita
+        news_list = soup.find_all(class_='media-heading')
+        if not news_list:
+            news_list = soup.find_all('div', class_='media')
+        if not news_list:
+            news_list = soup.find_all('article')
+        if not news_list:
+            news_list = soup.select('ul.search-results li') or soup.select('div.search-result') or []
+        if not news_list:
+            print("  Tidak ada berita ditemukan di halaman ini.")
+            continue
+        print(f"  Ketemu {len(news_list)} item.")
+        for item in news_list:
+            try:
+                a = item.find('a', href=True) or item.select_one('a[href]')
+                if not a:
+                    continue
+                link = absolute_url(a.get('href'))
+                if not link or link in seen_links:
+                    continue
+                seen_links.add(link)
+                title = a.get_text(strip=True)
+                # Ambil halaman detail
+                try:
+                    detail_r = sess.get(link, timeout=15)
+                except Exception as e:
+                    print(f"    ERROR request detail {link}: {e}")
+                    continue
+                if detail_r.status_code != 200:
+                    print(f"    ERROR status {detail_r.status_code} for {link}")
+                    continue
+                detail_soup = BeautifulSoup(detail_r.text, "html.parser")
+                # Judul detail
+                h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
+                title_detail = h1.get_text(strip=True) if h1 else title
+                # Tanggal detail
+                date_text = None  # Inisialisasi variabel
+                # Opsi 1: Cari tag dengan class 'date' secara langsung
+                date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
+                if date_detail_tag:
+                    print("Ditemukan dengan Target Langsung")
+                    # PERBAIKAN: Gunakan variabel 'date_detail_tag', bukan 'tag'
+                    date_text = date_detail_tag.get_text(strip=True)
+                # Opsi 2: Jika Opsi 1 gagal, cari di dalam kontainer 'post-info'
+                if not date_text:
+                    post_info_div = detail_soup.find('div', class_='post-info')
+                    if post_info_div:
+                        tag_tanggal = post_info_div.find('span', class_='date')
+                        if tag_tanggal:
+                            print("Ditemukan dengan Target Kontainer")
+                            date_text = tag_tanggal.get_text(strip=True)
+                # Opsi 3: Jika masih gagal, gunakan Regex sebagai usaha terakhir
+                if not date_text:
+                    # Pola Regex untuk format seperti "Rabu 22-08-2024" atau "Selasa, 21 Agustus 2024"
+                    date_pattern = re.compile(r'\w+,\s*\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{2}-\d{2}-\d{4}')
+                    found_text = detail_soup.find(string=date_pattern)
+                    if found_text:
+                        print("Ditemukan dengan Target Pola Teks (Regex)")
+                        date_text = found_text.strip()
+                # Isi berita
+                content_container = None
+                for cls in ('entry-content', 'post-content', 'article-body', 'detail__body-text', 'detail_text', 'content', 'article__content'):
+                    content_container = detail_soup.find('div', class_=cls)
+                    if content_container:
+                        break
+                if not content_container:
+                    content_container = detail_soup.find('article')
+                content_parts = []
+                search_scope = content_container if content_container else detail_soup
+                for p in search_scope.find_all('p'):
+                    text = p.get_text(strip=True)
+                    if text and 'Baca Juga:' not in text:
+                        content_parts.append(text)
+                content = "\n".join(content_parts)
+                tags = []
+                try:
+                    # 1. Cari SEMUA tag <a> yang tautannya (href) mengandung '/listtag/'
+                    #    Ini adalah pola unik untuk tag di situs tersebut.
+                    tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
+                    # 2. Loop melalui setiap tautan tag yang ditemukan
+                    for a_tag in tag_links:
+                        # 3. Ambil teks dari atribut 'title', karena itu berisi nama tag yang bersih
+                        tag_text = a_tag.get('title', '').strip()
+                        # 4. Pastikan teks tidak kosong sebelum menambahkannya ke list
+                        if tag_text:
+                            tags.append(tag_text)
+                    # Jika tidak ada tag yang ditemukan, list akan tetap kosong, yang mana sudah benar.
+                except Exception as e:
+                    # Menjaga agar program tidak berhenti jika ada error tak terduga
+                    print(f"    Terjadi error saat mencari tag: {e}")
+                # Gabungkan hasil tag menjadi satu string untuk disimpan
+                final_tags = ", ".join(tags) if tags else "-"
+                results.append({
+                    "judul": title_detail,
+                    "tanggal": date_text,
+                    "tag": final_tags,  # INI BAGIAN YANG DIPERBAIKI
+                    "isi_berita": content,
+                    "link": link
+                })
+                print(f"    Berhasil: {title_detail} | Tags: {', '.join(tags) if tags else '-'}")
+                time.sleep(delay_between_items)
+            except Exception as e:
+                print(f"    Error saat memproses item: {e}")
+                continue
+        time.sleep(delay_between_pages)
+    df = pd.DataFrame(results)
+    return df
+if __name__ == "__main__":
+    keyword = "kabupaten cirebon"
+    df = scrape_radar_cirebon(keyword, max_pages=100)
+    if not df.empty:
+        df.to_csv("/content/drive/MyDrive/Machine Learning/Sentiment Analysis/radarcirebondisway_berita.csv", index=False, encoding="utf-8-sig")
+        print("\nSelesai menyimpan data berita ke radarcirebon_berita.csv")
+    else:
+        print("\nTidak ada data yang berhasil di-scrape.")
+# Antara News
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+import re
+import random
+from urllib.parse import quote_plus, urlparse, urlunparse
+BASE_HOST = "https://www.antaranews.com"
+BASE_SEARCH = BASE_HOST + "/search"
+def make_search_url(keyword, page):
+    q = quote_plus(keyword)
+    if page == 1:
+        return f"{BASE_SEARCH}?q={q}"
+    else:
+        return f"{BASE_SEARCH}?q={q}&page={page}"
+def absolute_url(href):
+    if not href:
+        return None
+    href = href.strip()
+    if href.startswith("http://") or href.startswith("https://"):
+        return href
+    if href.startswith("/"):
+        return BASE_HOST + href
+    return BASE_HOST + "/" + href
+def normalize_url(href):
+    """Buat URL konsisten: absolut + buang query/fragment + hapus trailing slash."""
+    if not href:
+        return None
+    href = absolute_url(href)
+    parsed = urlparse(href)
+    clean = parsed._replace(query="", fragment="")
+    return urlunparse(clean).rstrip("/")
+def get_with_retry(sess, url, max_retries=3, delay_range=(2, 5)):
+    """Request dengan retry & delay acak."""
+    for attempt in range(max_retries):
+        try:
+            r = sess.get(url, timeout=15)
+            r.raise_for_status()
+            return r
+        except Exception as e:
+            print(f"    Percobaan {attempt+1} gagal: {e}")
+            if attempt < max_retries - 1:
+                time.sleep(random.uniform(*delay_range))
+    return None
+def scrape_antaranews(keyword, max_pages=5, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
+    sess = requests.Session()
+    sess.headers.update({
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                      'AppleWebKit/537.36 (KHTML, like Gecko) '
+                      'Chrome/115.0 Safari/537.36'
+    })
+    results = []
+    seen_links = set()
+    for page in range(1, max_pages + 1):
+        url = make_search_url(keyword, page)
+        print(f"\nScraping page {page} -> {url}")
+        r = get_with_retry(sess, url)
+        if not r:
+            print(f"  ERROR: Gagal request halaman search setelah retry.")
+            continue
+        soup = BeautifulSoup(r.text, "html.parser")
+        # Ambil semua anchor yang mengarah ke artikel berita (biasanya /berita/...)
+        anchors = soup.select('a[href*="/berita/"]')
+        all_links_in_page = {normalize_url(a.get('href')) for a in anchors if a.get('href')}
+        all_links_in_page = {l for l in all_links_in_page if l}
+        new_links = all_links_in_page - seen_links
+        print(f"  Ketemu {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
+        seen_links.update(all_links_in_page)
+        for link in sorted(new_links):
+            detail_r = get_with_retry(sess, link)
+            if not detail_r:
+                print(f"    ERROR: Gagal request detail {link}")
+                continue
+            detail_soup = BeautifulSoup(detail_r.text, "html.parser")
+            # Judul
+            h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
+            title_detail = h1.get_text(strip=True) if h1 else ""
+            # Waktu / tanggal
+            date_detail = ""
+            cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
+            if cal_icon:
+                parent_li = cal_icon.find_parent('li') or cal_icon.find_parent()
+                if parent_li:
+                    date_detail = parent_li.get_text(" ", strip=True)
+            if not date_detail:
+                text_all = detail_soup.get_text(" ", strip=True)
+                m = re.search(r'\b(?:[A-Za-z]+,\s*\d{1,2}\s+[A-Za-z]+ \d{4}\s*\d{1,2}:\d{2}\s*WIB|\d+\s+jam lalu|\bWIB\b)', text_all)
+                if m:
+                    date_detail = m.group(0)
+            # Isi berita
+            content_parts = []
+            article_body = detail_soup.find('div', class_='wrap__article-detail-content') \
+                           or detail_soup.find('div', class_='detail__body-text') \
+                           or detail_soup.find('article')
+            search_scope = article_body if article_body else detail_soup
+            for p in search_scope.find_all('p'):
+                text = p.get_text(strip=True)
+                if text and not text.lower().startswith("baca juga"):
+                    content_parts.append(text)
+            content = "\n".join(content_parts)
+            # Ambil tag
+            tags = []
+            found = False
+            for ul in detail_soup.find_all('ul', class_='list-inline'):
+                if ul.find('i', class_='fa-tags') or ul.find('i', class_='fas fa-tags'):
+                    for a in ul.find_all('a', href=True):
+                        if '/tag/' in a['href']:
+                            tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
+                            if tag_text:
+                                tags.append(tag_text)
+                    if tags:
+                        found = True
+                        break
+            if not found:
+                for a in detail_soup.select('a[href*="/tag/"]'):
+                    tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
+                    if tag_text:
+                        tags.append(tag_text)
+            tags = list(dict.fromkeys(tags))
+            results.append({
+                "judul": title_detail,
+                "tanggal": date_detail,
+                "tag": ", ".join(tags) if tags else "-",
+                "isi_berita": content,
+                "link": link
+            })
+            print(f"    Berhasil: {title_detail} | Tanggal: {date_detail if date_detail else '-'} | Tags: {', '.join(tags) if tags else '-'}")
+            time.sleep(random.uniform(*delay_between_items))
+        time.sleep(random.uniform(*delay_between_pages))
+    df = pd.DataFrame(results)
+    return df
+if __name__ == "__main__":
+    keyword = "kabupaten cirebon"
+    df = scrape_antaranews(keyword, max_pages=100)
+    if not df.empty:
+        df.to_csv("antaranews_berita.csv", index=False, encoding="utf-8-sig")
+        print(f"\nSelesai menyimpan {len(df)} data berita ke antaranews_berita.csv")
+    else:
+        print("\nTidak ada data yang berhasil di-scrape.")
+# Jalanin di IDE lokal karena butuh chrome (CNN)
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+import random
+from urllib.parse import quote, urlparse, urlunparse
+import re
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.common.exceptions import TimeoutException
+BASE_HOST = "https://www.cnnindonesia.com"
+# <<< DIUBAH: Fungsi ini dimodifikasi untuk menangani nomor halaman >>>
+def make_search_url(keyword, page):
+    """
+    Membuat URL pencarian yang benar untuk setiap halaman.
+    """
+    q = quote(keyword)
+    base_url = f"{BASE_HOST}/search?query={q}&result_type=latest"
+    if page == 1:
+        return base_url
+    else:
+        return f"{base_url}&page={page}"
+# --- Fungsi-fungsi pembantu lainnya tidak ada perubahan ---
+def absolute_url(href):
+    if not href: return None
+    href = href.strip()
+    if href.startswith("http://") or href.startswith("https://"): return href
+    if href.startswith("/"): return BASE_HOST + href
+    return BASE_HOST + "/" + href
+def normalize_url(href):
+    if not href: return None
+    href = absolute_url(href)
+    parsed = urlparse(href)
+    clean = parsed._replace(query="", fragment="")
+    return urlunparse(clean).rstrip("/")
+def parse_cnn_date(raw_date):
+    if not raw_date: return "-"
+    if '|' in raw_date: raw_date = raw_date.split('|')[1]
+    raw = raw_date.replace(" WIB", "").strip()
+    try:
+        from datetime import datetime
+        import locale
+        try: locale.setlocale(locale.LC_TIME, 'id_ID.UTF-8')
+        except locale.Error: locale.setlocale(locale.LC_TIME, '')
+        dt = datetime.strptime(raw, "%A, %d %b %Y %H:%M")
+        return dt.strftime("%Y-%m-%d %H:%M")
+    except Exception: return raw_date.strip()
+def looks_like_article_href(href):
+    if not href: return False
+    parsed = urlparse(href.strip())
+    path = parsed.path
+    if any(skip in path for skip in ['/search', '/tag', '/kategori', '/author', '/channel', '/indeks', '/video', '/foto']): return False
+    if re.search(r'/\d{14}-\d{2,3}-\d{6,}', path): return True
+    return False
+HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"}
+def fetch_article_detail(url, retries=3, delay=3):
+    for attempt in range(1, retries + 1):
+        try:
+            resp = requests.get(url, headers=HEADERS, timeout=15)
+            if resp.status_code == 200: return resp.text
+            else: print(f"  WARNING: HTTP {resp.status_code} saat akses {url}")
+        except Exception as e: print(f"  WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
+        time.sleep(delay)
+    return None
+def scrape_cnn_with_selenium(keyword, max_pages=3, delay_between_items=(1,2)):
+    results = []
+    seen_links = set()
+    print("Menginisialisasi browser Chrome...")
+    service = Service(ChromeDriverManager().install())
+    options = webdriver.ChromeOptions()
+    options.add_argument("--headless")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    options.add_experimental_option('useAutomationExtension', False)
+    driver = webdriver.Chrome(service=service, options=options)
+    driver.set_page_load_timeout(30)
+    # <<< DIUBAH: Logika perulangan kembali menggunakan nomor halaman (bukan scroll) >>>
+    for page in range(1, max_pages + 1):
+        # Membuat URL untuk halaman yang dituju
+        url = make_search_url(keyword, page)
+        print(f"\nMembuka halaman {page} -> {url}")
+        driver.get(url)
+        # Penanganan cookie hanya perlu saat pertama kali halaman dimuat (page 1)
+        if page == 1:
+            try:
+                print("Mencari pop-up cookie...")
+                cookie_agree_button = WebDriverWait(driver, 10).until(
+                    EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))
+                )
+                cookie_agree_button.click()
+                print("  Pop-up cookie ditemukan dan ditutup.")
+                time.sleep(2)
+            except TimeoutException:
+                print("  Pop-up cookie tidak ditemukan, melanjutkan proses.")
+        print(f"Mengambil data dari halaman {page}...")
+        try:
+            # Menunggu konten dimuat di setiap halaman baru
+            WebDriverWait(driver, 15).until(
+                EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a"))
+            )
+        except TimeoutException:
+            print(f"  WARNING: Waktu habis menunggu konten di halaman {page}. Mungkin halaman ini kosong.")
+            continue # Lanjut ke halaman berikutnya jika ada
+        page_html = driver.page_source
+        soup = BeautifulSoup(page_html, "html.parser")
+        link_elements = soup.select('div.nhl-list article a[href]')
+        all_links_in_page = {normalize_url(a['href']) for a in link_elements if looks_like_article_href(a['href'])}
+        new_links = all_links_in_page - seen_links
+        if not new_links:
+            print("  Tidak ada link baru yang ditemukan di halaman ini.")
+            # Tidak perlu berhenti, karena halaman berikutnya mungkin punya link baru
+        print(f"  Ditemukan {len(new_links)} link baru.")
+        seen_links.update(new_links)
+        # Proses scrape detail artikel tidak ada perubahan
+        for link in sorted(new_links):
+            print(f"   -> Memproses: {link}")
+            html_detail = fetch_article_detail(link)
+            if not html_detail: continue
+            detail_soup = BeautifulSoup(html_detail, "html.parser")
+            title_el = detail_soup.select_one('h1')
+            title_text = title_el.get_text(strip=True) if title_el else "-"
+            date_el = detail_soup.select_one('div.text-cnn_grey.text-sm')
+            date_text = parse_cnn_date(date_el.get_text(strip=True)) if date_el else "-"
+            tags_list = []
+            topik_terkait_header = detail_soup.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
+            if topik_terkait_header:
+                tags_container = topik_terkait_header.find_next_sibling('div')
+                if tags_container:
+                    tags_elements = tags_container.select('a')
+                    tags_list = [tag.get_text(strip=True) for tag in tags_elements]
+            content_parts = []
+            content_container = detail_soup.select_one("div.detail-text")
+            if content_container:
+                for p in content_container.find_all('p'):
+                    text = p.get_text(" ", strip=True)
+                    if text and not text.lower().startswith("lihat juga") and not text.lower().startswith("scroll to continue"):
+                        content_parts.append(text)
+            results.append({
+                "judul": title_text, "tanggal": date_text,
+                "tag": ", ".join(tags_list) if tags_list else "-",
+                "isi_berita": "\n".join(content_parts) if content_parts else "-", "link": link
+            })
+            print(f"    Berhasil: {title_text} | Tanggal: {date_text}")
+            time.sleep(random.uniform(*delay_between_items))
+    print("\nMenutup browser...")
+    driver.quit()
+    return pd.DataFrame(results)
+if __name__ == "__main__":
+    keyword = "kabupaten cirebon"
+    df = scrape_cnn_with_selenium(keyword, max_pages=100)
+    if not df.empty:
+        df.to_csv("cnnindonesia_berita_final.csv", index=False, encoding="utf-8-sig")
+        print(f"\nSelesai menyimpan {len(df)} data berita ke cnnindonesia_berita_final.csv")
+    else:
+        print("\nTidak ada data yang berhasil di-scrape.")
+# Radar Cirebon ID
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+import random
+from urllib.parse import quote, urlparse, urlunparse
+import re
+# Mengganti BASE_HOST ke situs target yang baru
+BASE_HOST = "https://radarcirebon.id"
+def make_search_url(keyword, page):
+    """
+    Membuat URL pencarian sesuai format radarcirebon.id.
+    Contoh: https://radarcirebon.id/search/kabupaten+cirebon/page/2/
+    """
+    # Mengganti spasi dengan '+' sesuai format URL situs
+    q = quote(keyword).replace('%20', '+')
+    if page == 1:
+        return f"{BASE_HOST}/search/{q}/"
+    else:
+        return f"{BASE_HOST}/search/{q}/page/{page}/"
+def normalize_url(href):
+    """
+    Memastikan URL dalam format absolut dan bersih (tanpa parameter).
+    """
+    if not href:
+        return None
+    href = href.strip()
+    # Membuat URL absolut jika hanya berupa path
+    if href.startswith("//"):
+        href = "https:" + href
+    elif href.startswith("/"):
+        href = BASE_HOST + href
+    elif not href.startswith("http"):
+        return None # Mengabaikan link yang tidak valid
+    parsed = urlparse(href)
+    clean = parsed._replace(query="", fragment="")
+    return urlunparse(clean).rstrip("/")
+def parse_radarcirebon_date(raw_date):
+    """
+    Mengubah format tanggal dari 'Selasa, 12 Agu 2025 - 11:01'
+    menjadi format standar 'YYYY-MM-DD HH:MM'.
+    """
+    if not raw_date:
+        return "-"
+    try:
+        # Pemetaan manual untuk nama bulan 3 huruf dalam Bahasa Indonesia
+        month_map = {
+            'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'Mei': '05', 'Jun': '06',
+            'Jul': '07', 'Agu': '08', 'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
+        }
+        # Membersihkan hari dan memisahkan bagian-bagian tanggal
+        date_part = raw_date.split(', ')[1] # -> "12 Agu 2025 - 11:01"
+        parts = date_part.replace(' - ', ' ').split() # -> ['12', 'Agu', '2025', '11:01']
+        day = parts[0].zfill(2) # zfill(2) untuk memastikan format '01', '02', dst.
+        month_abbr = parts[1]
+        year = parts[2]
+        time_str = parts[3]
+        # Mengambil angka bulan dari pemetaan
+        month = month_map.get(month_abbr, '00')
+        return f"{year}-{month}-{day} {time_str}"
+    except Exception:
+        return raw_date.strip()
+def looks_like_article_href(href):
+    """
+    Memfilter URL agar hanya mengambil link artikel yang valid.
+    Contoh URL artikel: /2025/08/12/nama-artikel/
+    """
+    if not href:
+        return False
+    # Pola URL artikel di radarcirebon.id selalu mengandung /YYYY/MM/DD/
+    return bool(re.search(r'/\d{4}/\d{2}/\d{2}/', href))
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                  "AppleWebKit/537.36 (KHTML, like Gecko) "
+                  "Chrome/126.0.0.0 Safari/537.36"
+}
+def fetch_url(url, retries=3, delay=3):
+    """
+    Fungsi untuk mengambil konten dari sebuah URL dengan mekanisme coba lagi (retry).
+    """
+    for attempt in range(1, retries + 1):
+        try:
+            resp = requests.get(url, headers=HEADERS, timeout=15)
+            if resp.status_code == 200:
+                return resp.text
+            else:
+                print(f"   WARNING: HTTP {resp.status_code} saat akses {url}")
+        except Exception as e:
+            print(f"   WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
+        time.sleep(delay)
+    return None
+def scrape_radarcirebon(keyword, max_pages=3, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
+    """
+    Fungsi utama untuk melakukan scraping dari situs radarcirebon.id.
+    """
+    results = []
+    seen_links = set()
+    for page in range(1, max_pages + 1):
+        url = make_search_url(keyword, page)
+        print(f"\nScraping halaman {page} -> {url}")
+        html = fetch_url(url)
+        if not html:
+            print(f"   ERROR: Gagal mengambil halaman pencarian {page}")
+            continue
+        soup = BeautifulSoup(html, "html.parser")
+        # Selektor CSS baru untuk menemukan link artikel di halaman pencarian
+        link_elements = soup.select('article .wp-block-latest-posts__post-title a')
+        print(f"   DEBUG: Ditemukan {len(link_elements)} elemen link di halaman {page}")
+        all_links_in_page = set()
+        for a in link_elements:
+            href_raw = a.get('href')
+            if href_raw and looks_like_article_href(href_raw):
+                norm = normalize_url(href_raw)
+                if norm:
+                    all_links_in_page.add(norm)
+        new_links = all_links_in_page - seen_links
+        print(f"   Menemukan {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
+        seen_links.update(all_links_in_page)
+        for link in sorted(list(new_links)):
+            html_detail = fetch_url(link)
+            if not html_detail:
+                print(f"     ERROR: Gagal mengambil artikel {link}")
+                continue
+            detail_soup = BeautifulSoup(html_detail, "html.parser")
+            # Selektor baru untuk judul artikel
+            title_el = detail_soup.select_one('h1.entry-title')
+            title_detail = title_el.get_text(strip=True) if title_el else "-"
+            # Selektor baru untuk tanggal
+            date_el = detail_soup.select_one('time.entry-date')
+            date_detail = parse_radarcirebon_date(date_el.get_text(strip=True)) if date_el else "-"
+            # Selektor baru untuk isi berita
+            content_parts = []
+            content_container = detail_soup.select_one('div.entry-content')
+            if content_container:
+                for p in content_container.select('p'):
+                    # Mengabaikan paragraf yang berisi link "Baca Juga"
+                    if not p.find_parent(class_='read-also'):
+                        text = p.get_text(" ", strip=True)
+                        if text:
+                            content_parts.append(text)
+            content = "\n".join(content_parts)
+            # Selektor baru untuk tag
+            tags_container = detail_soup.select_one('div.wp-block-tag-cloud')
+            tags = [a.get_text(strip=True) for a in tags_container.select('a')] if tags_container else []
+            tags = list(dict.fromkeys(tags)) # Menghapus duplikat
+            results.append({
+                "judul": title_detail,
+                "tanggal": date_detail,
+                "tag": ", ".join(tags) if tags else "-",
+                "isi_berita": content if content else "-",
+                "link": link
+            })
+            print(f"     Berhasil: {title_detail} | Tanggal: {date_detail}")
+            time.sleep(random.uniform(*delay_between_items))
+        # Beri jeda antar halaman untuk tidak membebani server
+        time.sleep(random.uniform(*delay_between_pages))
+    return pd.DataFrame(results)
+if __name__ == "__main__":
+    keyword = "kabupaten cirebon"
+    # Batasi max_pages sesuai kebutuhan Anda, misalnya 3 halaman
+    df = scrape_radarcirebon(keyword, max_pages=3)
+    if not df.empty:
+        # Menyimpan ke file CSV baru
+        output_filename = "radarcirebon_berita.csv"
+        df.to_csv(output_filename, index=False, encoding="utf-8-sig")
+        print(f"\nSelesai menyimpan {len(df)} data berita ke {output_filename}")
+    else:
+        print("\nTidak ada data yang berhasil di-scrape.")
+# Download html
+import requests
+url = "https://radarcirebon.id/2025/08/12/warga-resah-dprd-cirebon-panggil-dpkpp-untuk-tuntaskan-masalah-psu-di-dua-perumahan/"
+headers = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
+}
+resp = requests.get(url, headers=headers)
+with open("detail.html", "w", encoding="utf-8") as f:
+    f.write(resp.text)
+print("HTML halaman disimpan ke page.html")
+# Detik.com memiliki batas waktu
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import time
+from datetime import datetime
+def scrape_detik_search(keyword, max_years=3, max_pages=100):
+    base_search_url = "https://www.detik.com/search/searchall"
+    results = []
+    cutoff_date = datetime.now().replace(year=datetime.now().year - max_years)
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
+    }
+    for page in range(1, max_pages + 1):
+        params = {
+            'query': keyword,
+            'siteid': '2',
+            'sortby': 'time',
+            'page': page
+        }
+        print(f"Scraping page {page}...")
+        r = requests.get(base_search_url, params=params, headers=headers)
+        if r.status_code != 200:
+            print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
+            break
+        soup = BeautifulSoup(r.text, 'html.parser')
+        news_list = soup.find_all('div', class_='media')
+        if not news_list:
+            print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
+            break
+        for news in news_list:
+            try:
+                title_tag = news.find('h3', class_='media__title')
+                if not title_tag:
+                    continue
+                link_tag = title_tag.find('a', class_='media__link')
+                if not link_tag or not link_tag.has_attr('href'):
+                    continue
+                link = link_tag['href']
+                title = link_tag.text.strip()
+                date_tag = news.find('div', class_='media__date')
+                if date_tag:
+                    span_tag = date_tag.find('span')
+                    if span_tag and span_tag.has_attr('d-time'):
+                        timestamp = span_tag['d-time']
+                        news_date = datetime.fromtimestamp(int(timestamp))
+                    else:
+                        news_date = None
+                else:
+                    news_date = None
+                if news_date and news_date < cutoff_date:
+                    print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
+                    return pd.DataFrame(results)
+                # Ambil halaman detail berita dengan header
+                news_resp = requests.get(link, headers=headers)
+                if news_resp.status_code != 200:
+                    print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
+                    continue
+                news_soup = BeautifulSoup(news_resp.text, 'html.parser')
+                content_div = news_soup.find('div', class_='detail__body-text') or \
+                              news_soup.find('div', class_='detail_text')
+                if content_div:
+                    content_parts = []
+                    for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
+                        text = tag.get_text(strip=True)
+                        if text:
+                            prefix = tag.name.upper() if tag.name.startswith('h') else ''
+                            if prefix:
+                                content_parts.append(f"{prefix}: {text}")
+                            else:
+                                content_parts.append(text)
+                    content = '\n'.join(content_parts)
+                else:
+                    content = ''
+                tag_list_div = news_soup.find('div', class_='tag__list') or \
+                               news_soup.find('div', class_='detail_tag')
+                tags = []
+                if tag_list_div:
+                    tags = [t.text.strip() for t in tag_list_div.find_all('a')]
+                results.append({
+                    'judul': title,
+                    'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
+                    'tag': ', '.join(tags),
+                    'isi_berita': content,
+                    'link': link
+                })
+                print(f"Berhasil scrape berita: {title}")
+                time.sleep(1)
+            except Exception as e:
+                print(f"Error saat memproses berita: {e}")
+                continue
+        time.sleep(2)
+    return pd.DataFrame(results)
+if __name__ == "__main__":
+    keyword = "Kabupaten Cirebon"
+    df = scrape_detik_search(keyword)
+    if not df.empty:
+        df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
+        print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
+    else:
+        print("Tidak ada data yang berhasil di-scrape.")

word_cloud.py ADDED Viewed

	@@ -0,0 +1,535 @@

+# -*- coding: utf-8 -*-
+"""Word Cloud.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1rwyDXgYaTJQJvXu2FPeggecHOxIYQ3l3
+"""
+!pip install stop-words
+!pip install sastrawi
+!pip install transformers
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import html
+import re
+import json
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import NMF
+from wordcloud import WordCloud
+from tqdm import tqdm
+from IPython.display import display
+from bs4 import BeautifulSoup
+from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
+from stop_words import get_stop_words
+from collections import Counter
+from transformers import pipeline
+# ===============================================
+# --- Konfigurasi ---
+# ===============================================
+FILE_PATH = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/medsos (6).csv'
+N_TOPICS = 15
+N_TOP_WORDS = 10  # top kata per topik (juga dipakai untuk wordcloud)
+SAMPLE_DATA_TO_SHOW = 5 # Jumlah sampel data yang ingin ditampilkan per sentimen
+# ===============================================
+#  1. Stopwords: stop_words + Sastrawi + tambahan
+# ===============================================
+stopwords_indonesia = get_stop_words('indonesian')
+factory = StopWordRemoverFactory()
+sastrawi_stopwords = factory.get_stop_words()
+additional_stopwords = [
+    'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
+    'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi','apk',
+    'sllu','apknya','sngt','joos','ni','kak',
+    # kata umum
+    'manfaatnya','ya','lbh','digunakan','semangat','dah','sangat','penting',
+    'lancar','cepat','senang','makasih','bermanfaat','keren','berguna','baik',
+    'indonesia','usaha','memudahkan','pokoknya','puas','mantap','dananya','luar',
+    'hati','ber','terimakasih','tepat','memudah','terbaik','mempermudah','praktis',
+    'simple','kadang','memuaskan','bagus','semoga','smoga','aplikasi','transaksi',
+    'kesimpulan','sip','pelayanannya','orang','manfaat','untuk','proses','membantu',
+    'pengiriman','muda','mantaap','kedepannya','pake','aktifitas','sejauh','untung',
+    'tenang','bikin','pakek','saldo','keluhan','dimanapun','cukup','menggunakan',
+    'sengat','banget','pakai','terpercaya','top','sukses',
+    # hasil wordcloud
+    'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk','guna',
+    'baru','jelas','level','selengkapnya','yuk','mohon','punya','cara','hari',
+    'kota','news','baca','fitur','kasih','suruh',
+    'besar','sapa','bawa','atas','hidup','jaga','moga','kali','balas','perintah',
+    'masyarakat','ide','hadir','ikut','ingat','tali','alhamdulillah','sambut',
+    'masa','tuju','terima','ibu','silaturahmi','pasang','bangun','dukung',
+    'muhammad','teladan','tahun','insan','bulan','iman','erat','syukur',
+    'kabupaten','cirebon','langsung','cinta','kuat','tebar','hubung','ikat',
+    'resmi','giat','selenggara','luka','kendara','putih','fyp','reses','mulai',
+    'rctvcirebon','radarcirebon','temu','satu','factor','harap','wararctv',
+    'maksimal','salah','tiktokberita','kawasan','sangka','juang','merah','puluh',
+    'ribu','omo','argo','role','jati','tingkat','kata','emis','majalengka',
+    'madam','sebut','tawur','duga',
+    # tambahan kata lain
+    'visi','saw','keras','sayang','bentuk','didik','jalin','keluarga','momen',
+    'program','baginda','hikmah','panjang','lingkung','wewararctv', 'magelang',
+    'kang', 'langkah', 'limpah', 'explore', 'tabindex', 'penuh', 'aa', 'rasa', 'tags',
+    'notranslate', 'desa', 'daerah', 'lengkap', 'aa', 'kunjung', 'laku', 'klik', 'berkah',
+    'aboutcirebon', 'jl', 'terus', 'hasil', 'instastory', 'taut', 'upaya', 'berita',
+    'beri', 'lanjut', 'pemkabcirebon', 'warga', 'pemkabcirebon', 'selamat', 'wujud', 'maju',
+    'wakil', 'ungkap', 'turut', 'pihak', 'wilayah', 'dinas', 'promo', 'pemkotcirebon', 'hadap',
+    'barat', 'layan', 'siap', 'milik', 'lokasi', 'ujar', 'rupa', 'gratis', 'daftar', 'jawa', 'tengah',
+    'kolaborasi', 'tempat', 'tegas', 'gelar', 'wib'
+    # Bulan
+    'januari', 'februari', 'maret', 'april', 'mei', 'juni', 'juli', 'agustus', 'september',
+    'oktober', 'november', 'desember'
+]
+# ===== Tambahan stopwords untuk kata tidak jelas =====
+noise_stopwords = [
+    'by','zd','xyri','yu','uobl','ypdohk','xt','pz','lziwak','mp',
+    'rp','xdj','xexx','xggy','xjbqb','xstzfhl','link','class','hfl','xat',
+    'qhh','dhg','cr', 'tdsg', 'ct', 'etr', 'nq', 'oe', 'ejq', 'psk', 'href',
+    'hl', 'hd' , 'sy', 'amp', 'fbf', 'tags'
+]
+CUSTOM_STOPWORDS = [
+    # HTML & atribut umum
+    "class", "id", "span", "div", "href", "src", "style", "alt",
+    "aria", "role", "tabindex", "button", "label", "img", "input",
+    "placeholder", "form", "field", "hidden", "value", 'aa',
+    # Token acak/huruf tunggal
+    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
+    "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
+    "w", "x", "y", "z",
+    # Kata noise berulang dari teks kamu
+    "hfl", "xjbqb", "ejq", "ypdohk", "xexx", "hfr", "eyih",
+    "dwj", "hkzxv", "yuc", "igjr", "eqks", "oq", "kjzd", "oxk",
+    "zsgpy", "dycq", "g", "o", "wa", "wo", "ae", "ov", "vv", "uxc",
+    # Kata teknis netral
+    "content", "data", "video", "playlist", "source", "watch",
+    "channel", "views", "subscribe", "update", "next", "prev",
+    "click", "menu", "link", "button", "card", "section",
+    # Angka & simbol sering muncul
+    "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
+]
+# Gabungkan semua stopwords
+final_stopwords = list(set(stopwords_indonesia + sastrawi_stopwords + additional_stopwords + noise_stopwords + CUSTOM_STOPWORDS))
+# ===============================================
+#  2. Pembersihan HTML + Stemming Sastrawi
+# ===============================================
+stemmer = StemmerFactory().create_stemmer()
+html_noise = ['fbf','tabindex','tags','notranslate','aria-label','div','span','class']
+noise_words = set(noise_stopwords + CUSTOM_STOPWORDS + html_noise)
+def clean_html(text):
+    if pd.isna(text):
+        return ""
+    s = BeautifulSoup(str(text), "html.parser")
+    for tag in s(["script", "style"]):
+        tag.decompose()
+    cleaned = s.get_text(separator=" ")
+    cleaned = html.unescape(cleaned)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    return cleaned
+def remove_single_letters(text):
+    return re.sub(r"\b\w\b", "", text)
+def hapus (text):
+    tokens = [word for word in text.split() if word not in noise_words]
+    text = " ".join(tokens)
+    return text
+def preprocess_text(text):
+    # 1. Clean HTML
+    text = clean_html(text)
+    # 2. Lowercase
+    text = text.lower()
+    # 3. Stemming
+    text = stemmer.stem(text)
+    # 4. Hapus stopwords dan html noise
+    tokens = [word for word in text.split()
+              if word not in final_stopwords and word not in html_noise]
+    # 5. Ambil hanya kata (huruf saja)
+    tokens = [t for t in tokens if re.search(r"[a-zA-Z]", t)]
+    # 6. Gabung kembali
+    text = " ".join(tokens)
+    # 7. Hapus huruf tunggal
+    text = remove_single_letters(text)
+    return text.strip()
+# ===============================================
+#  3. Load & Preprocess Dataset
+# ===============================================
+try:
+    df = pd.read_csv(FILE_PATH)
+    df.dropna(subset=['caption'], inplace=True)
+    df['caption'] = df['caption'].astype(str)
+    df['caption_clean'] = df['caption'].apply(preprocess_text)
+    df['caption'] = df['caption'].apply(hapus)
+    print("✅ Dataset berhasil dimuat & dipreproses.")
+    print(f"Jumlah data: {len(df)} baris")
+    if 'caption_pred' in df.columns:
+        print("\nDistribusi Sentimen (caption_pred):")
+        print(df['caption_pred'].value_counts())
+except FileNotFoundError:
+    print(f"❌ Error: File '{FILE_PATH}' tidak ditemukan.")
+    raise SystemExit
+# ===============================================
+#  4. Fungsi utilitas
+# ===============================================
+def get_top_words_per_topic(model, feature_names, n_top_words):
+    topics = {}
+    for topic_idx, topic in enumerate(model.components_):
+        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
+        top_features = [feature_names[i] for i in top_features_ind]
+        topics[topic_idx] = top_features
+    return topics
+def format_topics_sentences(topics):
+    return {topic_idx: ", ".join(words) for topic_idx, words in topics.items()}
+def create_circular_wordcloud(words_list, title, n_words=10):
+    text_data = " ".join(words_list[:n_words])
+    if not text_data.strip():
+        print(f"Tidak ada kata untuk word cloud '{title}'.")
+        return
+    x, y = np.ogrid[:400, :400]
+    mask = (x - 200) ** 2 + (y - 200) ** 2 > 190 ** 2
+    mask = 255 * mask.astype(int)
+    wc = WordCloud(width=800, height=800, background_color='white',
+                   colormap='viridis', mask=mask,
+                   contour_width=3, contour_color='steelblue').generate(text_data)
+    plt.figure(figsize=(8, 8))
+    plt.imshow(wc, interpolation='bilinear')
+    plt.title(title, fontsize=18, pad=15)
+    plt.axis('off')
+    plt.show()
+def get_top_words_by_doc_frequency(df_subset, n_top_words=10):
+    word_doc_count = Counter()
+    for text in df_subset['caption_clean'].fillna(""):
+        tokens = [w for w in text.split() if not re.fullmatch(r"[a-z]", w)]
+        unique_tokens = set(tokens)
+        word_doc_count.update(unique_tokens)
+    return word_doc_count.most_common(n_top_words)
+summarizer = pipeline(
+    "summarization",
+    model="google/mt5-small",
+    tokenizer="google/mt5-small"
+)
+def generate_summary(text, max_length=60, min_length=20):
+    if not text or len(text.split()) < 10:
+        return text
+    try:
+        result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
+        return result[0]['summary_text']
+    except Exception as e:
+        print(f"⚠️ Error summarizing: {e}")
+        return text
+def summarize_text(corpus, n_topics=5, n_words=10):
+    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
+    X = vectorizer.fit_transform(corpus)
+    nmf = NMF(n_components=n_topics, random_state=42)
+    nmf.fit(X)
+    feature_names = vectorizer.get_feature_names_out()
+    key_sentences = []
+    for topic_idx, topic in enumerate(nmf.components_):
+        top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
+        key_sentences.extend(top_words)
+    # ubah jadi paragraf ringkas
+    summary = " ".join(key_sentences)
+    return summary
+# ===============================================
+#  5. GLOBAL Topic Modeling dan Pembuatan Ringkasan (PARAGRAF)
+# ===============================================
+print("\n--- 🧠 Memprediksi Topik dan Membuat Ringkasan untuk Semua Data ---")
+# 🔹 Gabungkan caption + comment jadi satu teks
+df['combined_text'] = df['caption_clean'].fillna('') + " " + df['comments_pred'].fillna('')
+# --- TF-IDF Vectorizer ---
+global_vectorizer = TfidfVectorizer(
+    max_df=0.9,
+    min_df=10,
+    max_features=1000,
+    stop_words=final_stopwords,
+    ngram_range=(1, 2)
+)
+global_tfidf = global_vectorizer.fit_transform(df['combined_text'])
+global_feature_names = global_vectorizer.get_feature_names_out()
+# --- Bagian NMF + Summary ---
+if global_tfidf.shape[1] == 0:
+    df['predicted_topic_id'] = -1
+    df['predicted_topic'] = "Tidak ada fitur yang cukup untuk modeling"
+    df['summary'] = "Tidak dapat membuat ringkasan"
+    print("⚠️ Peringatan: Kosakata terlalu sedikit setelah preprocessing. Topic modeling tidak dapat dilakukan.")
+else:
+    global_nmf_model = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
+    global_nmf_model.fit(global_tfidf)
+    # Distribusi topik per dokumen
+    topic_distribution = global_nmf_model.transform(global_tfidf)
+    df['predicted_topic_id'] = np.argmax(topic_distribution, axis=1)
+    # Ambil kata-kata penting tiap topik
+    def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
+        top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
+        return [feature_names[i] for i in top_indices]
+    # Mapping topik → keyword utama
+    topic_keywords = {}
+    for topic_idx in range(N_TOPICS):
+        top_words = get_top_words_for_topic(global_nmf_model, global_feature_names, topic_idx, N_TOP_WORDS)
+        topic_keywords[topic_idx] = ", ".join(top_words)
+    df['predicted_topic'] = df['predicted_topic_id'].map(topic_keywords).fillna("Topik tidak teridentifikasi")
+    # 🔹 Update ringkasan pakai IndoBERT, berdasarkan teks gabungan
+    df['summary'] = df['combined_text'].apply(lambda x: generate_summary(x))
+    print("✅ Prediksi topik selesai, ringkasan memakai IndoBERT Summarization (gabungan caption + comment).")
+# Menampilkan hasil untuk verifikasi
+print("\n--- ✨ Contoh Hasil Prediksi Topik dan Ringkasan ---")
+display(df[['caption', 'comments_pred', 'predicted_topic', 'summary']].head(10))
+# ===============================================
+# 6. Analisis per Sentimen + WordCloud + TAMPILKAN BUKTI BERDASARKAN KEYWORD
+# ===============================================
+analysis_result = {}  # tempat simpan hasil JSON
+if 'caption_pred' in df.columns:
+    sentiments = ['positif', 'negatif', 'netral']
+    # Pandas tampilkan teks penuh
+    pd.set_option('display.max_colwidth', None)
+    for sentiment in sentiments:
+        print(f"\n\n=======================================================")
+        print(f"📊 Analisis Mendalam untuk Sentimen: '{sentiment.upper()}'")
+        print(f"=======================================================")
+        subset_df = df[df['caption_pred'] == sentiment].copy()
+        analysis_result[sentiment] = []  # list kosong untuk simpan hasil tiap sentimen
+        if subset_df.empty:
+            print(f"Tidak ada data untuk sentimen '{sentiment}'.")
+            continue
+        # 1. Dapatkan kata-kata teratas
+        top_words_tuples = get_top_words_by_doc_frequency(subset_df, n_top_words=N_TOP_WORDS)
+        if not top_words_tuples:
+            print(f"Tidak ada kata signifikan pada sentimen '{sentiment}' untuk dianalisis.")
+            continue
+        # 2. Buat WordCloud
+        words_list_for_wc = [word for word, count in top_words_tuples]
+        create_circular_wordcloud(words_list_for_wc, f"WordCloud Sentimen {sentiment.upper()}", n_words=N_TOP_WORDS)
+        # 3. Tampilkan bukti ringkasan
+        print(f"\n--- 📄 Bukti Ringkasan Berdasarkan Kata Kunci Populer ---")
+        for word, doc_count in top_words_tuples:
+            relevant_data = subset_df[
+                subset_df['caption_clean'].str.contains(r'\b{}\b'.format(re.escape(word)), case=False, na=False)
+            ]
+            summaries_list = []
+            if not relevant_data.empty:
+                print(f"\n✅ Kata Kunci: '{word}' (ditemukan dalam {len(relevant_data)} data pada sentimen ini)")
+                for i, row in enumerate(relevant_data.itertuples(index=False), 1):
+                    caption = getattr(row, "caption_clean", "")
+                    link = getattr(row, "link", None) or getattr(row, "url", None) or "-"
+                    comment = getattr(row, "comments_pred", "")
+                    print(f"   {i}. {caption}  🔗 {link}  💬 {comment}")
+                    summaries_list.append({
+                        "caption": caption,
+                        "link": link,
+                        "comment": comment
+                    })
+            else:
+                print(f"\n❌ Kata Kunci: '{word}' (tidak ditemukan data relevan untuk ditampilkan)")
+            # tetap simpan ke JSON meskipun kosong
+            analysis_result[sentiment].append({
+                "keyword": word,
+                "count": int(len(relevant_data)),
+                "summary": summaries_list
+            })
+else:
+    print("\nKolom 'caption_pred' tidak ditemukan. Melewati analisis per sentimen.")
+# ===============================================
+# Simpan hasil JSON
+# ===============================================
+with open("sentiment_analysis_result.json", "w", encoding="utf-8") as f:
+    json.dump(analysis_result, f, ensure_ascii=False, indent=4)
+print("\n📂 Hasil analisis juga telah disimpan di 'sentiment_analysis_result.json'")
+# ===============================================
+# Prediksi Dataset Berita (judul, isi_berita, tag, link)
+# ===============================================
+FILE_BERITA = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita (6).csv'
+try:
+    df_berita = pd.read_csv(FILE_BERITA)
+    df_berita.dropna(subset=['isi_berita'], inplace=True)
+    df_berita['isi_berita'] = df_berita['isi_berita'].astype(str)
+    # Preprocessing isi_berita
+    df_berita['isi_berita_clean'] = df_berita['isi_berita'].apply(preprocess_text)
+    print("✅ Dataset berita berhasil dimuat & dipreproses.")
+    print(f"Jumlah data: {len(df_berita)} baris")
+except FileNotFoundError:
+    print(f"❌ Error: File '{FILE_BERITA}' tidak ditemukan.")
+    raise SystemExit
+# ===============================================
+# Topic Modeling untuk berita
+# ===============================================
+print("\n--- 🧠 Memprediksi Topik & Ringkasan untuk Dataset Berita ---")
+# 🔹 Gabungkan isi_berita_clean + judul + tag
+df_berita['combined_text'] = (
+    df_berita['isi_berita_clean'].fillna('') + " " +
+    df_berita['judul'].fillna('') + " " +
+    df_berita['tag'].fillna('')
+)
+# --- TF-IDF Vectorizer ---
+vectorizer_berita = TfidfVectorizer(
+    max_df=0.9,
+    min_df=5,
+    max_features=1000,
+    stop_words=final_stopwords,
+    ngram_range=(1, 2)
+)
+tfidf_berita = vectorizer_berita.fit_transform(df_berita['combined_text'])
+feature_names_berita = vectorizer_berita.get_feature_names_out()
+if tfidf_berita.shape[1] == 0:
+    df_berita['predicted_topic_id'] = -1
+    df_berita['predicted_topic'] = "Tidak cukup fitur untuk modeling"
+    df_berita['summary'] = "Tidak dapat membuat ringkasan"
+else:
+    nmf_berita = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
+    nmf_berita.fit(tfidf_berita)
+    topic_dist_berita = nmf_berita.transform(tfidf_berita)
+    df_berita['predicted_topic_id'] = np.argmax(topic_dist_berita, axis=1)
+    # Ambil kata topik
+    def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
+        top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
+        return [feature_names[i] for i in top_indices]
+    topic_keywords_berita = {}
+    for topic_idx in range(N_TOPICS):
+        top_words = get_top_words_for_topic(nmf_berita, feature_names_berita, topic_idx, N_TOP_WORDS)
+        topic_keywords_berita[topic_idx] = ", ".join(top_words)
+    df_berita['predicted_topic'] = df_berita['predicted_topic_id'].map(topic_keywords_berita).fillna("Topik tidak teridentifikasi")
+    # 🔹 Summarization IndoBERT (Google mT5)
+    df_berita['summary'] = df_berita['isi_berita'].apply(lambda x: generate_summary(x))
+print("✅ Prediksi topik & ringkasan berita selesai.")
+# ===============================================
+# Simpan hasil JSON
+# ===============================================
+output_data = []
+for row in df_berita.itertuples(index=False):
+    output_data.append({
+        "judul": getattr(row, "judul", ""),
+        "tag": getattr(row, "tag", ""),
+        "link": getattr(row, "link", ""),
+        "isi_berita": getattr(row, "isi_berita", ""),
+        "isi_berita_clean": getattr(row, "isi_berita_clean", ""),
+        "predicted_topic": getattr(row, "predicted_topic", ""),
+        "summary": getattr(row, "summary", "")
+    })
+with open("berita_analysis_result.json", "w", encoding="utf-8") as f:
+    json.dump(output_data, f, ensure_ascii=False, indent=4)
+print("\n📂 Hasil analisis berita disimpan di 'berita_analysis_result.json'")
+!pip install pyngrok flask
+from flask import Flask, jsonify
+from pyngrok import ngrok
+import json
+# Masukkan token ngrok kamu
+ngrok.set_auth_token("31odwJIHeYFk9aOrDfXDajKjK87_7esvX4phWySwTCG3BQ1R2")
+# Load JSON hasil analisis sentiment
+with open("sentiment_analysis_result.json", "r", encoding="utf-8") as f:
+    sentiment_result = json.load(f)
+# Load JSON hasil analisis berita
+with open("berita_analysis_result.json", "r", encoding="utf-8") as f:
+    berita_result = json.load(f)
+# Inisialisasi Flask
+app = Flask(__name__)
+# Endpoint untuk sentiment
+@app.route("/api/sentiment", methods=["GET"])
+def api_sentiment():
+    return jsonify(sentiment_result)
+# Endpoint untuk berita
+@app.route("/api/berita", methods=["GET"])
+def api_berita():
+    return jsonify(berita_result)
+# Jalankan Flask di port 5000
+port = 5000
+public_url = ngrok.connect(port)
+print("🔗 Public URL:", public_url)
+app.run(port=port)