Commit Β·
fa8ff66
0
Parent(s):
Initial clean deploy: Sentiment Analysis
Browse files- .dockerignore +35 -0
- .gitattributes +45 -0
- .gitignore +0 -0
- Dockerfile +60 -0
- Procfile +1 -0
- README.md +11 -0
- app.py +401 -0
- docker-compose.yml +20 -0
- fb.py +390 -0
- medos_scraping.py +461 -0
- preparing.py +236 -0
- requirements.txt +0 -0
- runtime.txt +1 -0
- sentimentanalysis.py +675 -0
- services/__init__.py +1 -0
- services/__pycache__/__init__.cpython-311.pyc +0 -0
- services/__pycache__/facebook.cpython-311.pyc +0 -0
- services/__pycache__/medos.cpython-311.pyc +0 -0
- services/__pycache__/news.cpython-311.pyc +0 -0
- services/__pycache__/preprocessing.cpython-311.pyc +0 -0
- services/__pycache__/sentiment.cpython-311.pyc +0 -0
- services/__pycache__/tiktok.cpython-311.pyc +0 -0
- services/__pycache__/wordcloud_service.cpython-311.pyc +0 -0
- services/_driver.py +66 -0
- services/facebook.py +304 -0
- services/medos.py +331 -0
- services/news.py +387 -0
- services/preprocessing.py +119 -0
- services/sentiment.py +159 -0
- services/tiktok.py +320 -0
- services/wordcloud_service.py +120 -0
- templates/index.html +1009 -0
- web_scrapping.py +1026 -0
- word_cloud.py +535 -0
.dockerignore
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python bytecache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.pyo
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
env/
|
| 10 |
+
|
| 11 |
+
# IDE
|
| 12 |
+
.idea/
|
| 13 |
+
.vscode/
|
| 14 |
+
|
| 15 |
+
# Cookies (may contain sensitive data, don't bake into image)
|
| 16 |
+
*.json
|
| 17 |
+
!requirements.txt
|
| 18 |
+
|
| 19 |
+
# Output files
|
| 20 |
+
static/output/*.png
|
| 21 |
+
|
| 22 |
+
# Notebook files
|
| 23 |
+
*.ipynb
|
| 24 |
+
|
| 25 |
+
# Git
|
| 26 |
+
.git/
|
| 27 |
+
.gitignore
|
| 28 |
+
|
| 29 |
+
# Model directory β mount as volume instead
|
| 30 |
+
indoBERT-sentiment/
|
| 31 |
+
|
| 32 |
+
# Misc
|
| 33 |
+
*.csv
|
| 34 |
+
Procfile
|
| 35 |
+
runtime.txt
|
.gitattributes
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
=======
|
| 7 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
>>>>>>> 649536a0e30a230c86bf243c4a705ac8f70543b6
|
| 43 |
+
static/output/*.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
Binary file (130 Bytes). View file
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# βββ Base Image βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Environment Variables
|
| 5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 8 |
+
|
| 9 |
+
# βββ System Dependencies βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 11 |
+
wget curl gnupg ca-certificates unzip \
|
| 12 |
+
# Chromium + driver (AUTO MATCH, STABLE)
|
| 13 |
+
chromium chromium-driver \
|
| 14 |
+
# Required libs
|
| 15 |
+
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 \
|
| 16 |
+
libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 \
|
| 17 |
+
libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \
|
| 18 |
+
libasound2 libxshmfence1 fonts-liberation libappindicator3-1 \
|
| 19 |
+
xdg-utils libvulkan1 libx11-xcb1 \
|
| 20 |
+
# Fonts
|
| 21 |
+
fonts-noto fonts-noto-cjk \
|
| 22 |
+
# Build tools
|
| 23 |
+
gcc g++ build-essential \
|
| 24 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
+
|
| 26 |
+
# βββ Set Chromium Path βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
ENV CHROME_BIN=/usr/bin/chromium
|
| 28 |
+
ENV CHROMEDRIVER_PATH=/usr/bin/chromedriver
|
| 29 |
+
|
| 30 |
+
# βββ Hugging Face Spaces Rules (Non-Root User) βββββββββββββββββββββββββββββββ
|
| 31 |
+
# Hugging Face Spaces requires running Docker as a non-root user (UID 1000)
|
| 32 |
+
RUN useradd -m -u 1000 user
|
| 33 |
+
ENV HOME=/home/user \
|
| 34 |
+
PATH=/home/user/.local/bin:$PATH
|
| 35 |
+
|
| 36 |
+
WORKDIR $HOME/app
|
| 37 |
+
|
| 38 |
+
# Pre-create output directory and ensure permissions
|
| 39 |
+
RUN mkdir -p $HOME/app/static/output && chown -R user:user $HOME
|
| 40 |
+
|
| 41 |
+
# Switch to the non-root user
|
| 42 |
+
USER user
|
| 43 |
+
|
| 44 |
+
# βββ App Setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
COPY --chown=user:user requirements.txt .
|
| 46 |
+
|
| 47 |
+
# Install dependencies into user directory
|
| 48 |
+
# PyTorch CPU version specified explicitly
|
| 49 |
+
RUN pip install --no-cache-dir --user torch --index-url https://download.pytorch.org/whl/cpu && \
|
| 50 |
+
pip install --no-cache-dir --user -r requirements.txt
|
| 51 |
+
|
| 52 |
+
# Copy project files
|
| 53 |
+
COPY --chown=user:user . .
|
| 54 |
+
|
| 55 |
+
# βββ Expose Port βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
# Hugging Face exposes port 7860
|
| 57 |
+
EXPOSE 7860
|
| 58 |
+
|
| 59 |
+
# βββ Run App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
Procfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
web: uvicorn app:app --host 0.0.0.0 --port $PORT
|
README.md
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Sentiment
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py β FastAPI application for Scraping + Sentiment Analysis + WordCloud.
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import base64
|
| 7 |
+
import io
|
| 8 |
+
import csv
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import traceback
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
import uvicorn
|
| 15 |
+
from fastapi import FastAPI, File, Form, Request, UploadFile
|
| 16 |
+
from fastapi.responses import HTMLResponse
|
| 17 |
+
from fastapi.staticfiles import StaticFiles
|
| 18 |
+
from fastapi.templating import Jinja2Templates
|
| 19 |
+
|
| 20 |
+
from services.medos import scrape_medos
|
| 21 |
+
from services.tiktok import scrape_tiktok
|
| 22 |
+
from services.news import scrape_news
|
| 23 |
+
from services.preprocessing import preprocess_text
|
| 24 |
+
from services.sentiment import analyze_sentiment
|
| 25 |
+
from services.wordcloud_service import generate_wordcloud
|
| 26 |
+
from services.facebook import scrape_facebook
|
| 27 |
+
|
| 28 |
+
# ββ App setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
app = FastAPI(title="Sentiment Analysis Dashboard")
|
| 30 |
+
|
| 31 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 32 |
+
|
| 33 |
+
templates = Jinja2Templates(directory="templates")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
+
|
| 38 |
+
def _split_targets(raw: str | None) -> list[str]:
|
| 39 |
+
"""Split a newline/comma-separated string into a clean list of non-empty strings."""
|
| 40 |
+
if not raw or not raw.strip():
|
| 41 |
+
return []
|
| 42 |
+
parts = []
|
| 43 |
+
for line in raw.replace(",", "\n").splitlines():
|
| 44 |
+
s = line.strip()
|
| 45 |
+
if s:
|
| 46 |
+
parts.append(s)
|
| 47 |
+
return parts
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _is_enabled(flag: str | None) -> bool:
|
| 51 |
+
"""Return True only if the enable flag is explicitly '1'."""
|
| 52 |
+
return (flag or "").strip() == "1"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _flatten_for_csv(raw_texts: list) -> list[dict]:
|
| 56 |
+
flat = []
|
| 57 |
+
for item in raw_texts:
|
| 58 |
+
if isinstance(item, str):
|
| 59 |
+
flat.append({"text": item})
|
| 60 |
+
elif isinstance(item, dict):
|
| 61 |
+
base = {k: v for k, v in item.items() if k != "comments"}
|
| 62 |
+
comments = item.get("comments", [])
|
| 63 |
+
if not comments:
|
| 64 |
+
flat.append(base)
|
| 65 |
+
else:
|
| 66 |
+
for c in comments:
|
| 67 |
+
row = dict(base)
|
| 68 |
+
if isinstance(c, str):
|
| 69 |
+
row["comment_text"] = c
|
| 70 |
+
elif isinstance(c, dict):
|
| 71 |
+
row["comment_author"] = c.get("author", "")
|
| 72 |
+
row["comment_text"] = c.get("comment", "")
|
| 73 |
+
flat.append(row)
|
| 74 |
+
for r in c.get("replies", []):
|
| 75 |
+
rep_row = dict(base)
|
| 76 |
+
rep_row["comment_author"] = r.get("author", "")
|
| 77 |
+
rep_row["comment_text"] = r.get("comment", "")
|
| 78 |
+
flat.append(rep_row)
|
| 79 |
+
continue
|
| 80 |
+
flat.append(row)
|
| 81 |
+
return flat
|
| 82 |
+
|
| 83 |
+
def _extract_texts(raw_texts: list) -> list[str]:
|
| 84 |
+
extracted = []
|
| 85 |
+
for item in raw_texts:
|
| 86 |
+
if isinstance(item, str):
|
| 87 |
+
extracted.append(item)
|
| 88 |
+
elif isinstance(item, dict):
|
| 89 |
+
if "caption_short" in item: extracted.append(item["caption_short"])
|
| 90 |
+
if "caption_detail" in item: extracted.append(item["caption_detail"])
|
| 91 |
+
if "caption" in item: extracted.append(item["caption"])
|
| 92 |
+
if "judul" in item: extracted.append(item["judul"])
|
| 93 |
+
if "isi_berita" in item: extracted.append(item["isi_berita"])
|
| 94 |
+
if "tag" in item: extracted.append(item["tag"])
|
| 95 |
+
for c in item.get("comments", []):
|
| 96 |
+
if isinstance(c, str):
|
| 97 |
+
extracted.append(c)
|
| 98 |
+
elif isinstance(c, dict):
|
| 99 |
+
extracted.append(c.get("comment", ""))
|
| 100 |
+
for r in c.get("replies", []):
|
| 101 |
+
extracted.append(r.get("comment", ""))
|
| 102 |
+
return extracted
|
| 103 |
+
|
| 104 |
+
def _run_pipeline(raw_texts: list) -> dict:
|
| 105 |
+
"""Shared preprocessing β sentiment β wordcloud pipeline."""
|
| 106 |
+
if not raw_texts:
|
| 107 |
+
return {
|
| 108 |
+
"error": "Tidak ada teks yang berhasil dikumpulkan.",
|
| 109 |
+
"result": None,
|
| 110 |
+
"image": None,
|
| 111 |
+
"total_scraped": 0,
|
| 112 |
+
"csv_filename": None,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# Save CSV
|
| 116 |
+
import os
|
| 117 |
+
import csv
|
| 118 |
+
from datetime import datetime
|
| 119 |
+
os.makedirs("static/output", exist_ok=True)
|
| 120 |
+
csv_fname = f"scraped_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
| 121 |
+
csv_path = os.path.join("static", "output", csv_fname)
|
| 122 |
+
|
| 123 |
+
flat_data = _flatten_for_csv(raw_texts)
|
| 124 |
+
if flat_data:
|
| 125 |
+
keys = set()
|
| 126 |
+
for d in flat_data: keys.update(d.keys())
|
| 127 |
+
with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
|
| 128 |
+
writer = csv.DictWriter(f, fieldnames=list(keys))
|
| 129 |
+
writer.writeheader()
|
| 130 |
+
writer.writerows(flat_data)
|
| 131 |
+
csv_url = f"/static/output/{csv_fname}"
|
| 132 |
+
else:
|
| 133 |
+
csv_url = None
|
| 134 |
+
|
| 135 |
+
# Extract text for ML pipeline
|
| 136 |
+
text_list = _extract_texts(raw_texts)
|
| 137 |
+
|
| 138 |
+
total_scraped = len(text_list)
|
| 139 |
+
print(f"[APP] Total item yg di-ekstrak teksnya: {total_scraped}")
|
| 140 |
+
|
| 141 |
+
# Preprocess
|
| 142 |
+
print("[APP] Preprocessingβ¦")
|
| 143 |
+
clean_texts = preprocess_text(text_list)
|
| 144 |
+
clean_texts = [t for t in clean_texts if t and t.strip()]
|
| 145 |
+
|
| 146 |
+
if not clean_texts:
|
| 147 |
+
return {
|
| 148 |
+
"error": "Semua teks kosong setelah preprocessing. Coba input yang berbeda.",
|
| 149 |
+
"result": None,
|
| 150 |
+
"image": None,
|
| 151 |
+
"total_scraped": total_scraped,
|
| 152 |
+
"csv_filename": csv_url,
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Sentiment
|
| 156 |
+
print(f"[APP] Analyzing sentiment on {len(clean_texts)} textsβ¦")
|
| 157 |
+
try:
|
| 158 |
+
sentiment = analyze_sentiment(clean_texts)
|
| 159 |
+
except Exception as e:
|
| 160 |
+
print(f"[APP] Sentiment error: {e}\n{traceback.format_exc()}")
|
| 161 |
+
sentiment = None
|
| 162 |
+
|
| 163 |
+
# WordCloud β generate into memory as base64 (no file saved)
|
| 164 |
+
print("[APP] Generating wordcloudβ¦")
|
| 165 |
+
image_b64 = None
|
| 166 |
+
try:
|
| 167 |
+
buf = io.BytesIO()
|
| 168 |
+
wc_ok = generate_wordcloud(clean_texts, buf)
|
| 169 |
+
if wc_ok:
|
| 170 |
+
buf.seek(0)
|
| 171 |
+
image_b64 = base64.b64encode(buf.read()).decode("utf-8")
|
| 172 |
+
except Exception as e:
|
| 173 |
+
print(f"[APP] WordCloud error: {e}")
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
"error": None,
|
| 177 |
+
"result": sentiment,
|
| 178 |
+
"image": image_b64,
|
| 179 |
+
"total_scraped": total_scraped,
|
| 180 |
+
"csv_filename": csv_url,
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ββ Routes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 185 |
+
|
| 186 |
+
@app.get("/", response_class=HTMLResponse)
|
| 187 |
+
async def home(request: Request):
|
| 188 |
+
return templates.TemplateResponse(request=request, name="index.html")
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
@app.post("/process", response_class=HTMLResponse)
|
| 192 |
+
async def process(
|
| 193 |
+
request: Request,
|
| 194 |
+
|
| 195 |
+
# ββ Platform enable flags (set by JS, "1" = enabled) ββββββββββββββββββ
|
| 196 |
+
enable_instagram: str = Form(""),
|
| 197 |
+
enable_tiktok: str = Form(""),
|
| 198 |
+
enable_facebook: str = Form(""),
|
| 199 |
+
enable_news: str = Form(""),
|
| 200 |
+
|
| 201 |
+
# ββ Instagram (separate credentials) βββββββββββββββββββββββββββββββββ
|
| 202 |
+
ig_username: str = Form(None),
|
| 203 |
+
ig_password: str = Form(None),
|
| 204 |
+
target_accounts: str = Form(None),
|
| 205 |
+
mode: str = Form("all"),
|
| 206 |
+
|
| 207 |
+
# ββ TikTok ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 208 |
+
tiktok_cookie: str = Form(None),
|
| 209 |
+
tiktok_targets: str = Form(None),
|
| 210 |
+
|
| 211 |
+
# ββ Facebook (separate credentials, explicit groups only) βββββββββββββ
|
| 212 |
+
fb_username: str = Form(None),
|
| 213 |
+
fb_password: str = Form(None),
|
| 214 |
+
facebook_groups: str = Form(None),
|
| 215 |
+
|
| 216 |
+
# ββ News ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 217 |
+
news_portals: str = Form(None), # comma-separated portal keys
|
| 218 |
+
news_keyword: str = Form("kabupaten cirebon"),
|
| 219 |
+
news_pages: int = Form(1),
|
| 220 |
+
):
|
| 221 |
+
raw_texts: list = []
|
| 222 |
+
|
| 223 |
+
# ββ 1. Instagram ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 224 |
+
if _is_enabled(enable_instagram):
|
| 225 |
+
ig_targets = _split_targets(target_accounts)
|
| 226 |
+
if not ig_username or not ig_password:
|
| 227 |
+
print("[APP] Instagram diaktifkan tapi username/password kosong β skip.")
|
| 228 |
+
elif not ig_targets:
|
| 229 |
+
print("[APP] Instagram diaktifkan tapi tidak ada target β skip.")
|
| 230 |
+
else:
|
| 231 |
+
for tgt in ig_targets:
|
| 232 |
+
print(f"[APP] Scraping Instagram: {tgt}")
|
| 233 |
+
try:
|
| 234 |
+
texts = scrape_medos(ig_username, ig_password, tgt, mode)
|
| 235 |
+
raw_texts.extend(texts)
|
| 236 |
+
print(f"[APP] Instagram @{tgt} β {len(texts)} teks")
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f"[APP] Instagram error ({tgt}): {e}")
|
| 239 |
+
else:
|
| 240 |
+
print("[APP] Instagram dinonaktifkan β skip.")
|
| 241 |
+
|
| 242 |
+
# ββ 2. TikTok βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
+
if _is_enabled(enable_tiktok):
|
| 244 |
+
tt_targets = _split_targets(tiktok_targets)
|
| 245 |
+
if not tt_targets:
|
| 246 |
+
print("[APP] TikTok diaktifkan tapi tidak ada target β skip.")
|
| 247 |
+
else:
|
| 248 |
+
for tgt in tt_targets:
|
| 249 |
+
print(f"[APP] Scraping TikTok: {tgt}")
|
| 250 |
+
try:
|
| 251 |
+
texts = scrape_tiktok(tiktok_cookie or "", tgt)
|
| 252 |
+
raw_texts.extend(texts)
|
| 253 |
+
print(f"[APP] TikTok @{tgt} β {len(texts)} teks")
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"[APP] TikTok error ({tgt}): {e}")
|
| 256 |
+
else:
|
| 257 |
+
print("[APP] TikTok dinonaktifkan β skip.")
|
| 258 |
+
|
| 259 |
+
# ββ 3. Facebook βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 260 |
+
# TIDAK memakai default groups β harus ada URL & credentials eksplisit
|
| 261 |
+
if _is_enabled(enable_facebook):
|
| 262 |
+
fb_groups = _split_targets(facebook_groups)
|
| 263 |
+
if not fb_username or not fb_password:
|
| 264 |
+
print("[APP] Facebook diaktifkan tapi username/password kosong β skip.")
|
| 265 |
+
elif not fb_groups:
|
| 266 |
+
print("[APP] Facebook diaktifkan tapi tidak ada URL grup β skip (tidak ada default).")
|
| 267 |
+
else:
|
| 268 |
+
print(f"[APP] Scraping Facebook {len(fb_groups)} grupβ¦")
|
| 269 |
+
try:
|
| 270 |
+
texts = scrape_facebook(fb_username, fb_password, fb_groups)
|
| 271 |
+
raw_texts.extend(texts)
|
| 272 |
+
print(f"[APP] Facebook β {len(texts)} teks")
|
| 273 |
+
except Exception as e:
|
| 274 |
+
print(f"[APP] Facebook error: {e}")
|
| 275 |
+
else:
|
| 276 |
+
print("[APP] Facebook dinonaktifkan β skip.")
|
| 277 |
+
|
| 278 |
+
# ββ 4. News βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 279 |
+
if _is_enabled(enable_news):
|
| 280 |
+
portals = _split_targets(news_portals)
|
| 281 |
+
if not portals:
|
| 282 |
+
print("[APP] News diaktifkan tapi tidak ada portal dipilih β skip.")
|
| 283 |
+
else:
|
| 284 |
+
for portal in portals:
|
| 285 |
+
print(f"[APP] Scraping news: portal={portal}, keyword={news_keyword}, pages={news_pages}")
|
| 286 |
+
try:
|
| 287 |
+
texts = scrape_news(portal, news_pages, keyword=news_keyword)
|
| 288 |
+
raw_texts.extend(texts)
|
| 289 |
+
print(f"[APP] News ({portal}) β {len(texts)} teks")
|
| 290 |
+
except Exception as e:
|
| 291 |
+
print(f"[APP] News error ({portal}): {e}")
|
| 292 |
+
else:
|
| 293 |
+
print("[APP] News dinonaktifkan β skip.")
|
| 294 |
+
|
| 295 |
+
# ββ Pipeline ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 296 |
+
outcome = _run_pipeline(raw_texts)
|
| 297 |
+
|
| 298 |
+
return templates.TemplateResponse(
|
| 299 |
+
request=request,
|
| 300 |
+
name="index.html",
|
| 301 |
+
context={
|
| 302 |
+
"error": outcome["error"],
|
| 303 |
+
"result": outcome["result"],
|
| 304 |
+
"image": outcome["image"],
|
| 305 |
+
"total_scraped": outcome["total_scraped"],
|
| 306 |
+
"csv_filename": outcome["csv_filename"],
|
| 307 |
+
"active_tab": "scraping",
|
| 308 |
+
},
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
@app.post("/wordcloud-dataset", response_class=HTMLResponse)
|
| 313 |
+
async def wordcloud_dataset(
|
| 314 |
+
request: Request,
|
| 315 |
+
dataset_text: str = Form(None),
|
| 316 |
+
dataset_file: UploadFile = File(None),
|
| 317 |
+
text_column: str = Form("text"),
|
| 318 |
+
):
|
| 319 |
+
"""
|
| 320 |
+
Word cloud + sentiment from an uploaded dataset (CSV/TXT/JSON) or pasted text.
|
| 321 |
+
"""
|
| 322 |
+
raw_texts: list = []
|
| 323 |
+
|
| 324 |
+
# Priority: file upload
|
| 325 |
+
if dataset_file and dataset_file.filename:
|
| 326 |
+
fname = dataset_file.filename.lower()
|
| 327 |
+
content_bytes = await dataset_file.read()
|
| 328 |
+
try:
|
| 329 |
+
content_str = content_bytes.decode("utf-8", errors="replace")
|
| 330 |
+
except Exception:
|
| 331 |
+
content_str = content_bytes.decode("latin-1", errors="replace")
|
| 332 |
+
|
| 333 |
+
if fname.endswith(".csv") or fname.endswith(".tsv"):
|
| 334 |
+
delimiter = "\t" if fname.endswith(".tsv") else ","
|
| 335 |
+
reader = csv.DictReader(io.StringIO(content_str), delimiter=delimiter)
|
| 336 |
+
cols = reader.fieldnames or []
|
| 337 |
+
for row in reader:
|
| 338 |
+
if text_column and text_column in cols and row.get(text_column):
|
| 339 |
+
raw_texts.append(str(row[text_column]))
|
| 340 |
+
else:
|
| 341 |
+
raw_texts.append(row)
|
| 342 |
+
|
| 343 |
+
elif fname.endswith(".json"):
|
| 344 |
+
try:
|
| 345 |
+
data = json.loads(content_str)
|
| 346 |
+
if isinstance(data, list):
|
| 347 |
+
for item in data:
|
| 348 |
+
if isinstance(item, str) and item:
|
| 349 |
+
raw_texts.append(item)
|
| 350 |
+
elif isinstance(item, dict):
|
| 351 |
+
if text_column and text_column in item and item.get(text_column):
|
| 352 |
+
raw_texts.append(str(item[text_column]))
|
| 353 |
+
else:
|
| 354 |
+
raw_texts.append(item)
|
| 355 |
+
except Exception as e:
|
| 356 |
+
print(f"[Dataset] JSON parse error: {e}")
|
| 357 |
+
else:
|
| 358 |
+
# Plain text οΏ½οΏ½οΏ½ each non-empty line is one document
|
| 359 |
+
for line in content_str.splitlines():
|
| 360 |
+
line = line.strip()
|
| 361 |
+
if line:
|
| 362 |
+
raw_texts.append(line)
|
| 363 |
+
|
| 364 |
+
elif dataset_text and dataset_text.strip():
|
| 365 |
+
for line in dataset_text.splitlines():
|
| 366 |
+
line = line.strip()
|
| 367 |
+
if line:
|
| 368 |
+
raw_texts.append(line)
|
| 369 |
+
|
| 370 |
+
if not raw_texts:
|
| 371 |
+
return templates.TemplateResponse(
|
| 372 |
+
request=request,
|
| 373 |
+
name="index.html",
|
| 374 |
+
context={
|
| 375 |
+
"error": "Tidak ada teks ditemukan dalam dataset. Pastikan file / teks tidak kosong.",
|
| 376 |
+
"result": None,
|
| 377 |
+
"image": None,
|
| 378 |
+
"total_scraped": 0,
|
| 379 |
+
"csv_filename": None,
|
| 380 |
+
"active_tab": "dataset",
|
| 381 |
+
},
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
outcome = _run_pipeline(raw_texts)
|
| 385 |
+
|
| 386 |
+
return templates.TemplateResponse(
|
| 387 |
+
request=request,
|
| 388 |
+
name="index.html",
|
| 389 |
+
context={
|
| 390 |
+
"error": outcome["error"],
|
| 391 |
+
"result": outcome["result"],
|
| 392 |
+
"image": outcome["image"],
|
| 393 |
+
"total_scraped": outcome["total_scraped"],
|
| 394 |
+
"csv_filename": outcome["csv_filename"],
|
| 395 |
+
"active_tab": "dataset",
|
| 396 |
+
},
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
if __name__ == "__main__":
|
| 401 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.9"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
app:
|
| 5 |
+
build: .
|
| 6 |
+
container_name: sentiment_app
|
| 7 |
+
ports:
|
| 8 |
+
# Map host 8000 to container 7860 (Hugging Face default)
|
| 9 |
+
- "8000:7860"
|
| 10 |
+
# Chrome needs a larger /dev/shm to avoid crashes in headless mode
|
| 11 |
+
shm_size: "2gb"
|
| 12 |
+
environment:
|
| 13 |
+
- PYTHONUNBUFFERED=1
|
| 14 |
+
volumes:
|
| 15 |
+
# Persist wordcloud output between runs
|
| 16 |
+
- ./static/output:/home/user/app/static/output
|
| 17 |
+
# Mount a local model folder if you have one (optional)
|
| 18 |
+
# Rename or create the folder 'indoBERT-sentiment' in the project root
|
| 19 |
+
- ./indoBERT-sentiment:/home/user/app/indoBERT-sentiment
|
| 20 |
+
restart: unless-stopped
|
fb.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import csv
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
import undetected_chromedriver as uc
|
| 7 |
+
from selenium.webdriver.common.by import By
|
| 8 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 9 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 10 |
+
|
| 11 |
+
# ========== KONFIGURASI ==========
|
| 12 |
+
FB_USERNAME = "fatihr252@gmail.com"
|
| 13 |
+
FB_PASSWORD = "Bambank1"
|
| 14 |
+
COOKIES_FILE = "fb_cookies.json"
|
| 15 |
+
|
| 16 |
+
# daftar grup yang ingin di-scrape
|
| 17 |
+
GROUP_INPUTS = [
|
| 18 |
+
"https://web.facebook.com/groups/183039928416039?locale=id_ID",
|
| 19 |
+
"https://web.facebook.com/groups/teraswarga?locale=id_ID",
|
| 20 |
+
"https://web.facebook.com/groups/967901979894945?locale=id_ID"
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
# lokasi hasil scraping
|
| 24 |
+
OUTPUT_CSV = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
| 25 |
+
OUTPUT_JSON = f"facebook_groups_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
| 26 |
+
|
| 27 |
+
# ========== SETUP SELENIUM ==========
|
| 28 |
+
options = uc.ChromeOptions()
|
| 29 |
+
options.add_argument("--disable-notifications")
|
| 30 |
+
options.add_argument("--disable-infobars")
|
| 31 |
+
options.add_argument("--start-maximized")
|
| 32 |
+
|
| 33 |
+
driver = uc.Chrome(options=options, use_subprocess=True)
|
| 34 |
+
wait = WebDriverWait(driver, 15)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ========== FUNGSI LOGIN ==========
|
| 38 |
+
def save_cookies(driver, path):
|
| 39 |
+
with open(path, "w") as file:
|
| 40 |
+
json.dump(driver.get_cookies(), file)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_cookies(driver, path):
|
| 44 |
+
with open(path, "r") as file:
|
| 45 |
+
cookies = json.load(file)
|
| 46 |
+
for cookie in cookies:
|
| 47 |
+
driver.add_cookie(cookie)
|
| 48 |
+
|
| 49 |
+
def fb_login(force=False):
|
| 50 |
+
"""
|
| 51 |
+
force=True akan memaksa login pakai username/password
|
| 52 |
+
walaupun ada cookies.
|
| 53 |
+
"""
|
| 54 |
+
driver.get("https://www.facebook.com/")
|
| 55 |
+
time.sleep(3)
|
| 56 |
+
|
| 57 |
+
if not force and os.path.exists(COOKIES_FILE):
|
| 58 |
+
try:
|
| 59 |
+
load_cookies(driver, COOKIES_FILE)
|
| 60 |
+
driver.refresh()
|
| 61 |
+
time.sleep(5)
|
| 62 |
+
if "login" not in driver.current_url:
|
| 63 |
+
print("β
Login pakai cookies berhasil")
|
| 64 |
+
# pastikan search bar muncul sebelum keluar
|
| 65 |
+
try:
|
| 66 |
+
wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
|
| 67 |
+
print("π Search bar tersedia, siap mencari grup")
|
| 68 |
+
except:
|
| 69 |
+
print("β οΈ Search bar belum muncul, tetap lanjutkan")
|
| 70 |
+
return
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print("β οΈ Cookies gagal dipakai:", e)
|
| 73 |
+
|
| 74 |
+
print("π Login manual pakai username/password...")
|
| 75 |
+
|
| 76 |
+
# --- Login form handling ---
|
| 77 |
+
try:
|
| 78 |
+
# versi klasik (id=email, id=pass)
|
| 79 |
+
email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
|
| 80 |
+
pass_input = driver.find_element(By.ID, "pass")
|
| 81 |
+
email_input.clear()
|
| 82 |
+
email_input.send_keys(FB_USERNAME)
|
| 83 |
+
pass_input.clear()
|
| 84 |
+
pass_input.send_keys(FB_PASSWORD)
|
| 85 |
+
driver.find_element(By.NAME, "login").click()
|
| 86 |
+
except Exception:
|
| 87 |
+
try:
|
| 88 |
+
# versi dinamis (_r_s_, _r_17_)
|
| 89 |
+
email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@name="email" and @type="text"]')))
|
| 90 |
+
pass_input = driver.find_element(By.XPATH, '//input[@name="pass" and @type="password"]')
|
| 91 |
+
email_input.clear()
|
| 92 |
+
email_input.send_keys(FB_USERNAME)
|
| 93 |
+
pass_input.clear()
|
| 94 |
+
pass_input.send_keys(FB_PASSWORD)
|
| 95 |
+
pass_input.submit()
|
| 96 |
+
except Exception:
|
| 97 |
+
try:
|
| 98 |
+
# versi lain (data-testid)
|
| 99 |
+
email_input = wait.until(EC.presence_of_element_located((By.XPATH, '//input[@data-testid="royal-email"]')))
|
| 100 |
+
pass_input = driver.find_element(By.XPATH, '//input[@data-testid="royal-pass"]')
|
| 101 |
+
email_input.clear()
|
| 102 |
+
email_input.send_keys(FB_USERNAME)
|
| 103 |
+
pass_input.clear()
|
| 104 |
+
pass_input.send_keys(FB_PASSWORD)
|
| 105 |
+
driver.find_element(By.NAME, "login").click()
|
| 106 |
+
except Exception as e:
|
| 107 |
+
raise Exception(f"β Tidak menemukan form login yang cocok: {e}")
|
| 108 |
+
|
| 109 |
+
time.sleep(5)
|
| 110 |
+
if "login" in driver.current_url:
|
| 111 |
+
raise Exception("β Login gagal! Cek username/password")
|
| 112 |
+
|
| 113 |
+
save_cookies(driver, COOKIES_FILE)
|
| 114 |
+
print("β
Login sukses & cookies disimpan")
|
| 115 |
+
|
| 116 |
+
# setelah login sukses, pastikan search bar ada
|
| 117 |
+
try:
|
| 118 |
+
wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
|
| 119 |
+
print("π Search bar tersedia, siap mencari grup")
|
| 120 |
+
except:
|
| 121 |
+
print("β οΈ Search bar belum muncul, coba manual redirect ke beranda")
|
| 122 |
+
driver.get("https://www.facebook.com/")
|
| 123 |
+
wait.until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]')))
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def ensure_logged_in():
|
| 127 |
+
"""Cek apakah user masih login, kalau muncul halaman login atau popup, login ulang."""
|
| 128 |
+
try:
|
| 129 |
+
# --- Kasus URL berubah ke login page ---
|
| 130 |
+
if driver.current_url and "login" in driver.current_url:
|
| 131 |
+
print("β οΈ Redirect ke halaman login, mencoba login ulang...")
|
| 132 |
+
fb_login(force=True)
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
# --- Kasus popup 'See more on Facebook' muncul ---
|
| 136 |
+
try:
|
| 137 |
+
popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
|
| 138 |
+
if popup.is_displayed():
|
| 139 |
+
print("β οΈ Popup login terdeteksi, login ulang...")
|
| 140 |
+
fb_login(force=True)
|
| 141 |
+
return
|
| 142 |
+
except:
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
# --- Kasus ada input email/password nongol di modal ---
|
| 146 |
+
try:
|
| 147 |
+
login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
|
| 148 |
+
if login_modal.is_displayed():
|
| 149 |
+
print("β οΈ Form login modal terdeteksi, login ulang...")
|
| 150 |
+
fb_login(force=True)
|
| 151 |
+
return
|
| 152 |
+
except:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print("β οΈ Gagal cek login:", e)
|
| 157 |
+
|
| 158 |
+
# ========== SEARCH & BUKA GRUP ==========
|
| 159 |
+
def open_group(group_input):
|
| 160 |
+
"""
|
| 161 |
+
Bisa menerima nama grup ATAU link grup langsung.
|
| 162 |
+
"""
|
| 163 |
+
# --- Kasus: input berupa link langsung ---
|
| 164 |
+
if group_input.startswith("http"):
|
| 165 |
+
print(f"π Buka langsung link grup: {group_input}")
|
| 166 |
+
driver.get(group_input)
|
| 167 |
+
time.sleep(5)
|
| 168 |
+
|
| 169 |
+
ensure_logged_in()
|
| 170 |
+
return group_input
|
| 171 |
+
|
| 172 |
+
# --- Kasus: input berupa nama grup ---
|
| 173 |
+
try:
|
| 174 |
+
search_box = wait.until(
|
| 175 |
+
EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Cari di Facebook"]'))
|
| 176 |
+
)
|
| 177 |
+
print(f"π Mencari grup '{group_input}' via search...")
|
| 178 |
+
search_box.clear()
|
| 179 |
+
search_box.send_keys(group_input)
|
| 180 |
+
search_box.submit()
|
| 181 |
+
time.sleep(5)
|
| 182 |
+
|
| 183 |
+
# cari hasil grup dengan nama persis
|
| 184 |
+
link = None
|
| 185 |
+
results = driver.find_elements(By.XPATH, f'//a[contains(text(),"{group_input}")]')
|
| 186 |
+
if results:
|
| 187 |
+
link = results[0].get_attribute("href")
|
| 188 |
+
|
| 189 |
+
if link:
|
| 190 |
+
print(f"β
Grup ditemukan: {link}")
|
| 191 |
+
driver.get(link)
|
| 192 |
+
time.sleep(5)
|
| 193 |
+
return link
|
| 194 |
+
else:
|
| 195 |
+
print(f"β Grup '{group_input}' tidak ditemukan via search")
|
| 196 |
+
return None
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
print(f"β οΈ Search gagal untuk '{group_input}':", e)
|
| 200 |
+
return None
|
| 201 |
+
|
| 202 |
+
def scroll_to_bottom(driver, max_scrolls=10, pause_time=2):
|
| 203 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
| 204 |
+
for i in range(max_scrolls):
|
| 205 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 206 |
+
time.sleep(pause_time)
|
| 207 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
| 208 |
+
if new_height == last_height:
|
| 209 |
+
break
|
| 210 |
+
last_height = new_height
|
| 211 |
+
|
| 212 |
+
# ========== SCRAPING POSTINGAN GRUP ==========
|
| 213 |
+
def scrape_group(group_url, group_name, max_scrolls=3, max_posts=None):
|
| 214 |
+
print(f"π₯ Scraping grup: {group_name} ({group_url})")
|
| 215 |
+
driver.get(group_url)
|
| 216 |
+
time.sleep(4)
|
| 217 |
+
ensure_logged_in()
|
| 218 |
+
|
| 219 |
+
posts = []
|
| 220 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
| 221 |
+
|
| 222 |
+
for scroll_round in range(max_scrolls):
|
| 223 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 224 |
+
time.sleep(4)
|
| 225 |
+
ensure_logged_in()
|
| 226 |
+
|
| 227 |
+
post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
|
| 228 |
+
print(f"π Ditemukan {len(post_elements)} postingan pada scroll {scroll_round+1}")
|
| 229 |
+
|
| 230 |
+
for idx, post in enumerate(post_elements):
|
| 231 |
+
if max_posts and len(posts) >= max_posts:
|
| 232 |
+
break
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", post)
|
| 236 |
+
time.sleep(1)
|
| 237 |
+
|
| 238 |
+
# --- article_ctx: konteks utama artikel/post ---
|
| 239 |
+
article_ctx = None
|
| 240 |
+
try:
|
| 241 |
+
# Biasanya post itu sendiri sudah konteks utama
|
| 242 |
+
article_ctx = post
|
| 243 |
+
except:
|
| 244 |
+
article_ctx = None
|
| 245 |
+
|
| 246 |
+
# --- permalink & buka halaman post ---
|
| 247 |
+
permalink = None
|
| 248 |
+
post_context = post # default fallback ke post list
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
# coba ambil link /posts/
|
| 252 |
+
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
|
| 253 |
+
permalink = link_el.get_attribute("href").split("?")[0]
|
| 254 |
+
except:
|
| 255 |
+
try:
|
| 256 |
+
# coba ambil link /permalink/
|
| 257 |
+
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
|
| 258 |
+
permalink = link_el.get_attribute("href").split("?")[0]
|
| 259 |
+
except:
|
| 260 |
+
try:
|
| 261 |
+
# fallback ambil ID dari data-ft
|
| 262 |
+
post_id = post.get_attribute("data-ft")
|
| 263 |
+
if post_id and "top_level_post_id" in post_id:
|
| 264 |
+
import json
|
| 265 |
+
d = json.loads(post_id)
|
| 266 |
+
pid = d.get("top_level_post_id")
|
| 267 |
+
if pid:
|
| 268 |
+
permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
|
| 269 |
+
except:
|
| 270 |
+
pass
|
| 271 |
+
|
| 272 |
+
if not permalink:
|
| 273 |
+
print("β οΈ Tidak ada permalink & tidak bisa generate. Tetap lanjut simpan data.")
|
| 274 |
+
permalink = group_url # fallback isi dengan URL grup
|
| 275 |
+
|
| 276 |
+
# --- buka halaman permalink ---
|
| 277 |
+
try:
|
| 278 |
+
driver.get(permalink)
|
| 279 |
+
time.sleep(3)
|
| 280 |
+
ensure_logged_in()
|
| 281 |
+
|
| 282 |
+
# ambil elemen post baru dari halaman permalink
|
| 283 |
+
post_context = driver.find_element(By.XPATH, "//div[@role='article']")
|
| 284 |
+
except Exception as e:
|
| 285 |
+
print(f"β οΈ Gagal buka permalink {permalink}: {e}")
|
| 286 |
+
post_context = None # jangan pakai lagi elemen lama
|
| 287 |
+
|
| 288 |
+
# --- ambil author ---
|
| 289 |
+
author = "Unknown"
|
| 290 |
+
try:
|
| 291 |
+
if post_context:
|
| 292 |
+
try:
|
| 293 |
+
author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
|
| 294 |
+
except:
|
| 295 |
+
try:
|
| 296 |
+
author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
|
| 297 |
+
except:
|
| 298 |
+
author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
|
| 299 |
+
except:
|
| 300 |
+
pass
|
| 301 |
+
|
| 302 |
+
# --- expand komentar ---
|
| 303 |
+
while True:
|
| 304 |
+
try:
|
| 305 |
+
btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
|
| 306 |
+
driver.execute_script("arguments[0].click();", btn)
|
| 307 |
+
time.sleep(2)
|
| 308 |
+
except:
|
| 309 |
+
break
|
| 310 |
+
|
| 311 |
+
while True:
|
| 312 |
+
try:
|
| 313 |
+
btn = post.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
|
| 314 |
+
driver.execute_script("arguments[0].click();", btn)
|
| 315 |
+
time.sleep(2)
|
| 316 |
+
except:
|
| 317 |
+
break
|
| 318 |
+
|
| 319 |
+
# --- ambil caption & komentar dari post_context ---
|
| 320 |
+
if post_context:
|
| 321 |
+
try:
|
| 322 |
+
caption_blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
|
| 323 |
+
caption_texts = [cb.text.strip() for cb in caption_blocks if cb.text.strip()]
|
| 324 |
+
caption = "\n".join(caption_texts)[:2000] if caption_texts else ""
|
| 325 |
+
except:
|
| 326 |
+
caption = ""
|
| 327 |
+
|
| 328 |
+
# ambil komentar
|
| 329 |
+
comments = []
|
| 330 |
+
try:
|
| 331 |
+
comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar']//div[@dir='auto']")
|
| 332 |
+
seen = set()
|
| 333 |
+
for cb in comment_blocks:
|
| 334 |
+
text = cb.text.strip()
|
| 335 |
+
if text and text not in seen:
|
| 336 |
+
seen.add(text)
|
| 337 |
+
comments.append(text)
|
| 338 |
+
except:
|
| 339 |
+
comments = []
|
| 340 |
+
|
| 341 |
+
data = {
|
| 342 |
+
"group_name": group_name,
|
| 343 |
+
"group_url": group_url,
|
| 344 |
+
"post_url": permalink,
|
| 345 |
+
"author": author,
|
| 346 |
+
"caption": caption,
|
| 347 |
+
"comments": comments,
|
| 348 |
+
}
|
| 349 |
+
print(f"β
Post captured: {author} | {caption[:60]}... | {len(comments)} komentar")
|
| 350 |
+
posts.append(data)
|
| 351 |
+
|
| 352 |
+
except Exception as e:
|
| 353 |
+
print(f"β οΈ Error baca postingan {idx}: {e}")
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
| 357 |
+
if new_height == last_height:
|
| 358 |
+
break
|
| 359 |
+
last_height = new_height
|
| 360 |
+
|
| 361 |
+
return posts
|
| 362 |
+
|
| 363 |
+
# ========== MAIN ==========
|
| 364 |
+
all_data = []
|
| 365 |
+
|
| 366 |
+
fb_login()
|
| 367 |
+
|
| 368 |
+
for g in GROUP_INPUTS:
|
| 369 |
+
group_url = open_group(g)
|
| 370 |
+
if group_url:
|
| 371 |
+
posts = scrape_group(group_url, g)
|
| 372 |
+
all_data.extend(posts)
|
| 373 |
+
|
| 374 |
+
# simpan ke CSV
|
| 375 |
+
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
|
| 376 |
+
fieldnames = ["group_name", "group_url", "post_url", "author", "caption", "comments"]
|
| 377 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
| 378 |
+
writer.writeheader()
|
| 379 |
+
for row in all_data:
|
| 380 |
+
writer.writerow(row)
|
| 381 |
+
|
| 382 |
+
# simpan ke JSON
|
| 383 |
+
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
| 384 |
+
json.dump(all_data, f, ensure_ascii=False, indent=2)
|
| 385 |
+
|
| 386 |
+
print(f"β
Selesai. Data disimpan ke {OUTPUT_CSV} dan {OUTPUT_JSON}")
|
| 387 |
+
try:
|
| 388 |
+
driver.quit()
|
| 389 |
+
except:
|
| 390 |
+
pass
|
medos_scraping.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from json import JSONDecodeError
|
| 8 |
+
from selenium import webdriver
|
| 9 |
+
from selenium.webdriver.common.by import By
|
| 10 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 11 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 12 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
| 13 |
+
from selenium.webdriver.common.keys import Keys
|
| 14 |
+
|
| 15 |
+
# ==============================================================================
|
| 16 |
+
# KONFIGURASI SELENIUM
|
| 17 |
+
# ==============================================================================
|
| 18 |
+
|
| 19 |
+
def setup_driver():
|
| 20 |
+
"""Menyiapkan instance Selenium WebDriver."""
|
| 21 |
+
options = webdriver.ChromeOptions()
|
| 22 |
+
# options.add_argument('--headless')
|
| 23 |
+
options.add_argument('--disable-gpu')
|
| 24 |
+
options.add_argument('--log-level=3')
|
| 25 |
+
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36')
|
| 26 |
+
options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
driver = webdriver.Chrome(options=options)
|
| 30 |
+
return driver
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"Error saat memulai WebDriver: {e}")
|
| 33 |
+
print("Pastikan chromedriver sudah diunduh dan berada di folder yang sama.")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
# ==============================================================================
|
| 37 |
+
# FUNGSI COOKIES & CAPTCHA
|
| 38 |
+
# ==============================================================================
|
| 39 |
+
|
| 40 |
+
def save_cookies(driver, path):
|
| 41 |
+
"""Menyimpan cookies dari sesi browser ke file JSON."""
|
| 42 |
+
with open(path, 'w', encoding='utf-8') as f:
|
| 43 |
+
json.dump(driver.get_cookies(), f, indent=2)
|
| 44 |
+
print(f"\nCookies berhasil disimpan ke {path}")
|
| 45 |
+
|
| 46 |
+
# [PERBAIKAN] Fungsi ini dibuat lebih tangguh terhadap file kosong/rusak
|
| 47 |
+
def load_cookies(driver, path):
|
| 48 |
+
"""Memuat cookies dari file JSON. Mengembalikan True jika berhasil, False jika gagal."""
|
| 49 |
+
if not os.path.exists(path) or os.path.getsize(path) == 0:
|
| 50 |
+
print(f"File cookies '{path}' tidak ditemukan atau kosong.")
|
| 51 |
+
return False
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 55 |
+
cookies = json.load(f)
|
| 56 |
+
|
| 57 |
+
if not isinstance(cookies, list):
|
| 58 |
+
print(f"Format data di '{path}' tidak valid (bukan list).")
|
| 59 |
+
return False
|
| 60 |
+
|
| 61 |
+
for cookie in cookies:
|
| 62 |
+
driver.add_cookie(cookie)
|
| 63 |
+
print(f"Cookies berhasil dimuat dari {path}")
|
| 64 |
+
return True
|
| 65 |
+
except JSONDecodeError:
|
| 66 |
+
print(f"Gagal membaca '{path}' karena file rusak (JSONDecodeError).")
|
| 67 |
+
return False
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"Terjadi error saat memuat cookies dari '{path}': {e}")
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
def establish_and_verify_session(driver, base_cookies_path, profile_cookies_path, profile_url):
|
| 73 |
+
"""
|
| 74 |
+
Menangani alur CAPTCHA dengan membangun sesi dasar terlebih dahulu.
|
| 75 |
+
"""
|
| 76 |
+
# --- TAHAP 1: MEMBANGUN SESI DASAR (HOMEPAGE) ---
|
| 77 |
+
print("\n--- Tahap 1: Membangun Sesi Dasar di tiktok.com ---")
|
| 78 |
+
driver.get("https://www.tiktok.com/")
|
| 79 |
+
|
| 80 |
+
# [PERBAIKAN] Cek hasil dari load_cookies
|
| 81 |
+
if not load_cookies(driver, base_cookies_path):
|
| 82 |
+
print("\n" + "="*50)
|
| 83 |
+
print("βΌοΈ TINDAKAN AWAL DIPERLUKAN βΌοΈ")
|
| 84 |
+
input("File cookies dasar tidak valid/tidak ada. Selesaikan CAPTCHA di tiktok.com, lalu tekan [Enter]...")
|
| 85 |
+
save_cookies(driver, base_cookies_path)
|
| 86 |
+
|
| 87 |
+
driver.refresh()
|
| 88 |
+
print("Sesi dasar telah dibuat/dimuat.")
|
| 89 |
+
|
| 90 |
+
# --- TAHAP 2: VERIFIKASI SESI PROFIL ---
|
| 91 |
+
print(f"\n--- Tahap 2: Verifikasi Sesi di Halaman Profil ---")
|
| 92 |
+
driver.get(profile_url)
|
| 93 |
+
|
| 94 |
+
# [PERBAIKAN] Cek hasil dari load_cookies
|
| 95 |
+
if load_cookies(driver, profile_cookies_path):
|
| 96 |
+
print("Mencoba memvalidasi sesi dengan cookies profil...")
|
| 97 |
+
driver.refresh()
|
| 98 |
+
try:
|
| 99 |
+
WebDriverWait(driver, 10).until(
|
| 100 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
|
| 101 |
+
)
|
| 102 |
+
print("β
Sesi profil berhasil dipulihkan.")
|
| 103 |
+
return True
|
| 104 |
+
except TimeoutException:
|
| 105 |
+
print("β οΈ Cookies profil tidak valid. Diperlukan verifikasi manual.")
|
| 106 |
+
|
| 107 |
+
print("\n" + "="*50)
|
| 108 |
+
print("βΌοΈ VERIFIKASI SEBELUM SCRAPING βΌοΈ")
|
| 109 |
+
input("Halaman profil telah dimuat. Jika ada CAPTCHA, selesaikan sekarang. Tekan [Enter]...")
|
| 110 |
+
save_cookies(driver, profile_cookies_path)
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
WebDriverWait(driver, 10).until(
|
| 114 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
|
| 115 |
+
)
|
| 116 |
+
print("β
Sesi profil berhasil dibuat/diperbarui.")
|
| 117 |
+
return True
|
| 118 |
+
except TimeoutException:
|
| 119 |
+
print("β Gagal memverifikasi halaman profil.")
|
| 120 |
+
return False
|
| 121 |
+
|
| 122 |
+
# ==============================================================================
|
| 123 |
+
# FUNGSI-FUNGSI BANTUAN SCRAPING (Tidak Berubah)
|
| 124 |
+
# ==============================================================================
|
| 125 |
+
|
| 126 |
+
def get_video_links(driver, max_videos):
|
| 127 |
+
"""
|
| 128 |
+
Mengambil link video dari halaman profil dengan melakukan scroll
|
| 129 |
+
hingga batas maksimal tercapai atau halaman paling bawah.
|
| 130 |
+
"""
|
| 131 |
+
print(f"\nπ Mulai mengumpulkan link video (target: {max_videos} video)...")
|
| 132 |
+
video_links = set()
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
# 1. Tunggu hingga elemen video pertama kali muncul
|
| 136 |
+
WebDriverWait(driver, 15).until(
|
| 137 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a'))
|
| 138 |
+
)
|
| 139 |
+
print("β
Halaman profil berhasil dimuat.")
|
| 140 |
+
|
| 141 |
+
# 2. Loop untuk scroll dan kumpulkan link
|
| 142 |
+
while len(video_links) < max_videos:
|
| 143 |
+
# Simpan jumlah link sebelum scroll untuk deteksi akhir halaman
|
| 144 |
+
links_before_scroll = len(video_links)
|
| 145 |
+
|
| 146 |
+
# Kumpulkan semua link yang ada di DOM saat ini
|
| 147 |
+
video_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-e2e="user-post-item"] a')
|
| 148 |
+
for elem in video_elements:
|
| 149 |
+
href = elem.get_attribute('href')
|
| 150 |
+
if href:
|
| 151 |
+
video_links.add(href)
|
| 152 |
+
|
| 153 |
+
# Cek apakah target sudah tercapai setelah pengumpulan
|
| 154 |
+
if len(video_links) >= max_videos:
|
| 155 |
+
print(f"π― Target {max_videos} video tercapai ({len(video_links)} ditemukan). Berhenti scroll.")
|
| 156 |
+
break
|
| 157 |
+
|
| 158 |
+
# Lakukan scroll ke paling bawah halaman
|
| 159 |
+
print(f"π Scrolling... Ditemukan {len(video_links)}/{max_videos} video.")
|
| 160 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 161 |
+
|
| 162 |
+
# Beri waktu agar konten baru sempat dimuat
|
| 163 |
+
time.sleep(3)
|
| 164 |
+
|
| 165 |
+
# 3. Deteksi jika sudah tidak ada video baru yang dimuat (paling bawah)
|
| 166 |
+
# Untuk menghindari infinite loop, kita cek apakah jumlah link bertambah.
|
| 167 |
+
if len(video_links) == links_before_scroll:
|
| 168 |
+
print("π Halaman sudah paling bawah atau tidak ada video baru yang dimuat.")
|
| 169 |
+
break
|
| 170 |
+
|
| 171 |
+
except TimeoutException:
|
| 172 |
+
print("β Gagal memuat halaman profil atau tidak ada video ditemukan.")
|
| 173 |
+
return []
|
| 174 |
+
|
| 175 |
+
print(f"\nπ Selesai mengumpulkan. Total {len(video_links)} link video unik ditemukan.")
|
| 176 |
+
|
| 177 |
+
# Pastikan hasil akhir tidak melebihi max_videos
|
| 178 |
+
return list(video_links)[:max_videos]
|
| 179 |
+
|
| 180 |
+
def check_for_captcha(driver):
|
| 181 |
+
"""
|
| 182 |
+
[PERBAIKAN V2] Memeriksa CAPTCHA, termasuk di dalam iFrame.
|
| 183 |
+
"""
|
| 184 |
+
captcha_texts = [
|
| 185 |
+
"Drag the slider to fit the puzzle",
|
| 186 |
+
"Drag the puzzle piece into place",
|
| 187 |
+
"Geser puzzle untuk melengkapi gambar",
|
| 188 |
+
"Verify to continue"
|
| 189 |
+
]
|
| 190 |
+
# Menggunakan contains(., '...') agar lebih kuat dalam mencari teks
|
| 191 |
+
xpath_query = "//*[" + " or ".join([f"contains(., '{text}')" for text in captcha_texts]) + "]"
|
| 192 |
+
|
| 193 |
+
# 1. Cek di dalam iFrame terlebih dahulu (penyebab paling umum)
|
| 194 |
+
try:
|
| 195 |
+
iframes = driver.find_elements(By.TAG_NAME, 'iframe')
|
| 196 |
+
if iframes:
|
| 197 |
+
print(f"\n Mendeteksi {len(iframes)} iFrame, memeriksa satu per satu untuk CAPTCHA...")
|
| 198 |
+
for frame in iframes:
|
| 199 |
+
try:
|
| 200 |
+
# Pindah fokus ke dalam iFrame
|
| 201 |
+
driver.switch_to.frame(frame)
|
| 202 |
+
# Cari elemen CAPTCHA di dalam iFrame
|
| 203 |
+
driver.find_element(By.XPATH, xpath_query)
|
| 204 |
+
print("\nβ οΈ CAPTCHA terdeteksi di dalam sebuah iFrame!")
|
| 205 |
+
# PENTING: Kembali ke konteks halaman utama agar sisa skrip tidak error
|
| 206 |
+
driver.switch_to.default_content()
|
| 207 |
+
return True
|
| 208 |
+
except NoSuchElementException:
|
| 209 |
+
# Jika tidak ditemukan di iFrame ini, kembali dan lanjut ke iFrame berikutnya
|
| 210 |
+
driver.switch_to.default_content()
|
| 211 |
+
continue
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"\n Error saat memeriksa iFrame: {e}")
|
| 214 |
+
# Pastikan kembali ke konteks utama jika ada error tak terduga
|
| 215 |
+
driver.switch_to.default_content()
|
| 216 |
+
|
| 217 |
+
# 2. Jika tidak ada di iFrame, cek di halaman utama (sebagai cadangan)
|
| 218 |
+
try:
|
| 219 |
+
driver.find_element(By.XPATH, xpath_query)
|
| 220 |
+
print("\nβ οΈ CAPTCHA terdeteksi di halaman utama!")
|
| 221 |
+
return True
|
| 222 |
+
except NoSuchElementException:
|
| 223 |
+
return False
|
| 224 |
+
|
| 225 |
+
def scrape_video_details(driver, video_url):
|
| 226 |
+
"""Mengambil caption dan seluruh komentar, dengan penanganan CAPTCHA dan logika ekspansi konten."""
|
| 227 |
+
print(f"\n--- Memproses video: {video_url} ---")
|
| 228 |
+
driver.get(video_url)
|
| 229 |
+
|
| 230 |
+
max_retries = 2
|
| 231 |
+
for attempt in range(max_retries):
|
| 232 |
+
try:
|
| 233 |
+
upload_date = "N/A"
|
| 234 |
+
like_count = "N/A"
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
date_element = WebDriverWait(driver, 10).until(
|
| 238 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
|
| 239 |
+
)
|
| 240 |
+
upload_date = date_element.text
|
| 241 |
+
except TimeoutException:
|
| 242 |
+
print(" Β -> Info tanggal video tidak ditemukan.")
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
like_element = WebDriverWait(driver, 10).until(
|
| 246 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'strong[data-e2e="like-count"]'))
|
| 247 |
+
)
|
| 248 |
+
like_count = like_element.text
|
| 249 |
+
print(f" Β -> Jumlah 'like' ditemukan: {like_count}")
|
| 250 |
+
except TimeoutException:
|
| 251 |
+
print(" Β -> Info jumlah 'like' tidak ditemukan.")
|
| 252 |
+
|
| 253 |
+
video_data = {'url': video_url, 'upload_date': upload_date, 'like_count': like_count, 'caption_short': '', 'caption_detail': '', 'comments': []}
|
| 254 |
+
|
| 255 |
+
# --- [PERBAIKAN DIMULAI DI SINI] ---
|
| 256 |
+
try:
|
| 257 |
+
# 1. Tetap tunggu container utamanya
|
| 258 |
+
desc_container = WebDriverWait(driver, 5).until(
|
| 259 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
# 2. Cari caption di dalam try...except baru
|
| 263 |
+
try:
|
| 264 |
+
video_data['caption_short'] = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]').text
|
| 265 |
+
print(f" Β -> Caption singkat ditemukan: {video_data['caption_short'][:50]}...")
|
| 266 |
+
|
| 267 |
+
# 3. Logika untuk tombol 'more' hanya dijalankan jika caption ditemukan
|
| 268 |
+
try:
|
| 269 |
+
more_button = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
|
| 270 |
+
driver.execute_script("arguments[0].click();", more_button)
|
| 271 |
+
print(" Β -> Tombol 'more' (ikon) pada caption diklik.")
|
| 272 |
+
time.sleep(2)
|
| 273 |
+
detail_container = WebDriverWait(driver, 5).until(
|
| 274 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
|
| 275 |
+
)
|
| 276 |
+
desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
|
| 277 |
+
keywords_text = ""
|
| 278 |
+
try:
|
| 279 |
+
keywords_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
|
| 280 |
+
except NoSuchElementException: pass
|
| 281 |
+
video_data['caption_detail'] = f"Deskripsi: {desc_text}\nKeywords: {keywords_text}".strip()
|
| 282 |
+
print(f" Β -> Caption detail ditemukan: {video_data['caption_detail'][:50]}...")
|
| 283 |
+
except (NoSuchElementException, TimeoutException):
|
| 284 |
+
print(" Β -> Tidak ada tombol 'more' untuk caption detail.")
|
| 285 |
+
|
| 286 |
+
except NoSuchElementException:
|
| 287 |
+
# Jika elemen caption tidak ada, cetak pesan dan lanjutkan
|
| 288 |
+
print(" Β -> Video ini tidak memiliki caption.")
|
| 289 |
+
|
| 290 |
+
except TimeoutException:
|
| 291 |
+
# Jika bahkan container deskripsinya tidak ada, anggap halaman gagal dimuat
|
| 292 |
+
print(" Β -> Bagian deskripsi/caption tidak ditemukan, kemungkinan halaman terhalang.")
|
| 293 |
+
# --- [PERBAIKAN SELESAI DI SINI] ---
|
| 294 |
+
|
| 295 |
+
# ... (Sisa kode untuk mengambil komentar tidak perlu diubah) ...
|
| 296 |
+
try:
|
| 297 |
+
comment_container = WebDriverWait(driver, 15).until(
|
| 298 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
|
| 299 |
+
)
|
| 300 |
+
print(" Β -> Bagian komentar ditemukan. Memuat seluruh komentar...")
|
| 301 |
+
body = driver.find_element(By.TAG_NAME, 'body')
|
| 302 |
+
except TimeoutException:
|
| 303 |
+
print(" Β -> Bagian komentar tidak ditemukan.")
|
| 304 |
+
return video_data
|
| 305 |
+
|
| 306 |
+
try:
|
| 307 |
+
print(" Β -> Memulai proses scroll dan klik balasan secara dinamis...")
|
| 308 |
+
reply_button_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'reply') or contains(text(), 'replies'))]"
|
| 309 |
+
|
| 310 |
+
last_comment_count = 0
|
| 311 |
+
stalled_attempts = 0
|
| 312 |
+
max_stalled_attempts = 5
|
| 313 |
+
|
| 314 |
+
while stalled_attempts < max_stalled_attempts:
|
| 315 |
+
try:
|
| 316 |
+
view_buttons = driver.find_elements(By.XPATH, reply_button_xpath)
|
| 317 |
+
if view_buttons:
|
| 318 |
+
print(f" Β Β -> Menemukan {len(view_buttons)} tombol balasan. Mengklik satu...")
|
| 319 |
+
driver.execute_script("arguments[0].click();", view_buttons[0])
|
| 320 |
+
time.sleep(2)
|
| 321 |
+
stalled_attempts = 0
|
| 322 |
+
continue
|
| 323 |
+
except Exception as e:
|
| 324 |
+
print(f" Β Β -> Error minor saat mengklik tombol balasan: {e}")
|
| 325 |
+
|
| 326 |
+
print(" Β Β -> Tidak ada tombol balasan terlihat. Melakukan scroll...")
|
| 327 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 328 |
+
time.sleep(3)
|
| 329 |
+
|
| 330 |
+
current_comment_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
|
| 331 |
+
if current_comment_count > last_comment_count:
|
| 332 |
+
print(f" Β Β -> Konten baru dimuat. Total item sekarang: {current_comment_count}")
|
| 333 |
+
last_comment_count = current_comment_count
|
| 334 |
+
stalled_attempts = 0
|
| 335 |
+
else:
|
| 336 |
+
stalled_attempts += 1
|
| 337 |
+
print(f" Β Β -> Konten tidak bertambah, percobaan ke-{stalled_attempts}/{max_stalled_attempts}.")
|
| 338 |
+
|
| 339 |
+
print(" Β -> Scroll dan klik selesai. Memulai ekstraksi final...")
|
| 340 |
+
|
| 341 |
+
comment_item_count = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
|
| 342 |
+
print(f" Β -> Ditemukan total {comment_item_count} item komentar. Memproses satu per satu...")
|
| 343 |
+
|
| 344 |
+
for i in range(comment_item_count):
|
| 345 |
+
try:
|
| 346 |
+
all_comment_items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
|
| 347 |
+
item = all_comment_items[i]
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
|
| 351 |
+
comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]')
|
| 352 |
+
new_comment = {
|
| 353 |
+
'author': author_element.text,
|
| 354 |
+
'comment': comment_element.text,
|
| 355 |
+
'replies': []
|
| 356 |
+
}
|
| 357 |
+
video_data['comments'].append(new_comment)
|
| 358 |
+
continue
|
| 359 |
+
except NoSuchElementException:
|
| 360 |
+
pass
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
reply_author_element = item.find_element(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
|
| 364 |
+
reply_comment_element = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]')
|
| 365 |
+
if video_data['comments']:
|
| 366 |
+
new_reply = {
|
| 367 |
+
'author': reply_author_element.text,
|
| 368 |
+
'comment': reply_comment_element.text
|
| 369 |
+
}
|
| 370 |
+
video_data['comments'][-1]['replies'].append(new_reply)
|
| 371 |
+
except NoSuchElementException:
|
| 372 |
+
pass
|
| 373 |
+
except IndexError:
|
| 374 |
+
print(f" Β Β -> Peringatan: Jumlah komentar berubah saat proses. Melewatkan indeks ke-{i}.")
|
| 375 |
+
break
|
| 376 |
+
except Exception as e:
|
| 377 |
+
print(f" Β Β -> Terjadi error pada item ke-{i}, melewati. Error: {e}")
|
| 378 |
+
|
| 379 |
+
print(" Β -> Selesai. Berhasil memproses dan mengelompokkan komentar.")
|
| 380 |
+
|
| 381 |
+
except Exception as e:
|
| 382 |
+
print(f" Β -> Gagal pada proses utama karena: {e}")
|
| 383 |
+
|
| 384 |
+
return video_data
|
| 385 |
+
|
| 386 |
+
except TimeoutException:
|
| 387 |
+
print(" Β -> Gagal memuat elemen halaman (Timeout).")
|
| 388 |
+
if check_for_captcha(driver):
|
| 389 |
+
print("\n" + "="*50)
|
| 390 |
+
print(f"β οΈ CAPTCHA terdeteksi pada percobaan ke-{attempt + 1} untuk video: {video_url}")
|
| 391 |
+
input(" Β Silakan selesaikan CAPTCHA di browser, lalu tekan [Enter] untuk mencoba lagi...")
|
| 392 |
+
driver.refresh()
|
| 393 |
+
print(" Β Mencoba lagi...")
|
| 394 |
+
continue
|
| 395 |
+
else:
|
| 396 |
+
print(" Β -> Tidak ada CAPTCHA. Melewati video ini.")
|
| 397 |
+
return None
|
| 398 |
+
|
| 399 |
+
print(f" Β -> Gagal memproses video setelah {max_retries} kali percobaan. Melewati video ini.")
|
| 400 |
+
return None
|
| 401 |
+
# ==============================================================================
|
| 402 |
+
# EKSEKUSI UTAMA (Tidak Berubah)
|
| 403 |
+
# ==============================================================================
|
| 404 |
+
if __name__ == "__main__":
|
| 405 |
+
PROFILE_USERNAMES = ["rctvcirebon", "cirebonkabtv", "kang_jigus", "kangimron_", "info.cirebonan"]
|
| 406 |
+
#
|
| 407 |
+
MAX_VIDEOS_PER_PROFILE = 200
|
| 408 |
+
|
| 409 |
+
BASE_COOKIES_FILE = "tiktok_base_cookies.json"
|
| 410 |
+
PROFILE_COOKIES_FILE = "tiktok_profile_cookies.json"
|
| 411 |
+
|
| 412 |
+
all_data = []
|
| 413 |
+
driver = setup_driver()
|
| 414 |
+
|
| 415 |
+
if driver:
|
| 416 |
+
try:
|
| 417 |
+
if not PROFILE_USERNAMES:
|
| 418 |
+
print("Daftar PROFILE_USERNAMES kosong.")
|
| 419 |
+
else:
|
| 420 |
+
first_profile_url = f"https://www.tiktok.com/@{PROFILE_USERNAMES[0]}"
|
| 421 |
+
session_ok = establish_and_verify_session(driver, BASE_COOKIES_FILE, PROFILE_COOKIES_FILE, first_profile_url)
|
| 422 |
+
|
| 423 |
+
if session_ok:
|
| 424 |
+
for username in PROFILE_USERNAMES:
|
| 425 |
+
print("\n" + "="*70)
|
| 426 |
+
print(f"MEMULAI SCRAPING UNTUK PROFIL: @{username}")
|
| 427 |
+
print("="*70)
|
| 428 |
+
|
| 429 |
+
profile_url = f"https://www.tiktok.com/@{username}"
|
| 430 |
+
driver.get(profile_url)
|
| 431 |
+
|
| 432 |
+
# [PERUBAHAN] Panggilan fungsi disederhanakan
|
| 433 |
+
video_urls = get_video_links(driver, MAX_VIDEOS_PER_PROFILE)
|
| 434 |
+
|
| 435 |
+
for url in video_urls:
|
| 436 |
+
data = scrape_video_details(driver, url)
|
| 437 |
+
if data:
|
| 438 |
+
data['profile_username'] = username
|
| 439 |
+
data['scrape_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| 440 |
+
all_data.append(data)
|
| 441 |
+
time.sleep(2)
|
| 442 |
+
|
| 443 |
+
# ... sisa kode untuk menyimpan file tidak perlu diubah ...
|
| 444 |
+
if all_data:
|
| 445 |
+
print("\nMenyimpan semua data yang terkumpul...")
|
| 446 |
+
df = pd.DataFrame(all_data)
|
| 447 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 448 |
+
output_filename = f"tiktok_data_multi_{timestamp}"
|
| 449 |
+
df.to_csv(f'{output_filename}.csv', index=False, encoding='utf-8-sig')
|
| 450 |
+
print(f"Data telah disimpan ke {output_filename}.csv")
|
| 451 |
+
with open(f'{output_filename}.json', 'w', encoding='utf-8') as f:
|
| 452 |
+
json.dump(all_data, f, ensure_ascii=False, indent=4)
|
| 453 |
+
print(f"Data telah disimpan ke {output_filename}.json")
|
| 454 |
+
else:
|
| 455 |
+
print("\nTidak ada data yang berhasil dikumpulkan untuk disimpan.")
|
| 456 |
+
|
| 457 |
+
except Exception as e:
|
| 458 |
+
print(f"\nTerjadi kesalahan fatal selama proses: {e}")
|
| 459 |
+
finally:
|
| 460 |
+
print("\n--- PROSES SELESAI ---")
|
| 461 |
+
driver.quit()
|
preparing.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Preparing.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/15vt4W7yYW7JIYujXVDkuQ-a28ZvoIHBg
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
!pip -q install -U transformers accelerate torch
|
| 11 |
+
!pip install transformers
|
| 12 |
+
!pip install --upgrade transformers
|
| 13 |
+
!pip uninstall -y torch torchvision torchaudio transformers
|
| 14 |
+
!pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl/cu118
|
| 15 |
+
!pip install transformers accelerate
|
| 16 |
+
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import numpy as np
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
import seaborn as sns
|
| 21 |
+
import os
|
| 22 |
+
import torch
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
from textblob import TextBlob
|
| 26 |
+
from transformers import AutoConfig, pipeline
|
| 27 |
+
|
| 28 |
+
# Menampilkan Dataset
|
| 29 |
+
folder_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis'
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
# Dapatkan daftar semua file dalam folder
|
| 33 |
+
files = os.listdir(folder_path)
|
| 34 |
+
|
| 35 |
+
# Loop melalui setiap file
|
| 36 |
+
for file_name in files:
|
| 37 |
+
# Periksa apakah file tersebut adalah file CSV
|
| 38 |
+
if file_name.endswith('.csv'):
|
| 39 |
+
file_path = os.path.join(folder_path, file_name)
|
| 40 |
+
|
| 41 |
+
print(f"Membaca file: {file_name}")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
# Baca file CSV menggunakan Pandas
|
| 45 |
+
df = pd.read_csv(file_path)
|
| 46 |
+
|
| 47 |
+
# Tampilkan beberapa baris pertama dari dataset
|
| 48 |
+
print(df)
|
| 49 |
+
print("\n") # Beri jarak antar file
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Tidak dapat membaca file {file_name}. Error: {e}\n")
|
| 53 |
+
|
| 54 |
+
except FileNotFoundError:
|
| 55 |
+
print(f"Error: Folder '{folder_path}' tidak ditemukan.")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Terjadi error: {e}")
|
| 58 |
+
|
| 59 |
+
# Medsos
|
| 60 |
+
|
| 61 |
+
# 1. Memproses data Instagram
|
| 62 |
+
ig_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/instagram_data_20250815_025750.csv'
|
| 63 |
+
df_ig = pd.read_csv(ig_path)
|
| 64 |
+
df_ig = df_ig.rename(columns={
|
| 65 |
+
'source_name': 'profile',
|
| 66 |
+
'post_url': 'url'
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
# 2. Memproses data TikTok
|
| 70 |
+
tiktok_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/tiktok_data_multi_20250816_173832.csv'
|
| 71 |
+
df_tiktok = pd.read_csv(tiktok_path)
|
| 72 |
+
df_tiktok = df_tiktok.rename(columns={
|
| 73 |
+
'like_count': 'likes',
|
| 74 |
+
'caption_short': 'caption',
|
| 75 |
+
'profile_username': 'profile',
|
| 76 |
+
'scrape_date': 'datetime'
|
| 77 |
+
})
|
| 78 |
+
df_tiktok = df_tiktok.drop(columns=['upload_date'])
|
| 79 |
+
|
| 80 |
+
# --- Seleksi dan Konversi Tipe Data (Dilakukan SEBELUM Penggabungan) ---
|
| 81 |
+
|
| 82 |
+
kolom_yang_dipilih = ['profile', 'url', 'likes', 'caption', 'comments', 'datetime']
|
| 83 |
+
|
| 84 |
+
# Proses DataFrame Instagram
|
| 85 |
+
df1_pilihan = df_ig[kolom_yang_dipilih].copy()
|
| 86 |
+
df1_pilihan['datetime'] = pd.to_datetime(df1_pilihan['datetime'], errors='coerce') # Konversi di sini
|
| 87 |
+
df1_pilihan['asal_dataset'] = 'Instagram'
|
| 88 |
+
|
| 89 |
+
# Proses DataFrame TikTok
|
| 90 |
+
df2_pilihan = df_tiktok[kolom_yang_dipilih].copy()
|
| 91 |
+
df2_pilihan['datetime'] = pd.to_datetime(df2_pilihan['datetime'], errors='coerce') # Konversi di sini
|
| 92 |
+
df2_pilihan['asal_dataset'] = 'Tiktok'
|
| 93 |
+
|
| 94 |
+
# --- Penggabungan ---
|
| 95 |
+
df_gabungan = pd.concat([df1_pilihan, df2_pilihan], ignore_index=True)
|
| 96 |
+
|
| 97 |
+
# --- Pembersihan Data (Preprocessing) ---
|
| 98 |
+
|
| 99 |
+
# Kolom datetime sudah dikonversi, jadi kita lanjutkan dengan yang lain
|
| 100 |
+
df_gabungan['likes'] = pd.to_numeric(df_gabungan['likes'], errors='coerce').fillna(0).astype(int)
|
| 101 |
+
|
| 102 |
+
def clean_text(text):
|
| 103 |
+
if pd.isna(text): return ""
|
| 104 |
+
text = str(text).lower()
|
| 105 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
|
| 106 |
+
text = re.sub(r'[^a-zA-Z\s]', ' ', text)
|
| 107 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 108 |
+
return text
|
| 109 |
+
|
| 110 |
+
def format_author(text):
|
| 111 |
+
formatted = re.sub(r'(?<!^)\bauthor', r', author', str(text))
|
| 112 |
+
return formatted
|
| 113 |
+
|
| 114 |
+
df_gabungan['caption'] = df_gabungan['caption'].apply(clean_text)
|
| 115 |
+
df_gabungan['comments'] = df_gabungan['comments'].apply(clean_text)
|
| 116 |
+
df_gabungan['caption'] = df_gabungan['caption'].str.replace('br', '', regex=False)
|
| 117 |
+
df_gabungan['comments'] = df_gabungan['comments'].str.replace(r'replies', '', regex=True)
|
| 118 |
+
df_gabungan['comments'] = df_gabungan['comments'].apply(format_author)
|
| 119 |
+
|
| 120 |
+
# Hapus baris kosong dan duplikat di akhir
|
| 121 |
+
df_gabungan = df_gabungan.dropna(subset=['datetime', 'caption'])
|
| 122 |
+
df_gabungan = df_gabungan.drop_duplicates()
|
| 123 |
+
|
| 124 |
+
# --- HASIL AKHIR ---
|
| 125 |
+
print("\n--- HASIL AKHIR SETELAH PERBAIKAN FINAL ---")
|
| 126 |
+
print(f"Total baris Instagram: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Instagram'])}")
|
| 127 |
+
print(f"Total baris TikTok: {len(df_gabungan[df_gabungan['asal_dataset'] == 'Tiktok'])}")
|
| 128 |
+
df_gabungan.info()
|
| 129 |
+
|
| 130 |
+
# Simpan ke file CSV baru
|
| 131 |
+
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/gabungan.csv'
|
| 132 |
+
df_gabungan.to_csv(save_path, index=False)
|
| 133 |
+
print(f"\nData berhasil disimpan di: {save_path}")
|
| 134 |
+
|
| 135 |
+
# Berita
|
| 136 |
+
|
| 137 |
+
df_berita = pd.read_csv('/content/drive/MyDrive/Machine Learning/Sentiment Analysis/power_ranger.csv')
|
| 138 |
+
|
| 139 |
+
# Apply string operations to the 'tag' column
|
| 140 |
+
df_berita['tag'] = df_berita['tag'].str.lower().str.replace(', nan', '', regex=False)
|
| 141 |
+
|
| 142 |
+
# Filter the DataFrame based on the 'tag' column
|
| 143 |
+
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
|
| 144 |
+
df_berita_filtered = df_berita[df_berita['tag'].str.contains('cirebon', na=False)].copy()
|
| 145 |
+
|
| 146 |
+
df_berita = df_berita_filtered.dropna().drop_duplicates()
|
| 147 |
+
df_berita = df_berita.dropna(subset=['isi_berita', 'tag'])
|
| 148 |
+
|
| 149 |
+
df_berita['tanggal'] = pd.to_datetime(df_berita['tanggal'], errors='coerce')
|
| 150 |
+
df_berita['tag'] = df_berita['tag'].apply(clean_text)
|
| 151 |
+
df_berita['judul'] = df_berita['judul'].apply(clean_text)
|
| 152 |
+
df_berita['isi_berita'] = df_berita['isi_berita'].str.lower()
|
| 153 |
+
|
| 154 |
+
df_berita = df_berita[~df_berita['tag'].str.contains(r'promo|diskon|iklan|daihatsu|sholat|shalat|rumah|puasa', regex=True)]
|
| 155 |
+
df_berita['isi_berita'] = (
|
| 156 |
+
df_berita['isi_berita']
|
| 157 |
+
.str.replace(r'(?i)scroll.*?content', '', regex=True)
|
| 158 |
+
.str.replace(r'(?i)h3:', '', regex=True)
|
| 159 |
+
.str.replace(r'(?i)tonton.*?20detik\]', '', regex=True)
|
| 160 |
+
.str.replace(r'(?i)editor.*?antara', '', regex=True)
|
| 161 |
+
.str.replace(r'(?i)pewarta.*?antara', '', regex=True)
|
| 162 |
+
.str.replace(r'(?i)copyright.*?(antara|com)', '', regex=True)
|
| 163 |
+
.str.replace(r'(?i)dilarang.*?antara', '', regex=True)
|
| 164 |
+
.str.replace(r'(?i)advertisement', '', regex=True)
|
| 165 |
+
.str.replace(r'(?i)baca (juga )?[^.]+sini\.?', '', regex=True)
|
| 166 |
+
.str.replace(r'(?i)\bradar\b.*?-', '', regex=True)
|
| 167 |
+
.str.replace(r'(?i)(cirebon|kuningan|jawa|majalengka|indramayu|kendal|boyolali|jakarta|bandung|losarang|jatibarang|flores|brebes|sumedang|garut|madura|mataram|banda)\s*-\s*', '', regex=True)
|
| 168 |
+
.str.replace(r'(?i)cek.*?(sumber:|reportase)', '', regex=True)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
df_berita = df_berita.drop_duplicates()
|
| 172 |
+
df_berita = df_berita.dropna(subset=['isi_berita', 'tag', 'tanggal'])
|
| 173 |
+
|
| 174 |
+
print(df_berita)
|
| 175 |
+
|
| 176 |
+
save_path = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita2.csv'
|
| 177 |
+
df_berita.to_csv(save_path, index=False)
|
| 178 |
+
|
| 179 |
+
MODEL_ID = "taufiqdp/indonesian-sentiment" # IndoBERT fine-tuned (3 kelas)
|
| 180 |
+
|
| 181 |
+
# (opsional) kalau kamu perlu token HF untuk repo privat:
|
| 182 |
+
# from huggingface_hub import login
|
| 183 |
+
# login("hf_xxx") # token kamu
|
| 184 |
+
|
| 185 |
+
config = AutoConfig.from_pretrained(MODEL_ID)
|
| 186 |
+
clf = pipeline(
|
| 187 |
+
task="text-classification",
|
| 188 |
+
model=MODEL_ID,
|
| 189 |
+
tokenizer=MODEL_ID,
|
| 190 |
+
device=0 if torch.cuda.is_available() else -1,
|
| 191 |
+
truncation=True,
|
| 192 |
+
max_length=256,
|
| 193 |
+
return_all_scores=False,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
def normalize_label(lbl: str) -> str:
|
| 197 |
+
l = lbl.lower()
|
| 198 |
+
if l in ("positif","positive"): return "positif"
|
| 199 |
+
if l in ("negatif","negative"): return "negatif"
|
| 200 |
+
if l in ("netral","neutral"): return "netral"
|
| 201 |
+
# fallback jika format 'LABEL_0/1/2'
|
| 202 |
+
if "label_" in l:
|
| 203 |
+
try:
|
| 204 |
+
idx = int(l.split("_")[-1])
|
| 205 |
+
return config.id2label[idx].lower()
|
| 206 |
+
except:
|
| 207 |
+
return "netral"
|
| 208 |
+
return l
|
| 209 |
+
|
| 210 |
+
# Sentimen untuk CAPTION
|
| 211 |
+
texts_caption = df_gabungan['caption'].fillna("").astype(str).tolist()
|
| 212 |
+
preds_caption = clf(texts_caption, batch_size=64)
|
| 213 |
+
df_gabungan['sentimen_caption'] = [normalize_label(p['label']) for p in preds_caption]
|
| 214 |
+
|
| 215 |
+
# Sentimen untuk COMMENTS
|
| 216 |
+
texts_comments = df_gabungan['comments'].fillna("").astype(str).tolist()
|
| 217 |
+
preds_comments = clf(texts_comments, batch_size=64)
|
| 218 |
+
df_gabungan['sentimen_comments'] = [normalize_label(p['label']) for p in preds_comments]
|
| 219 |
+
|
| 220 |
+
# (opsional) buat kolom sentimen gabungan
|
| 221 |
+
# kalau caption netral/empty, ambil dari comments
|
| 222 |
+
def combine_sentiment(row):
|
| 223 |
+
if row['sentimen_caption'] != "netral":
|
| 224 |
+
return row['sentimen_caption']
|
| 225 |
+
return row['sentimen_comments']
|
| 226 |
+
df_gabungan['sentimen'] = df_gabungan.apply(combine_sentiment, axis=1)
|
| 227 |
+
|
| 228 |
+
df_gabungan.to_csv('medsos2.csv', index=False)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# contoh ke dataframe berita (judul/tag)
|
| 232 |
+
texts_b = df_berita['isi_berita'].fillna("").astype(str).tolist()
|
| 233 |
+
preds_b = clf(texts_b, batch_size=64)
|
| 234 |
+
df_berita['sentimen'] = [normalize_label(p['label']) for p in preds_b]
|
| 235 |
+
|
| 236 |
+
df_berita.to_csv('berita2.csv', index=False)
|
requirements.txt
ADDED
|
Binary file (3.95 kB). View file
|
|
|
runtime.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python-3.10
|
sentimentanalysis.py
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""SentimentAnalysis
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/fatihramadhan/sentimentanalysis.74f160cb-74cc-4609-ba85-0081c3654a18.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20260326/auto/storage/goog4_request%26X-Goog-Date%3D20260326T141800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2fe877a762338b5e556a035ce46a5a6bf9c51c0d33c4b062e919cfd44e0297ff787b3a23bf4290b33ca0467d04cf7ba377d77c975cd79da4f1adfec176cb7d78d1eddf1eec10e87d86e656200eaed9b0781f5f5d215ee084957aa5a30c2e9fa1731c23b333d5f742767875bd84e34b83339d834639567639d817ad1295fbc8fd552a5ae92f938b90cb8d916b4a7190e208c6d0effdc10665a9405efffc12a2d4497159428e898204e32ad2d629a58e985c020c7febef459895fd34b052c37a041102284e207ed788a6490c64656ece6150fc355120a49cf2b2fdadda53018d3dba4f8aeda15faaa1eb9c9cef82a476c38be69504e5a5f98cf61686a2b337ea77
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
|
| 11 |
+
# THEN FEEL FREE TO DELETE THIS CELL.
|
| 12 |
+
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
|
| 13 |
+
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
|
| 14 |
+
# NOTEBOOK.
|
| 15 |
+
import kagglehub
|
| 16 |
+
fatihramadhan_sentimentdataset_path = kagglehub.dataset_download('fatihramadhan/sentimentdataset')
|
| 17 |
+
|
| 18 |
+
print('Data source import complete.')
|
| 19 |
+
|
| 20 |
+
import pandas as pd
|
| 21 |
+
import numpy as np
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
+
|
| 24 |
+
import re
|
| 25 |
+
import html
|
| 26 |
+
import torch
|
| 27 |
+
import evaluate
|
| 28 |
+
import os
|
| 29 |
+
import transformers
|
| 30 |
+
import inspect
|
| 31 |
+
import joblib
|
| 32 |
+
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
from torch.utils.data import Dataset, DataLoader
|
| 35 |
+
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline
|
| 36 |
+
|
| 37 |
+
from sklearn.model_selection import train_test_split
|
| 38 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 39 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 40 |
+
from sklearn.utils import resample
|
| 41 |
+
|
| 42 |
+
# ----------------------------
|
| 43 |
+
# Konfigurasi
|
| 44 |
+
# ----------------------------
|
| 45 |
+
INPUT_PATH = "/kaggle/input/sentimentdataset/dataset_gabungan.csv"
|
| 46 |
+
|
| 47 |
+
# Jika kamu pakai model cased (mis. indobenchmark/indobert-base-p2), set ke False
|
| 48 |
+
APPLY_LOWERCASE = True
|
| 49 |
+
|
| 50 |
+
# Batasi huruf berulang (contoh: "baguuuusss" -> "baguus")
|
| 51 |
+
LIMIT_REPEAT_CHARS = True
|
| 52 |
+
MAX_REPEAT = 2
|
| 53 |
+
|
| 54 |
+
# Nama kolom (biarkan None agar ditebak otomatis)
|
| 55 |
+
TEXT_COL = None
|
| 56 |
+
LABEL_COL = None
|
| 57 |
+
|
| 58 |
+
# Label yang didukung (akan dinormalisasi ke bentuk ini)
|
| 59 |
+
CANON_LABELS = {"positif": "positif", "positive": "positif", "pos": "positif", 'positi': 'positif',
|
| 60 |
+
"negatif": "negatif", "negative": "negatif", "neg": "negatif", 'negartif': 'negatif',
|
| 61 |
+
"netral": "netral", "neutral": "netral", "neu": "netral", 'netr' : 'netral'}
|
| 62 |
+
|
| 63 |
+
# ----------------------------
|
| 64 |
+
# Utilitas
|
| 65 |
+
# ----------------------------
|
| 66 |
+
def guess_column(df: pd.DataFrame, candidates):
|
| 67 |
+
for c in candidates:
|
| 68 |
+
if c in df.columns:
|
| 69 |
+
return c
|
| 70 |
+
# fallback: pilih kolom bertipe object terpanjang
|
| 71 |
+
obj_cols = [c for c in df.columns if df[c].dtype == "object"]
|
| 72 |
+
return obj_cols[0] if obj_cols else df.columns[0]
|
| 73 |
+
|
| 74 |
+
url_pattern = re.compile(r"(https?://\S+|www\.\S+)")
|
| 75 |
+
mention_pattern = re.compile(r"@\w+")
|
| 76 |
+
hashtag_pattern = re.compile(r"#(\w+)")
|
| 77 |
+
multi_space_pattern = re.compile(r"\s+")
|
| 78 |
+
rt_fw_pattern = re.compile(r"\b(rt|fw|fwd)\b[:]?", flags=re.IGNORECASE)
|
| 79 |
+
|
| 80 |
+
# Optional: pola khusus yang sering ada di data komentar (hapus segmen "author ... comment")
|
| 81 |
+
author_comment_pattern = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
|
| 82 |
+
|
| 83 |
+
def limit_repeated_chars(text: str, max_repeat: int = 2) -> str:
|
| 84 |
+
return re.sub(r"(.)\1{%d,}" % (max_repeat), r"\1" * max_repeat, text)
|
| 85 |
+
|
| 86 |
+
class TextPreprocessor(BaseEstimator, TransformerMixin):
|
| 87 |
+
def __init__(self,
|
| 88 |
+
apply_lowercase=True,
|
| 89 |
+
limit_repeat=True,
|
| 90 |
+
max_repeat=2,
|
| 91 |
+
canon_labels=None):
|
| 92 |
+
self.apply_lowercase = apply_lowercase
|
| 93 |
+
self.limit_repeat = limit_repeat
|
| 94 |
+
self.max_repeat = max_repeat
|
| 95 |
+
self.canon_labels = canon_labels or {}
|
| 96 |
+
|
| 97 |
+
def fit(self, X, y=None):
|
| 98 |
+
return self
|
| 99 |
+
|
| 100 |
+
def transform(self, X, y=None):
|
| 101 |
+
# pastikan Series + atasi NaN di sini, JANGAN di _clean_text
|
| 102 |
+
texts = pd.Series(X).fillna("").astype(str)
|
| 103 |
+
return texts.apply(self._clean_text)
|
| 104 |
+
|
| 105 |
+
def transform_labels(self, y):
|
| 106 |
+
if y is None:
|
| 107 |
+
return None
|
| 108 |
+
labels = pd.Series(y).astype(str)
|
| 109 |
+
return labels.apply(self._normalize_label)
|
| 110 |
+
|
| 111 |
+
def _normalize_label(self, x):
|
| 112 |
+
if pd.isna(x):
|
| 113 |
+
return None
|
| 114 |
+
s = str(x).strip().lower()
|
| 115 |
+
return self.canon_labels.get(s, None)
|
| 116 |
+
|
| 117 |
+
def _clean_text(self, t: str) -> str:
|
| 118 |
+
if not isinstance(t, str):
|
| 119 |
+
return ""
|
| 120 |
+
|
| 121 |
+
# Hapus pola "author ... comment"
|
| 122 |
+
t = author_comment_pattern.sub("", t)
|
| 123 |
+
|
| 124 |
+
# Hapus tag HTML / atribut
|
| 125 |
+
t = remove_html_elements(t)
|
| 126 |
+
|
| 127 |
+
# Unescape HTML entities
|
| 128 |
+
t = html.unescape(t)
|
| 129 |
+
|
| 130 |
+
# Ganti URL dan mention
|
| 131 |
+
t = url_pattern.sub(" <url> ", t)
|
| 132 |
+
t = mention_pattern.sub(" <user> ", t)
|
| 133 |
+
|
| 134 |
+
# Hashtag "#kata" -> "kata"
|
| 135 |
+
t = hashtag_pattern.sub(lambda m: f"{m.group(1)}", t)
|
| 136 |
+
|
| 137 |
+
# Hapus token RT/FW
|
| 138 |
+
t = rt_fw_pattern.sub(" ", t)
|
| 139 |
+
|
| 140 |
+
# Hanya simpan huruf, angka, dan spasi
|
| 141 |
+
t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
|
| 142 |
+
|
| 143 |
+
# Normalisasi whitespace
|
| 144 |
+
t = multi_space_pattern.sub(" ", t).strip()
|
| 145 |
+
|
| 146 |
+
# Lowercase jika diinginkan
|
| 147 |
+
if self.apply_lowercase:
|
| 148 |
+
t = t.lower()
|
| 149 |
+
|
| 150 |
+
# Batasi huruf berulang
|
| 151 |
+
if self.limit_repeat:
|
| 152 |
+
t = limit_repeated_chars(t, self.max_repeat)
|
| 153 |
+
|
| 154 |
+
return t
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def remove_html_elements(text: str) -> str:
|
| 158 |
+
if not isinstance(text, str):
|
| 159 |
+
return ""
|
| 160 |
+
|
| 161 |
+
# Unescape HTML entities (& -> &, dll)
|
| 162 |
+
text = html.unescape(text)
|
| 163 |
+
|
| 164 |
+
# Hapus semua <tag> lengkap
|
| 165 |
+
text = TAG_RE.sub(" ", text)
|
| 166 |
+
|
| 167 |
+
# Hapus atribut HTML yang nyangkut sebagai plain text
|
| 168 |
+
text = ATTR_RE.sub(" ", text)
|
| 169 |
+
|
| 170 |
+
# Hapus simbol "<" atau ">" sisa
|
| 171 |
+
text = re.sub(r"[<>]", " ", text)
|
| 172 |
+
|
| 173 |
+
# Normalkan spasi
|
| 174 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 175 |
+
|
| 176 |
+
return text
|
| 177 |
+
|
| 178 |
+
# regex: hapus <tag> beserta isinya
|
| 179 |
+
TAG_RE = re.compile(r"<[^>]+>")
|
| 180 |
+
|
| 181 |
+
# regex: hapus atribut-atribut html yang sering nyangkut
|
| 182 |
+
ATTR_RE = re.compile(r"\b(class|id|style|role|tabindex|href|src|alt)=[^\s>]+", flags=re.IGNORECASE)
|
| 183 |
+
|
| 184 |
+
# ----------------------------
|
| 185 |
+
# Load
|
| 186 |
+
# ----------------------------
|
| 187 |
+
path = Path(INPUT_PATH)
|
| 188 |
+
if not path.exists():
|
| 189 |
+
raise FileNotFoundError(f"File tidak ditemukan: {path.resolve()}")
|
| 190 |
+
|
| 191 |
+
df = pd.read_csv(path)
|
| 192 |
+
|
| 193 |
+
# ----------------------------
|
| 194 |
+
# Tentukan kolom teks & label
|
| 195 |
+
# ----------------------------
|
| 196 |
+
if TEXT_COL is None:
|
| 197 |
+
TEXT_COL = guess_column(df, ["text", "tweet", "content", "sentence", "caption", "judul", "deskripsi"])
|
| 198 |
+
if LABEL_COL is None:
|
| 199 |
+
LABEL_COL = guess_column(df, ["label", "sentiment", "polarity", "target", "kelas"])
|
| 200 |
+
|
| 201 |
+
print(f"Kolom teks terdeteksi : {TEXT_COL}")
|
| 202 |
+
print(f"Kolom label terdeteksi: {LABEL_COL}")
|
| 203 |
+
|
| 204 |
+
# ----------------------------
|
| 205 |
+
# Load Preproc
|
| 206 |
+
# ----------------------------
|
| 207 |
+
|
| 208 |
+
preproc = TextPreprocessor(
|
| 209 |
+
apply_lowercase=APPLY_LOWERCASE,
|
| 210 |
+
limit_repeat=LIMIT_REPEAT_CHARS,
|
| 211 |
+
max_repeat=MAX_REPEAT,
|
| 212 |
+
canon_labels=CANON_LABELS
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# ----------------------------
|
| 216 |
+
# Penggunaan Preproc
|
| 217 |
+
# ----------------------------
|
| 218 |
+
# fit_transform teks
|
| 219 |
+
df["text"] = preproc.fit_transform(df[TEXT_COL])
|
| 220 |
+
df["sentiment"] = preproc.transform_labels(df[LABEL_COL])
|
| 221 |
+
|
| 222 |
+
# ----------------------------
|
| 223 |
+
# Drop Data jika Text Kosong
|
| 224 |
+
# ----------------------------
|
| 225 |
+
df = df[df["text"].str.strip().ne("")]
|
| 226 |
+
|
| 227 |
+
# ----------------------------
|
| 228 |
+
# Tampilkan contoh label tak dikenal
|
| 229 |
+
# ----------------------------
|
| 230 |
+
unknown = df[df["sentiment"].isna()]
|
| 231 |
+
print("\nContoh label tak dikenal yang akan dibuang:")
|
| 232 |
+
print(unknown[[LABEL_COL]].value_counts()) # tampilkan 10 teratas
|
| 233 |
+
|
| 234 |
+
# Buang label tak dikenal
|
| 235 |
+
before = len(df)
|
| 236 |
+
df = df[df["sentiment"].notna()]
|
| 237 |
+
dropped_unknown = before - len(df)
|
| 238 |
+
|
| 239 |
+
# ----------------------------
|
| 240 |
+
# Hapus duplikasi (berdasarkan teks bersih)
|
| 241 |
+
# ----------------------------
|
| 242 |
+
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
|
| 243 |
+
|
| 244 |
+
# ----------------------------
|
| 245 |
+
# Ringkasan
|
| 246 |
+
# ----------------------------
|
| 247 |
+
print("\nRingkasan setelah preprocessing:")
|
| 248 |
+
print(f" - Baris total : {len(df)}")
|
| 249 |
+
print(f" - Dibuang label tak dikenal: {dropped_unknown}")
|
| 250 |
+
print(" - Distribusi label:")
|
| 251 |
+
print(df["sentiment"].value_counts(dropna=False))
|
| 252 |
+
|
| 253 |
+
# Contoh pratinjau
|
| 254 |
+
print("\nContoh 5 baris:")
|
| 255 |
+
print(df[[TEXT_COL, "text", LABEL_COL, "sentiment"]].head(5))
|
| 256 |
+
|
| 257 |
+
# df.to_csv('/content/drive/MyDrive/Machine Learning/Latih Model/bersihhh.csv')
|
| 258 |
+
|
| 259 |
+
# ----------------------------
|
| 260 |
+
# Save Preproc
|
| 261 |
+
# ----------------------------
|
| 262 |
+
|
| 263 |
+
joblib.dump(preproc, "preprocessor.joblib")
|
| 264 |
+
|
| 265 |
+
# ============================
|
| 266 |
+
# PERBAIKAN LABEL BERDASARKAN KATA KUNCI
|
| 267 |
+
# ============================
|
| 268 |
+
|
| 269 |
+
# Definisikan kamus kata kunci untuk tiap label
|
| 270 |
+
NEGATIVE_KEYWORDS = {
|
| 271 |
+
# Kata kasar / slang
|
| 272 |
+
"bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
|
| 273 |
+
"kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
|
| 274 |
+
"kik", "goblog", "kntl",
|
| 275 |
+
|
| 276 |
+
# Kata resmi / formal
|
| 277 |
+
"buruk", "lemah", "rendah", "gagal", "hancur", "rusak", "cacat",
|
| 278 |
+
"jahat", "dusta", "bohong", "fitnah", "korup", "curang", "palsu",
|
| 279 |
+
"salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
|
| 280 |
+
"tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
POSITIVE_KEYWORDS = {
|
| 284 |
+
# Kata umum positif
|
| 285 |
+
"bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
|
| 286 |
+
"cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
|
| 287 |
+
"sempurna", "berhasil", "luas", "indah"
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
NEUTRAL_KEYWORDS = {
|
| 291 |
+
# Kata netral / umum
|
| 292 |
+
"ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
def correct_label(row):
|
| 296 |
+
text = row["text"]
|
| 297 |
+
label = row["sentiment"]
|
| 298 |
+
|
| 299 |
+
# cek kata negatif
|
| 300 |
+
if any(word in text for word in NEGATIVE_KEYWORDS):
|
| 301 |
+
return "negatif"
|
| 302 |
+
# cek kata positif
|
| 303 |
+
if any(word in text for word in POSITIVE_KEYWORDS):
|
| 304 |
+
return "positif"
|
| 305 |
+
# cek kata netral
|
| 306 |
+
if any(word in text for word in NEUTRAL_KEYWORDS):
|
| 307 |
+
return "netral"
|
| 308 |
+
|
| 309 |
+
# kalau tidak ada aturan yang kena, pakai label asli
|
| 310 |
+
return label
|
| 311 |
+
|
| 312 |
+
# Terapkan perbaikan
|
| 313 |
+
df["sentiment"] = df.apply(correct_label, axis=1)
|
| 314 |
+
|
| 315 |
+
# Ringkasan distribusi setelah perbaikan
|
| 316 |
+
print("\nDistribusi label setelah perbaikan:")
|
| 317 |
+
print(df["sentiment"].value_counts())
|
| 318 |
+
|
| 319 |
+
# Pisahkan tiap kelas
|
| 320 |
+
df_negatif = df[df["sentiment"] == "negatif"]
|
| 321 |
+
df_positif = df[df["sentiment"] == "positif"]
|
| 322 |
+
df_netral = df[df["sentiment"] == "netral"]
|
| 323 |
+
|
| 324 |
+
# Tentukan target jumlah (misal samakan dengan kelas netral)
|
| 325 |
+
target_count = df_netral.shape[0]
|
| 326 |
+
|
| 327 |
+
# Oversampling positif & negatif
|
| 328 |
+
df_negatif_over = resample(df_negatif,
|
| 329 |
+
replace=True,
|
| 330 |
+
n_samples=target_count,
|
| 331 |
+
random_state=42)
|
| 332 |
+
|
| 333 |
+
df_positif_over = resample(df_positif,
|
| 334 |
+
replace=True,
|
| 335 |
+
n_samples=target_count,
|
| 336 |
+
random_state=42)
|
| 337 |
+
|
| 338 |
+
# Gabungkan kembali
|
| 339 |
+
df_balanced = pd.concat([df_netral, df_negatif_over, df_positif_over])
|
| 340 |
+
|
| 341 |
+
print("Distribusi setelah balancing:")
|
| 342 |
+
print(df_balanced["sentiment"].value_counts())
|
| 343 |
+
|
| 344 |
+
# ============================
|
| 345 |
+
# VISUALISASI DISTRIBUSI LABEL
|
| 346 |
+
# ============================
|
| 347 |
+
|
| 348 |
+
# ambil distribusi label_clean
|
| 349 |
+
label_counts = df_balanced["sentiment"].value_counts()
|
| 350 |
+
|
| 351 |
+
# -------- Diagram Batang --------
|
| 352 |
+
plt.figure(figsize=(6,4))
|
| 353 |
+
label_counts.plot(kind="bar", color=["red","green","blue"])
|
| 354 |
+
plt.title("Distribusi Sentimen")
|
| 355 |
+
plt.xlabel("Label")
|
| 356 |
+
plt.ylabel("Jumlah")
|
| 357 |
+
plt.xticks(rotation=0)
|
| 358 |
+
plt.show()
|
| 359 |
+
|
| 360 |
+
print('\n')
|
| 361 |
+
|
| 362 |
+
# -------- Diagram Lingkaran (Pie) --------
|
| 363 |
+
plt.figure(figsize=(5,5))
|
| 364 |
+
label_counts.plot(kind="pie", autopct='%1.1f%%', startangle=90, colors=["red","green","blue"])
|
| 365 |
+
plt.title("Persentase Sentimen")
|
| 366 |
+
plt.ylabel("") # hilangkan label Y
|
| 367 |
+
plt.show()
|
| 368 |
+
|
| 369 |
+
# ============================
|
| 370 |
+
# SPLIT DATASET (train/val/test)
|
| 371 |
+
# ============================
|
| 372 |
+
|
| 373 |
+
# ambil teks & label hasil bersih
|
| 374 |
+
X = df_balanced["text"].values
|
| 375 |
+
y = df_balanced["sentiment"].values
|
| 376 |
+
|
| 377 |
+
# 1. Bagi train + temp (80%) dan test (20%)
|
| 378 |
+
X_train, X_temp, y_train, y_temp = train_test_split(
|
| 379 |
+
X, y, test_size=0.2, random_state=42, stratify=y
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# 2. Dari temp (20%), bagi lagi jadi val (10%) + test (10%)
|
| 383 |
+
X_val, X_test, y_val, y_test = train_test_split(
|
| 384 |
+
X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# Cek ukuran hasil split
|
| 388 |
+
print("Ukuran dataset:")
|
| 389 |
+
print(f"Train: {len(X_train)}")
|
| 390 |
+
print(f"Validation: {len(X_val)}")
|
| 391 |
+
print(f"Test: {len(X_test)}")
|
| 392 |
+
|
| 393 |
+
# ============================
|
| 394 |
+
# FINE-TUNING IndoBERT
|
| 395 |
+
# ============================
|
| 396 |
+
|
| 397 |
+
# pastikan pakai GPU kalau tersedia
|
| 398 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 399 |
+
print("Device:", device)
|
| 400 |
+
|
| 401 |
+
os.environ["WANDB_API_KEY"] = "009f08e71506e55bdfd282b691a4abee4ac85ff9"
|
| 402 |
+
os.environ["WANDB_DISABLED"] = "false"
|
| 403 |
+
|
| 404 |
+
# ----------------------------
|
| 405 |
+
# 1. Tokenizer & Label Encoding
|
| 406 |
+
# ----------------------------
|
| 407 |
+
MODEL_NAME = "indobenchmark/indobert-base-p1" # model IndoBERT pre-trained
|
| 408 |
+
|
| 409 |
+
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
|
| 410 |
+
|
| 411 |
+
# mapping label ke angka
|
| 412 |
+
label2id = {"negatif": 0, "netral": 1, "positif": 2}
|
| 413 |
+
id2label = {v: k for k, v in label2id.items()}
|
| 414 |
+
|
| 415 |
+
def encode_labels(labels):
|
| 416 |
+
return [label2id[l] for l in labels]
|
| 417 |
+
|
| 418 |
+
y_train_enc = encode_labels(y_train)
|
| 419 |
+
y_val_enc = encode_labels(y_val)
|
| 420 |
+
y_test_enc = encode_labels(y_test)
|
| 421 |
+
|
| 422 |
+
# ----------------------------
|
| 423 |
+
# 2. Dataset class
|
| 424 |
+
# ----------------------------
|
| 425 |
+
class SentimentDataset(Dataset):
|
| 426 |
+
def __init__(self, texts, labels, tokenizer, max_len=128):
|
| 427 |
+
self.texts = texts
|
| 428 |
+
self.labels = labels
|
| 429 |
+
self.tokenizer = tokenizer
|
| 430 |
+
self.max_len = max_len
|
| 431 |
+
|
| 432 |
+
def __len__(self):
|
| 433 |
+
return len(self.texts)
|
| 434 |
+
|
| 435 |
+
def __getitem__(self, idx):
|
| 436 |
+
text = str(self.texts[idx])
|
| 437 |
+
label = self.labels[idx]
|
| 438 |
+
|
| 439 |
+
enc = self.tokenizer(
|
| 440 |
+
text,
|
| 441 |
+
truncation=True,
|
| 442 |
+
padding="max_length",
|
| 443 |
+
max_length=self.max_len,
|
| 444 |
+
return_tensors="pt"
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
return {
|
| 448 |
+
"input_ids": enc["input_ids"].squeeze(),
|
| 449 |
+
"attention_mask": enc["attention_mask"].squeeze(),
|
| 450 |
+
"labels": torch.tensor(label, dtype=torch.long)
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
train_dataset = SentimentDataset(X_train, y_train_enc, tokenizer)
|
| 454 |
+
val_dataset = SentimentDataset(X_val, y_val_enc, tokenizer)
|
| 455 |
+
test_dataset = SentimentDataset(X_test, y_test_enc, tokenizer)
|
| 456 |
+
|
| 457 |
+
# ----------------------------
|
| 458 |
+
# 3. Model
|
| 459 |
+
# ----------------------------
|
| 460 |
+
model = BertForSequenceClassification.from_pretrained(
|
| 461 |
+
MODEL_NAME,
|
| 462 |
+
num_labels=3,
|
| 463 |
+
id2label=id2label,
|
| 464 |
+
label2id=label2id
|
| 465 |
+
).to(device)
|
| 466 |
+
|
| 467 |
+
# ----------------------------
|
| 468 |
+
# 4. Training Arguments
|
| 469 |
+
# ----------------------------
|
| 470 |
+
training_args = TrainingArguments(
|
| 471 |
+
output_dir="./results",
|
| 472 |
+
per_device_train_batch_size=32,
|
| 473 |
+
per_device_eval_batch_size=32,
|
| 474 |
+
num_train_epochs=5, # cukup 10β15, early stopping yang handle
|
| 475 |
+
learning_rate=2e-5, # lebih kecil β stabil
|
| 476 |
+
weight_decay=0.05, # lebih besar β regularisasi
|
| 477 |
+
warmup_ratio=0.1, # 10% step awal dipakai warmup
|
| 478 |
+
logging_dir="./logs",
|
| 479 |
+
logging_steps=500,
|
| 480 |
+
save_total_limit=2,
|
| 481 |
+
eval_strategy="epoch", # evaluasi setiap epoch
|
| 482 |
+
save_strategy="epoch", # simpan juga setiap epoch
|
| 483 |
+
load_best_model_at_end=True,
|
| 484 |
+
metric_for_best_model="f1",
|
| 485 |
+
greater_is_better=True
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
# ----------------------------
|
| 490 |
+
# 5. Metrics
|
| 491 |
+
# ----------------------------
|
| 492 |
+
|
| 493 |
+
metric_acc = evaluate.load("accuracy")
|
| 494 |
+
metric_f1 = evaluate.load("f1")
|
| 495 |
+
|
| 496 |
+
def compute_metrics(eval_pred):
|
| 497 |
+
logits, labels = eval_pred
|
| 498 |
+
preds = np.argmax(logits, axis=-1)
|
| 499 |
+
acc = metric_acc.compute(predictions=preds, references=labels)
|
| 500 |
+
f1 = metric_f1.compute(predictions=preds, references=labels, average="weighted")
|
| 501 |
+
return {"accuracy": acc["accuracy"], "f1": f1["f1"]}
|
| 502 |
+
|
| 503 |
+
# ----------------------------
|
| 504 |
+
# 6. Trainer
|
| 505 |
+
# ----------------------------
|
| 506 |
+
trainer = Trainer(
|
| 507 |
+
model=model,
|
| 508 |
+
args=training_args,
|
| 509 |
+
train_dataset=train_dataset,
|
| 510 |
+
eval_dataset=val_dataset,
|
| 511 |
+
compute_metrics=compute_metrics,
|
| 512 |
+
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # stop kalau 2 epoch tidak membaik
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# ----------------------------
|
| 516 |
+
# 7. Mulai Training
|
| 517 |
+
# ----------------------------
|
| 518 |
+
trainer.train()
|
| 519 |
+
|
| 520 |
+
# =============================
|
| 521 |
+
# 8. Evaluasi & Simpan Prediksi
|
| 522 |
+
# =============================
|
| 523 |
+
|
| 524 |
+
# hasil prediksi di test set
|
| 525 |
+
pred_results = trainer.predict(test_dataset)
|
| 526 |
+
|
| 527 |
+
# ambil logits β konversi ke label prediksi
|
| 528 |
+
pred_logits = pred_results.predictions
|
| 529 |
+
pred_labels = np.argmax(pred_logits, axis=1)
|
| 530 |
+
|
| 531 |
+
# konversi angka ke label teks
|
| 532 |
+
pred_text_labels = [id2label[i] for i in pred_labels]
|
| 533 |
+
true_text_labels = [id2label[i] for i in y_test_enc]
|
| 534 |
+
|
| 535 |
+
# gabungkan dengan teks asli
|
| 536 |
+
df_test_results = pd.DataFrame({
|
| 537 |
+
"text": X_test,
|
| 538 |
+
"true_label": true_text_labels,
|
| 539 |
+
"predicted_label": pred_text_labels
|
| 540 |
+
})
|
| 541 |
+
|
| 542 |
+
# simpan ke CSV
|
| 543 |
+
df_test_results.to_csv("test_predictions.csv", index=False)
|
| 544 |
+
print("β
Hasil prediksi test set sudah disimpan ke test_predictions.csv")
|
| 545 |
+
|
| 546 |
+
# ============================
|
| 547 |
+
# EVALUASI & SIMPAN MODEL
|
| 548 |
+
# ============================
|
| 549 |
+
|
| 550 |
+
# 1. Evaluasi di test set
|
| 551 |
+
print("\nEvaluasi di Test Set:")
|
| 552 |
+
test_result = trainer.evaluate(test_dataset)
|
| 553 |
+
print(test_result)
|
| 554 |
+
|
| 555 |
+
# 2. Prediksi label test set (opsional, untuk analisis lebih lanjut)
|
| 556 |
+
predictions = trainer.predict(test_dataset)
|
| 557 |
+
pred_labels = np.argmax(predictions.predictions, axis=-1)
|
| 558 |
+
|
| 559 |
+
# contoh lihat 10 prediksi pertama
|
| 560 |
+
for i in range(10):
|
| 561 |
+
print(f"Teks: {X_test[i]}")
|
| 562 |
+
print(f"Label Asli: {id2label[y_test_enc[i]]} | Prediksi: {id2label[pred_labels[i]]}")
|
| 563 |
+
print("---")
|
| 564 |
+
|
| 565 |
+
# 3. Simpan model + tokenizer
|
| 566 |
+
SAVE_DIR = "./indoBERT-sentiment"
|
| 567 |
+
|
| 568 |
+
trainer.save_model(SAVE_DIR)
|
| 569 |
+
tokenizer.save_pretrained(SAVE_DIR)
|
| 570 |
+
|
| 571 |
+
print(f"\nModel & tokenizer sudah disimpan ke: {SAVE_DIR}")
|
| 572 |
+
|
| 573 |
+
# ==========================
|
| 574 |
+
# LOAD MODEL & TOKENIZER
|
| 575 |
+
# ==========================
|
| 576 |
+
MODEL_DIR = "./indoBERT-sentiment"
|
| 577 |
+
|
| 578 |
+
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
|
| 579 |
+
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
|
| 580 |
+
|
| 581 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 582 |
+
sentiment_pipeline = pipeline(
|
| 583 |
+
"text-classification",
|
| 584 |
+
model=model,
|
| 585 |
+
tokenizer=tokenizer,
|
| 586 |
+
device=device
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
# load preprocessor yang sudah disimpan
|
| 590 |
+
preproc = joblib.load("preprocessor.joblib")
|
| 591 |
+
|
| 592 |
+
# ==========================
|
| 593 |
+
# FUNGSI PREDIKSI
|
| 594 |
+
# ==========================
|
| 595 |
+
def predict_text(text):
|
| 596 |
+
if not isinstance(text, str) or text.strip() == "":
|
| 597 |
+
return "EMPTY"
|
| 598 |
+
result = sentiment_pipeline(text, truncation=True, max_length=512)[0]
|
| 599 |
+
return result["label"]
|
| 600 |
+
|
| 601 |
+
# ==========================
|
| 602 |
+
# PREDIKSI FILE 1 (MEDIA SOSIAL)
|
| 603 |
+
# ==========================
|
| 604 |
+
file1 = pd.read_csv("/kaggle/input/sentimentdataset/gabungan (1).csv")
|
| 605 |
+
|
| 606 |
+
# Preprocessing caption
|
| 607 |
+
file1["caption"] = preproc.transform(file1["caption"])
|
| 608 |
+
|
| 609 |
+
# Preprocessing comment
|
| 610 |
+
file1["comments"] = preproc.transform(file1["comments"])
|
| 611 |
+
|
| 612 |
+
# drop NaN biar aman
|
| 613 |
+
file1 = file1.dropna(subset=["caption", "comments"])
|
| 614 |
+
|
| 615 |
+
outputs1 = []
|
| 616 |
+
|
| 617 |
+
for idx, row in file1.iterrows():
|
| 618 |
+
print(f"[File1] Proses baris {idx+1}/{len(file1)}")
|
| 619 |
+
|
| 620 |
+
# caption
|
| 621 |
+
caption_text = str(row["caption"]).strip()
|
| 622 |
+
caption_pred = predict_text(caption_text)
|
| 623 |
+
|
| 624 |
+
# comments
|
| 625 |
+
comments_text = str(row["comments"]).strip()
|
| 626 |
+
comments_pred_label = predict_text(comments_text)
|
| 627 |
+
|
| 628 |
+
outputs1.append({
|
| 629 |
+
"link": row.get("link", ""), # simpan link medsos
|
| 630 |
+
"caption": caption_text,
|
| 631 |
+
"caption_pred": caption_pred,
|
| 632 |
+
"comments_pred": comments_text, # simpan teks asli komentar
|
| 633 |
+
"comments_summary": comments_pred_label # hasil prediksi sentimen komentar
|
| 634 |
+
})
|
| 635 |
+
|
| 636 |
+
df_out1 = pd.DataFrame(outputs1)
|
| 637 |
+
df_out1.to_csv("medsos.csv", index=False, encoding="utf-8-sig")
|
| 638 |
+
print("β
Hasil prediksi file1 sudah disimpan ke medsos.csv")
|
| 639 |
+
|
| 640 |
+
# ==========================
|
| 641 |
+
# PREDIKSI FILE 2 (BERITA)
|
| 642 |
+
# ==========================
|
| 643 |
+
file2 = pd.read_csv("/kaggle/input/sentimentdataset/berita2 (1).csv")
|
| 644 |
+
|
| 645 |
+
# Preprocessing judul
|
| 646 |
+
file2["judul"] = preproc.transform(file2["judul"])
|
| 647 |
+
|
| 648 |
+
# Preprocessing tag (β
perbaikan: tidak menimpa judul)
|
| 649 |
+
file2["tag"] = preproc.transform(file2["tag"])
|
| 650 |
+
|
| 651 |
+
# Preprocessing isi_berita
|
| 652 |
+
file2["isi_berita"] = preproc.transform(file2["isi_berita"])
|
| 653 |
+
|
| 654 |
+
# drop NaN biar aman
|
| 655 |
+
file2 = file2.dropna(subset=["judul", "tag", "isi_berita"])
|
| 656 |
+
|
| 657 |
+
outputs2 = []
|
| 658 |
+
|
| 659 |
+
for idx, row in file2.iterrows():
|
| 660 |
+
print(f"[File2] Proses baris {idx+1}/{len(file2)}")
|
| 661 |
+
|
| 662 |
+
combined_text = f"{row['judul']} {row['tag']} {row['isi_berita']}"
|
| 663 |
+
pred = predict_text(combined_text)
|
| 664 |
+
|
| 665 |
+
outputs2.append({
|
| 666 |
+
"link": row.get("link", ""), # simpan link berita
|
| 667 |
+
"judul": row["judul"],
|
| 668 |
+
"tag": row["tag"],
|
| 669 |
+
"isi_berita": row["isi_berita"],
|
| 670 |
+
"prediction": pred
|
| 671 |
+
})
|
| 672 |
+
|
| 673 |
+
df_out2 = pd.DataFrame(outputs2)
|
| 674 |
+
df_out2.to_csv("berita.csv", index=False, encoding="utf-8-sig")
|
| 675 |
+
print("β
Hasil prediksi file2 sudah disimpan ke berita.csv")
|
services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Empty init file to make 'services' a proper Python package
|
services/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (158 Bytes). View file
|
|
|
services/__pycache__/facebook.cpython-311.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
services/__pycache__/medos.cpython-311.pyc
ADDED
|
Binary file (15.3 kB). View file
|
|
|
services/__pycache__/news.cpython-311.pyc
ADDED
|
Binary file (22.8 kB). View file
|
|
|
services/__pycache__/preprocessing.cpython-311.pyc
ADDED
|
Binary file (5.7 kB). View file
|
|
|
services/__pycache__/sentiment.cpython-311.pyc
ADDED
|
Binary file (5.12 kB). View file
|
|
|
services/__pycache__/tiktok.cpython-311.pyc
ADDED
|
Binary file (11.1 kB). View file
|
|
|
services/__pycache__/wordcloud_service.cpython-311.pyc
ADDED
|
Binary file (5.59 kB). View file
|
|
|
services/_driver.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
_driver.py β Shared Selenium Chrome driver factory.
|
| 3 |
+
All scrapers import _create_driver() from here so that Docker env-vars
|
| 4 |
+
(CHROME_BIN, CHROMEDRIVER_PATH) are respected in one place.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
from selenium import webdriver
|
| 11 |
+
from selenium.webdriver.chrome.service import Service
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _create_driver(mobile: bool = False) -> webdriver.Chrome:
|
| 15 |
+
"""
|
| 16 |
+
Return a headless Chrome/Chromium instance tuned for Docker.
|
| 17 |
+
|
| 18 |
+
Picks up:
|
| 19 |
+
CHROME_BIN β path to chromium binary (default: /usr/bin/chromium)
|
| 20 |
+
CHROMEDRIVER_PATH β path to chromedriver (default: /usr/bin/chromedriver)
|
| 21 |
+
"""
|
| 22 |
+
chrome_bin = os.environ.get("CHROME_BIN", "/usr/bin/chromium")
|
| 23 |
+
driver_bin = os.environ.get("CHROMEDRIVER_PATH", "/usr/bin/chromedriver")
|
| 24 |
+
|
| 25 |
+
options = webdriver.ChromeOptions()
|
| 26 |
+
options.binary_location = chrome_bin
|
| 27 |
+
|
| 28 |
+
# ββ Headless & sandbox flags ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
options.add_argument("--headless=new")
|
| 30 |
+
options.add_argument("--no-sandbox")
|
| 31 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 32 |
+
options.add_argument("--disable-gpu")
|
| 33 |
+
options.add_argument("--disable-software-rasterizer")
|
| 34 |
+
options.add_argument("--disable-extensions")
|
| 35 |
+
options.add_argument("--disable-infobars")
|
| 36 |
+
options.add_argument("--disable-notifications")
|
| 37 |
+
options.add_argument("--disable-popup-blocking")
|
| 38 |
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
| 39 |
+
options.add_argument("--ignore-certificate-errors")
|
| 40 |
+
options.add_argument("--window-size=1920,1080")
|
| 41 |
+
options.add_argument("--remote-debugging-port=0") # avoid port conflicts
|
| 42 |
+
|
| 43 |
+
# ββ User-Agent ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
if mobile:
|
| 45 |
+
options.add_argument(
|
| 46 |
+
"--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
| 47 |
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1"
|
| 48 |
+
)
|
| 49 |
+
else:
|
| 50 |
+
options.add_argument(
|
| 51 |
+
"--user-agent=Mozilla/5.0 (X11; Linux x86_64) "
|
| 52 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
|
| 56 |
+
options.add_experimental_option("useAutomationExtension", False)
|
| 57 |
+
|
| 58 |
+
service = Service(executable_path=driver_bin)
|
| 59 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 60 |
+
|
| 61 |
+
# Hide webdriver fingerprint
|
| 62 |
+
driver.execute_cdp_cmd(
|
| 63 |
+
"Page.addScriptToEvaluateOnNewDocument",
|
| 64 |
+
{"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
|
| 65 |
+
)
|
| 66 |
+
return driver
|
services/facebook.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
facebook.py β Facebook group scraper using Selenium.
|
| 3 |
+
Exports: scrape_facebook(username, password, groups) -> list[dict]
|
| 4 |
+
|
| 5 |
+
Returns structured data per-post:
|
| 6 |
+
group_name, group_url, post_url, author, caption, comments
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
from selenium.webdriver.common.by import By
|
| 15 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 16 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 17 |
+
|
| 18 |
+
from ._driver import _create_driver
|
| 19 |
+
|
| 20 |
+
COOKIES_FILE = "fb_cookies.json"
|
| 21 |
+
FB_BASE = "https://www.facebook.com"
|
| 22 |
+
MOBILE_FB = "https://m.facebook.com"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ββ Cookie helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
def _save_cookies(driver, path: str) -> None:
|
| 28 |
+
try:
|
| 29 |
+
with open(path, "w") as f:
|
| 30 |
+
json.dump(driver.get_cookies(), f)
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(f"[Facebook] Gagal simpan cookies: {e}")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _load_cookies(driver, path: str) -> bool:
|
| 36 |
+
if not os.path.exists(path) or os.path.getsize(path) == 0:
|
| 37 |
+
return False
|
| 38 |
+
try:
|
| 39 |
+
with open(path, "r") as f:
|
| 40 |
+
cookies = json.load(f)
|
| 41 |
+
for cookie in cookies:
|
| 42 |
+
try:
|
| 43 |
+
driver.add_cookie(cookie)
|
| 44 |
+
except Exception:
|
| 45 |
+
pass
|
| 46 |
+
return True
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"[Facebook] Gagal load cookies: {e}")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ββ Login ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
|
| 54 |
+
def _fb_login(driver, username: str, password: str) -> bool:
|
| 55 |
+
wait = WebDriverWait(driver, 20)
|
| 56 |
+
driver.get(MOBILE_FB)
|
| 57 |
+
time.sleep(3)
|
| 58 |
+
|
| 59 |
+
if os.path.exists(COOKIES_FILE):
|
| 60 |
+
try:
|
| 61 |
+
_load_cookies(driver, COOKIES_FILE)
|
| 62 |
+
driver.refresh()
|
| 63 |
+
time.sleep(4)
|
| 64 |
+
if "login" not in driver.current_url and "checkpoint" not in driver.current_url:
|
| 65 |
+
print("[Facebook] Login via cookies berhasil.")
|
| 66 |
+
return True
|
| 67 |
+
driver.delete_all_cookies()
|
| 68 |
+
driver.get(MOBILE_FB)
|
| 69 |
+
time.sleep(2)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
print("[Facebook] Login manual username/password...")
|
| 74 |
+
try:
|
| 75 |
+
email_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[name="email"]')))
|
| 76 |
+
pass_input = driver.find_element(By.CSS_SELECTOR, 'input[name="pass"]')
|
| 77 |
+
email_input.clear()
|
| 78 |
+
email_input.send_keys(username)
|
| 79 |
+
pass_input.clear()
|
| 80 |
+
pass_input.send_keys(password)
|
| 81 |
+
pass_input.send_keys("\n")
|
| 82 |
+
time.sleep(1)
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
login_btn = driver.find_element(By.CSS_SELECTOR, 'button[name="login"], [data-sigil="m_login_button"], input[type="submit"]')
|
| 86 |
+
driver.execute_script("arguments[0].click();", login_btn)
|
| 87 |
+
except Exception:
|
| 88 |
+
pass
|
| 89 |
+
except Exception:
|
| 90 |
+
try:
|
| 91 |
+
driver.get(f"{FB_BASE}/login.php")
|
| 92 |
+
time.sleep(3)
|
| 93 |
+
email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
|
| 94 |
+
pass_input = driver.find_element(By.ID, "pass")
|
| 95 |
+
email_input.clear()
|
| 96 |
+
email_input.send_keys(username)
|
| 97 |
+
pass_input.clear()
|
| 98 |
+
pass_input.send_keys(password)
|
| 99 |
+
driver.find_element(By.NAME, "login").click()
|
| 100 |
+
except Exception as e2:
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
time.sleep(6)
|
| 104 |
+
if "login" in driver.current_url or "checkpoint" in driver.current_url:
|
| 105 |
+
return False
|
| 106 |
+
|
| 107 |
+
_save_cookies(driver, COOKIES_FILE)
|
| 108 |
+
return True
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def ensure_logged_in(driver, username, password):
|
| 112 |
+
try:
|
| 113 |
+
url = driver.current_url
|
| 114 |
+
if url and "login" in url:
|
| 115 |
+
_fb_login(driver, username, password)
|
| 116 |
+
return
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
popup = driver.find_element(By.XPATH, '//div[contains(text(),"See more on Facebook")]')
|
| 120 |
+
if popup.is_displayed():
|
| 121 |
+
_fb_login(driver, username, password)
|
| 122 |
+
return
|
| 123 |
+
except: pass
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
login_modal = driver.find_element(By.XPATH, '//input[@type="email" or @type="text"]')
|
| 127 |
+
if login_modal.is_displayed():
|
| 128 |
+
_fb_login(driver, username, password)
|
| 129 |
+
return
|
| 130 |
+
except: pass
|
| 131 |
+
except: pass
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ββ Scraping βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
|
| 136 |
+
def _scrape_group(driver, username, password, group_url: str, max_scrolls: int = 5) -> list:
|
| 137 |
+
"""Scrape posts from a single FB group URL. Returns list of dict strings."""
|
| 138 |
+
posts: list = []
|
| 139 |
+
|
| 140 |
+
group_url = group_url.replace("m.facebook.com", "www.facebook.com").replace("web.facebook.com", "www.facebook.com")
|
| 141 |
+
print(f"[Facebook] Scraping grup: {group_url}")
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
driver.get(group_url)
|
| 145 |
+
time.sleep(6)
|
| 146 |
+
ensure_logged_in(driver, username, password)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"[Facebook] Gagal buka grup: {e}")
|
| 149 |
+
return posts
|
| 150 |
+
|
| 151 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
| 152 |
+
|
| 153 |
+
for scroll_n in range(max_scrolls):
|
| 154 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 155 |
+
time.sleep(4)
|
| 156 |
+
ensure_logged_in(driver, username, password)
|
| 157 |
+
|
| 158 |
+
post_elements = driver.find_elements(By.XPATH, '//div[@role="article"]')
|
| 159 |
+
print(f"[Facebook] Scroll {scroll_n + 1} β {len(post_elements)} artikel ditemukan")
|
| 160 |
+
|
| 161 |
+
for idx, post in enumerate(post_elements):
|
| 162 |
+
try:
|
| 163 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", post)
|
| 164 |
+
time.sleep(1)
|
| 165 |
+
|
| 166 |
+
permalink = None
|
| 167 |
+
post_context = post
|
| 168 |
+
try:
|
| 169 |
+
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/posts/')]")
|
| 170 |
+
permalink = link_el.get_attribute("href").split("?")[0]
|
| 171 |
+
except:
|
| 172 |
+
try:
|
| 173 |
+
link_el = post.find_element(By.XPATH, ".//a[contains(@href,'/permalink/')]")
|
| 174 |
+
permalink = link_el.get_attribute("href").split("?")[0]
|
| 175 |
+
except:
|
| 176 |
+
try:
|
| 177 |
+
post_id = post.get_attribute("data-ft")
|
| 178 |
+
if post_id and "top_level_post_id" in post_id:
|
| 179 |
+
d = json.loads(post_id)
|
| 180 |
+
pid = d.get("top_level_post_id")
|
| 181 |
+
if pid:
|
| 182 |
+
permalink = f"{group_url.rstrip('/').split('?')[0]}/posts/{pid}/"
|
| 183 |
+
except:
|
| 184 |
+
pass
|
| 185 |
+
|
| 186 |
+
if not permalink:
|
| 187 |
+
permalink = group_url
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
driver.execute_script(f"window.open('{permalink}', '_blank');")
|
| 191 |
+
time.sleep(1)
|
| 192 |
+
driver.switch_to.window(driver.window_handles[-1])
|
| 193 |
+
time.sleep(3)
|
| 194 |
+
ensure_logged_in(driver, username, password)
|
| 195 |
+
post_context = driver.find_element(By.XPATH, "//div[@role='article']")
|
| 196 |
+
except:
|
| 197 |
+
post_context = None
|
| 198 |
+
|
| 199 |
+
author = "Unknown"
|
| 200 |
+
try:
|
| 201 |
+
if post_context:
|
| 202 |
+
try:
|
| 203 |
+
author = post_context.find_element(By.XPATH, ".//h2//span//span").text.strip()
|
| 204 |
+
except:
|
| 205 |
+
try:
|
| 206 |
+
author = post_context.find_element(By.XPATH, ".//strong//span").text.strip()
|
| 207 |
+
except:
|
| 208 |
+
author = post_context.find_element(By.XPATH, ".//span[contains(@class,'x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1nxh6w3 x1sibtaa x1s688f xi81zsa')]").text.strip()
|
| 209 |
+
except: pass
|
| 210 |
+
|
| 211 |
+
# Expand comments if permalink tab is open
|
| 212 |
+
if post_context:
|
| 213 |
+
while True:
|
| 214 |
+
try:
|
| 215 |
+
btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat komentar') or contains(text(),'View more comments')]")
|
| 216 |
+
driver.execute_script("arguments[0].click();", btn)
|
| 217 |
+
time.sleep(2)
|
| 218 |
+
except: break
|
| 219 |
+
while True:
|
| 220 |
+
try:
|
| 221 |
+
btn = post_context.find_element(By.XPATH, ".//span[contains(text(),'Lihat') and contains(text(),'balasan')] | .//span[contains(text(),'View') and contains(text(),'replies')]")
|
| 222 |
+
driver.execute_script("arguments[0].click();", btn)
|
| 223 |
+
time.sleep(2)
|
| 224 |
+
except: break
|
| 225 |
+
|
| 226 |
+
caption = ""
|
| 227 |
+
comments = []
|
| 228 |
+
if post_context:
|
| 229 |
+
try:
|
| 230 |
+
blocks = post_context.find_elements(By.XPATH, ".//div[@data-ad-rendering-role='story_message']//div[@dir='auto']")
|
| 231 |
+
caption = "\n".join([b.text.strip() for b in blocks if b.text.strip()])[:2000]
|
| 232 |
+
except: pass
|
| 233 |
+
try:
|
| 234 |
+
comment_blocks = post_context.find_elements(By.XPATH, ".//div[@aria-label='Komentar' or @aria-label='Comment']//div[@dir='auto']")
|
| 235 |
+
seen_c = set()
|
| 236 |
+
for cb in comment_blocks:
|
| 237 |
+
c = cb.text.strip()
|
| 238 |
+
if c and c not in seen_c:
|
| 239 |
+
seen_c.add(c)
|
| 240 |
+
comments.append(c)
|
| 241 |
+
except: pass
|
| 242 |
+
|
| 243 |
+
if len(driver.window_handles) > 1:
|
| 244 |
+
driver.close()
|
| 245 |
+
driver.switch_to.window(driver.window_handles[0])
|
| 246 |
+
|
| 247 |
+
if caption or comments:
|
| 248 |
+
posts.append({
|
| 249 |
+
"group_name": group_url.split("/")[-1] if not group_url.endswith("/") else group_url.split("/")[-2],
|
| 250 |
+
"group_url": group_url,
|
| 251 |
+
"post_url": permalink,
|
| 252 |
+
"author": author,
|
| 253 |
+
"caption": caption,
|
| 254 |
+
"comments": comments
|
| 255 |
+
})
|
| 256 |
+
except Exception as e:
|
| 257 |
+
print(f"[Facebook] Error baca post: {e}")
|
| 258 |
+
if len(driver.window_handles) > 1:
|
| 259 |
+
driver.close()
|
| 260 |
+
driver.switch_to.window(driver.window_handles[0])
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
| 264 |
+
if new_height == last_height:
|
| 265 |
+
break
|
| 266 |
+
last_height = new_height
|
| 267 |
+
|
| 268 |
+
return posts
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 272 |
+
|
| 273 |
+
def scrape_facebook(username: str, password: str, groups: list | None = None) -> list:
|
| 274 |
+
if not username or not password:
|
| 275 |
+
print("[Facebook] Username/password tidak disediakan.")
|
| 276 |
+
return []
|
| 277 |
+
|
| 278 |
+
if not groups:
|
| 279 |
+
print("[Facebook] Tidak ada URL grup yang disediakan β skip.")
|
| 280 |
+
return []
|
| 281 |
+
|
| 282 |
+
driver = _create_driver(mobile=False)
|
| 283 |
+
all_data: list = []
|
| 284 |
+
|
| 285 |
+
try:
|
| 286 |
+
if not _fb_login(driver, username, password):
|
| 287 |
+
return []
|
| 288 |
+
|
| 289 |
+
for group_url in groups:
|
| 290 |
+
if not group_url or not group_url.strip():
|
| 291 |
+
continue
|
| 292 |
+
data = _scrape_group(driver, username, password, group_url.strip())
|
| 293 |
+
all_data.extend(data)
|
| 294 |
+
|
| 295 |
+
except Exception as e:
|
| 296 |
+
print(f"[Facebook] Fatal error: {e}")
|
| 297 |
+
finally:
|
| 298 |
+
try:
|
| 299 |
+
driver.quit()
|
| 300 |
+
except Exception:
|
| 301 |
+
pass
|
| 302 |
+
|
| 303 |
+
print(f"[Facebook] Total article posts dari Facebook: {len(all_data)}")
|
| 304 |
+
return all_data
|
services/medos.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
medos.py β Instagram scraper using Selenium.
|
| 3 |
+
Exports: scrape_medos(username, password, target_account, mode) -> list[str]
|
| 4 |
+
|
| 5 |
+
Strategy:
|
| 6 |
+
1. Try saved cookies first (faster, avoids login throttling).
|
| 7 |
+
2. Fall back to username/password login via mobile IG version.
|
| 8 |
+
3. Collect post links from profile / hashtag page.
|
| 9 |
+
4. Scrape caption + visible comments from each post.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import time
|
| 16 |
+
from datetime import datetime, timedelta
|
| 17 |
+
|
| 18 |
+
from selenium.webdriver.common.by import By
|
| 19 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 20 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 21 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
| 22 |
+
|
| 23 |
+
from ._driver import _create_driver
|
| 24 |
+
|
| 25 |
+
IG_BASE = "https://www.instagram.com/"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ββ Cookie helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
|
| 30 |
+
def _save_cookies(driver, path: str) -> None:
|
| 31 |
+
try:
|
| 32 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 33 |
+
json.dump(driver.get_cookies(), f, ensure_ascii=False, indent=2)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"[Medos] Gagal simpan cookies: {e}")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _load_cookies(driver, path: str) -> bool:
|
| 39 |
+
if not os.path.exists(path) or os.path.getsize(path) == 0:
|
| 40 |
+
return False
|
| 41 |
+
try:
|
| 42 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 43 |
+
cookies = json.load(f)
|
| 44 |
+
driver.get(IG_BASE)
|
| 45 |
+
time.sleep(2)
|
| 46 |
+
driver.delete_all_cookies()
|
| 47 |
+
for c in cookies:
|
| 48 |
+
allowed = {k: c[k] for k in c.keys() & {"name", "value", "domain", "path", "secure", "httpOnly", "expiry"}}
|
| 49 |
+
if "expiry" in allowed and isinstance(allowed["expiry"], float):
|
| 50 |
+
allowed["expiry"] = int(allowed["expiry"])
|
| 51 |
+
try:
|
| 52 |
+
driver.add_cookie(allowed)
|
| 53 |
+
except Exception:
|
| 54 |
+
allowed.pop("domain", None)
|
| 55 |
+
try:
|
| 56 |
+
driver.add_cookie(allowed)
|
| 57 |
+
except Exception:
|
| 58 |
+
pass
|
| 59 |
+
return True
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"[Medos] Gagal load cookies: {e}")
|
| 62 |
+
return False
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _is_logged_in(driver) -> bool:
|
| 66 |
+
"""Check if the session has a valid sessionid cookie on instagram."""
|
| 67 |
+
return any(c.get("name") == "sessionid" for c in driver.get_cookies())
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ββ Login ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 71 |
+
|
| 72 |
+
def _login(driver, username: str, password: str, cookies_file: str) -> bool:
|
| 73 |
+
# 1. Try saved cookies
|
| 74 |
+
if _load_cookies(driver, cookies_file):
|
| 75 |
+
driver.get(IG_BASE)
|
| 76 |
+
time.sleep(3)
|
| 77 |
+
if _is_logged_in(driver):
|
| 78 |
+
print("[Medos] Login via cookies OK.")
|
| 79 |
+
return True
|
| 80 |
+
print("[Medos] Cookies kadaluarsa, coba login manual.")
|
| 81 |
+
|
| 82 |
+
# 2. Username/password login
|
| 83 |
+
login_url = f"{IG_BASE}accounts/login/"
|
| 84 |
+
driver.get(login_url)
|
| 85 |
+
print("[Medos] Membuka halaman login Instagramβ¦")
|
| 86 |
+
|
| 87 |
+
try:
|
| 88 |
+
# Wait for username OR email field
|
| 89 |
+
WebDriverWait(driver, 20).until(
|
| 90 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "input[name='username'], input[name='email']"))
|
| 91 |
+
)
|
| 92 |
+
except TimeoutException:
|
| 93 |
+
print("[Medos] Halaman login tidak termuat.")
|
| 94 |
+
try:
|
| 95 |
+
with open("/app/static/output/ig_login_error.html", "w", encoding="utf-8") as f:
|
| 96 |
+
f.write(driver.page_source)
|
| 97 |
+
driver.save_screenshot("/app/static/output/ig_login_error.png")
|
| 98 |
+
print("[Medos] Log error HTML dan screenshot disimpan ke /app/static/output/")
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f"[Medos] Gagal menyimpan log error: {e}")
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Try both username/email and password/pass
|
| 105 |
+
user_field = None
|
| 106 |
+
for sel in ["input[name='username']", "input[name='email']"]:
|
| 107 |
+
try:
|
| 108 |
+
user_field = driver.find_element(By.CSS_SELECTOR, sel)
|
| 109 |
+
break
|
| 110 |
+
except NoSuchElementException:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
pass_field = None
|
| 114 |
+
for sel in ["input[name='password']", "input[name='pass']"]:
|
| 115 |
+
try:
|
| 116 |
+
pass_field = driver.find_element(By.CSS_SELECTOR, sel)
|
| 117 |
+
break
|
| 118 |
+
except NoSuchElementException:
|
| 119 |
+
pass
|
| 120 |
+
|
| 121 |
+
if not user_field or not pass_field:
|
| 122 |
+
print("[Medos] Field login (username/password) tidak ditemukan.")
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
user_field.clear()
|
| 126 |
+
user_field.send_keys(username)
|
| 127 |
+
time.sleep(0.8)
|
| 128 |
+
pass_field.clear()
|
| 129 |
+
pass_field.send_keys(password)
|
| 130 |
+
time.sleep(0.5)
|
| 131 |
+
|
| 132 |
+
# Submit form: Press ENTER inside password field
|
| 133 |
+
pass_field.send_keys("\n")
|
| 134 |
+
time.sleep(1)
|
| 135 |
+
|
| 136 |
+
# Fallback: Try clicking the submit button if it exists
|
| 137 |
+
try:
|
| 138 |
+
submit_btn = driver.find_element(By.CSS_SELECTOR, "button[type='submit'], input[type='submit'], div[role='button']")
|
| 139 |
+
driver.execute_script("arguments[0].click();", submit_btn)
|
| 140 |
+
except Exception:
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
# Wait for redirect away from login page
|
| 144 |
+
WebDriverWait(driver, 20).until(
|
| 145 |
+
lambda d: "/accounts/login/" not in d.current_url and "login" not in d.current_url.lower()
|
| 146 |
+
)
|
| 147 |
+
print("[Medos] Login sukses.")
|
| 148 |
+
except TimeoutException:
|
| 149 |
+
print("[Medos] Login timeout β cek credentials atau akun ter-throttle.")
|
| 150 |
+
return False
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"[Medos] Login gagal: {e}")
|
| 153 |
+
return False
|
| 154 |
+
|
| 155 |
+
# 3. Dismiss save-info / notification popups
|
| 156 |
+
for _ in range(2):
|
| 157 |
+
try:
|
| 158 |
+
WebDriverWait(driver, 6).until(
|
| 159 |
+
EC.element_to_be_clickable((
|
| 160 |
+
By.XPATH,
|
| 161 |
+
"//button[contains(text(),'Not Now') or "
|
| 162 |
+
"contains(text(),'Bukan Sekarang') or "
|
| 163 |
+
"contains(text(),'Not now')]"
|
| 164 |
+
))
|
| 165 |
+
).click()
|
| 166 |
+
time.sleep(1.5)
|
| 167 |
+
except Exception:
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
_save_cookies(driver, cookies_file)
|
| 171 |
+
return True
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# ββ Scraping helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
|
| 176 |
+
def _collect_post_links(driver, target_url: str, max_scrolls: int = 5) -> list:
|
| 177 |
+
print(f"[Medos] Membuka: {target_url}")
|
| 178 |
+
driver.get(target_url)
|
| 179 |
+
time.sleep(6)
|
| 180 |
+
|
| 181 |
+
links: set = set()
|
| 182 |
+
stall = 0
|
| 183 |
+
|
| 184 |
+
for i in range(max_scrolls):
|
| 185 |
+
prev_count = len(links)
|
| 186 |
+
for el in driver.find_elements(By.CSS_SELECTOR, "a[href*='/p/'], a[href*='/reel/']"):
|
| 187 |
+
href = el.get_attribute("href")
|
| 188 |
+
if href:
|
| 189 |
+
links.add(href.split("?")[0])
|
| 190 |
+
print(f"[Medos] Scroll {i+1}: {len(links)} link ditemukan.")
|
| 191 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 192 |
+
time.sleep(3.5)
|
| 193 |
+
if len(links) == prev_count:
|
| 194 |
+
stall += 1
|
| 195 |
+
if stall >= 3:
|
| 196 |
+
break
|
| 197 |
+
else:
|
| 198 |
+
stall = 0
|
| 199 |
+
|
| 200 |
+
return list(links)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def _scrape_post(driver, link: str) -> list:
|
| 204 |
+
"""Return list of text strings (caption + comments) from one post."""
|
| 205 |
+
driver.get(link)
|
| 206 |
+
time.sleep(4)
|
| 207 |
+
|
| 208 |
+
texts = []
|
| 209 |
+
|
| 210 |
+
# Caption β based on medos_scraping.py
|
| 211 |
+
caption_selectors = [
|
| 212 |
+
(By.XPATH, "//div[@data-testid='post-caption']"),
|
| 213 |
+
(By.XPATH, "//h1"),
|
| 214 |
+
(By.XPATH, "//span[contains(@class, 'x126k92a')]"),
|
| 215 |
+
(By.CSS_SELECTOR, "article span[dir='auto']"),
|
| 216 |
+
]
|
| 217 |
+
for by, sel in caption_selectors:
|
| 218 |
+
try:
|
| 219 |
+
el = WebDriverWait(driver, 3).until(EC.presence_of_element_located((by, sel)))
|
| 220 |
+
# Try to get text, if empty, we might need innerHTML but text is cleaner
|
| 221 |
+
t = el.text.strip()
|
| 222 |
+
if not t:
|
| 223 |
+
# If text is empty due to formatting, try extracting via JS
|
| 224 |
+
t = driver.execute_script("return arguments[0].innerText;", el)
|
| 225 |
+
|
| 226 |
+
if t and len(t) > 3:
|
| 227 |
+
texts.append(t.strip())
|
| 228 |
+
break
|
| 229 |
+
except Exception:
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
# Load more comments (Tahap 1 Ekspansi dari medos_scraping.py)
|
| 233 |
+
for _ in range(5):
|
| 234 |
+
try:
|
| 235 |
+
# First try the default svg
|
| 236 |
+
btn = driver.find_element(
|
| 237 |
+
By.CSS_SELECTOR,
|
| 238 |
+
"svg[aria-label='Load more comments'], svg[aria-label='Muat komentar lainnya']"
|
| 239 |
+
)
|
| 240 |
+
driver.execute_script("arguments[0].click();", btn)
|
| 241 |
+
time.sleep(2)
|
| 242 |
+
except Exception:
|
| 243 |
+
try:
|
| 244 |
+
# Fallback to load more text
|
| 245 |
+
btn2 = driver.find_element(
|
| 246 |
+
By.XPATH,
|
| 247 |
+
"//div[@role='button']//span[contains(text(),'Load') or contains(text(),'Muat')]"
|
| 248 |
+
)
|
| 249 |
+
driver.execute_script("arguments[0].click();", btn2)
|
| 250 |
+
time.sleep(2)
|
| 251 |
+
except Exception:
|
| 252 |
+
break
|
| 253 |
+
|
| 254 |
+
# Collect visible comments (Ekstraksi dari medos_scraping.py)
|
| 255 |
+
try:
|
| 256 |
+
# Locators from working script + fallbacks
|
| 257 |
+
xpaths = [
|
| 258 |
+
"//div[contains(@class, 'x1cy8zhl')]/span", # From user's working macro
|
| 259 |
+
"//ul//li//span[@dir='auto']",
|
| 260 |
+
"//div[@role='button']//span[@dir='auto']",
|
| 261 |
+
"//div[contains(@class, 'x1xegmmw')]//span[@dir='auto']"
|
| 262 |
+
]
|
| 263 |
+
seen_texts = set()
|
| 264 |
+
for t in texts:
|
| 265 |
+
seen_texts.add(t)
|
| 266 |
+
|
| 267 |
+
for xpath in xpaths:
|
| 268 |
+
spans = driver.find_elements(By.XPATH, xpath)
|
| 269 |
+
for span in spans:
|
| 270 |
+
try:
|
| 271 |
+
t = span.text.strip()
|
| 272 |
+
if t and len(t) > 3 and t not in seen_texts:
|
| 273 |
+
seen_texts.add(t)
|
| 274 |
+
texts.append(t)
|
| 275 |
+
except Exception:
|
| 276 |
+
pass
|
| 277 |
+
except Exception as e:
|
| 278 |
+
print(f"[Medos] Gagal ambil komentar: {e}")
|
| 279 |
+
|
| 280 |
+
return texts
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 284 |
+
|
| 285 |
+
def scrape_medos(username: str, password: str, target_account: str, mode: str = "all") -> list:
|
| 286 |
+
"""
|
| 287 |
+
Scrape Instagram profile/hashtag posts and return list of text strings.
|
| 288 |
+
mode: 'all' | 'date' (last 7 months)
|
| 289 |
+
"""
|
| 290 |
+
if not username or not password or not target_account:
|
| 291 |
+
print("[Medos] Parameter tidak lengkap.")
|
| 292 |
+
return []
|
| 293 |
+
|
| 294 |
+
cookies_file = f"/app/ig_cookies_{username}.json"
|
| 295 |
+
driver = _create_driver(mobile=False)
|
| 296 |
+
texts_out: list = []
|
| 297 |
+
|
| 298 |
+
try:
|
| 299 |
+
if not _login(driver, username, password, cookies_file):
|
| 300 |
+
print("[Medos] Login gagal, scraping dibatalkan.")
|
| 301 |
+
return []
|
| 302 |
+
|
| 303 |
+
# Determine target URL
|
| 304 |
+
account = target_account.strip()
|
| 305 |
+
if account.startswith("#"):
|
| 306 |
+
tag = account.lstrip("#")
|
| 307 |
+
target_url = f"{IG_BASE}explore/tags/{tag}/"
|
| 308 |
+
else:
|
| 309 |
+
target_url = f"{IG_BASE}{account.lstrip('@')}/"
|
| 310 |
+
|
| 311 |
+
post_links = _collect_post_links(driver, target_url, max_scrolls=5)
|
| 312 |
+
print(f"[Medos] {len(post_links)} link postingan ditemukan untuk '{account}'.")
|
| 313 |
+
|
| 314 |
+
for link in post_links[:30]: # cap 30 posts
|
| 315 |
+
try:
|
| 316 |
+
result = _scrape_post(driver, link)
|
| 317 |
+
texts_out.extend(result)
|
| 318 |
+
print(f"[Medos] {link} β {len(result)} teks")
|
| 319 |
+
except Exception as e:
|
| 320 |
+
print(f"[Medos] Error pada {link}: {e}")
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f"[Medos] Fatal error: {e}")
|
| 324 |
+
finally:
|
| 325 |
+
try:
|
| 326 |
+
driver.quit()
|
| 327 |
+
except Exception:
|
| 328 |
+
pass
|
| 329 |
+
|
| 330 |
+
print(f"[Medos] Total teks dari Instagram: {len(texts_out)}")
|
| 331 |
+
return texts_out
|
services/news.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
news.py β News scraper dispatcher.
|
| 3 |
+
Exports: scrape_news(portal, pages, keyword) -> list[dict]
|
| 4 |
+
|
| 5 |
+
portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import random
|
| 10 |
+
import re
|
| 11 |
+
import time
|
| 12 |
+
from urllib.parse import quote, quote_plus, urlparse, urlunparse
|
| 13 |
+
|
| 14 |
+
import requests
|
| 15 |
+
from bs4 import BeautifulSoup
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ββ Shared HTTP session helpers ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
_HEADERS = {
|
| 21 |
+
"User-Agent": (
|
| 22 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 23 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
| 24 |
+
),
|
| 25 |
+
"Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
|
| 26 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
|
| 30 |
+
for attempt in range(retries):
|
| 31 |
+
try:
|
| 32 |
+
r = sess.get(url, timeout=20, allow_redirects=True)
|
| 33 |
+
r.raise_for_status()
|
| 34 |
+
return r
|
| 35 |
+
except Exception as e:
|
| 36 |
+
if attempt < retries - 1:
|
| 37 |
+
time.sleep(delay)
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
|
| 41 |
+
container = None
|
| 42 |
+
for cls in container_classes:
|
| 43 |
+
container = soup.find("div", class_=cls)
|
| 44 |
+
if container:
|
| 45 |
+
break
|
| 46 |
+
scope = container if container else soup
|
| 47 |
+
texts = []
|
| 48 |
+
for p in scope.find_all("p"):
|
| 49 |
+
t = p.get_text(" ", strip=True)
|
| 50 |
+
if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
|
| 51 |
+
texts.append(t)
|
| 52 |
+
return texts
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ββ Detik.com ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
|
| 57 |
+
def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
|
| 58 |
+
import datetime
|
| 59 |
+
sess = requests.Session()
|
| 60 |
+
sess.headers.update(_HEADERS)
|
| 61 |
+
results = []
|
| 62 |
+
|
| 63 |
+
for page in range(1, max_pages + 1):
|
| 64 |
+
r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
|
| 65 |
+
if not r: break
|
| 66 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 67 |
+
news_list = soup.find_all('div', class_='media')
|
| 68 |
+
if not news_list: break
|
| 69 |
+
|
| 70 |
+
for news in news_list:
|
| 71 |
+
try:
|
| 72 |
+
title_tag = news.find('h3', class_='media__title')
|
| 73 |
+
if not title_tag: continue
|
| 74 |
+
link_tag = title_tag.find('a', class_='media__link')
|
| 75 |
+
if not link_tag or not link_tag.has_attr('href'): continue
|
| 76 |
+
link = link_tag['href']
|
| 77 |
+
title = link_tag.text.strip()
|
| 78 |
+
|
| 79 |
+
news_date = None
|
| 80 |
+
date_tag = news.find('div', class_='media__date')
|
| 81 |
+
if date_tag:
|
| 82 |
+
span_tag = date_tag.find('span')
|
| 83 |
+
if span_tag and span_tag.has_attr('d-time'):
|
| 84 |
+
timestamp = span_tag['d-time']
|
| 85 |
+
news_date = datetime.datetime.fromtimestamp(int(timestamp))
|
| 86 |
+
|
| 87 |
+
news_resp = _get(sess, link)
|
| 88 |
+
if not news_resp: continue
|
| 89 |
+
news_soup = BeautifulSoup(news_resp.text, 'html.parser')
|
| 90 |
+
|
| 91 |
+
content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
|
| 92 |
+
content = ""
|
| 93 |
+
if content_div:
|
| 94 |
+
parts = []
|
| 95 |
+
for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
| 96 |
+
text = tag.get_text(strip=True)
|
| 97 |
+
if text:
|
| 98 |
+
prefix = tag.name.upper() if tag.name.startswith('h') else ''
|
| 99 |
+
parts.append(f"{prefix}: {text}" if prefix else text)
|
| 100 |
+
content = '\n'.join(parts)
|
| 101 |
+
|
| 102 |
+
nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
|
| 103 |
+
tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []
|
| 104 |
+
|
| 105 |
+
results.append({
|
| 106 |
+
'judul': title,
|
| 107 |
+
'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
|
| 108 |
+
'tag': ', '.join(tags),
|
| 109 |
+
'isi_berita': content,
|
| 110 |
+
'link': link
|
| 111 |
+
})
|
| 112 |
+
except Exception: pass
|
| 113 |
+
time.sleep(2)
|
| 114 |
+
return results
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ββ Radar ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
+
|
| 119 |
+
def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
|
| 120 |
+
BASE_HOST = "https://radarcirebon.disway.id"
|
| 121 |
+
sess = requests.Session()
|
| 122 |
+
sess.headers.update(_HEADERS)
|
| 123 |
+
results = []
|
| 124 |
+
|
| 125 |
+
def _abs(href):
|
| 126 |
+
if not href: return None
|
| 127 |
+
href = href.strip()
|
| 128 |
+
return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")
|
| 129 |
+
|
| 130 |
+
for page in range(1, max_pages + 1):
|
| 131 |
+
q = quote_plus(keyword)
|
| 132 |
+
offset = (page - 1) * 30
|
| 133 |
+
url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="
|
| 134 |
+
|
| 135 |
+
r = _get(sess, url)
|
| 136 |
+
if not r: break
|
| 137 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 138 |
+
|
| 139 |
+
news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
|
| 140 |
+
for item in news_list:
|
| 141 |
+
try:
|
| 142 |
+
a = item.find('a', href=True)
|
| 143 |
+
if not a: continue
|
| 144 |
+
link = _abs(a.get('href'))
|
| 145 |
+
title = a.get_text(strip=True)
|
| 146 |
+
|
| 147 |
+
detail_r = _get(sess, link)
|
| 148 |
+
if not detail_r: continue
|
| 149 |
+
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
|
| 150 |
+
|
| 151 |
+
h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
|
| 152 |
+
title_detail = h1.get_text(strip=True) if h1 else title
|
| 153 |
+
|
| 154 |
+
date_text = ""
|
| 155 |
+
date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
|
| 156 |
+
if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)
|
| 157 |
+
|
| 158 |
+
content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
|
| 159 |
+
content = ""
|
| 160 |
+
if content_container:
|
| 161 |
+
content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])
|
| 162 |
+
|
| 163 |
+
tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
|
| 164 |
+
tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]
|
| 165 |
+
|
| 166 |
+
results.append({
|
| 167 |
+
"judul": title_detail,
|
| 168 |
+
"tanggal": date_text,
|
| 169 |
+
"tag": ", ".join(tags) if tags else "-",
|
| 170 |
+
"isi_berita": content,
|
| 171 |
+
"link": link
|
| 172 |
+
})
|
| 173 |
+
except Exception: pass
|
| 174 |
+
time.sleep(2)
|
| 175 |
+
return results
|
| 176 |
+
|
| 177 |
+
# ββ Antara βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 178 |
+
|
| 179 |
+
def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
|
| 180 |
+
BASE_HOST = "https://www.antaranews.com"
|
| 181 |
+
sess = requests.Session()
|
| 182 |
+
sess.headers.update(_HEADERS)
|
| 183 |
+
results = []
|
| 184 |
+
|
| 185 |
+
def _norm(href):
|
| 186 |
+
if not href: return None
|
| 187 |
+
href = href.strip()
|
| 188 |
+
if href.startswith("/"): href = BASE_HOST + href
|
| 189 |
+
elif not href.startswith("http"): return None
|
| 190 |
+
return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")
|
| 191 |
+
|
| 192 |
+
for page in range(1, max_pages + 1):
|
| 193 |
+
q = quote_plus(keyword)
|
| 194 |
+
url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
|
| 195 |
+
r = _get(sess, url)
|
| 196 |
+
if not r: break
|
| 197 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 198 |
+
|
| 199 |
+
anchors = soup.select('a[href*="/berita/"]')
|
| 200 |
+
links = {_norm(a.get('href')) for a in anchors if a.get('href')}
|
| 201 |
+
|
| 202 |
+
for link in links:
|
| 203 |
+
if not link: continue
|
| 204 |
+
detail_r = _get(sess, link)
|
| 205 |
+
if not detail_r: continue
|
| 206 |
+
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
|
| 207 |
+
|
| 208 |
+
h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
|
| 209 |
+
title_detail = h1.get_text(strip=True) if h1 else ""
|
| 210 |
+
|
| 211 |
+
date_detail = ""
|
| 212 |
+
cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
|
| 213 |
+
if cal_icon and cal_icon.find_parent('li'):
|
| 214 |
+
date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)
|
| 215 |
+
|
| 216 |
+
content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])
|
| 217 |
+
|
| 218 |
+
tags = []
|
| 219 |
+
for a in detail_soup.select('a[href*="/tag/"]'):
|
| 220 |
+
tag_text = a.get('title') or a.get_text(strip=True)
|
| 221 |
+
if tag_text: tags.append(tag_text)
|
| 222 |
+
|
| 223 |
+
results.append({
|
| 224 |
+
"judul": title_detail,
|
| 225 |
+
"tanggal": date_detail,
|
| 226 |
+
"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
|
| 227 |
+
"isi_berita": "\n".join(content_parts),
|
| 228 |
+
"link": link
|
| 229 |
+
})
|
| 230 |
+
return results
|
| 231 |
+
|
| 232 |
+
# ββ CNN βββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 233 |
+
|
| 234 |
+
def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
|
| 235 |
+
from selenium.webdriver.common.by import By
|
| 236 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 237 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 238 |
+
from ._driver import _create_driver
|
| 239 |
+
|
| 240 |
+
BASE_HOST = "https://www.cnnindonesia.com"
|
| 241 |
+
results = []
|
| 242 |
+
|
| 243 |
+
driver = _create_driver(mobile=False)
|
| 244 |
+
for page in range(1, max_pages + 1):
|
| 245 |
+
q = quote(keyword)
|
| 246 |
+
url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
|
| 247 |
+
driver.get(url)
|
| 248 |
+
|
| 249 |
+
if page == 1:
|
| 250 |
+
try:
|
| 251 |
+
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
|
| 252 |
+
except: pass
|
| 253 |
+
|
| 254 |
+
try:
|
| 255 |
+
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
|
| 256 |
+
except: continue
|
| 257 |
+
|
| 258 |
+
soup = BeautifulSoup(driver.page_source, "html.parser")
|
| 259 |
+
links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}
|
| 260 |
+
|
| 261 |
+
sess = requests.Session()
|
| 262 |
+
sess.headers.update(_HEADERS)
|
| 263 |
+
for link in links:
|
| 264 |
+
html = _get(sess, link)
|
| 265 |
+
if not html: continue
|
| 266 |
+
ds = BeautifulSoup(html.text, "html.parser")
|
| 267 |
+
|
| 268 |
+
title_el = ds.select_one('h1')
|
| 269 |
+
title = title_el.get_text(strip=True) if title_el else "-"
|
| 270 |
+
|
| 271 |
+
date_el = ds.select_one('div.text-cnn_grey.text-sm')
|
| 272 |
+
date_text = date_el.get_text(strip=True) if date_el else "-"
|
| 273 |
+
|
| 274 |
+
tags_list = []
|
| 275 |
+
tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
|
| 276 |
+
if tk_header and tk_header.find_next_sibling('div'):
|
| 277 |
+
tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]
|
| 278 |
+
|
| 279 |
+
content_container = ds.select_one("div.detail-text")
|
| 280 |
+
content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"
|
| 281 |
+
|
| 282 |
+
results.append({
|
| 283 |
+
"judul": title,
|
| 284 |
+
"tanggal": date_text,
|
| 285 |
+
"tag": ", ".join(tags_list) if tags_list else "-",
|
| 286 |
+
"isi_berita": content,
|
| 287 |
+
"link": link
|
| 288 |
+
})
|
| 289 |
+
driver.quit()
|
| 290 |
+
return results
|
| 291 |
+
|
| 292 |
+
# ββ RadarCirebonID βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 293 |
+
|
| 294 |
+
def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
|
| 295 |
+
BASE_HOST = "https://radarcirebon.id"
|
| 296 |
+
sess = requests.Session()
|
| 297 |
+
sess.headers.update(_HEADERS)
|
| 298 |
+
results = []
|
| 299 |
+
|
| 300 |
+
for page in range(1, max_pages + 1):
|
| 301 |
+
q = quote(keyword).replace('%20', '+')
|
| 302 |
+
url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
|
| 303 |
+
r = _get(sess, url)
|
| 304 |
+
if not r: break
|
| 305 |
+
|
| 306 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 307 |
+
links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}
|
| 308 |
+
|
| 309 |
+
for link in links:
|
| 310 |
+
detail_r = _get(sess, link)
|
| 311 |
+
if not detail_r: continue
|
| 312 |
+
ds = BeautifulSoup(detail_r.text, "html.parser")
|
| 313 |
+
|
| 314 |
+
title_el = ds.select_one('h1.entry-title')
|
| 315 |
+
date_el = ds.select_one('time.entry-date')
|
| 316 |
+
|
| 317 |
+
c_parts = []
|
| 318 |
+
cc = ds.select_one('div.entry-content')
|
| 319 |
+
if cc:
|
| 320 |
+
for p in cc.select('p'):
|
| 321 |
+
if not p.find_parent(class_='read-also'):
|
| 322 |
+
t = p.get_text(" ", strip=True)
|
| 323 |
+
if t: c_parts.append(t)
|
| 324 |
+
|
| 325 |
+
tc = ds.select_one('div.wp-block-tag-cloud')
|
| 326 |
+
tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []
|
| 327 |
+
|
| 328 |
+
results.append({
|
| 329 |
+
"judul": title_el.get_text(strip=True) if title_el else "-",
|
| 330 |
+
"tanggal": date_el.get_text(strip=True) if date_el else "-",
|
| 331 |
+
"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
|
| 332 |
+
"isi_berita": "\n".join(c_parts) if c_parts else "-",
|
| 333 |
+
"link": link
|
| 334 |
+
})
|
| 335 |
+
|
| 336 |
+
return results
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 340 |
+
|
| 341 |
+
_PORTAL_MAP = {
|
| 342 |
+
"detik": _scrape_detik,
|
| 343 |
+
"detik.com": _scrape_detik,
|
| 344 |
+
"radar": _scrape_radar,
|
| 345 |
+
"radardisway": _scrape_radar,
|
| 346 |
+
"radarcirebon.disway.id": _scrape_radar,
|
| 347 |
+
"antara": _scrape_antara,
|
| 348 |
+
"antaranews": _scrape_antara,
|
| 349 |
+
"antaranews.com": _scrape_antara,
|
| 350 |
+
"cnn": _scrape_cnn,
|
| 351 |
+
"cnnindonesia": _scrape_cnn,
|
| 352 |
+
"cnnindonesia.com": _scrape_cnn,
|
| 353 |
+
"radarcirebon": _scrape_radarcirebon,
|
| 354 |
+
"radarcirebon.id": _scrape_radarcirebon,
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
|
| 359 |
+
if not portal: return []
|
| 360 |
+
portal_key = portal.strip().lower().rstrip("/")
|
| 361 |
+
scraper = _PORTAL_MAP.get(portal_key)
|
| 362 |
+
|
| 363 |
+
if scraper is None:
|
| 364 |
+
for key, fn in _PORTAL_MAP.items():
|
| 365 |
+
if key in portal_key or portal_key in key:
|
| 366 |
+
scraper = fn
|
| 367 |
+
break
|
| 368 |
+
|
| 369 |
+
if scraper is None:
|
| 370 |
+
try:
|
| 371 |
+
domain = urlparse(portal).netloc or portal_key
|
| 372 |
+
for key, fn in _PORTAL_MAP.items():
|
| 373 |
+
if key in domain:
|
| 374 |
+
scraper = fn
|
| 375 |
+
break
|
| 376 |
+
except Exception: pass
|
| 377 |
+
|
| 378 |
+
if scraper is None:
|
| 379 |
+
print(f"[News] Portal '{portal}' tidak dikenali.")
|
| 380 |
+
return []
|
| 381 |
+
|
| 382 |
+
print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
|
| 383 |
+
try:
|
| 384 |
+
return scraper(keyword, max_pages=pages)
|
| 385 |
+
except Exception as e:
|
| 386 |
+
print(f"[News] Error saat scraping: {e}")
|
| 387 |
+
return []
|
services/preprocessing.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
preprocessing.py β Clean & preprocess text for sentiment analysis.
|
| 3 |
+
Only contains utility functions; no Colab/notebook code.
|
| 4 |
+
"""
|
| 5 |
+
import re
|
| 6 |
+
import html as html_lib
|
| 7 |
+
|
| 8 |
+
from bs4 import BeautifulSoup
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
| 12 |
+
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
|
| 13 |
+
_sastrawi_available = True
|
| 14 |
+
except ImportError:
|
| 15 |
+
_sastrawi_available = False
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from stop_words import get_stop_words
|
| 19 |
+
_stopwords_id = get_stop_words('indonesian')
|
| 20 |
+
except Exception:
|
| 21 |
+
_stopwords_id = []
|
| 22 |
+
|
| 23 |
+
# ββ Stopwords ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
_sastrawi_stopwords: list = []
|
| 25 |
+
_stemmer = None
|
| 26 |
+
|
| 27 |
+
if _sastrawi_available:
|
| 28 |
+
_stemmer = StemmerFactory().create_stemmer()
|
| 29 |
+
_sastrawi_stopwords = StopWordRemoverFactory().get_stop_words()
|
| 30 |
+
|
| 31 |
+
_ADDITIONAL_STOPWORDS = [
|
| 32 |
+
'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
|
| 33 |
+
'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi',
|
| 34 |
+
'ya','lbh','digunakan','semangat','dah','sangat','penting',
|
| 35 |
+
'lancar','cepat','senang','makasih','bermanfaat','keren','baik',
|
| 36 |
+
'terimakasih','bagus','semoga','aplikasi','transaksi','banget','pakai',
|
| 37 |
+
'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk',
|
| 38 |
+
'baru','jelas','yuk','mohon','punya','cara','hari','kota','berita',
|
| 39 |
+
# HTML attributes
|
| 40 |
+
'class','id','span','div','href','src','style','alt','aria','role',
|
| 41 |
+
'tabindex','button','label','img','input','placeholder','form',
|
| 42 |
+
'field','hidden','value','by','link','tags',
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
_NOISE_STOPWORDS = [
|
| 46 |
+
'xd','xyri','yu','uobl','ypdohk','xt','pz','lziwak',
|
| 47 |
+
'rp','xdj','xggy','xjbqb','xstzfhl','hfl','xat',
|
| 48 |
+
'qhh','dhg','cr','tdsg','ct','etr','nq','oe','ejq','psk',
|
| 49 |
+
'hl','hd','sy','amp','fbf',
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
|
| 53 |
+
|
| 54 |
+
FINAL_STOPWORDS: set = set(
|
| 55 |
+
_stopwords_id + _sastrawi_stopwords + _ADDITIONAL_STOPWORDS + _NOISE_STOPWORDS
|
| 56 |
+
) | _SINGLE_LETTERS
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# ββ Individual text cleaners βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
+
|
| 61 |
+
_AUTHOR_COMMENT_PATTERN = re.compile(r"author\b.*?\bcomment", flags=re.IGNORECASE|re.DOTALL)
|
| 62 |
+
|
| 63 |
+
def clean_html(text: str) -> str:
|
| 64 |
+
"""Strip HTML tags and unescape HTML entities."""
|
| 65 |
+
if not text:
|
| 66 |
+
return ""
|
| 67 |
+
try:
|
| 68 |
+
soup = BeautifulSoup(str(text), "html.parser")
|
| 69 |
+
for tag in soup(["script", "style"]):
|
| 70 |
+
tag.decompose()
|
| 71 |
+
cleaned = soup.get_text(separator=" ")
|
| 72 |
+
except Exception:
|
| 73 |
+
cleaned = str(text)
|
| 74 |
+
cleaned = html_lib.unescape(cleaned)
|
| 75 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 76 |
+
return cleaned
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def clean_text(text: str) -> str:
|
| 80 |
+
"""Basic single-string cleaner: lowercase, remove URLs, non-alpha chars."""
|
| 81 |
+
if not text:
|
| 82 |
+
return ""
|
| 83 |
+
text = str(text).lower()
|
| 84 |
+
text = _AUTHOR_COMMENT_PATTERN.sub("", text)
|
| 85 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
|
| 86 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
|
| 87 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 88 |
+
return text
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _preprocess_single(text: str) -> str:
|
| 92 |
+
"""Full pipeline for one text string."""
|
| 93 |
+
# 1. Strip HTML
|
| 94 |
+
text = clean_html(text)
|
| 95 |
+
# 2. Lowercase + remove URLs/non-alpha
|
| 96 |
+
text = clean_text(text)
|
| 97 |
+
# 3. Stem (Sastrawi)
|
| 98 |
+
if _stemmer:
|
| 99 |
+
text = _stemmer.stem(text)
|
| 100 |
+
# 4. Remove stopwords & noise
|
| 101 |
+
tokens = [
|
| 102 |
+
w for w in text.split()
|
| 103 |
+
if w not in FINAL_STOPWORDS and len(w) > 1
|
| 104 |
+
]
|
| 105 |
+
# 5. Keep only tokens with at least one letter
|
| 106 |
+
tokens = [t for t in tokens if re.search(r'[a-z]', t)]
|
| 107 |
+
return " ".join(tokens).strip()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
|
| 112 |
+
def preprocess_text(texts) -> list:
|
| 113 |
+
"""
|
| 114 |
+
Accept either a single string or a list of strings.
|
| 115 |
+
Returns a list of cleaned strings.
|
| 116 |
+
"""
|
| 117 |
+
if isinstance(texts, str):
|
| 118 |
+
texts = [texts]
|
| 119 |
+
return [_preprocess_single(t) for t in texts if isinstance(t, str)]
|
services/sentiment.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
sentiment.py β Sentiment analysis using IndoBERT / HuggingFace pipeline.
|
| 3 |
+
Model is loaded lazily (first call) to avoid crashing at import time.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
# ββ Model configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
# If you have a local fine-tuned model, place it in ./indoBERT-sentiment
|
| 12 |
+
# and set MODEL_DIR. Otherwise it downloads from HuggingFace.
|
| 13 |
+
_LOCAL_MODEL_DIR = os.path.join(os.path.dirname(__file__), "..", "indoBERT-sentiment")
|
| 14 |
+
_HF_MODEL_ID = "taufiqdp/indonesian-sentiment"
|
| 15 |
+
|
| 16 |
+
# ββ Lazy-loaded globals ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
_pipeline: Optional[object] = None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _load_pipeline():
|
| 21 |
+
global _pipeline
|
| 22 |
+
if _pipeline is not None:
|
| 23 |
+
return _pipeline
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
from transformers import pipeline as hf_pipeline
|
| 27 |
+
|
| 28 |
+
# Prefer local model if it exists (avoids repeated downloads in Docker)
|
| 29 |
+
if os.path.isdir(_LOCAL_MODEL_DIR) and os.listdir(_LOCAL_MODEL_DIR):
|
| 30 |
+
model_source = _LOCAL_MODEL_DIR
|
| 31 |
+
print(f"[Sentiment] Loading model from local dir: {model_source}")
|
| 32 |
+
else:
|
| 33 |
+
model_source = _HF_MODEL_ID
|
| 34 |
+
print(f"[Sentiment] Local model not found. Downloading from HuggingFace: {model_source}")
|
| 35 |
+
|
| 36 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 37 |
+
|
| 38 |
+
_pipeline = hf_pipeline(
|
| 39 |
+
"text-classification",
|
| 40 |
+
model=model_source,
|
| 41 |
+
tokenizer=model_source,
|
| 42 |
+
device=device,
|
| 43 |
+
truncation=True,
|
| 44 |
+
max_length=256,
|
| 45 |
+
return_all_scores=False,
|
| 46 |
+
)
|
| 47 |
+
print("[Sentiment] Model loaded successfully.")
|
| 48 |
+
return _pipeline
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
|
| 53 |
+
def _normalize_label(lbl: str) -> str:
|
| 54 |
+
"""Normalise raw model label to 'positif', 'negatif', or 'netral'."""
|
| 55 |
+
l = lbl.lower()
|
| 56 |
+
if l in ("positif", "positive", "pos"):
|
| 57 |
+
return "positif"
|
| 58 |
+
if l in ("negatif", "negative", "neg"):
|
| 59 |
+
return "negatif"
|
| 60 |
+
if l in ("netral", "neutral", "neu"):
|
| 61 |
+
return "netral"
|
| 62 |
+
if "label_" in l:
|
| 63 |
+
try:
|
| 64 |
+
from transformers import AutoConfig
|
| 65 |
+
cfg = AutoConfig.from_pretrained(_HF_MODEL_ID)
|
| 66 |
+
idx = int(l.split("_")[-1])
|
| 67 |
+
return _normalize_label(cfg.id2label[idx])
|
| 68 |
+
except Exception:
|
| 69 |
+
return "netral"
|
| 70 |
+
return "netral"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ββ Keywords Override ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
|
| 75 |
+
_NEGATIVE_KEYWORDS = {
|
| 76 |
+
"bego", "bodoh", "jelek", "goblok", "bangsat", "kampungan", "tolol",
|
| 77 |
+
"kontol", "kirik", "koplok", "anjing", "babi", "monyet", "belegug",
|
| 78 |
+
"kik", "goblog", "kntl", "buruk", "lemah", "rendah", "gagal", "hancur",
|
| 79 |
+
"rusak", "cacat", "jahat", "dusta", "bohong", "fitnah", "korup", "curang",
|
| 80 |
+
"palsu", "salah", "sesat", "kejam", "dendam", "malas", "lambat", "menyakitkan",
|
| 81 |
+
"tercela", "merugikan", "menghina", "melecehkan", "menyesatkan"
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
_POSITIVE_KEYWORDS = {
|
| 85 |
+
"bagus", "hebat", "mantap", "luar biasa", "keren", "canggih",
|
| 86 |
+
"cerdas", "pintar", "senang", "bahagia", "memuaskan", "unggul",
|
| 87 |
+
"sempurna", "berhasil", "luas", "indah"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
_NEUTRAL_KEYWORDS = {
|
| 91 |
+
"ok", "oke", "biasa", "lumayan", "standar", "normal", "cukup", "agak"
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
def _override_label(text: str, model_label: str) -> str:
|
| 95 |
+
text_lower = text.lower()
|
| 96 |
+
|
| 97 |
+
if any(w in text_lower for w in _NEGATIVE_KEYWORDS):
|
| 98 |
+
return "negatif"
|
| 99 |
+
if any(w in text_lower for w in _POSITIVE_KEYWORDS):
|
| 100 |
+
return "positif"
|
| 101 |
+
if any(w in text_lower for w in _NEUTRAL_KEYWORDS):
|
| 102 |
+
return "netral"
|
| 103 |
+
|
| 104 |
+
return model_label
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
+
|
| 109 |
+
def analyze_sentiment(texts: list) -> dict:
|
| 110 |
+
"""
|
| 111 |
+
Run sentiment analysis on a list of text strings.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
texts: list of pre-processed strings
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
dict with keys: positif, negatif, netral, total, detail
|
| 118 |
+
Example:
|
| 119 |
+
{
|
| 120 |
+
"positif": 12, "negatif": 4, "netral": 6, "total": 22,
|
| 121 |
+
"detail": [{"text": "...", "label": "positif", "score": 0.95}, ...]
|
| 122 |
+
}
|
| 123 |
+
"""
|
| 124 |
+
if not texts:
|
| 125 |
+
return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
|
| 126 |
+
|
| 127 |
+
# Filter out empty strings
|
| 128 |
+
texts = [t for t in texts if t and t.strip()]
|
| 129 |
+
if not texts:
|
| 130 |
+
return {"positif": 0, "negatif": 0, "netral": 0, "total": 0, "detail": []}
|
| 131 |
+
|
| 132 |
+
clf = _load_pipeline()
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
preds = clf(texts, batch_size=16, truncation=True)
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"[Sentiment] Prediction error: {e}")
|
| 138 |
+
return {"positif": 0, "negatif": 0, "netral": 0, "total": len(texts), "detail": []}
|
| 139 |
+
|
| 140 |
+
counts = {"positif": 0, "negatif": 0, "netral": 0}
|
| 141 |
+
detail = []
|
| 142 |
+
for text, pred in zip(texts, preds):
|
| 143 |
+
model_label = _normalize_label(pred["label"])
|
| 144 |
+
final_label = _override_label(text, model_label)
|
| 145 |
+
|
| 146 |
+
counts[final_label] += 1
|
| 147 |
+
detail.append({
|
| 148 |
+
"text": text[:200],
|
| 149 |
+
"label": final_label,
|
| 150 |
+
"score": round(float(pred["score"]), 4),
|
| 151 |
+
})
|
| 152 |
+
|
| 153 |
+
return {
|
| 154 |
+
"positif": counts["positif"],
|
| 155 |
+
"negatif": counts["negatif"],
|
| 156 |
+
"netral": counts["netral"],
|
| 157 |
+
"total": len(texts),
|
| 158 |
+
"detail": detail,
|
| 159 |
+
}
|
services/tiktok.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tiktok.py β TikTok scraper using Selenium.
|
| 3 |
+
Exports: scrape_tiktok(cookie_str, target_username) -> list[dict]
|
| 4 |
+
|
| 5 |
+
Returns structured data per-video:
|
| 6 |
+
url, profile_username, upload_date, like_count,
|
| 7 |
+
caption_short, caption_detail, comments, scrape_date
|
| 8 |
+
|
| 9 |
+
cookie_str accepts:
|
| 10 |
+
1. Raw string: "sessionid=xxx; tt_webid=yyy; ..."
|
| 11 |
+
2. JSON array: [{"name":"sessionid","value":"xxx",...}, ...]
|
| 12 |
+
3. JSON object: {"sessionid": "xxx", "tt_webid": "yyy"}
|
| 13 |
+
"""
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
|
| 20 |
+
from selenium.webdriver.common.by import By
|
| 21 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 22 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 23 |
+
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
| 24 |
+
|
| 25 |
+
from ._driver import _create_driver
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ββ Cookie injection βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
|
| 30 |
+
def _inject_cookies(driver, cookie_str: str) -> bool:
|
| 31 |
+
driver.get("https://www.tiktok.com/")
|
| 32 |
+
time.sleep(3)
|
| 33 |
+
|
| 34 |
+
if not cookie_str or not cookie_str.strip():
|
| 35 |
+
print("[TikTok] Tidak ada cookie yang diberikan.")
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
+
stripped = cookie_str.strip()
|
| 39 |
+
|
| 40 |
+
if stripped.startswith("["):
|
| 41 |
+
try:
|
| 42 |
+
cookies = json.loads(stripped)
|
| 43 |
+
count = 0
|
| 44 |
+
for c in cookies:
|
| 45 |
+
if not isinstance(c, dict) or "name" not in c:
|
| 46 |
+
continue
|
| 47 |
+
safe = {k: c[k] for k in ("name", "value", "domain", "path", "secure", "httpOnly", "expiry") if k in c}
|
| 48 |
+
safe.setdefault("domain", ".tiktok.com")
|
| 49 |
+
try:
|
| 50 |
+
driver.add_cookie(safe)
|
| 51 |
+
count += 1
|
| 52 |
+
except Exception:
|
| 53 |
+
safe.pop("domain", None)
|
| 54 |
+
try:
|
| 55 |
+
driver.add_cookie(safe)
|
| 56 |
+
count += 1
|
| 57 |
+
except Exception:
|
| 58 |
+
pass
|
| 59 |
+
driver.refresh()
|
| 60 |
+
time.sleep(3)
|
| 61 |
+
return count > 0
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"[TikTok] JSON array error: {e}")
|
| 64 |
+
|
| 65 |
+
if stripped.startswith("{"):
|
| 66 |
+
try:
|
| 67 |
+
obj = json.loads(stripped)
|
| 68 |
+
count = 0
|
| 69 |
+
for name, value in obj.items():
|
| 70 |
+
try:
|
| 71 |
+
driver.add_cookie({"name": str(name), "value": str(value), "domain": ".tiktok.com"})
|
| 72 |
+
count += 1
|
| 73 |
+
except Exception:
|
| 74 |
+
pass
|
| 75 |
+
driver.refresh()
|
| 76 |
+
time.sleep(3)
|
| 77 |
+
return count > 0
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"[TikTok] JSON object error: {e}")
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
count = 0
|
| 83 |
+
for item in stripped.split(";"):
|
| 84 |
+
item = item.strip()
|
| 85 |
+
if "=" not in item:
|
| 86 |
+
continue
|
| 87 |
+
name, _, value = item.partition("=")
|
| 88 |
+
try:
|
| 89 |
+
driver.add_cookie({"name": name.strip(), "value": value.strip(), "domain": ".tiktok.com"})
|
| 90 |
+
count += 1
|
| 91 |
+
except Exception:
|
| 92 |
+
pass
|
| 93 |
+
driver.refresh()
|
| 94 |
+
time.sleep(3)
|
| 95 |
+
return count > 0
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(f"[TikTok] String cookie error: {e}")
|
| 98 |
+
return False
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββ Scraping helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
|
| 103 |
+
_VIDEO_LINK_SELECTORS = [
|
| 104 |
+
'div[data-e2e="user-post-item"] a',
|
| 105 |
+
'div[data-e2e="user-post-item-list"] a',
|
| 106 |
+
'a[href*="/video/"]',
|
| 107 |
+
'div[class*="DivItemContainerV2"] a',
|
| 108 |
+
'div[class*="DivWrapper"] a[href*="/video/"]',
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _get_video_links(driver, profile_url: str, max_videos: int = 30) -> list:
|
| 113 |
+
print(f"[TikTok] Membuka profil: {profile_url}")
|
| 114 |
+
driver.get(profile_url)
|
| 115 |
+
|
| 116 |
+
loaded = False
|
| 117 |
+
for sel in _VIDEO_LINK_SELECTORS:
|
| 118 |
+
try:
|
| 119 |
+
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, sel)))
|
| 120 |
+
loaded = True
|
| 121 |
+
break
|
| 122 |
+
except TimeoutException:
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
if not loaded:
|
| 126 |
+
time.sleep(5)
|
| 127 |
+
|
| 128 |
+
links: set = set()
|
| 129 |
+
stall = 0
|
| 130 |
+
|
| 131 |
+
while len(links) < max_videos:
|
| 132 |
+
prev = len(links)
|
| 133 |
+
for sel in _VIDEO_LINK_SELECTORS:
|
| 134 |
+
for el in driver.find_elements(By.CSS_SELECTOR, sel):
|
| 135 |
+
href = el.get_attribute("href")
|
| 136 |
+
if href and "/video/" in href:
|
| 137 |
+
links.add(href.split("?")[0])
|
| 138 |
+
if len(links) >= max_videos:
|
| 139 |
+
break
|
| 140 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 141 |
+
time.sleep(3)
|
| 142 |
+
if len(links) == prev:
|
| 143 |
+
stall += 1
|
| 144 |
+
if stall >= 3:
|
| 145 |
+
break
|
| 146 |
+
else:
|
| 147 |
+
stall = 0
|
| 148 |
+
|
| 149 |
+
return list(links)[:max_videos]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _scrape_video(driver, video_url: str, profile_username: str) -> dict | None:
|
| 153 |
+
print(f"[TikTok] Memproses: {video_url}")
|
| 154 |
+
driver.get(video_url)
|
| 155 |
+
time.sleep(5)
|
| 156 |
+
|
| 157 |
+
video_data = {
|
| 158 |
+
"url": video_url,
|
| 159 |
+
"profile_username": profile_username,
|
| 160 |
+
"upload_date": "N/A",
|
| 161 |
+
"like_count": "N/A",
|
| 162 |
+
"caption_short": "",
|
| 163 |
+
"caption_detail": "",
|
| 164 |
+
"comments": [],
|
| 165 |
+
"scrape_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
date_el = WebDriverWait(driver, 8).until(
|
| 170 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-e2e="browser-video-meta-date"]'))
|
| 171 |
+
)
|
| 172 |
+
video_data["upload_date"] = date_el.text.strip()
|
| 173 |
+
except TimeoutException:
|
| 174 |
+
pass
|
| 175 |
+
|
| 176 |
+
try:
|
| 177 |
+
like_el = driver.find_element(By.CSS_SELECTOR, 'strong[data-e2e="like-count"]')
|
| 178 |
+
video_data["like_count"] = like_el.text.strip()
|
| 179 |
+
except NoSuchElementException:
|
| 180 |
+
pass
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
desc_container = WebDriverWait(driver, 5).until(
|
| 184 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='browse-video-desc']"))
|
| 185 |
+
)
|
| 186 |
+
try:
|
| 187 |
+
cap_el = desc_container.find_element(By.CSS_SELECTOR, 'span[data-e2e="new-desc-span"]')
|
| 188 |
+
video_data["caption_short"] = cap_el.text.strip()
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
more_btn = driver.find_element(By.CSS_SELECTOR, "span[class*='-SpanExpandIcon']")
|
| 192 |
+
driver.execute_script("arguments[0].click();", more_btn)
|
| 193 |
+
time.sleep(2)
|
| 194 |
+
detail_container = WebDriverWait(driver, 5).until(
|
| 195 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCustomTDKContainer']"))
|
| 196 |
+
)
|
| 197 |
+
desc_text = ""
|
| 198 |
+
try:
|
| 199 |
+
desc_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-desc']").text
|
| 200 |
+
except NoSuchElementException:
|
| 201 |
+
pass
|
| 202 |
+
kw_text = ""
|
| 203 |
+
try:
|
| 204 |
+
kw_text = detail_container.find_element(By.CSS_SELECTOR, "div[data-e2e='v2t-keywords']").text
|
| 205 |
+
except NoSuchElementException:
|
| 206 |
+
pass
|
| 207 |
+
video_data["caption_detail"] = f"Deskripsi: {desc_text}\nKeywords: {kw_text}".strip()
|
| 208 |
+
except Exception:
|
| 209 |
+
pass
|
| 210 |
+
except NoSuchElementException:
|
| 211 |
+
pass
|
| 212 |
+
except TimeoutException:
|
| 213 |
+
pass
|
| 214 |
+
|
| 215 |
+
try:
|
| 216 |
+
WebDriverWait(driver, 15).until(
|
| 217 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='DivCommentListContainer']"))
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
reply_xpath = "//span[contains(text(), 'balasan') or (contains(text(), 'View') and contains(text(), 'repl'))]"
|
| 221 |
+
stall = 0
|
| 222 |
+
last_count = 0
|
| 223 |
+
|
| 224 |
+
for _ in range(15):
|
| 225 |
+
try:
|
| 226 |
+
btns = driver.find_elements(By.XPATH, reply_xpath)
|
| 227 |
+
if btns:
|
| 228 |
+
driver.execute_script("arguments[0].click();", btns[0])
|
| 229 |
+
time.sleep(2)
|
| 230 |
+
stall = 0
|
| 231 |
+
continue
|
| 232 |
+
except Exception:
|
| 233 |
+
pass
|
| 234 |
+
|
| 235 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 236 |
+
time.sleep(3)
|
| 237 |
+
cur = len(driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]'))
|
| 238 |
+
if cur > last_count:
|
| 239 |
+
last_count = cur
|
| 240 |
+
stall = 0
|
| 241 |
+
else:
|
| 242 |
+
stall += 1
|
| 243 |
+
if stall >= 4:
|
| 244 |
+
break
|
| 245 |
+
|
| 246 |
+
items = driver.find_elements(By.XPATH, '//div[contains(@class, "DivCommentItemWrapper")]')
|
| 247 |
+
for item in items:
|
| 248 |
+
try:
|
| 249 |
+
author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-1"]//p')
|
| 250 |
+
if author_el:
|
| 251 |
+
cat_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-1"]').text.strip()
|
| 252 |
+
if cat_text:
|
| 253 |
+
video_data["comments"].append({
|
| 254 |
+
"author": author_el[0].text.strip(),
|
| 255 |
+
"comment": cat_text,
|
| 256 |
+
"replies": []
|
| 257 |
+
})
|
| 258 |
+
continue
|
| 259 |
+
|
| 260 |
+
# Check for replies (level 2)
|
| 261 |
+
r_author_el = item.find_elements(By.XPATH, './/div[@data-e2e="comment-username-2"]//p')
|
| 262 |
+
if r_author_el and video_data["comments"]:
|
| 263 |
+
r_text = item.find_element(By.XPATH, './/span[@data-e2e="comment-level-2"]').text.strip()
|
| 264 |
+
if r_text:
|
| 265 |
+
video_data["comments"][-1]["replies"].append({
|
| 266 |
+
"author": r_author_el[0].text.strip(),
|
| 267 |
+
"comment": r_text
|
| 268 |
+
})
|
| 269 |
+
except Exception:
|
| 270 |
+
pass
|
| 271 |
+
except TimeoutException:
|
| 272 |
+
pass
|
| 273 |
+
|
| 274 |
+
return video_data
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
# ββ Public API βββοΏ½οΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 278 |
+
|
| 279 |
+
def scrape_tiktok(cookie_str: str, target_username: str, max_videos: int = 20) -> list:
|
| 280 |
+
"""
|
| 281 |
+
Scrape captions & comments from a TikTok profile.
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
list of dicts with: url, profile_username, upload_date, like_count,
|
| 285 |
+
caption_short, caption_detail, comments, scrape_date
|
| 286 |
+
"""
|
| 287 |
+
if not target_username:
|
| 288 |
+
print("[TikTok] target_username tidak ada.")
|
| 289 |
+
return []
|
| 290 |
+
|
| 291 |
+
username = target_username.lstrip("@")
|
| 292 |
+
profile_url = f"https://www.tiktok.com/@{username}"
|
| 293 |
+
|
| 294 |
+
driver = _create_driver(mobile=False)
|
| 295 |
+
all_data: list = []
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
if cookie_str and cookie_str.strip():
|
| 299 |
+
_inject_cookies(driver, cookie_str)
|
| 300 |
+
|
| 301 |
+
links = _get_video_links(driver, profile_url, max_videos)
|
| 302 |
+
|
| 303 |
+
for url in links:
|
| 304 |
+
try:
|
| 305 |
+
data = _scrape_video(driver, url, username)
|
| 306 |
+
if data:
|
| 307 |
+
all_data.append(data)
|
| 308 |
+
except Exception as e:
|
| 309 |
+
print(f"[TikTok] Error {url}: {e}")
|
| 310 |
+
time.sleep(1.5)
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
print(f"[TikTok] Fatal error: {e}")
|
| 314 |
+
finally:
|
| 315 |
+
try:
|
| 316 |
+
driver.quit()
|
| 317 |
+
except Exception:
|
| 318 |
+
pass
|
| 319 |
+
|
| 320 |
+
return all_data
|
services/wordcloud_service.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
wordcloud_service.py β Generate a word-cloud image from a list of texts.
|
| 3 |
+
Stripped from the original Colab notebook; only the generation function remains.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import io
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import numpy as np
|
| 11 |
+
import matplotlib
|
| 12 |
+
matplotlib.use("Agg") # Must be before pyplot import β headless/no-display
|
| 13 |
+
import matplotlib.pyplot as plt
|
| 14 |
+
from wordcloud import WordCloud
|
| 15 |
+
|
| 16 |
+
# ββ Stopwords (same set as preprocessing.py) ββββββββββββββββββββββββββββββββββ
|
| 17 |
+
try:
|
| 18 |
+
from stop_words import get_stop_words
|
| 19 |
+
_stopwords_id = get_stop_words('indonesian')
|
| 20 |
+
except Exception:
|
| 21 |
+
_stopwords_id = []
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
|
| 25 |
+
_sastrawi_sw = StopWordRemoverFactory().get_stop_words()
|
| 26 |
+
except Exception:
|
| 27 |
+
_sastrawi_sw = []
|
| 28 |
+
|
| 29 |
+
_EXTRA_STOPWORDS = [
|
| 30 |
+
'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg',
|
| 31 |
+
'deh','sih','kok','dong','udah','ya','banget','pakai','jadi','baru',
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
_BLOCKLIST = set(_stopwords_id + _sastrawi_sw + _EXTRA_STOPWORDS)
|
| 35 |
+
_SINGLE_LETTERS = set('abcdefghijklmnopqrstuvwxyz')
|
| 36 |
+
WORDCLOUD_STOPWORDS = _BLOCKLIST | _SINGLE_LETTERS
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ββ Internal helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
|
| 41 |
+
def _merge_texts(texts: list) -> str:
|
| 42 |
+
"""Join a list of strings, keeping only alphabetic tokens."""
|
| 43 |
+
joined = " ".join(str(t) for t in texts if t)
|
| 44 |
+
tokens = joined.lower().split()
|
| 45 |
+
tokens = [
|
| 46 |
+
w for w in tokens
|
| 47 |
+
if re.match(r'^[a-z]+$', w) and w not in WORDCLOUD_STOPWORDS and len(w) > 2
|
| 48 |
+
]
|
| 49 |
+
return " ".join(tokens)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _circular_mask(size: int = 400) -> np.ndarray:
|
| 53 |
+
x, y = np.ogrid[:size, :size]
|
| 54 |
+
center = size // 2
|
| 55 |
+
radius = center - 10
|
| 56 |
+
mask = (x - center) ** 2 + (y - center) ** 2 > radius ** 2
|
| 57 |
+
return (255 * mask).astype(np.uint8)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
+
|
| 62 |
+
def generate_wordcloud(texts: list, output_dest) -> bool:
|
| 63 |
+
"""
|
| 64 |
+
Generate a circular wordcloud from a list of text strings.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
texts: list of strings (raw or pre-processed)
|
| 68 |
+
output_dest: file path string OR a BytesIO buffer.
|
| 69 |
+
If a string path is given, the PNG is saved to disk.
|
| 70 |
+
If a BytesIO buffer is given, the PNG is written there
|
| 71 |
+
(no file is created on disk).
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
True on success, False on failure.
|
| 75 |
+
"""
|
| 76 |
+
if not texts:
|
| 77 |
+
print("[WordCloud] No texts provided.")
|
| 78 |
+
return False
|
| 79 |
+
|
| 80 |
+
text_data = _merge_texts(texts)
|
| 81 |
+
if not text_data.strip():
|
| 82 |
+
print("[WordCloud] All text was filtered out by stopwords; nothing to plot.")
|
| 83 |
+
return False
|
| 84 |
+
|
| 85 |
+
# If saving to a file path, ensure the directory exists
|
| 86 |
+
if isinstance(output_dest, str):
|
| 87 |
+
output_dir = os.path.dirname(output_dest)
|
| 88 |
+
if output_dir:
|
| 89 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
mask = _circular_mask(400)
|
| 93 |
+
wc = WordCloud(
|
| 94 |
+
width=800,
|
| 95 |
+
height=800,
|
| 96 |
+
background_color="white",
|
| 97 |
+
colormap="viridis",
|
| 98 |
+
mask=mask,
|
| 99 |
+
contour_width=2,
|
| 100 |
+
contour_color="steelblue",
|
| 101 |
+
stopwords=WORDCLOUD_STOPWORDS,
|
| 102 |
+
max_words=100,
|
| 103 |
+
).generate(text_data)
|
| 104 |
+
|
| 105 |
+
fig, ax = plt.subplots(figsize=(8, 8))
|
| 106 |
+
ax.imshow(wc, interpolation="bilinear")
|
| 107 |
+
ax.axis("off")
|
| 108 |
+
plt.tight_layout(pad=0)
|
| 109 |
+
plt.savefig(output_dest, dpi=150, bbox_inches="tight", format="png")
|
| 110 |
+
plt.close(fig)
|
| 111 |
+
|
| 112 |
+
if isinstance(output_dest, str):
|
| 113 |
+
print(f"[WordCloud] Saved to {output_dest}")
|
| 114 |
+
else:
|
| 115 |
+
print("[WordCloud] Written to in-memory buffer (temporal).")
|
| 116 |
+
return True
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
print(f"[WordCloud] Error generating wordcloud: {e}")
|
| 120 |
+
return False
|
templates/index.html
ADDED
|
@@ -0,0 +1,1009 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="id">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>SentiScope β Sentiment Analysis Dashboard</title>
|
| 7 |
+
<meta name="description" content="Dashboard analisis sentimen media sosial dengan scraping otomatis, word cloud, dan indoBERT.">
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Space+Grotesk:wght@400;500;600;700&display=swap" rel="stylesheet">
|
| 11 |
+
<style>
|
| 12 |
+
/* ββ Reset & Base ββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 13 |
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
| 14 |
+
|
| 15 |
+
:root {
|
| 16 |
+
--bg: #07071a;
|
| 17 |
+
--surface: #0e0e28;
|
| 18 |
+
--surface-2: #14143a;
|
| 19 |
+
--border: rgba(130, 100, 255, 0.18);
|
| 20 |
+
--border-hover: rgba(130, 100, 255, 0.42);
|
| 21 |
+
--purple: #7c3aed;
|
| 22 |
+
--purple-light: #a855f7;
|
| 23 |
+
--cyan: #06b6d4;
|
| 24 |
+
--text: #e2e8f0;
|
| 25 |
+
--text-muted: #8892a4;
|
| 26 |
+
--text-dim: #4b5563;
|
| 27 |
+
--radius: 14px;
|
| 28 |
+
--radius-sm: 8px;
|
| 29 |
+
--transition: 0.22s cubic-bezier(0.4, 0, 0.2, 1);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
html { scroll-behavior: smooth; }
|
| 33 |
+
|
| 34 |
+
body {
|
| 35 |
+
font-family: 'Inter', system-ui, sans-serif;
|
| 36 |
+
background: var(--bg);
|
| 37 |
+
color: var(--text);
|
| 38 |
+
min-height: 100vh;
|
| 39 |
+
overflow-x: hidden;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
body::before {
|
| 43 |
+
content: '';
|
| 44 |
+
position: fixed;
|
| 45 |
+
inset: 0;
|
| 46 |
+
background:
|
| 47 |
+
radial-gradient(ellipse 70% 50% at 15% 20%, rgba(124,58,237,0.12) 0%, transparent 60%),
|
| 48 |
+
radial-gradient(ellipse 50% 40% at 85% 75%, rgba(6,182,212,0.10) 0%, transparent 60%),
|
| 49 |
+
radial-gradient(ellipse 40% 35% at 50% 5%, rgba(168,85,247,0.08) 0%, transparent 55%);
|
| 50 |
+
pointer-events: none;
|
| 51 |
+
z-index: 0;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
/* ββ Layout ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 55 |
+
.wrapper {
|
| 56 |
+
position: relative;
|
| 57 |
+
z-index: 1;
|
| 58 |
+
max-width: 920px;
|
| 59 |
+
margin: 0 auto;
|
| 60 |
+
padding: 2.5rem 1.25rem 4rem;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
/* ββ Hero βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 64 |
+
.hero { text-align: center; margin-bottom: 2.5rem; }
|
| 65 |
+
|
| 66 |
+
.hero-badge {
|
| 67 |
+
display: inline-flex;
|
| 68 |
+
align-items: center;
|
| 69 |
+
gap: 0.45rem;
|
| 70 |
+
background: rgba(124,58,237,0.15);
|
| 71 |
+
border: 1px solid rgba(124,58,237,0.35);
|
| 72 |
+
border-radius: 100px;
|
| 73 |
+
padding: 0.28rem 0.9rem;
|
| 74 |
+
font-size: 0.75rem;
|
| 75 |
+
font-weight: 600;
|
| 76 |
+
color: var(--purple-light);
|
| 77 |
+
letter-spacing: 0.04em;
|
| 78 |
+
text-transform: uppercase;
|
| 79 |
+
margin-bottom: 1rem;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.hero h1 {
|
| 83 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 84 |
+
font-size: clamp(2rem, 5vw, 3.2rem);
|
| 85 |
+
font-weight: 700;
|
| 86 |
+
line-height: 1.15;
|
| 87 |
+
background: linear-gradient(135deg, #c084fc 0%, #818cf8 40%, #38bdf8 100%);
|
| 88 |
+
-webkit-background-clip: text;
|
| 89 |
+
-webkit-text-fill-color: transparent;
|
| 90 |
+
background-clip: text;
|
| 91 |
+
margin-bottom: 0.7rem;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
.hero p {
|
| 95 |
+
color: var(--text-muted);
|
| 96 |
+
font-size: 0.95rem;
|
| 97 |
+
max-width: 520px;
|
| 98 |
+
margin: 0 auto;
|
| 99 |
+
line-height: 1.6;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
/* ββ Tab navigation ββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 103 |
+
.tab-nav {
|
| 104 |
+
display: flex;
|
| 105 |
+
gap: 0.5rem;
|
| 106 |
+
background: var(--surface);
|
| 107 |
+
border: 1px solid var(--border);
|
| 108 |
+
border-radius: var(--radius);
|
| 109 |
+
padding: 0.4rem;
|
| 110 |
+
margin-bottom: 2rem;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.tab-btn {
|
| 114 |
+
flex: 1;
|
| 115 |
+
display: flex;
|
| 116 |
+
align-items: center;
|
| 117 |
+
justify-content: center;
|
| 118 |
+
gap: 0.5rem;
|
| 119 |
+
padding: 0.7rem 1.2rem;
|
| 120 |
+
border: none;
|
| 121 |
+
border-radius: var(--radius-sm);
|
| 122 |
+
background: transparent;
|
| 123 |
+
color: var(--text-muted);
|
| 124 |
+
font-family: 'Inter', sans-serif;
|
| 125 |
+
font-size: 0.88rem;
|
| 126 |
+
font-weight: 500;
|
| 127 |
+
cursor: pointer;
|
| 128 |
+
transition: var(--transition);
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
.tab-btn:hover { color: var(--text); background: rgba(255,255,255,0.05); }
|
| 132 |
+
|
| 133 |
+
.tab-btn.active {
|
| 134 |
+
background: linear-gradient(135deg, rgba(124,58,237,0.35), rgba(6,182,212,0.2));
|
| 135 |
+
color: #fff;
|
| 136 |
+
font-weight: 600;
|
| 137 |
+
box-shadow: 0 0 0 1px rgba(124,58,237,0.5) inset;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/* ββ Tab panels βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 141 |
+
.tab-panel { display: none; }
|
| 142 |
+
.tab-panel.active { display: block; }
|
| 143 |
+
|
| 144 |
+
/* ββ Glass card βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 145 |
+
.card {
|
| 146 |
+
background: linear-gradient(135deg, rgba(14,14,40,0.9) 0%, rgba(20,20,58,0.75) 100%);
|
| 147 |
+
border: 1px solid var(--border);
|
| 148 |
+
border-radius: var(--radius);
|
| 149 |
+
padding: 1.6rem;
|
| 150 |
+
margin-bottom: 1.25rem;
|
| 151 |
+
backdrop-filter: blur(12px);
|
| 152 |
+
transition: border-color var(--transition), box-shadow var(--transition);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
.card:hover { border-color: var(--border-hover); }
|
| 156 |
+
|
| 157 |
+
/* ββ Platform header ββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 158 |
+
.platform-header {
|
| 159 |
+
display: flex;
|
| 160 |
+
align-items: center;
|
| 161 |
+
justify-content: space-between;
|
| 162 |
+
margin-bottom: 1.1rem;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
.platform-title {
|
| 166 |
+
display: flex;
|
| 167 |
+
align-items: center;
|
| 168 |
+
gap: 0.6rem;
|
| 169 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 170 |
+
font-size: 1rem;
|
| 171 |
+
font-weight: 600;
|
| 172 |
+
color: #c4b5fd;
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.platform-icon {
|
| 176 |
+
width: 32px;
|
| 177 |
+
height: 32px;
|
| 178 |
+
border-radius: 8px;
|
| 179 |
+
display: flex;
|
| 180 |
+
align-items: center;
|
| 181 |
+
justify-content: center;
|
| 182 |
+
font-size: 1rem;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
.pi-instagram { background: linear-gradient(135deg, #f09433, #e6683c, #dc2743, #cc2366, #bc1888); }
|
| 186 |
+
.pi-tiktok { background: #161823; border: 1px solid #333; }
|
| 187 |
+
.pi-facebook { background: #1877f2; }
|
| 188 |
+
.pi-news { background: linear-gradient(135deg, #0ea5e9, #6366f1); }
|
| 189 |
+
.pi-dataset { background: linear-gradient(135deg, #059669, #0891b2); }
|
| 190 |
+
|
| 191 |
+
/* ββ Toggle switch ββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 192 |
+
.toggle-wrap { display: flex; align-items: center; gap: 0.6rem; }
|
| 193 |
+
|
| 194 |
+
.toggle-label { font-size: 0.78rem; color: var(--text-dim); font-weight: 500; }
|
| 195 |
+
|
| 196 |
+
.toggle { position: relative; width: 42px; height: 24px; }
|
| 197 |
+
|
| 198 |
+
.toggle input { opacity: 0; width: 0; height: 0; }
|
| 199 |
+
|
| 200 |
+
.slider {
|
| 201 |
+
position: absolute;
|
| 202 |
+
inset: 0;
|
| 203 |
+
background: rgba(255,255,255,0.1);
|
| 204 |
+
border-radius: 100px;
|
| 205 |
+
cursor: pointer;
|
| 206 |
+
transition: var(--transition);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
.slider::before {
|
| 210 |
+
content: '';
|
| 211 |
+
position: absolute;
|
| 212 |
+
width: 18px;
|
| 213 |
+
height: 18px;
|
| 214 |
+
left: 3px;
|
| 215 |
+
top: 3px;
|
| 216 |
+
background: white;
|
| 217 |
+
border-radius: 50%;
|
| 218 |
+
transition: var(--transition);
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.toggle input:checked + .slider { background: linear-gradient(135deg, var(--purple), var(--cyan)); }
|
| 222 |
+
.toggle input:checked + .slider::before { transform: translateX(18px); }
|
| 223 |
+
|
| 224 |
+
.platform-fields {
|
| 225 |
+
overflow: hidden;
|
| 226 |
+
transition: max-height 0.35s ease, opacity 0.3s ease;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.platform-fields.collapsed {
|
| 230 |
+
max-height: 0 !important;
|
| 231 |
+
opacity: 0;
|
| 232 |
+
pointer-events: none;
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
/* ββ Form elements ββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 236 |
+
.form-row { display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; }
|
| 237 |
+
.form-row.cols-3 { grid-template-columns: 1fr 1fr 1fr; }
|
| 238 |
+
.form-group { display: flex; flex-direction: column; gap: 0.3rem; }
|
| 239 |
+
.form-group.full { grid-column: 1 / -1; }
|
| 240 |
+
|
| 241 |
+
label { font-size: 0.78rem; color: var(--text-muted); font-weight: 500; letter-spacing: 0.01em; }
|
| 242 |
+
|
| 243 |
+
input[type="text"],
|
| 244 |
+
input[type="password"],
|
| 245 |
+
input[type="number"],
|
| 246 |
+
textarea,
|
| 247 |
+
select {
|
| 248 |
+
background: rgba(7,7,26,0.7);
|
| 249 |
+
border: 1px solid rgba(130,100,255,0.2);
|
| 250 |
+
border-radius: var(--radius-sm);
|
| 251 |
+
color: var(--text);
|
| 252 |
+
padding: 0.65rem 0.9rem;
|
| 253 |
+
font-family: 'Inter', sans-serif;
|
| 254 |
+
font-size: 0.88rem;
|
| 255 |
+
width: 100%;
|
| 256 |
+
transition: border-color var(--transition), box-shadow var(--transition);
|
| 257 |
+
outline: none;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
input::placeholder, textarea::placeholder { color: var(--text-dim); }
|
| 261 |
+
|
| 262 |
+
input:focus, textarea:focus, select:focus {
|
| 263 |
+
border-color: var(--purple);
|
| 264 |
+
box-shadow: 0 0 0 3px rgba(124,58,237,0.2);
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
select option { background: var(--surface-2); }
|
| 268 |
+
textarea { resize: vertical; min-height: 88px; line-height: 1.5; }
|
| 269 |
+
|
| 270 |
+
.field-hint { font-size: 0.72rem; color: var(--text-dim); line-height: 1.4; margin-top: 0.2rem; }
|
| 271 |
+
|
| 272 |
+
/* ββ Cookie tabs ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 273 |
+
.cookie-tabs { display: flex; gap: 0.3rem; margin-bottom: 0.5rem; }
|
| 274 |
+
|
| 275 |
+
.cookie-tab-btn {
|
| 276 |
+
padding: 0.25rem 0.7rem;
|
| 277 |
+
font-size: 0.72rem;
|
| 278 |
+
font-weight: 600;
|
| 279 |
+
border: 1px solid rgba(130,100,255,0.25);
|
| 280 |
+
border-radius: 6px;
|
| 281 |
+
background: transparent;
|
| 282 |
+
color: var(--text-muted);
|
| 283 |
+
cursor: pointer;
|
| 284 |
+
transition: var(--transition);
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.cookie-tab-btn.active {
|
| 288 |
+
background: rgba(124,58,237,0.25);
|
| 289 |
+
color: #c4b5fd;
|
| 290 |
+
border-color: rgba(124,58,237,0.5);
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
/* ββ Tag hint βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 294 |
+
.tag-hint {
|
| 295 |
+
display: inline-flex;
|
| 296 |
+
align-items: center;
|
| 297 |
+
gap: 0.3rem;
|
| 298 |
+
font-size: 0.72rem;
|
| 299 |
+
color: var(--cyan);
|
| 300 |
+
background: rgba(6,182,212,0.1);
|
| 301 |
+
border: 1px solid rgba(6,182,212,0.25);
|
| 302 |
+
border-radius: 6px;
|
| 303 |
+
padding: 0.15rem 0.55rem;
|
| 304 |
+
margin-top: 0.3rem;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
/* ββ Portal chips βββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 308 |
+
.portal-grid {
|
| 309 |
+
display: grid;
|
| 310 |
+
grid-template-columns: repeat(auto-fill, minmax(160px, 1fr));
|
| 311 |
+
gap: 0.5rem;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
.portal-chip {
|
| 315 |
+
display: flex;
|
| 316 |
+
align-items: center;
|
| 317 |
+
gap: 0.5rem;
|
| 318 |
+
padding: 0.55rem 0.75rem;
|
| 319 |
+
border: 1px solid rgba(130,100,255,0.2);
|
| 320 |
+
border-radius: var(--radius-sm);
|
| 321 |
+
cursor: pointer;
|
| 322 |
+
background: rgba(7,7,26,0.5);
|
| 323 |
+
transition: var(--transition);
|
| 324 |
+
user-select: none;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
.portal-chip:hover { border-color: rgba(130,100,255,0.45); background: rgba(124,58,237,0.1); }
|
| 328 |
+
.portal-chip input[type="checkbox"] { display: none; }
|
| 329 |
+
.portal-chip.checked { border-color: var(--purple); background: rgba(124,58,237,0.2); }
|
| 330 |
+
|
| 331 |
+
.chip-label { font-size: 0.82rem; font-weight: 500; color: var(--text-muted); }
|
| 332 |
+
.portal-chip.checked .chip-label { color: var(--text); }
|
| 333 |
+
|
| 334 |
+
.chip-dot {
|
| 335 |
+
width: 8px;
|
| 336 |
+
height: 8px;
|
| 337 |
+
border-radius: 50%;
|
| 338 |
+
background: var(--text-dim);
|
| 339 |
+
flex-shrink: 0;
|
| 340 |
+
transition: var(--transition);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.portal-chip.checked .chip-dot { background: var(--purple-light); }
|
| 344 |
+
|
| 345 |
+
/* ββ Submit button ββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 346 |
+
.btn-submit {
|
| 347 |
+
display: flex;
|
| 348 |
+
align-items: center;
|
| 349 |
+
justify-content: center;
|
| 350 |
+
gap: 0.6rem;
|
| 351 |
+
width: 100%;
|
| 352 |
+
padding: 1rem;
|
| 353 |
+
background: linear-gradient(135deg, #7c3aed 0%, #4f46e5 50%, #0891b2 100%);
|
| 354 |
+
border: none;
|
| 355 |
+
border-radius: var(--radius);
|
| 356 |
+
color: #fff;
|
| 357 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 358 |
+
font-size: 1rem;
|
| 359 |
+
font-weight: 600;
|
| 360 |
+
cursor: pointer;
|
| 361 |
+
transition: opacity var(--transition), transform var(--transition), box-shadow var(--transition);
|
| 362 |
+
letter-spacing: 0.02em;
|
| 363 |
+
margin-top: 0.5rem;
|
| 364 |
+
position: relative;
|
| 365 |
+
overflow: hidden;
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
.btn-submit::before {
|
| 369 |
+
content: '';
|
| 370 |
+
position: absolute;
|
| 371 |
+
inset: 0;
|
| 372 |
+
background: linear-gradient(135deg, rgba(255,255,255,0.12), transparent);
|
| 373 |
+
opacity: 0;
|
| 374 |
+
transition: opacity var(--transition);
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
.btn-submit:hover::before { opacity: 1; }
|
| 378 |
+
.btn-submit:hover { transform: translateY(-2px); box-shadow: 0 8px 32px rgba(124,58,237,0.45); }
|
| 379 |
+
.btn-submit:active { transform: translateY(0); }
|
| 380 |
+
.btn-submit:disabled { opacity: 0.65; pointer-events: none; cursor: not-allowed; transform: none; }
|
| 381 |
+
|
| 382 |
+
/* ββ Spinner ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 383 |
+
.spinner {
|
| 384 |
+
display: none;
|
| 385 |
+
width: 18px;
|
| 386 |
+
height: 18px;
|
| 387 |
+
border: 2.5px solid rgba(255,255,255,0.3);
|
| 388 |
+
border-top-color: #fff;
|
| 389 |
+
border-radius: 50%;
|
| 390 |
+
animation: spin 0.7s linear infinite;
|
| 391 |
+
flex-shrink: 0;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 395 |
+
|
| 396 |
+
/* ββ Alert ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 397 |
+
.alert {
|
| 398 |
+
border-radius: var(--radius);
|
| 399 |
+
padding: 1rem 1.25rem;
|
| 400 |
+
margin-bottom: 1.5rem;
|
| 401 |
+
font-size: 0.88rem;
|
| 402 |
+
border: 1px solid;
|
| 403 |
+
display: flex;
|
| 404 |
+
gap: 0.6rem;
|
| 405 |
+
align-items: flex-start;
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
.alert-error {
|
| 409 |
+
background: rgba(239,68,68,0.08);
|
| 410 |
+
border-color: rgba(239,68,68,0.3);
|
| 411 |
+
color: #fca5a5;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
/* ββ Results section ββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 415 |
+
.results-section { margin-top: 2.5rem; }
|
| 416 |
+
|
| 417 |
+
.results-header {
|
| 418 |
+
display: flex;
|
| 419 |
+
align-items: center;
|
| 420 |
+
gap: 0.6rem;
|
| 421 |
+
margin-bottom: 1.5rem;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
.results-header h2 {
|
| 425 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 426 |
+
font-size: 1.3rem;
|
| 427 |
+
font-weight: 700;
|
| 428 |
+
background: linear-gradient(135deg, var(--cyan), var(--purple-light));
|
| 429 |
+
-webkit-background-clip: text;
|
| 430 |
+
-webkit-text-fill-color: transparent;
|
| 431 |
+
background-clip: text;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
.stats-strip {
|
| 435 |
+
font-size: 0.8rem;
|
| 436 |
+
color: var(--text-dim);
|
| 437 |
+
background: rgba(255,255,255,0.04);
|
| 438 |
+
border: 1px solid var(--border);
|
| 439 |
+
border-radius: 8px;
|
| 440 |
+
padding: 0.4rem 0.9rem;
|
| 441 |
+
margin-left: auto;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
/* ββ Sentiment cards βββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 445 |
+
.sentiment-grid {
|
| 446 |
+
display: grid;
|
| 447 |
+
grid-template-columns: repeat(3, 1fr);
|
| 448 |
+
gap: 1rem;
|
| 449 |
+
margin-bottom: 1.5rem;
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
.s-card {
|
| 453 |
+
border-radius: var(--radius);
|
| 454 |
+
padding: 1.4rem 1rem;
|
| 455 |
+
text-align: center;
|
| 456 |
+
border: 1px solid;
|
| 457 |
+
position: relative;
|
| 458 |
+
overflow: hidden;
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
.s-card::before { content: ''; position: absolute; inset: 0; opacity: 0.06; border-radius: inherit; }
|
| 462 |
+
|
| 463 |
+
.s-card.positif { background: rgba(34,197,94,0.08); border-color: rgba(34,197,94,0.3); }
|
| 464 |
+
.s-card.positif::before { background: #22c55e; }
|
| 465 |
+
.s-card.negatif { background: rgba(239,68,68,0.08); border-color: rgba(239,68,68,0.3); }
|
| 466 |
+
.s-card.negatif::before { background: #ef4444; }
|
| 467 |
+
.s-card.netral { background: rgba(148,163,184,0.06); border-color: rgba(148,163,184,0.2); }
|
| 468 |
+
.s-card.netral::before { background: #94a3b8; }
|
| 469 |
+
|
| 470 |
+
.s-count { font-family: 'Space Grotesk', sans-serif; font-size: 2.8rem; font-weight: 700; line-height: 1; margin-bottom: 0.3rem; }
|
| 471 |
+
.s-card.positif .s-count { color: #4ade80; }
|
| 472 |
+
.s-card.negatif .s-count { color: #f87171; }
|
| 473 |
+
.s-card.netral .s-count { color: #94a3b8; }
|
| 474 |
+
|
| 475 |
+
.s-label { font-size: 0.82rem; color: var(--text-muted); font-weight: 500; }
|
| 476 |
+
|
| 477 |
+
.s-bar-wrap { margin-top: 0.8rem; height: 4px; background: rgba(255,255,255,0.08); border-radius: 100px; overflow: hidden; }
|
| 478 |
+
.s-bar { height: 100%; border-radius: 100px; transition: width 1.2s cubic-bezier(0.4,0,0.2,1); }
|
| 479 |
+
.s-card.positif .s-bar { background: linear-gradient(90deg, #16a34a, #4ade80); }
|
| 480 |
+
.s-card.negatif .s-bar { background: linear-gradient(90deg, #b91c1c, #f87171); }
|
| 481 |
+
.s-card.netral .s-bar { background: linear-gradient(90deg, #475569, #94a3b8); }
|
| 482 |
+
|
| 483 |
+
/* ββ Word cloud βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 484 |
+
.wordcloud-card {
|
| 485 |
+
background: var(--surface);
|
| 486 |
+
border: 1px solid var(--border);
|
| 487 |
+
border-radius: var(--radius);
|
| 488 |
+
padding: 1.5rem;
|
| 489 |
+
text-align: center;
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
.wordcloud-card h3 {
|
| 493 |
+
font-family: 'Space Grotesk', sans-serif;
|
| 494 |
+
font-size: 1rem;
|
| 495 |
+
color: var(--purple-light);
|
| 496 |
+
margin-bottom: 1rem;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.wordcloud-img { max-width: 100%; border-radius: 10px; border: 1px solid var(--border); }
|
| 500 |
+
|
| 501 |
+
/* ββ Divider ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 502 |
+
.divider {
|
| 503 |
+
display: flex;
|
| 504 |
+
align-items: center;
|
| 505 |
+
gap: 0.75rem;
|
| 506 |
+
color: var(--text-dim);
|
| 507 |
+
font-size: 0.75rem;
|
| 508 |
+
margin: 0.75rem 0;
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
.divider::before, .divider::after { content: ''; flex: 1; height: 1px; background: var(--border); }
|
| 512 |
+
|
| 513 |
+
/* ββ Section label ββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 514 |
+
.section-label {
|
| 515 |
+
font-size: 0.7rem;
|
| 516 |
+
font-weight: 700;
|
| 517 |
+
text-transform: uppercase;
|
| 518 |
+
letter-spacing: 0.08em;
|
| 519 |
+
color: var(--text-dim);
|
| 520 |
+
margin-bottom: 0.6rem;
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
/* ββ File upload ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 524 |
+
.upload-zone {
|
| 525 |
+
border: 2px dashed rgba(130,100,255,0.28);
|
| 526 |
+
border-radius: var(--radius);
|
| 527 |
+
padding: 2.5rem 1.5rem;
|
| 528 |
+
text-align: center;
|
| 529 |
+
transition: var(--transition);
|
| 530 |
+
cursor: pointer;
|
| 531 |
+
background: rgba(124,58,237,0.04);
|
| 532 |
+
position: relative;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
.upload-zone:hover, .upload-zone.drag-over { border-color: var(--purple); background: rgba(124,58,237,0.1); }
|
| 536 |
+
|
| 537 |
+
.upload-zone input[type="file"] { position: absolute; inset: 0; opacity: 0; cursor: pointer; width: 100%; height: 100%; }
|
| 538 |
+
|
| 539 |
+
.upload-icon { font-size: 2rem; margin-bottom: 0.5rem; }
|
| 540 |
+
.upload-text { font-size: 0.9rem; color: var(--text-muted); }
|
| 541 |
+
.upload-sub { font-size: 0.78rem; color: var(--text-dim); margin-top: 0.3rem; }
|
| 542 |
+
.upload-filename { display: none; margin-top: 0.6rem; font-size: 0.82rem; color: var(--cyan); font-weight: 500; }
|
| 543 |
+
|
| 544 |
+
/* ββ Responsive βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 545 |
+
@media (max-width: 640px) {
|
| 546 |
+
.form-row { grid-template-columns: 1fr; }
|
| 547 |
+
.form-row.cols-3 { grid-template-columns: 1fr 1fr; }
|
| 548 |
+
.sentiment-grid { grid-template-columns: 1fr; }
|
| 549 |
+
.tab-btn span.tab-text { display: none; }
|
| 550 |
+
.hero h1 { font-size: 1.8rem; }
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
/* ββ Animations βββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 554 |
+
@keyframes fadeUp {
|
| 555 |
+
from { opacity: 0; transform: translateY(20px); }
|
| 556 |
+
to { opacity: 1; transform: translateY(0); }
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.animate-in { animation: fadeUp 0.5s ease both; }
|
| 560 |
+
.delay-1 { animation-delay: 0.05s; }
|
| 561 |
+
.delay-2 { animation-delay: 0.10s; }
|
| 562 |
+
.delay-3 { animation-delay: 0.15s; }
|
| 563 |
+
.delay-4 { animation-delay: 0.20s; }
|
| 564 |
+
.delay-5 { animation-delay: 0.25s; }
|
| 565 |
+
</style>
|
| 566 |
+
</head>
|
| 567 |
+
<body>
|
| 568 |
+
<div class="wrapper">
|
| 569 |
+
|
| 570 |
+
<!-- Hero -->
|
| 571 |
+
<header class="hero animate-in">
|
| 572 |
+
<div class="hero-badge">π¬ AI-Powered</div>
|
| 573 |
+
<h1>SentiScope</h1>
|
| 574 |
+
<p>Analisis sentimen media sosial otomatis dengan IndoBERT β Instagram, TikTok, Facebook & Berita Online.</p>
|
| 575 |
+
</header>
|
| 576 |
+
|
| 577 |
+
<!-- Error alert -->
|
| 578 |
+
{% if error %}
|
| 579 |
+
<div class="alert alert-error animate-in" role="alert">
|
| 580 |
+
<span>β οΈ</span>
|
| 581 |
+
<span>{{ error }}</span>
|
| 582 |
+
</div>
|
| 583 |
+
{% endif %}
|
| 584 |
+
|
| 585 |
+
<!-- Tab navigation -->
|
| 586 |
+
<nav class="tab-nav animate-in delay-1" role="tablist">
|
| 587 |
+
<button class="tab-btn {% if active_tab != 'dataset' %}active{% endif %}"
|
| 588 |
+
id="tab-scraping" role="tab" onclick="switchTab('scraping')">
|
| 589 |
+
<span class="tab-icon">π·οΈ</span>
|
| 590 |
+
<span class="tab-text">Scraping Otomatis</span>
|
| 591 |
+
</button>
|
| 592 |
+
<button class="tab-btn {% if active_tab == 'dataset' %}active{% endif %}"
|
| 593 |
+
id="tab-dataset" role="tab" onclick="switchTab('dataset')">
|
| 594 |
+
<span class="tab-icon">π</span>
|
| 595 |
+
<span class="tab-text">Upload Dataset</span>
|
| 596 |
+
</button>
|
| 597 |
+
</nav>
|
| 598 |
+
|
| 599 |
+
<!-- βββββββββββββββββββββββ TAB 1: Scraping βββββββββββββββββββββββββββ -->
|
| 600 |
+
<div class="tab-panel {% if active_tab != 'dataset' %}active{% endif %}" id="panel-scraping">
|
| 601 |
+
<form id="scraping-form" action="/process" method="post">
|
| 602 |
+
|
| 603 |
+
<!-- Hidden enable flags β managed by JS toggles -->
|
| 604 |
+
<input type="hidden" id="enable_instagram" name="enable_instagram" value="">
|
| 605 |
+
<input type="hidden" id="enable_tiktok" name="enable_tiktok" value="">
|
| 606 |
+
<input type="hidden" id="enable_facebook" name="enable_facebook" value="">
|
| 607 |
+
<input type="hidden" id="enable_news" name="enable_news" value="">
|
| 608 |
+
|
| 609 |
+
<!-- ββ Instagram ββββββββββββββββββββββββββββββββββββββββββββββββ -->
|
| 610 |
+
<div class="card animate-in delay-2">
|
| 611 |
+
<div class="platform-header">
|
| 612 |
+
<div class="platform-title">
|
| 613 |
+
<div class="platform-icon pi-instagram">πΈ</div>
|
| 614 |
+
Instagram
|
| 615 |
+
</div>
|
| 616 |
+
<div class="toggle-wrap">
|
| 617 |
+
<span class="toggle-label" id="ig-toggle-label">Nonaktif</span>
|
| 618 |
+
<label class="toggle">
|
| 619 |
+
<input type="checkbox" id="ig-toggle" onchange="togglePlatform('ig')">
|
| 620 |
+
<span class="slider"></span>
|
| 621 |
+
</label>
|
| 622 |
+
</div>
|
| 623 |
+
</div>
|
| 624 |
+
<div class="platform-fields collapsed" id="ig-fields" style="max-height:600px;">
|
| 625 |
+
<div class="form-row" style="margin-bottom:0.9rem;">
|
| 626 |
+
<div class="form-group">
|
| 627 |
+
<label for="ig_username">Username Instagram</label>
|
| 628 |
+
<input id="ig_username" type="text" name="ig_username" placeholder="akun_instagram" autocomplete="username">
|
| 629 |
+
</div>
|
| 630 |
+
<div class="form-group">
|
| 631 |
+
<label for="ig_password">Password Instagram</label>
|
| 632 |
+
<input id="ig_password" type="password" name="ig_password" placeholder="β’β’β’β’β’β’β’β’" autocomplete="current-password">
|
| 633 |
+
</div>
|
| 634 |
+
</div>
|
| 635 |
+
<div class="form-row">
|
| 636 |
+
<div class="form-group full">
|
| 637 |
+
<label for="target_accounts">Target Akun / #Hashtag (satu per baris)</label>
|
| 638 |
+
<textarea id="target_accounts" name="target_accounts"
|
| 639 |
+
placeholder="cirebonkab @rctvcirebon #jalanrusak"></textarea>
|
| 640 |
+
<span class="tag-hint">β΅ Satu target per baris, @ dan # opsional</span>
|
| 641 |
+
</div>
|
| 642 |
+
<div class="form-group">
|
| 643 |
+
<label for="mode">Mode Waktu</label>
|
| 644 |
+
<select id="mode" name="mode">
|
| 645 |
+
<option value="all">Semua Postingan</option>
|
| 646 |
+
<option value="date">7 Bulan Terakhir</option>
|
| 647 |
+
</select>
|
| 648 |
+
</div>
|
| 649 |
+
</div>
|
| 650 |
+
</div>
|
| 651 |
+
</div>
|
| 652 |
+
|
| 653 |
+
<!-- ββ TikTok ββοΏ½οΏ½βββββββββββββββββββββββββββββββββββββββββββββββββ -->
|
| 654 |
+
<div class="card animate-in delay-3">
|
| 655 |
+
<div class="platform-header">
|
| 656 |
+
<div class="platform-title">
|
| 657 |
+
<div class="platform-icon pi-tiktok">π΅</div>
|
| 658 |
+
TikTok
|
| 659 |
+
</div>
|
| 660 |
+
<div class="toggle-wrap">
|
| 661 |
+
<span class="toggle-label" id="tt-toggle-label">Nonaktif</span>
|
| 662 |
+
<label class="toggle">
|
| 663 |
+
<input type="checkbox" id="tt-toggle" onchange="togglePlatform('tt')">
|
| 664 |
+
<span class="slider"></span>
|
| 665 |
+
</label>
|
| 666 |
+
</div>
|
| 667 |
+
</div>
|
| 668 |
+
<div class="platform-fields collapsed" id="tt-fields" style="max-height:500px;">
|
| 669 |
+
<div class="form-group" style="margin-bottom:0.9rem;">
|
| 670 |
+
<label>Format Cookie TikTok</label>
|
| 671 |
+
<div class="cookie-tabs">
|
| 672 |
+
<button type="button" class="cookie-tab-btn active" onclick="setCookieHint('raw',this)">String Mentah</button>
|
| 673 |
+
<button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_arr',this)">JSON Array</button>
|
| 674 |
+
<button type="button" class="cookie-tab-btn" onclick="setCookieHint('json_obj',this)">JSON Object</button>
|
| 675 |
+
</div>
|
| 676 |
+
<textarea id="tiktok_cookie" name="tiktok_cookie"
|
| 677 |
+
placeholder="sessionid=xxx; tt_webid=yyy; ..."
|
| 678 |
+
style="min-height:70px;font-family:monospace;font-size:0.8rem;"></textarea>
|
| 679 |
+
<p class="field-hint" id="cookie-hint">
|
| 680 |
+
Format: <code>sessionid=ABC; tt_webid=123</code> β ambil dari DevTools β Application β Cookies β tiktok.com
|
| 681 |
+
</p>
|
| 682 |
+
</div>
|
| 683 |
+
<div class="form-group">
|
| 684 |
+
<label for="tiktok_targets">Target Username TikTok (satu per baris)</label>
|
| 685 |
+
<textarea id="tiktok_targets" name="tiktok_targets"
|
| 686 |
+
placeholder="@rctvcirebon @cirebonnews kuningan_update"></textarea>
|
| 687 |
+
<span class="tag-hint">β΅ Satu username per baris, @ opsional</span>
|
| 688 |
+
</div>
|
| 689 |
+
</div>
|
| 690 |
+
</div>
|
| 691 |
+
|
| 692 |
+
<!-- ββ Facebook ββββββββββββββββββββββββββββββββββββββββββββββββββ -->
|
| 693 |
+
<div class="card animate-in delay-3">
|
| 694 |
+
<div class="platform-header">
|
| 695 |
+
<div class="platform-title">
|
| 696 |
+
<div class="platform-icon pi-facebook">π</div>
|
| 697 |
+
Facebook
|
| 698 |
+
</div>
|
| 699 |
+
<div class="toggle-wrap">
|
| 700 |
+
<span class="toggle-label" id="fb-toggle-label">Nonaktif</span>
|
| 701 |
+
<label class="toggle">
|
| 702 |
+
<input type="checkbox" id="fb-toggle" onchange="togglePlatform('fb')">
|
| 703 |
+
<span class="slider"></span>
|
| 704 |
+
</label>
|
| 705 |
+
</div>
|
| 706 |
+
</div>
|
| 707 |
+
<div class="platform-fields collapsed" id="fb-fields" style="max-height:500px;">
|
| 708 |
+
<div class="form-row" style="margin-bottom:0.9rem;">
|
| 709 |
+
<div class="form-group">
|
| 710 |
+
<label for="fb_username">Email / No. HP Facebook</label>
|
| 711 |
+
<input id="fb_username" type="text" name="fb_username" placeholder="email@contoh.com" autocomplete="username">
|
| 712 |
+
</div>
|
| 713 |
+
<div class="form-group">
|
| 714 |
+
<label for="fb_password">Password Facebook</label>
|
| 715 |
+
<input id="fb_password" type="password" name="fb_password" placeholder="β’β’β’β’β’β’β’β’" autocomplete="current-password">
|
| 716 |
+
</div>
|
| 717 |
+
</div>
|
| 718 |
+
<div class="form-group">
|
| 719 |
+
<label for="facebook_groups">URL Grup Facebook (satu per baris, wajib diisi)</label>
|
| 720 |
+
<textarea id="facebook_groups" name="facebook_groups"
|
| 721 |
+
placeholder="https://web.facebook.com/groups/123456 https://web.facebook.com/groups/teraswarga"></textarea>
|
| 722 |
+
<p class="field-hint">β οΈ Harus diisi β tidak ada grup default. Jika kosong, Facebook tidak akan di-scrape.</p>
|
| 723 |
+
</div>
|
| 724 |
+
</div>
|
| 725 |
+
</div>
|
| 726 |
+
|
| 727 |
+
<!-- ββ Berita Online βββββββββββββββββββββββββββββββββββββββββββββ -->
|
| 728 |
+
<div class="card animate-in delay-4">
|
| 729 |
+
<div class="platform-header">
|
| 730 |
+
<div class="platform-title">
|
| 731 |
+
<div class="platform-icon pi-news">π°</div>
|
| 732 |
+
Berita Online
|
| 733 |
+
</div>
|
| 734 |
+
<div class="toggle-wrap">
|
| 735 |
+
<span class="toggle-label" id="news-toggle-label">Nonaktif</span>
|
| 736 |
+
<label class="toggle">
|
| 737 |
+
<input type="checkbox" id="news-toggle" onchange="togglePlatform('news')">
|
| 738 |
+
<span class="slider"></span>
|
| 739 |
+
</label>
|
| 740 |
+
</div>
|
| 741 |
+
</div>
|
| 742 |
+
<div class="platform-fields collapsed" id="news-fields" style="max-height:500px;">
|
| 743 |
+
<div class="section-label">Pilih Portal (bisa lebih dari satu)</div>
|
| 744 |
+
<div class="portal-grid" id="portal-grid">
|
| 745 |
+
<label class="portal-chip" onclick="toggleChip(this)">
|
| 746 |
+
<input type="checkbox" name="_portal_detik" value="detik">
|
| 747 |
+
<span class="chip-dot"></span><span class="chip-label">Detik.com</span>
|
| 748 |
+
</label>
|
| 749 |
+
<label class="portal-chip" onclick="toggleChip(this)">
|
| 750 |
+
<input type="checkbox" name="_portal_antara" value="antara">
|
| 751 |
+
<span class="chip-dot"></span><span class="chip-label">Antara News</span>
|
| 752 |
+
</label>
|
| 753 |
+
<label class="portal-chip" onclick="toggleChip(this)">
|
| 754 |
+
<input type="checkbox" name="_portal_radar" value="radar">
|
| 755 |
+
<span class="chip-dot"></span><span class="chip-label">Radar (Disway)</span>
|
| 756 |
+
</label>
|
| 757 |
+
<label class="portal-chip" onclick="toggleChip(this)">
|
| 758 |
+
<input type="checkbox" name="_portal_radarcirebon" value="radarcirebon">
|
| 759 |
+
<span class="chip-dot"></span><span class="chip-label">Radar Cirebon ID</span>
|
| 760 |
+
</label>
|
| 761 |
+
<label class="portal-chip" onclick="toggleChip(this)">
|
| 762 |
+
<input type="checkbox" name="_portal_cnn" value="cnn">
|
| 763 |
+
<span class="chip-dot"></span><span class="chip-label">CNN Indonesia</span>
|
| 764 |
+
</label>
|
| 765 |
+
</div>
|
| 766 |
+
<!-- Hidden field filled by JS -->
|
| 767 |
+
<input type="hidden" id="news_portals" name="news_portals" value="">
|
| 768 |
+
<div class="form-row" style="margin-top:1rem;">
|
| 769 |
+
<div class="form-group">
|
| 770 |
+
<label for="news_keyword">Keyword Pencarian</label>
|
| 771 |
+
<input id="news_keyword" type="text" name="news_keyword" value="kabupaten cirebon" placeholder="kabupaten cirebon">
|
| 772 |
+
</div>
|
| 773 |
+
<div class="form-group">
|
| 774 |
+
<label for="news_pages">Jumlah Halaman per Portal</label>
|
| 775 |
+
<input id="news_pages" type="number" name="news_pages" value="1" min="1" max="20">
|
| 776 |
+
</div>
|
| 777 |
+
</div>
|
| 778 |
+
</div>
|
| 779 |
+
</div>
|
| 780 |
+
|
| 781 |
+
<button class="btn-submit animate-in delay-5" type="submit" id="scraping-submit">
|
| 782 |
+
<span class="spinner" id="scraping-spinner"></span>
|
| 783 |
+
<span id="scraping-btn-text">β‘ Mulai Scraping & Analisis</span>
|
| 784 |
+
</button>
|
| 785 |
+
</form>
|
| 786 |
+
</div>
|
| 787 |
+
|
| 788 |
+
<!-- βββββββββββββββββββββββ TAB 2: Dataset ββββββββββββββββββββββββββββ -->
|
| 789 |
+
<div class="tab-panel {% if active_tab == 'dataset' %}active{% endif %}" id="panel-dataset">
|
| 790 |
+
<form id="dataset-form" action="/wordcloud-dataset" method="post" enctype="multipart/form-data">
|
| 791 |
+
<div class="card animate-in">
|
| 792 |
+
<div class="platform-header">
|
| 793 |
+
<div class="platform-title">
|
| 794 |
+
<div class="platform-icon pi-dataset">π</div>
|
| 795 |
+
Upload Dataset
|
| 796 |
+
</div>
|
| 797 |
+
</div>
|
| 798 |
+
|
| 799 |
+
<div class="form-group" style="margin-bottom:1.25rem;">
|
| 800 |
+
<label>File Dataset (CSV, JSON, atau TXT)</label>
|
| 801 |
+
<div class="upload-zone" id="upload-zone">
|
| 802 |
+
<input type="file" name="dataset_file" id="dataset_file"
|
| 803 |
+
accept=".csv,.json,.txt,.tsv"
|
| 804 |
+
onchange="showFilename(this)">
|
| 805 |
+
<div class="upload-icon">π</div>
|
| 806 |
+
<div class="upload-text">Klik atau seret file ke sini</div>
|
| 807 |
+
<div class="upload-sub">Mendukung .csv, .json, .txt β maks 50 MB</div>
|
| 808 |
+
<div class="upload-filename" id="upload-filename">β <span></span></div>
|
| 809 |
+
</div>
|
| 810 |
+
</div>
|
| 811 |
+
|
| 812 |
+
<div class="form-group" style="margin-bottom:1.25rem;">
|
| 813 |
+
<label for="text_column">Nama Kolom Teks (untuk CSV/JSON)</label>
|
| 814 |
+
<input id="text_column" type="text" name="text_column" value="text" placeholder="text / content / komentar">
|
| 815 |
+
<p class="field-hint">Kolom yang berisi teks yang akan dianalisis. Kosongkan untuk pakai kolom pertama.</p>
|
| 816 |
+
</div>
|
| 817 |
+
|
| 818 |
+
<div class="divider">atau paste teks langsung</div>
|
| 819 |
+
|
| 820 |
+
<div class="form-group">
|
| 821 |
+
<label for="dataset_text">Teks Dataset (satu dokumen/kalimat per baris)</label>
|
| 822 |
+
<textarea id="dataset_text" name="dataset_text" style="min-height:140px;"
|
| 823 |
+
placeholder="Masukkan teks di sini, satu kalimat per baris... Cirebon semakin maju dengan infrastruktur yang baik Jalan di daerah X masih rusak parah"></textarea>
|
| 824 |
+
</div>
|
| 825 |
+
</div>
|
| 826 |
+
|
| 827 |
+
<button class="btn-submit" type="submit" id="dataset-submit">
|
| 828 |
+
<span class="spinner" id="dataset-spinner"></span>
|
| 829 |
+
<span id="dataset-btn-text">βοΈ Buat Word Cloud & Analisis Sentimen</span>
|
| 830 |
+
</button>
|
| 831 |
+
</form>
|
| 832 |
+
</div>
|
| 833 |
+
|
| 834 |
+
<!-- βββββββββββββββββββββββ Hasil Analisis ββββββββββββββββββββββββββββ -->
|
| 835 |
+
{% if result %}
|
| 836 |
+
<section class="results-section animate-in">
|
| 837 |
+
<div class="results-header">
|
| 838 |
+
<h2>π Hasil Analisis Sentimen</h2>
|
| 839 |
+
<span class="stats-strip">{{ total_scraped }} teks dikumpulkan Β· {{ result.total }} dianalisis</span>
|
| 840 |
+
</div>
|
| 841 |
+
|
| 842 |
+
{% if csv_filename %}
|
| 843 |
+
<div style="margin-bottom: 1.5rem;">
|
| 844 |
+
<a href="{{ csv_filename }}" download class="btn-submit" style="display:inline-flex; width:auto; padding:0.7rem 1.25rem; background:linear-gradient(135deg, #059669, #10b981); text-decoration:none; font-size:0.9rem;">
|
| 845 |
+
π₯ Download Data Scraping (CSV)
|
| 846 |
+
</a>
|
| 847 |
+
</div>
|
| 848 |
+
{% endif %}
|
| 849 |
+
|
| 850 |
+
<div class="sentiment-grid">
|
| 851 |
+
{% set total = result.total if result.total > 0 else 1 %}
|
| 852 |
+
<div class="s-card positif">
|
| 853 |
+
<div class="s-count" id="count-pos">0</div>
|
| 854 |
+
<div class="s-label">π Positif</div>
|
| 855 |
+
<div class="s-bar-wrap"><div class="s-bar" id="bar-pos" style="width:0%"></div></div>
|
| 856 |
+
</div>
|
| 857 |
+
<div class="s-card negatif">
|
| 858 |
+
<div class="s-count" id="count-neg">0</div>
|
| 859 |
+
<div class="s-label">π Negatif</div>
|
| 860 |
+
<div class="s-bar-wrap"><div class="s-bar" id="bar-neg" style="width:0%"></div></div>
|
| 861 |
+
</div>
|
| 862 |
+
<div class="s-card netral">
|
| 863 |
+
<div class="s-count" id="count-neu">0</div>
|
| 864 |
+
<div class="s-label">π Netral</div>
|
| 865 |
+
<div class="s-bar-wrap"><div class="s-bar" id="bar-neu" style="width:0%"></div></div>
|
| 866 |
+
</div>
|
| 867 |
+
</div>
|
| 868 |
+
|
| 869 |
+
{% if image %}
|
| 870 |
+
<div class="wordcloud-card">
|
| 871 |
+
<h3>βοΈ Word Cloud</h3>
|
| 872 |
+
<img class="wordcloud-img" src="data:image/png;base64,{{ image }}" alt="Word Cloud">
|
| 873 |
+
</div>
|
| 874 |
+
{% endif %}
|
| 875 |
+
</section>
|
| 876 |
+
|
| 877 |
+
<script>
|
| 878 |
+
(function () {
|
| 879 |
+
var pos = {{ result.positif }};
|
| 880 |
+
var neg = {{ result.negatif }};
|
| 881 |
+
var neu = {{ result.netral }};
|
| 882 |
+
var total = {{ result.total if result.total > 0 else 1 }};
|
| 883 |
+
|
| 884 |
+
function animCount(el, target) {
|
| 885 |
+
var start = 0;
|
| 886 |
+
var step = Math.max(1, Math.ceil(target / 40));
|
| 887 |
+
var timer = setInterval(function () {
|
| 888 |
+
start = Math.min(start + step, target);
|
| 889 |
+
el.textContent = start;
|
| 890 |
+
if (start >= target) clearInterval(timer);
|
| 891 |
+
}, 25);
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
setTimeout(function () {
|
| 895 |
+
animCount(document.getElementById('count-pos'), pos);
|
| 896 |
+
animCount(document.getElementById('count-neg'), neg);
|
| 897 |
+
animCount(document.getElementById('count-neu'), neu);
|
| 898 |
+
document.getElementById('bar-pos').style.width = (pos / total * 100).toFixed(1) + '%';
|
| 899 |
+
document.getElementById('bar-neg').style.width = (neg / total * 100).toFixed(1) + '%';
|
| 900 |
+
document.getElementById('bar-neu').style.width = (neu / total * 100).toFixed(1) + '%';
|
| 901 |
+
}, 300);
|
| 902 |
+
})();
|
| 903 |
+
</script>
|
| 904 |
+
{% endif %}
|
| 905 |
+
|
| 906 |
+
</div><!-- /wrapper -->
|
| 907 |
+
|
| 908 |
+
<script>
|
| 909 |
+
// ββ Tab switching βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 910 |
+
function switchTab(name) {
|
| 911 |
+
document.querySelectorAll('.tab-btn').forEach(function (b) { b.classList.remove('active'); });
|
| 912 |
+
document.querySelectorAll('.tab-panel').forEach(function (p) { p.classList.remove('active'); });
|
| 913 |
+
document.getElementById('tab-' + name).classList.add('active');
|
| 914 |
+
document.getElementById('panel-' + name).classList.add('active');
|
| 915 |
+
}
|
| 916 |
+
|
| 917 |
+
// ββ Platform toggle βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 918 |
+
function togglePlatform(id) {
|
| 919 |
+
var fields = document.getElementById(id + '-fields');
|
| 920 |
+
var toggle = document.getElementById(id + '-toggle');
|
| 921 |
+
var label = document.getElementById(id + '-toggle-label');
|
| 922 |
+
var flagMap = { ig: 'enable_instagram', tt: 'enable_tiktok', fb: 'enable_facebook', news: 'enable_news' };
|
| 923 |
+
|
| 924 |
+
if (toggle.checked) {
|
| 925 |
+
fields.classList.remove('collapsed');
|
| 926 |
+
if (label) label.textContent = 'Aktif';
|
| 927 |
+
document.getElementById(flagMap[id]).value = '1';
|
| 928 |
+
} else {
|
| 929 |
+
fields.classList.add('collapsed');
|
| 930 |
+
if (label) label.textContent = 'Nonaktif';
|
| 931 |
+
document.getElementById(flagMap[id]).value = '';
|
| 932 |
+
}
|
| 933 |
+
}
|
| 934 |
+
|
| 935 |
+
// ββ Portal chip multi-select ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 936 |
+
function toggleChip(label) {
|
| 937 |
+
var cb = label.querySelector('input[type="checkbox"]');
|
| 938 |
+
cb.checked = !cb.checked;
|
| 939 |
+
label.classList.toggle('checked', cb.checked);
|
| 940 |
+
updatePortalField();
|
| 941 |
+
}
|
| 942 |
+
|
| 943 |
+
function updatePortalField() {
|
| 944 |
+
var vals = [];
|
| 945 |
+
document.querySelectorAll('#portal-grid .portal-chip.checked input').forEach(function (cb) {
|
| 946 |
+
vals.push(cb.value);
|
| 947 |
+
});
|
| 948 |
+
document.getElementById('news_portals').value = vals.join(',');
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
// ββ Cookie format hints βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 952 |
+
var cookieHints = {
|
| 953 |
+
raw: 'Format: <code>sessionid=ABC; tt_webid=123</code> οΏ½οΏ½ ambil dari DevTools β Application β Cookies β tiktok.com',
|
| 954 |
+
json_arr: 'Format JSON Array: <code>[{"name":"sessionid","value":"ABC","domain":".tiktok.com"}]</code>',
|
| 955 |
+
json_obj: 'Format JSON Object: <code>{"sessionid": "ABC", "tt_webid": "123"}</code>',
|
| 956 |
+
};
|
| 957 |
+
|
| 958 |
+
var cookiePlaceholders = {
|
| 959 |
+
raw: 'sessionid=xxx; tt_webid=yyy; ...',
|
| 960 |
+
json_arr: '[{"name":"sessionid","value":"xxx","domain":".tiktok.com"},...]',
|
| 961 |
+
json_obj: '{"sessionid": "xxx", "tt_webid": "yyy"}',
|
| 962 |
+
};
|
| 963 |
+
|
| 964 |
+
function setCookieHint(fmt, btn) {
|
| 965 |
+
document.querySelectorAll('.cookie-tab-btn').forEach(function (b) { b.classList.remove('active'); });
|
| 966 |
+
btn.classList.add('active');
|
| 967 |
+
document.getElementById('cookie-hint').innerHTML = cookieHints[fmt];
|
| 968 |
+
document.getElementById('tiktok_cookie').placeholder = cookiePlaceholders[fmt];
|
| 969 |
+
}
|
| 970 |
+
|
| 971 |
+
// ββ File upload label βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 972 |
+
function showFilename(input) {
|
| 973 |
+
var wrap = document.getElementById('upload-filename');
|
| 974 |
+
if (input.files && input.files[0]) {
|
| 975 |
+
wrap.style.display = 'block';
|
| 976 |
+
wrap.querySelector('span').textContent = input.files[0].name;
|
| 977 |
+
} else {
|
| 978 |
+
wrap.style.display = 'none';
|
| 979 |
+
}
|
| 980 |
+
}
|
| 981 |
+
|
| 982 |
+
// Drag-over styling
|
| 983 |
+
var zone = document.getElementById('upload-zone');
|
| 984 |
+
if (zone) {
|
| 985 |
+
zone.addEventListener('dragover', function (e) { e.preventDefault(); zone.classList.add('drag-over'); });
|
| 986 |
+
zone.addEventListener('dragleave', function () { zone.classList.remove('drag-over'); });
|
| 987 |
+
zone.addEventListener('drop', function () { zone.classList.remove('drag-over'); });
|
| 988 |
+
}
|
| 989 |
+
|
| 990 |
+
// ββ Form submit spinners ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 991 |
+
function bindSubmit(formId, spinnerId, btnTextId, btnId, loadingText) {
|
| 992 |
+
var form = document.getElementById(formId);
|
| 993 |
+
if (!form) return;
|
| 994 |
+
form.addEventListener('submit', function () {
|
| 995 |
+
document.getElementById(btnId).disabled = true;
|
| 996 |
+
document.getElementById(spinnerId).style.display = 'inline-block';
|
| 997 |
+
document.getElementById(btnTextId).innerHTML = loadingText + '<span class="dots"><span></span><span></span><span></span></span>';
|
| 998 |
+
});
|
| 999 |
+
}
|
| 1000 |
+
|
| 1001 |
+
bindSubmit('scraping-form', 'scraping-spinner', 'scraping-btn-text', 'scraping-submit', 'Memproses (mungkin beberapa menit)');
|
| 1002 |
+
bindSubmit('dataset-form', 'dataset-spinner', 'dataset-btn-text', 'dataset-submit', 'Memproses dataset');
|
| 1003 |
+
|
| 1004 |
+
// Build news_portals on submit (capture phase)
|
| 1005 |
+
var sf = document.getElementById('scraping-form');
|
| 1006 |
+
if (sf) sf.addEventListener('submit', updatePortalField, true);
|
| 1007 |
+
</script>
|
| 1008 |
+
</body>
|
| 1009 |
+
</html>
|
web_scrapping.py
ADDED
|
@@ -0,0 +1,1026 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Web Scrapping.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1OLoBK18jpB685Ivi8Zi3SzuVYiXJ9jRa
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
!pip install selenium
|
| 11 |
+
!pip install webdriver-manager
|
| 12 |
+
|
| 13 |
+
# Detik.com
|
| 14 |
+
|
| 15 |
+
import requests
|
| 16 |
+
from bs4 import BeautifulSoup
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import time
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
def scrape_detik_search(keyword, max_pages=1):
|
| 22 |
+
base_search_url = "https://www.detik.com/search/searchall"
|
| 23 |
+
results = []
|
| 24 |
+
|
| 25 |
+
headers = {
|
| 26 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
for page in range(1, max_pages + 1):
|
| 30 |
+
params = {
|
| 31 |
+
'query': keyword,
|
| 32 |
+
'siteid': '2',
|
| 33 |
+
'sortby': 'time',
|
| 34 |
+
'page': page
|
| 35 |
+
}
|
| 36 |
+
print(f"Scraping page {page}...")
|
| 37 |
+
r = requests.get(base_search_url, params=params, headers=headers)
|
| 38 |
+
if r.status_code != 200:
|
| 39 |
+
print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
|
| 40 |
+
break
|
| 41 |
+
|
| 42 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
| 43 |
+
|
| 44 |
+
news_list = soup.find_all('div', class_='media')
|
| 45 |
+
|
| 46 |
+
if not news_list:
|
| 47 |
+
print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
for news in news_list:
|
| 51 |
+
try:
|
| 52 |
+
title_tag = news.find('h3', class_='media__title')
|
| 53 |
+
if not title_tag:
|
| 54 |
+
continue
|
| 55 |
+
link_tag = title_tag.find('a', class_='media__link')
|
| 56 |
+
if not link_tag or not link_tag.has_attr('href'):
|
| 57 |
+
continue
|
| 58 |
+
link = link_tag['href']
|
| 59 |
+
title = link_tag.text.strip()
|
| 60 |
+
|
| 61 |
+
date_tag = news.find('div', class_='media__date')
|
| 62 |
+
if date_tag:
|
| 63 |
+
span_tag = date_tag.find('span')
|
| 64 |
+
if span_tag and span_tag.has_attr('d-time'):
|
| 65 |
+
timestamp = span_tag['d-time']
|
| 66 |
+
news_date = datetime.fromtimestamp(int(timestamp))
|
| 67 |
+
else:
|
| 68 |
+
news_date = None
|
| 69 |
+
else:
|
| 70 |
+
news_date = None
|
| 71 |
+
|
| 72 |
+
# if news_date and news_date < cutoff_date:
|
| 73 |
+
# print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
|
| 74 |
+
# return pd.DataFrame(results)
|
| 75 |
+
|
| 76 |
+
# Ambil halaman detail berita dengan header
|
| 77 |
+
news_resp = requests.get(link, headers=headers)
|
| 78 |
+
if news_resp.status_code != 200:
|
| 79 |
+
print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
news_soup = BeautifulSoup(news_resp.text, 'html.parser')
|
| 83 |
+
|
| 84 |
+
content_div = news_soup.find('div', class_='detail__body-text') or \
|
| 85 |
+
news_soup.find('div', class_='detail_text')
|
| 86 |
+
|
| 87 |
+
if content_div:
|
| 88 |
+
content_parts = []
|
| 89 |
+
for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
| 90 |
+
text = tag.get_text(strip=True)
|
| 91 |
+
if text:
|
| 92 |
+
prefix = tag.name.upper() if tag.name.startswith('h') else ''
|
| 93 |
+
if prefix:
|
| 94 |
+
content_parts.append(f"{prefix}: {text}")
|
| 95 |
+
else:
|
| 96 |
+
content_parts.append(text)
|
| 97 |
+
content = '\n'.join(content_parts)
|
| 98 |
+
else:
|
| 99 |
+
content = ''
|
| 100 |
+
|
| 101 |
+
# Ambil tag dari elemen nav > a.nav__item
|
| 102 |
+
nav_div = news_soup.find('div', class_='nav')
|
| 103 |
+
|
| 104 |
+
tags = []
|
| 105 |
+
if nav_div:
|
| 106 |
+
tags = [a.text.strip() for a in nav_div.find_all('a', class_='nav__item')]
|
| 107 |
+
|
| 108 |
+
results.append({
|
| 109 |
+
'judul': title,
|
| 110 |
+
'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
|
| 111 |
+
'tag': ', '.join(tags),
|
| 112 |
+
'isi_berita': content,
|
| 113 |
+
'link': link
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
print(f"Berhasil scrape berita: {title}")
|
| 117 |
+
|
| 118 |
+
time.sleep(1)
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Error saat memproses berita: {e}")
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
time.sleep(2)
|
| 125 |
+
|
| 126 |
+
return pd.DataFrame(results)
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
keyword = "Kabupaten Cirebon"
|
| 130 |
+
df = scrape_detik_search(keyword)
|
| 131 |
+
if not df.empty:
|
| 132 |
+
df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
|
| 133 |
+
print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
|
| 134 |
+
else:
|
| 135 |
+
print("Tidak ada data yang berhasil di-scrape.")
|
| 136 |
+
|
| 137 |
+
# Radar Cirebon KW
|
| 138 |
+
|
| 139 |
+
import requests
|
| 140 |
+
from bs4 import BeautifulSoup
|
| 141 |
+
import pandas as pd
|
| 142 |
+
import time
|
| 143 |
+
from urllib.parse import quote_plus
|
| 144 |
+
|
| 145 |
+
BASE_HOST = "https://radarcirebon.disway.id"
|
| 146 |
+
BASE_SEARCH = BASE_HOST + "/search/kata/"
|
| 147 |
+
|
| 148 |
+
def make_search_url(keyword, page, per_page=30):
|
| 149 |
+
q = quote_plus(keyword)
|
| 150 |
+
if page == 1:
|
| 151 |
+
return f"{BASE_SEARCH}?c={q}&num="
|
| 152 |
+
else:
|
| 153 |
+
offset = (page - 1) * per_page
|
| 154 |
+
return f"{BASE_SEARCH}{offset}/{offset}/?c={q}&num="
|
| 155 |
+
|
| 156 |
+
def absolute_url(href):
|
| 157 |
+
if not href:
|
| 158 |
+
return None
|
| 159 |
+
href = href.strip()
|
| 160 |
+
if href.startswith("http://") or href.startswith("https://"):
|
| 161 |
+
return href
|
| 162 |
+
if href.startswith("/"):
|
| 163 |
+
return BASE_HOST + href
|
| 164 |
+
return BASE_HOST + "/" + href
|
| 165 |
+
|
| 166 |
+
def scrape_radar_cirebon(keyword, max_pages=100, per_page=30, delay_between_items=1.0, delay_between_pages=2.0):
|
| 167 |
+
sess = requests.Session()
|
| 168 |
+
sess.headers.update({
|
| 169 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
|
| 170 |
+
})
|
| 171 |
+
|
| 172 |
+
results = []
|
| 173 |
+
seen_links = set()
|
| 174 |
+
|
| 175 |
+
for page in range(1, max_pages + 1):
|
| 176 |
+
url = make_search_url(keyword, page, per_page)
|
| 177 |
+
print(f"\nScraping page {page} -> {url}")
|
| 178 |
+
try:
|
| 179 |
+
r = sess.get(url, timeout=15)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f" ERROR: Gagal request halaman search: {e}")
|
| 182 |
+
break
|
| 183 |
+
|
| 184 |
+
if r.status_code != 200:
|
| 185 |
+
print(f" ERROR: status code {r.status_code}, hentikan scraping.")
|
| 186 |
+
break
|
| 187 |
+
|
| 188 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 189 |
+
|
| 190 |
+
# Ambil daftar berita
|
| 191 |
+
news_list = soup.find_all(class_='media-heading')
|
| 192 |
+
if not news_list:
|
| 193 |
+
news_list = soup.find_all('div', class_='media')
|
| 194 |
+
if not news_list:
|
| 195 |
+
news_list = soup.find_all('article')
|
| 196 |
+
if not news_list:
|
| 197 |
+
news_list = soup.select('ul.search-results li') or soup.select('div.search-result') or []
|
| 198 |
+
|
| 199 |
+
if not news_list:
|
| 200 |
+
print(" Tidak ada berita ditemukan di halaman ini.")
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
print(f" Ketemu {len(news_list)} item.")
|
| 204 |
+
|
| 205 |
+
for item in news_list:
|
| 206 |
+
try:
|
| 207 |
+
a = item.find('a', href=True) or item.select_one('a[href]')
|
| 208 |
+
if not a:
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
link = absolute_url(a.get('href'))
|
| 212 |
+
if not link or link in seen_links:
|
| 213 |
+
continue
|
| 214 |
+
seen_links.add(link)
|
| 215 |
+
|
| 216 |
+
title = a.get_text(strip=True)
|
| 217 |
+
|
| 218 |
+
# Ambil halaman detail
|
| 219 |
+
try:
|
| 220 |
+
detail_r = sess.get(link, timeout=15)
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f" ERROR request detail {link}: {e}")
|
| 223 |
+
continue
|
| 224 |
+
if detail_r.status_code != 200:
|
| 225 |
+
print(f" ERROR status {detail_r.status_code} for {link}")
|
| 226 |
+
continue
|
| 227 |
+
|
| 228 |
+
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
|
| 229 |
+
|
| 230 |
+
# Judul detail
|
| 231 |
+
h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
|
| 232 |
+
title_detail = h1.get_text(strip=True) if h1 else title
|
| 233 |
+
|
| 234 |
+
# Tanggal detail
|
| 235 |
+
date_text = None # Inisialisasi variabel
|
| 236 |
+
|
| 237 |
+
# Opsi 1: Cari tag dengan class 'date' secara langsung
|
| 238 |
+
date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
|
| 239 |
+
if date_detail_tag:
|
| 240 |
+
print("Ditemukan dengan Target Langsung")
|
| 241 |
+
# PERBAIKAN: Gunakan variabel 'date_detail_tag', bukan 'tag'
|
| 242 |
+
date_text = date_detail_tag.get_text(strip=True)
|
| 243 |
+
|
| 244 |
+
# Opsi 2: Jika Opsi 1 gagal, cari di dalam kontainer 'post-info'
|
| 245 |
+
if not date_text:
|
| 246 |
+
post_info_div = detail_soup.find('div', class_='post-info')
|
| 247 |
+
if post_info_div:
|
| 248 |
+
tag_tanggal = post_info_div.find('span', class_='date')
|
| 249 |
+
if tag_tanggal:
|
| 250 |
+
print("Ditemukan dengan Target Kontainer")
|
| 251 |
+
date_text = tag_tanggal.get_text(strip=True)
|
| 252 |
+
|
| 253 |
+
# Opsi 3: Jika masih gagal, gunakan Regex sebagai usaha terakhir
|
| 254 |
+
if not date_text:
|
| 255 |
+
# Pola Regex untuk format seperti "Rabu 22-08-2024" atau "Selasa, 21 Agustus 2024"
|
| 256 |
+
date_pattern = re.compile(r'\w+,\s*\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{2}-\d{2}-\d{4}')
|
| 257 |
+
found_text = detail_soup.find(string=date_pattern)
|
| 258 |
+
if found_text:
|
| 259 |
+
print("Ditemukan dengan Target Pola Teks (Regex)")
|
| 260 |
+
date_text = found_text.strip()
|
| 261 |
+
|
| 262 |
+
# Isi berita
|
| 263 |
+
content_container = None
|
| 264 |
+
for cls in ('entry-content', 'post-content', 'article-body', 'detail__body-text', 'detail_text', 'content', 'article__content'):
|
| 265 |
+
content_container = detail_soup.find('div', class_=cls)
|
| 266 |
+
if content_container:
|
| 267 |
+
break
|
| 268 |
+
if not content_container:
|
| 269 |
+
content_container = detail_soup.find('article')
|
| 270 |
+
|
| 271 |
+
content_parts = []
|
| 272 |
+
search_scope = content_container if content_container else detail_soup
|
| 273 |
+
for p in search_scope.find_all('p'):
|
| 274 |
+
text = p.get_text(strip=True)
|
| 275 |
+
if text and 'Baca Juga:' not in text:
|
| 276 |
+
content_parts.append(text)
|
| 277 |
+
content = "\n".join(content_parts)
|
| 278 |
+
|
| 279 |
+
tags = []
|
| 280 |
+
try:
|
| 281 |
+
# 1. Cari SEMUA tag <a> yang tautannya (href) mengandung '/listtag/'
|
| 282 |
+
# Ini adalah pola unik untuk tag di situs tersebut.
|
| 283 |
+
tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
|
| 284 |
+
|
| 285 |
+
# 2. Loop melalui setiap tautan tag yang ditemukan
|
| 286 |
+
for a_tag in tag_links:
|
| 287 |
+
# 3. Ambil teks dari atribut 'title', karena itu berisi nama tag yang bersih
|
| 288 |
+
tag_text = a_tag.get('title', '').strip()
|
| 289 |
+
|
| 290 |
+
# 4. Pastikan teks tidak kosong sebelum menambahkannya ke list
|
| 291 |
+
if tag_text:
|
| 292 |
+
tags.append(tag_text)
|
| 293 |
+
|
| 294 |
+
# Jika tidak ada tag yang ditemukan, list akan tetap kosong, yang mana sudah benar.
|
| 295 |
+
|
| 296 |
+
except Exception as e:
|
| 297 |
+
# Menjaga agar program tidak berhenti jika ada error tak terduga
|
| 298 |
+
print(f" Terjadi error saat mencari tag: {e}")
|
| 299 |
+
|
| 300 |
+
# Gabungkan hasil tag menjadi satu string untuk disimpan
|
| 301 |
+
final_tags = ", ".join(tags) if tags else "-"
|
| 302 |
+
|
| 303 |
+
results.append({
|
| 304 |
+
"judul": title_detail,
|
| 305 |
+
"tanggal": date_text,
|
| 306 |
+
"tag": final_tags, # INI BAGIAN YANG DIPERBAIKI
|
| 307 |
+
"isi_berita": content,
|
| 308 |
+
"link": link
|
| 309 |
+
})
|
| 310 |
+
|
| 311 |
+
print(f" Berhasil: {title_detail} | Tags: {', '.join(tags) if tags else '-'}")
|
| 312 |
+
|
| 313 |
+
time.sleep(delay_between_items)
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
print(f" Error saat memproses item: {e}")
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
time.sleep(delay_between_pages)
|
| 320 |
+
|
| 321 |
+
df = pd.DataFrame(results)
|
| 322 |
+
return df
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
keyword = "kabupaten cirebon"
|
| 326 |
+
df = scrape_radar_cirebon(keyword, max_pages=100)
|
| 327 |
+
if not df.empty:
|
| 328 |
+
df.to_csv("/content/drive/MyDrive/Machine Learning/Sentiment Analysis/radarcirebondisway_berita.csv", index=False, encoding="utf-8-sig")
|
| 329 |
+
print("\nSelesai menyimpan data berita ke radarcirebon_berita.csv")
|
| 330 |
+
else:
|
| 331 |
+
print("\nTidak ada data yang berhasil di-scrape.")
|
| 332 |
+
|
| 333 |
+
# Antara News
|
| 334 |
+
|
| 335 |
+
import requests
|
| 336 |
+
from bs4 import BeautifulSoup
|
| 337 |
+
import pandas as pd
|
| 338 |
+
import time
|
| 339 |
+
import re
|
| 340 |
+
import random
|
| 341 |
+
from urllib.parse import quote_plus, urlparse, urlunparse
|
| 342 |
+
|
| 343 |
+
BASE_HOST = "https://www.antaranews.com"
|
| 344 |
+
BASE_SEARCH = BASE_HOST + "/search"
|
| 345 |
+
|
| 346 |
+
def make_search_url(keyword, page):
|
| 347 |
+
q = quote_plus(keyword)
|
| 348 |
+
if page == 1:
|
| 349 |
+
return f"{BASE_SEARCH}?q={q}"
|
| 350 |
+
else:
|
| 351 |
+
return f"{BASE_SEARCH}?q={q}&page={page}"
|
| 352 |
+
|
| 353 |
+
def absolute_url(href):
|
| 354 |
+
if not href:
|
| 355 |
+
return None
|
| 356 |
+
href = href.strip()
|
| 357 |
+
if href.startswith("http://") or href.startswith("https://"):
|
| 358 |
+
return href
|
| 359 |
+
if href.startswith("/"):
|
| 360 |
+
return BASE_HOST + href
|
| 361 |
+
return BASE_HOST + "/" + href
|
| 362 |
+
|
| 363 |
+
def normalize_url(href):
|
| 364 |
+
"""Buat URL konsisten: absolut + buang query/fragment + hapus trailing slash."""
|
| 365 |
+
if not href:
|
| 366 |
+
return None
|
| 367 |
+
href = absolute_url(href)
|
| 368 |
+
parsed = urlparse(href)
|
| 369 |
+
clean = parsed._replace(query="", fragment="")
|
| 370 |
+
return urlunparse(clean).rstrip("/")
|
| 371 |
+
|
| 372 |
+
def get_with_retry(sess, url, max_retries=3, delay_range=(2, 5)):
|
| 373 |
+
"""Request dengan retry & delay acak."""
|
| 374 |
+
for attempt in range(max_retries):
|
| 375 |
+
try:
|
| 376 |
+
r = sess.get(url, timeout=15)
|
| 377 |
+
r.raise_for_status()
|
| 378 |
+
return r
|
| 379 |
+
except Exception as e:
|
| 380 |
+
print(f" Percobaan {attempt+1} gagal: {e}")
|
| 381 |
+
if attempt < max_retries - 1:
|
| 382 |
+
time.sleep(random.uniform(*delay_range))
|
| 383 |
+
return None
|
| 384 |
+
|
| 385 |
+
def scrape_antaranews(keyword, max_pages=5, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
|
| 386 |
+
sess = requests.Session()
|
| 387 |
+
sess.headers.update({
|
| 388 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
| 389 |
+
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
| 390 |
+
'Chrome/115.0 Safari/537.36'
|
| 391 |
+
})
|
| 392 |
+
|
| 393 |
+
results = []
|
| 394 |
+
seen_links = set()
|
| 395 |
+
|
| 396 |
+
for page in range(1, max_pages + 1):
|
| 397 |
+
url = make_search_url(keyword, page)
|
| 398 |
+
print(f"\nScraping page {page} -> {url}")
|
| 399 |
+
|
| 400 |
+
r = get_with_retry(sess, url)
|
| 401 |
+
if not r:
|
| 402 |
+
print(f" ERROR: Gagal request halaman search setelah retry.")
|
| 403 |
+
continue
|
| 404 |
+
|
| 405 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 406 |
+
|
| 407 |
+
# Ambil semua anchor yang mengarah ke artikel berita (biasanya /berita/...)
|
| 408 |
+
anchors = soup.select('a[href*="/berita/"]')
|
| 409 |
+
all_links_in_page = {normalize_url(a.get('href')) for a in anchors if a.get('href')}
|
| 410 |
+
all_links_in_page = {l for l in all_links_in_page if l}
|
| 411 |
+
new_links = all_links_in_page - seen_links
|
| 412 |
+
print(f" Ketemu {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
|
| 413 |
+
|
| 414 |
+
seen_links.update(all_links_in_page)
|
| 415 |
+
|
| 416 |
+
for link in sorted(new_links):
|
| 417 |
+
detail_r = get_with_retry(sess, link)
|
| 418 |
+
if not detail_r:
|
| 419 |
+
print(f" ERROR: Gagal request detail {link}")
|
| 420 |
+
continue
|
| 421 |
+
|
| 422 |
+
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
|
| 423 |
+
|
| 424 |
+
# Judul
|
| 425 |
+
h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
|
| 426 |
+
title_detail = h1.get_text(strip=True) if h1 else ""
|
| 427 |
+
|
| 428 |
+
# Waktu / tanggal
|
| 429 |
+
date_detail = ""
|
| 430 |
+
cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
|
| 431 |
+
if cal_icon:
|
| 432 |
+
parent_li = cal_icon.find_parent('li') or cal_icon.find_parent()
|
| 433 |
+
if parent_li:
|
| 434 |
+
date_detail = parent_li.get_text(" ", strip=True)
|
| 435 |
+
if not date_detail:
|
| 436 |
+
text_all = detail_soup.get_text(" ", strip=True)
|
| 437 |
+
m = re.search(r'\b(?:[A-Za-z]+,\s*\d{1,2}\s+[A-Za-z]+ \d{4}\s*\d{1,2}:\d{2}\s*WIB|\d+\s+jam lalu|\bWIB\b)', text_all)
|
| 438 |
+
if m:
|
| 439 |
+
date_detail = m.group(0)
|
| 440 |
+
|
| 441 |
+
# Isi berita
|
| 442 |
+
content_parts = []
|
| 443 |
+
article_body = detail_soup.find('div', class_='wrap__article-detail-content') \
|
| 444 |
+
or detail_soup.find('div', class_='detail__body-text') \
|
| 445 |
+
or detail_soup.find('article')
|
| 446 |
+
search_scope = article_body if article_body else detail_soup
|
| 447 |
+
for p in search_scope.find_all('p'):
|
| 448 |
+
text = p.get_text(strip=True)
|
| 449 |
+
if text and not text.lower().startswith("baca juga"):
|
| 450 |
+
content_parts.append(text)
|
| 451 |
+
content = "\n".join(content_parts)
|
| 452 |
+
|
| 453 |
+
# Ambil tag
|
| 454 |
+
tags = []
|
| 455 |
+
found = False
|
| 456 |
+
for ul in detail_soup.find_all('ul', class_='list-inline'):
|
| 457 |
+
if ul.find('i', class_='fa-tags') or ul.find('i', class_='fas fa-tags'):
|
| 458 |
+
for a in ul.find_all('a', href=True):
|
| 459 |
+
if '/tag/' in a['href']:
|
| 460 |
+
tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
|
| 461 |
+
if tag_text:
|
| 462 |
+
tags.append(tag_text)
|
| 463 |
+
if tags:
|
| 464 |
+
found = True
|
| 465 |
+
break
|
| 466 |
+
if not found:
|
| 467 |
+
for a in detail_soup.select('a[href*="/tag/"]'):
|
| 468 |
+
tag_text = a.get('title') if a.get('title') else a.get_text(strip=True)
|
| 469 |
+
if tag_text:
|
| 470 |
+
tags.append(tag_text)
|
| 471 |
+
tags = list(dict.fromkeys(tags))
|
| 472 |
+
|
| 473 |
+
results.append({
|
| 474 |
+
"judul": title_detail,
|
| 475 |
+
"tanggal": date_detail,
|
| 476 |
+
"tag": ", ".join(tags) if tags else "-",
|
| 477 |
+
"isi_berita": content,
|
| 478 |
+
"link": link
|
| 479 |
+
})
|
| 480 |
+
|
| 481 |
+
print(f" Berhasil: {title_detail} | Tanggal: {date_detail if date_detail else '-'} | Tags: {', '.join(tags) if tags else '-'}")
|
| 482 |
+
|
| 483 |
+
time.sleep(random.uniform(*delay_between_items))
|
| 484 |
+
|
| 485 |
+
time.sleep(random.uniform(*delay_between_pages))
|
| 486 |
+
|
| 487 |
+
df = pd.DataFrame(results)
|
| 488 |
+
return df
|
| 489 |
+
|
| 490 |
+
if __name__ == "__main__":
|
| 491 |
+
keyword = "kabupaten cirebon"
|
| 492 |
+
df = scrape_antaranews(keyword, max_pages=100)
|
| 493 |
+
if not df.empty:
|
| 494 |
+
df.to_csv("antaranews_berita.csv", index=False, encoding="utf-8-sig")
|
| 495 |
+
print(f"\nSelesai menyimpan {len(df)} data berita ke antaranews_berita.csv")
|
| 496 |
+
else:
|
| 497 |
+
print("\nTidak ada data yang berhasil di-scrape.")
|
| 498 |
+
|
| 499 |
+
# Jalanin di IDE lokal karena butuh chrome (CNN)
|
| 500 |
+
|
| 501 |
+
import requests
|
| 502 |
+
from bs4 import BeautifulSoup
|
| 503 |
+
import pandas as pd
|
| 504 |
+
import time
|
| 505 |
+
import random
|
| 506 |
+
from urllib.parse import quote, urlparse, urlunparse
|
| 507 |
+
import re
|
| 508 |
+
|
| 509 |
+
from selenium import webdriver
|
| 510 |
+
from selenium.webdriver.chrome.service import Service
|
| 511 |
+
from selenium.webdriver.common.by import By
|
| 512 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 513 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 514 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 515 |
+
from selenium.common.exceptions import TimeoutException
|
| 516 |
+
|
| 517 |
+
BASE_HOST = "https://www.cnnindonesia.com"
|
| 518 |
+
|
| 519 |
+
# <<< DIUBAH: Fungsi ini dimodifikasi untuk menangani nomor halaman >>>
|
| 520 |
+
def make_search_url(keyword, page):
|
| 521 |
+
"""
|
| 522 |
+
Membuat URL pencarian yang benar untuk setiap halaman.
|
| 523 |
+
"""
|
| 524 |
+
q = quote(keyword)
|
| 525 |
+
base_url = f"{BASE_HOST}/search?query={q}&result_type=latest"
|
| 526 |
+
if page == 1:
|
| 527 |
+
return base_url
|
| 528 |
+
else:
|
| 529 |
+
return f"{base_url}&page={page}"
|
| 530 |
+
|
| 531 |
+
# --- Fungsi-fungsi pembantu lainnya tidak ada perubahan ---
|
| 532 |
+
def absolute_url(href):
|
| 533 |
+
if not href: return None
|
| 534 |
+
href = href.strip()
|
| 535 |
+
if href.startswith("http://") or href.startswith("https://"): return href
|
| 536 |
+
if href.startswith("/"): return BASE_HOST + href
|
| 537 |
+
return BASE_HOST + "/" + href
|
| 538 |
+
|
| 539 |
+
def normalize_url(href):
|
| 540 |
+
if not href: return None
|
| 541 |
+
href = absolute_url(href)
|
| 542 |
+
parsed = urlparse(href)
|
| 543 |
+
clean = parsed._replace(query="", fragment="")
|
| 544 |
+
return urlunparse(clean).rstrip("/")
|
| 545 |
+
|
| 546 |
+
def parse_cnn_date(raw_date):
|
| 547 |
+
if not raw_date: return "-"
|
| 548 |
+
if '|' in raw_date: raw_date = raw_date.split('|')[1]
|
| 549 |
+
raw = raw_date.replace(" WIB", "").strip()
|
| 550 |
+
try:
|
| 551 |
+
from datetime import datetime
|
| 552 |
+
import locale
|
| 553 |
+
try: locale.setlocale(locale.LC_TIME, 'id_ID.UTF-8')
|
| 554 |
+
except locale.Error: locale.setlocale(locale.LC_TIME, '')
|
| 555 |
+
dt = datetime.strptime(raw, "%A, %d %b %Y %H:%M")
|
| 556 |
+
return dt.strftime("%Y-%m-%d %H:%M")
|
| 557 |
+
except Exception: return raw_date.strip()
|
| 558 |
+
|
| 559 |
+
def looks_like_article_href(href):
|
| 560 |
+
if not href: return False
|
| 561 |
+
parsed = urlparse(href.strip())
|
| 562 |
+
path = parsed.path
|
| 563 |
+
if any(skip in path for skip in ['/search', '/tag', '/kategori', '/author', '/channel', '/indeks', '/video', '/foto']): return False
|
| 564 |
+
if re.search(r'/\d{14}-\d{2,3}-\d{6,}', path): return True
|
| 565 |
+
return False
|
| 566 |
+
|
| 567 |
+
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"}
|
| 568 |
+
|
| 569 |
+
def fetch_article_detail(url, retries=3, delay=3):
|
| 570 |
+
for attempt in range(1, retries + 1):
|
| 571 |
+
try:
|
| 572 |
+
resp = requests.get(url, headers=HEADERS, timeout=15)
|
| 573 |
+
if resp.status_code == 200: return resp.text
|
| 574 |
+
else: print(f" WARNING: HTTP {resp.status_code} saat akses {url}")
|
| 575 |
+
except Exception as e: print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
|
| 576 |
+
time.sleep(delay)
|
| 577 |
+
return None
|
| 578 |
+
|
| 579 |
+
def scrape_cnn_with_selenium(keyword, max_pages=3, delay_between_items=(1,2)):
|
| 580 |
+
results = []
|
| 581 |
+
seen_links = set()
|
| 582 |
+
|
| 583 |
+
print("Menginisialisasi browser Chrome...")
|
| 584 |
+
service = Service(ChromeDriverManager().install())
|
| 585 |
+
options = webdriver.ChromeOptions()
|
| 586 |
+
options.add_argument("--headless")
|
| 587 |
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
| 588 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 589 |
+
options.add_experimental_option('useAutomationExtension', False)
|
| 590 |
+
|
| 591 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 592 |
+
driver.set_page_load_timeout(30)
|
| 593 |
+
|
| 594 |
+
# <<< DIUBAH: Logika perulangan kembali menggunakan nomor halaman (bukan scroll) >>>
|
| 595 |
+
for page in range(1, max_pages + 1):
|
| 596 |
+
# Membuat URL untuk halaman yang dituju
|
| 597 |
+
url = make_search_url(keyword, page)
|
| 598 |
+
print(f"\nMembuka halaman {page} -> {url}")
|
| 599 |
+
driver.get(url)
|
| 600 |
+
|
| 601 |
+
# Penanganan cookie hanya perlu saat pertama kali halaman dimuat (page 1)
|
| 602 |
+
if page == 1:
|
| 603 |
+
try:
|
| 604 |
+
print("Mencari pop-up cookie...")
|
| 605 |
+
cookie_agree_button = WebDriverWait(driver, 10).until(
|
| 606 |
+
EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))
|
| 607 |
+
)
|
| 608 |
+
cookie_agree_button.click()
|
| 609 |
+
print(" Pop-up cookie ditemukan dan ditutup.")
|
| 610 |
+
time.sleep(2)
|
| 611 |
+
except TimeoutException:
|
| 612 |
+
print(" Pop-up cookie tidak ditemukan, melanjutkan proses.")
|
| 613 |
+
|
| 614 |
+
print(f"Mengambil data dari halaman {page}...")
|
| 615 |
+
|
| 616 |
+
try:
|
| 617 |
+
# Menunggu konten dimuat di setiap halaman baru
|
| 618 |
+
WebDriverWait(driver, 15).until(
|
| 619 |
+
EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a"))
|
| 620 |
+
)
|
| 621 |
+
except TimeoutException:
|
| 622 |
+
print(f" WARNING: Waktu habis menunggu konten di halaman {page}. Mungkin halaman ini kosong.")
|
| 623 |
+
continue # Lanjut ke halaman berikutnya jika ada
|
| 624 |
+
|
| 625 |
+
page_html = driver.page_source
|
| 626 |
+
soup = BeautifulSoup(page_html, "html.parser")
|
| 627 |
+
|
| 628 |
+
link_elements = soup.select('div.nhl-list article a[href]')
|
| 629 |
+
|
| 630 |
+
all_links_in_page = {normalize_url(a['href']) for a in link_elements if looks_like_article_href(a['href'])}
|
| 631 |
+
new_links = all_links_in_page - seen_links
|
| 632 |
+
|
| 633 |
+
if not new_links:
|
| 634 |
+
print(" Tidak ada link baru yang ditemukan di halaman ini.")
|
| 635 |
+
# Tidak perlu berhenti, karena halaman berikutnya mungkin punya link baru
|
| 636 |
+
|
| 637 |
+
print(f" Ditemukan {len(new_links)} link baru.")
|
| 638 |
+
seen_links.update(new_links)
|
| 639 |
+
|
| 640 |
+
# Proses scrape detail artikel tidak ada perubahan
|
| 641 |
+
for link in sorted(new_links):
|
| 642 |
+
print(f" -> Memproses: {link}")
|
| 643 |
+
html_detail = fetch_article_detail(link)
|
| 644 |
+
if not html_detail: continue
|
| 645 |
+
detail_soup = BeautifulSoup(html_detail, "html.parser")
|
| 646 |
+
|
| 647 |
+
title_el = detail_soup.select_one('h1')
|
| 648 |
+
title_text = title_el.get_text(strip=True) if title_el else "-"
|
| 649 |
+
|
| 650 |
+
date_el = detail_soup.select_one('div.text-cnn_grey.text-sm')
|
| 651 |
+
date_text = parse_cnn_date(date_el.get_text(strip=True)) if date_el else "-"
|
| 652 |
+
|
| 653 |
+
tags_list = []
|
| 654 |
+
topik_terkait_header = detail_soup.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
|
| 655 |
+
if topik_terkait_header:
|
| 656 |
+
tags_container = topik_terkait_header.find_next_sibling('div')
|
| 657 |
+
if tags_container:
|
| 658 |
+
tags_elements = tags_container.select('a')
|
| 659 |
+
tags_list = [tag.get_text(strip=True) for tag in tags_elements]
|
| 660 |
+
|
| 661 |
+
content_parts = []
|
| 662 |
+
content_container = detail_soup.select_one("div.detail-text")
|
| 663 |
+
if content_container:
|
| 664 |
+
for p in content_container.find_all('p'):
|
| 665 |
+
text = p.get_text(" ", strip=True)
|
| 666 |
+
if text and not text.lower().startswith("lihat juga") and not text.lower().startswith("scroll to continue"):
|
| 667 |
+
content_parts.append(text)
|
| 668 |
+
|
| 669 |
+
results.append({
|
| 670 |
+
"judul": title_text, "tanggal": date_text,
|
| 671 |
+
"tag": ", ".join(tags_list) if tags_list else "-",
|
| 672 |
+
"isi_berita": "\n".join(content_parts) if content_parts else "-", "link": link
|
| 673 |
+
})
|
| 674 |
+
print(f" Berhasil: {title_text} | Tanggal: {date_text}")
|
| 675 |
+
time.sleep(random.uniform(*delay_between_items))
|
| 676 |
+
|
| 677 |
+
print("\nMenutup browser...")
|
| 678 |
+
driver.quit()
|
| 679 |
+
return pd.DataFrame(results)
|
| 680 |
+
|
| 681 |
+
if __name__ == "__main__":
|
| 682 |
+
keyword = "kabupaten cirebon"
|
| 683 |
+
df = scrape_cnn_with_selenium(keyword, max_pages=100)
|
| 684 |
+
if not df.empty:
|
| 685 |
+
df.to_csv("cnnindonesia_berita_final.csv", index=False, encoding="utf-8-sig")
|
| 686 |
+
print(f"\nSelesai menyimpan {len(df)} data berita ke cnnindonesia_berita_final.csv")
|
| 687 |
+
else:
|
| 688 |
+
print("\nTidak ada data yang berhasil di-scrape.")
|
| 689 |
+
|
| 690 |
+
# Radar Cirebon ID
|
| 691 |
+
|
| 692 |
+
import requests
|
| 693 |
+
from bs4 import BeautifulSoup
|
| 694 |
+
import pandas as pd
|
| 695 |
+
import time
|
| 696 |
+
import random
|
| 697 |
+
from urllib.parse import quote, urlparse, urlunparse
|
| 698 |
+
import re
|
| 699 |
+
|
| 700 |
+
# Mengganti BASE_HOST ke situs target yang baru
|
| 701 |
+
BASE_HOST = "https://radarcirebon.id"
|
| 702 |
+
|
| 703 |
+
def make_search_url(keyword, page):
|
| 704 |
+
"""
|
| 705 |
+
Membuat URL pencarian sesuai format radarcirebon.id.
|
| 706 |
+
Contoh: https://radarcirebon.id/search/kabupaten+cirebon/page/2/
|
| 707 |
+
"""
|
| 708 |
+
# Mengganti spasi dengan '+' sesuai format URL situs
|
| 709 |
+
q = quote(keyword).replace('%20', '+')
|
| 710 |
+
if page == 1:
|
| 711 |
+
return f"{BASE_HOST}/search/{q}/"
|
| 712 |
+
else:
|
| 713 |
+
return f"{BASE_HOST}/search/{q}/page/{page}/"
|
| 714 |
+
|
| 715 |
+
def normalize_url(href):
|
| 716 |
+
"""
|
| 717 |
+
Memastikan URL dalam format absolut dan bersih (tanpa parameter).
|
| 718 |
+
"""
|
| 719 |
+
if not href:
|
| 720 |
+
return None
|
| 721 |
+
href = href.strip()
|
| 722 |
+
# Membuat URL absolut jika hanya berupa path
|
| 723 |
+
if href.startswith("//"):
|
| 724 |
+
href = "https:" + href
|
| 725 |
+
elif href.startswith("/"):
|
| 726 |
+
href = BASE_HOST + href
|
| 727 |
+
elif not href.startswith("http"):
|
| 728 |
+
return None # Mengabaikan link yang tidak valid
|
| 729 |
+
|
| 730 |
+
parsed = urlparse(href)
|
| 731 |
+
clean = parsed._replace(query="", fragment="")
|
| 732 |
+
return urlunparse(clean).rstrip("/")
|
| 733 |
+
|
| 734 |
+
def parse_radarcirebon_date(raw_date):
|
| 735 |
+
"""
|
| 736 |
+
Mengubah format tanggal dari 'Selasa, 12 Agu 2025 - 11:01'
|
| 737 |
+
menjadi format standar 'YYYY-MM-DD HH:MM'.
|
| 738 |
+
"""
|
| 739 |
+
if not raw_date:
|
| 740 |
+
return "-"
|
| 741 |
+
try:
|
| 742 |
+
# Pemetaan manual untuk nama bulan 3 huruf dalam Bahasa Indonesia
|
| 743 |
+
month_map = {
|
| 744 |
+
'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'Mei': '05', 'Jun': '06',
|
| 745 |
+
'Jul': '07', 'Agu': '08', 'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12'
|
| 746 |
+
}
|
| 747 |
+
# Membersihkan hari dan memisahkan bagian-bagian tanggal
|
| 748 |
+
date_part = raw_date.split(', ')[1] # -> "12 Agu 2025 - 11:01"
|
| 749 |
+
parts = date_part.replace(' - ', ' ').split() # -> ['12', 'Agu', '2025', '11:01']
|
| 750 |
+
|
| 751 |
+
day = parts[0].zfill(2) # zfill(2) untuk memastikan format '01', '02', dst.
|
| 752 |
+
month_abbr = parts[1]
|
| 753 |
+
year = parts[2]
|
| 754 |
+
time_str = parts[3]
|
| 755 |
+
|
| 756 |
+
# Mengambil angka bulan dari pemetaan
|
| 757 |
+
month = month_map.get(month_abbr, '00')
|
| 758 |
+
|
| 759 |
+
return f"{year}-{month}-{day} {time_str}"
|
| 760 |
+
except Exception:
|
| 761 |
+
return raw_date.strip()
|
| 762 |
+
|
| 763 |
+
def looks_like_article_href(href):
|
| 764 |
+
"""
|
| 765 |
+
Memfilter URL agar hanya mengambil link artikel yang valid.
|
| 766 |
+
Contoh URL artikel: /2025/08/12/nama-artikel/
|
| 767 |
+
"""
|
| 768 |
+
if not href:
|
| 769 |
+
return False
|
| 770 |
+
# Pola URL artikel di radarcirebon.id selalu mengandung /YYYY/MM/DD/
|
| 771 |
+
return bool(re.search(r'/\d{4}/\d{2}/\d{2}/', href))
|
| 772 |
+
|
| 773 |
+
HEADERS = {
|
| 774 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 775 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 776 |
+
"Chrome/126.0.0.0 Safari/537.36"
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
def fetch_url(url, retries=3, delay=3):
|
| 780 |
+
"""
|
| 781 |
+
Fungsi untuk mengambil konten dari sebuah URL dengan mekanisme coba lagi (retry).
|
| 782 |
+
"""
|
| 783 |
+
for attempt in range(1, retries + 1):
|
| 784 |
+
try:
|
| 785 |
+
resp = requests.get(url, headers=HEADERS, timeout=15)
|
| 786 |
+
if resp.status_code == 200:
|
| 787 |
+
return resp.text
|
| 788 |
+
else:
|
| 789 |
+
print(f" WARNING: HTTP {resp.status_code} saat akses {url}")
|
| 790 |
+
except Exception as e:
|
| 791 |
+
print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}")
|
| 792 |
+
time.sleep(delay)
|
| 793 |
+
return None
|
| 794 |
+
|
| 795 |
+
def scrape_radarcirebon(keyword, max_pages=3, delay_between_items=(1, 2), delay_between_pages=(2, 4)):
|
| 796 |
+
"""
|
| 797 |
+
Fungsi utama untuk melakukan scraping dari situs radarcirebon.id.
|
| 798 |
+
"""
|
| 799 |
+
results = []
|
| 800 |
+
seen_links = set()
|
| 801 |
+
|
| 802 |
+
for page in range(1, max_pages + 1):
|
| 803 |
+
url = make_search_url(keyword, page)
|
| 804 |
+
print(f"\nScraping halaman {page} -> {url}")
|
| 805 |
+
|
| 806 |
+
html = fetch_url(url)
|
| 807 |
+
if not html:
|
| 808 |
+
print(f" ERROR: Gagal mengambil halaman pencarian {page}")
|
| 809 |
+
continue
|
| 810 |
+
|
| 811 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 812 |
+
|
| 813 |
+
# Selektor CSS baru untuk menemukan link artikel di halaman pencarian
|
| 814 |
+
link_elements = soup.select('article .wp-block-latest-posts__post-title a')
|
| 815 |
+
print(f" DEBUG: Ditemukan {len(link_elements)} elemen link di halaman {page}")
|
| 816 |
+
|
| 817 |
+
all_links_in_page = set()
|
| 818 |
+
for a in link_elements:
|
| 819 |
+
href_raw = a.get('href')
|
| 820 |
+
if href_raw and looks_like_article_href(href_raw):
|
| 821 |
+
norm = normalize_url(href_raw)
|
| 822 |
+
if norm:
|
| 823 |
+
all_links_in_page.add(norm)
|
| 824 |
+
|
| 825 |
+
new_links = all_links_in_page - seen_links
|
| 826 |
+
print(f" Menemukan {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.")
|
| 827 |
+
seen_links.update(all_links_in_page)
|
| 828 |
+
|
| 829 |
+
for link in sorted(list(new_links)):
|
| 830 |
+
html_detail = fetch_url(link)
|
| 831 |
+
if not html_detail:
|
| 832 |
+
print(f" ERROR: Gagal mengambil artikel {link}")
|
| 833 |
+
continue
|
| 834 |
+
|
| 835 |
+
detail_soup = BeautifulSoup(html_detail, "html.parser")
|
| 836 |
+
|
| 837 |
+
# Selektor baru untuk judul artikel
|
| 838 |
+
title_el = detail_soup.select_one('h1.entry-title')
|
| 839 |
+
title_detail = title_el.get_text(strip=True) if title_el else "-"
|
| 840 |
+
|
| 841 |
+
# Selektor baru untuk tanggal
|
| 842 |
+
date_el = detail_soup.select_one('time.entry-date')
|
| 843 |
+
date_detail = parse_radarcirebon_date(date_el.get_text(strip=True)) if date_el else "-"
|
| 844 |
+
|
| 845 |
+
# Selektor baru untuk isi berita
|
| 846 |
+
content_parts = []
|
| 847 |
+
content_container = detail_soup.select_one('div.entry-content')
|
| 848 |
+
if content_container:
|
| 849 |
+
for p in content_container.select('p'):
|
| 850 |
+
# Mengabaikan paragraf yang berisi link "Baca Juga"
|
| 851 |
+
if not p.find_parent(class_='read-also'):
|
| 852 |
+
text = p.get_text(" ", strip=True)
|
| 853 |
+
if text:
|
| 854 |
+
content_parts.append(text)
|
| 855 |
+
content = "\n".join(content_parts)
|
| 856 |
+
|
| 857 |
+
# Selektor baru untuk tag
|
| 858 |
+
tags_container = detail_soup.select_one('div.wp-block-tag-cloud')
|
| 859 |
+
tags = [a.get_text(strip=True) for a in tags_container.select('a')] if tags_container else []
|
| 860 |
+
tags = list(dict.fromkeys(tags)) # Menghapus duplikat
|
| 861 |
+
|
| 862 |
+
results.append({
|
| 863 |
+
"judul": title_detail,
|
| 864 |
+
"tanggal": date_detail,
|
| 865 |
+
"tag": ", ".join(tags) if tags else "-",
|
| 866 |
+
"isi_berita": content if content else "-",
|
| 867 |
+
"link": link
|
| 868 |
+
})
|
| 869 |
+
print(f" Berhasil: {title_detail} | Tanggal: {date_detail}")
|
| 870 |
+
time.sleep(random.uniform(*delay_between_items))
|
| 871 |
+
|
| 872 |
+
# Beri jeda antar halaman untuk tidak membebani server
|
| 873 |
+
time.sleep(random.uniform(*delay_between_pages))
|
| 874 |
+
|
| 875 |
+
return pd.DataFrame(results)
|
| 876 |
+
|
| 877 |
+
if __name__ == "__main__":
|
| 878 |
+
keyword = "kabupaten cirebon"
|
| 879 |
+
# Batasi max_pages sesuai kebutuhan Anda, misalnya 3 halaman
|
| 880 |
+
df = scrape_radarcirebon(keyword, max_pages=3)
|
| 881 |
+
if not df.empty:
|
| 882 |
+
# Menyimpan ke file CSV baru
|
| 883 |
+
output_filename = "radarcirebon_berita.csv"
|
| 884 |
+
df.to_csv(output_filename, index=False, encoding="utf-8-sig")
|
| 885 |
+
print(f"\nSelesai menyimpan {len(df)} data berita ke {output_filename}")
|
| 886 |
+
else:
|
| 887 |
+
print("\nTidak ada data yang berhasil di-scrape.")
|
| 888 |
+
|
| 889 |
+
# Download html
|
| 890 |
+
|
| 891 |
+
import requests
|
| 892 |
+
|
| 893 |
+
url = "https://radarcirebon.id/2025/08/12/warga-resah-dprd-cirebon-panggil-dpkpp-untuk-tuntaskan-masalah-psu-di-dua-perumahan/"
|
| 894 |
+
headers = {
|
| 895 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
|
| 896 |
+
}
|
| 897 |
+
resp = requests.get(url, headers=headers)
|
| 898 |
+
with open("detail.html", "w", encoding="utf-8") as f:
|
| 899 |
+
f.write(resp.text)
|
| 900 |
+
print("HTML halaman disimpan ke page.html")
|
| 901 |
+
|
| 902 |
+
# Detik.com memiliki batas waktu
|
| 903 |
+
|
| 904 |
+
import requests
|
| 905 |
+
from bs4 import BeautifulSoup
|
| 906 |
+
import pandas as pd
|
| 907 |
+
import time
|
| 908 |
+
from datetime import datetime
|
| 909 |
+
|
| 910 |
+
def scrape_detik_search(keyword, max_years=3, max_pages=100):
|
| 911 |
+
base_search_url = "https://www.detik.com/search/searchall"
|
| 912 |
+
results = []
|
| 913 |
+
|
| 914 |
+
cutoff_date = datetime.now().replace(year=datetime.now().year - max_years)
|
| 915 |
+
|
| 916 |
+
headers = {
|
| 917 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
|
| 918 |
+
}
|
| 919 |
+
|
| 920 |
+
for page in range(1, max_pages + 1):
|
| 921 |
+
params = {
|
| 922 |
+
'query': keyword,
|
| 923 |
+
'siteid': '2',
|
| 924 |
+
'sortby': 'time',
|
| 925 |
+
'page': page
|
| 926 |
+
}
|
| 927 |
+
print(f"Scraping page {page}...")
|
| 928 |
+
r = requests.get(base_search_url, params=params, headers=headers)
|
| 929 |
+
if r.status_code != 200:
|
| 930 |
+
print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.")
|
| 931 |
+
break
|
| 932 |
+
|
| 933 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
| 934 |
+
|
| 935 |
+
news_list = soup.find_all('div', class_='media')
|
| 936 |
+
|
| 937 |
+
if not news_list:
|
| 938 |
+
print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.")
|
| 939 |
+
break
|
| 940 |
+
|
| 941 |
+
for news in news_list:
|
| 942 |
+
try:
|
| 943 |
+
title_tag = news.find('h3', class_='media__title')
|
| 944 |
+
if not title_tag:
|
| 945 |
+
continue
|
| 946 |
+
link_tag = title_tag.find('a', class_='media__link')
|
| 947 |
+
if not link_tag or not link_tag.has_attr('href'):
|
| 948 |
+
continue
|
| 949 |
+
link = link_tag['href']
|
| 950 |
+
title = link_tag.text.strip()
|
| 951 |
+
|
| 952 |
+
date_tag = news.find('div', class_='media__date')
|
| 953 |
+
if date_tag:
|
| 954 |
+
span_tag = date_tag.find('span')
|
| 955 |
+
if span_tag and span_tag.has_attr('d-time'):
|
| 956 |
+
timestamp = span_tag['d-time']
|
| 957 |
+
news_date = datetime.fromtimestamp(int(timestamp))
|
| 958 |
+
else:
|
| 959 |
+
news_date = None
|
| 960 |
+
else:
|
| 961 |
+
news_date = None
|
| 962 |
+
|
| 963 |
+
if news_date and news_date < cutoff_date:
|
| 964 |
+
print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.")
|
| 965 |
+
return pd.DataFrame(results)
|
| 966 |
+
|
| 967 |
+
# Ambil halaman detail berita dengan header
|
| 968 |
+
news_resp = requests.get(link, headers=headers)
|
| 969 |
+
if news_resp.status_code != 200:
|
| 970 |
+
print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.")
|
| 971 |
+
continue
|
| 972 |
+
|
| 973 |
+
news_soup = BeautifulSoup(news_resp.text, 'html.parser')
|
| 974 |
+
|
| 975 |
+
content_div = news_soup.find('div', class_='detail__body-text') or \
|
| 976 |
+
news_soup.find('div', class_='detail_text')
|
| 977 |
+
|
| 978 |
+
if content_div:
|
| 979 |
+
content_parts = []
|
| 980 |
+
for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
|
| 981 |
+
text = tag.get_text(strip=True)
|
| 982 |
+
if text:
|
| 983 |
+
prefix = tag.name.upper() if tag.name.startswith('h') else ''
|
| 984 |
+
if prefix:
|
| 985 |
+
content_parts.append(f"{prefix}: {text}")
|
| 986 |
+
else:
|
| 987 |
+
content_parts.append(text)
|
| 988 |
+
content = '\n'.join(content_parts)
|
| 989 |
+
else:
|
| 990 |
+
content = ''
|
| 991 |
+
|
| 992 |
+
tag_list_div = news_soup.find('div', class_='tag__list') or \
|
| 993 |
+
news_soup.find('div', class_='detail_tag')
|
| 994 |
+
|
| 995 |
+
tags = []
|
| 996 |
+
if tag_list_div:
|
| 997 |
+
tags = [t.text.strip() for t in tag_list_div.find_all('a')]
|
| 998 |
+
|
| 999 |
+
results.append({
|
| 1000 |
+
'judul': title,
|
| 1001 |
+
'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
|
| 1002 |
+
'tag': ', '.join(tags),
|
| 1003 |
+
'isi_berita': content,
|
| 1004 |
+
'link': link
|
| 1005 |
+
})
|
| 1006 |
+
|
| 1007 |
+
print(f"Berhasil scrape berita: {title}")
|
| 1008 |
+
|
| 1009 |
+
time.sleep(1)
|
| 1010 |
+
|
| 1011 |
+
except Exception as e:
|
| 1012 |
+
print(f"Error saat memproses berita: {e}")
|
| 1013 |
+
continue
|
| 1014 |
+
|
| 1015 |
+
time.sleep(2)
|
| 1016 |
+
|
| 1017 |
+
return pd.DataFrame(results)
|
| 1018 |
+
|
| 1019 |
+
if __name__ == "__main__":
|
| 1020 |
+
keyword = "Kabupaten Cirebon"
|
| 1021 |
+
df = scrape_detik_search(keyword)
|
| 1022 |
+
if not df.empty:
|
| 1023 |
+
df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig')
|
| 1024 |
+
print("Selesai menyimpan data berita ke detik_berita_cirebon.csv")
|
| 1025 |
+
else:
|
| 1026 |
+
print("Tidak ada data yang berhasil di-scrape.")
|
word_cloud.py
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""Word Cloud.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1rwyDXgYaTJQJvXu2FPeggecHOxIYQ3l3
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
!pip install stop-words
|
| 11 |
+
!pip install sastrawi
|
| 12 |
+
!pip install transformers
|
| 13 |
+
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import matplotlib.pyplot as plt
|
| 17 |
+
|
| 18 |
+
import html
|
| 19 |
+
import re
|
| 20 |
+
import json
|
| 21 |
+
|
| 22 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 23 |
+
from sklearn.decomposition import NMF
|
| 24 |
+
from wordcloud import WordCloud
|
| 25 |
+
from tqdm import tqdm
|
| 26 |
+
from IPython.display import display
|
| 27 |
+
from bs4 import BeautifulSoup
|
| 28 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
| 29 |
+
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
|
| 30 |
+
from stop_words import get_stop_words
|
| 31 |
+
from collections import Counter
|
| 32 |
+
from transformers import pipeline
|
| 33 |
+
|
| 34 |
+
# ===============================================
|
| 35 |
+
# --- Konfigurasi ---
|
| 36 |
+
# ===============================================
|
| 37 |
+
FILE_PATH = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/medsos (6).csv'
|
| 38 |
+
N_TOPICS = 15
|
| 39 |
+
N_TOP_WORDS = 10 # top kata per topik (juga dipakai untuk wordcloud)
|
| 40 |
+
SAMPLE_DATA_TO_SHOW = 5 # Jumlah sampel data yang ingin ditampilkan per sentimen
|
| 41 |
+
|
| 42 |
+
# ===============================================
|
| 43 |
+
# 1. Stopwords: stop_words + Sastrawi + tambahan
|
| 44 |
+
# ===============================================
|
| 45 |
+
stopwords_indonesia = get_stop_words('indonesian')
|
| 46 |
+
factory = StopWordRemoverFactory()
|
| 47 |
+
sastrawi_stopwords = factory.get_stop_words()
|
| 48 |
+
|
| 49 |
+
additional_stopwords = [
|
| 50 |
+
'yg','ga','gak','nggak','aja','saja','nya','oke','ok','bgt','jg','utk',
|
| 51 |
+
'deh','sih','kok','dong','udah','sdh','blm','bgmn','dgn','lgi','apk',
|
| 52 |
+
'sllu','apknya','sngt','joos','ni','kak',
|
| 53 |
+
# kata umum
|
| 54 |
+
'manfaatnya','ya','lbh','digunakan','semangat','dah','sangat','penting',
|
| 55 |
+
'lancar','cepat','senang','makasih','bermanfaat','keren','berguna','baik',
|
| 56 |
+
'indonesia','usaha','memudahkan','pokoknya','puas','mantap','dananya','luar',
|
| 57 |
+
'hati','ber','terimakasih','tepat','memudah','terbaik','mempermudah','praktis',
|
| 58 |
+
'simple','kadang','memuaskan','bagus','semoga','smoga','aplikasi','transaksi',
|
| 59 |
+
'kesimpulan','sip','pelayanannya','orang','manfaat','untuk','proses','membantu',
|
| 60 |
+
'pengiriman','muda','mantaap','kedepannya','pake','aktifitas','sejauh','untung',
|
| 61 |
+
'tenang','bikin','pakek','saldo','keluhan','dimanapun','cukup','menggunakan',
|
| 62 |
+
'sengat','banget','pakai','terpercaya','top','sukses',
|
| 63 |
+
# hasil wordcloud
|
| 64 |
+
'hp','tolong','gimana','iya','jadi','ambil','buka','butuh','masuk','guna',
|
| 65 |
+
'baru','jelas','level','selengkapnya','yuk','mohon','punya','cara','hari',
|
| 66 |
+
'kota','news','baca','fitur','kasih','suruh',
|
| 67 |
+
'besar','sapa','bawa','atas','hidup','jaga','moga','kali','balas','perintah',
|
| 68 |
+
'masyarakat','ide','hadir','ikut','ingat','tali','alhamdulillah','sambut',
|
| 69 |
+
'masa','tuju','terima','ibu','silaturahmi','pasang','bangun','dukung',
|
| 70 |
+
'muhammad','teladan','tahun','insan','bulan','iman','erat','syukur',
|
| 71 |
+
'kabupaten','cirebon','langsung','cinta','kuat','tebar','hubung','ikat',
|
| 72 |
+
'resmi','giat','selenggara','luka','kendara','putih','fyp','reses','mulai',
|
| 73 |
+
'rctvcirebon','radarcirebon','temu','satu','factor','harap','wararctv',
|
| 74 |
+
'maksimal','salah','tiktokberita','kawasan','sangka','juang','merah','puluh',
|
| 75 |
+
'ribu','omo','argo','role','jati','tingkat','kata','emis','majalengka',
|
| 76 |
+
'madam','sebut','tawur','duga',
|
| 77 |
+
# tambahan kata lain
|
| 78 |
+
'visi','saw','keras','sayang','bentuk','didik','jalin','keluarga','momen',
|
| 79 |
+
'program','baginda','hikmah','panjang','lingkung','wewararctv', 'magelang',
|
| 80 |
+
'kang', 'langkah', 'limpah', 'explore', 'tabindex', 'penuh', 'aa', 'rasa', 'tags',
|
| 81 |
+
'notranslate', 'desa', 'daerah', 'lengkap', 'aa', 'kunjung', 'laku', 'klik', 'berkah',
|
| 82 |
+
'aboutcirebon', 'jl', 'terus', 'hasil', 'instastory', 'taut', 'upaya', 'berita',
|
| 83 |
+
'beri', 'lanjut', 'pemkabcirebon', 'warga', 'pemkabcirebon', 'selamat', 'wujud', 'maju',
|
| 84 |
+
'wakil', 'ungkap', 'turut', 'pihak', 'wilayah', 'dinas', 'promo', 'pemkotcirebon', 'hadap',
|
| 85 |
+
'barat', 'layan', 'siap', 'milik', 'lokasi', 'ujar', 'rupa', 'gratis', 'daftar', 'jawa', 'tengah',
|
| 86 |
+
'kolaborasi', 'tempat', 'tegas', 'gelar', 'wib'
|
| 87 |
+
# Bulan
|
| 88 |
+
'januari', 'februari', 'maret', 'april', 'mei', 'juni', 'juli', 'agustus', 'september',
|
| 89 |
+
'oktober', 'november', 'desember'
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
# ===== Tambahan stopwords untuk kata tidak jelas =====
|
| 93 |
+
noise_stopwords = [
|
| 94 |
+
'by','zd','xyri','yu','uobl','ypdohk','xt','pz','lziwak','mp',
|
| 95 |
+
'rp','xdj','xexx','xggy','xjbqb','xstzfhl','link','class','hfl','xat',
|
| 96 |
+
'qhh','dhg','cr', 'tdsg', 'ct', 'etr', 'nq', 'oe', 'ejq', 'psk', 'href',
|
| 97 |
+
'hl', 'hd' , 'sy', 'amp', 'fbf', 'tags'
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
CUSTOM_STOPWORDS = [
|
| 101 |
+
# HTML & atribut umum
|
| 102 |
+
"class", "id", "span", "div", "href", "src", "style", "alt",
|
| 103 |
+
"aria", "role", "tabindex", "button", "label", "img", "input",
|
| 104 |
+
"placeholder", "form", "field", "hidden", "value", 'aa',
|
| 105 |
+
|
| 106 |
+
# Token acak/huruf tunggal
|
| 107 |
+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
|
| 108 |
+
"l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
|
| 109 |
+
"w", "x", "y", "z",
|
| 110 |
+
|
| 111 |
+
# Kata noise berulang dari teks kamu
|
| 112 |
+
"hfl", "xjbqb", "ejq", "ypdohk", "xexx", "hfr", "eyih",
|
| 113 |
+
"dwj", "hkzxv", "yuc", "igjr", "eqks", "oq", "kjzd", "oxk",
|
| 114 |
+
"zsgpy", "dycq", "g", "o", "wa", "wo", "ae", "ov", "vv", "uxc",
|
| 115 |
+
|
| 116 |
+
# Kata teknis netral
|
| 117 |
+
"content", "data", "video", "playlist", "source", "watch",
|
| 118 |
+
"channel", "views", "subscribe", "update", "next", "prev",
|
| 119 |
+
"click", "menu", "link", "button", "card", "section",
|
| 120 |
+
|
| 121 |
+
# Angka & simbol sering muncul
|
| 122 |
+
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
# Gabungkan semua stopwords
|
| 126 |
+
final_stopwords = list(set(stopwords_indonesia + sastrawi_stopwords + additional_stopwords + noise_stopwords + CUSTOM_STOPWORDS))
|
| 127 |
+
|
| 128 |
+
# ===============================================
|
| 129 |
+
# 2. Pembersihan HTML + Stemming Sastrawi
|
| 130 |
+
# ===============================================
|
| 131 |
+
stemmer = StemmerFactory().create_stemmer()
|
| 132 |
+
html_noise = ['fbf','tabindex','tags','notranslate','aria-label','div','span','class']
|
| 133 |
+
noise_words = set(noise_stopwords + CUSTOM_STOPWORDS + html_noise)
|
| 134 |
+
|
| 135 |
+
def clean_html(text):
|
| 136 |
+
if pd.isna(text):
|
| 137 |
+
return ""
|
| 138 |
+
s = BeautifulSoup(str(text), "html.parser")
|
| 139 |
+
for tag in s(["script", "style"]):
|
| 140 |
+
tag.decompose()
|
| 141 |
+
cleaned = s.get_text(separator=" ")
|
| 142 |
+
cleaned = html.unescape(cleaned)
|
| 143 |
+
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
| 144 |
+
return cleaned
|
| 145 |
+
|
| 146 |
+
def remove_single_letters(text):
|
| 147 |
+
return re.sub(r"\b\w\b", "", text)
|
| 148 |
+
|
| 149 |
+
def hapus (text):
|
| 150 |
+
tokens = [word for word in text.split() if word not in noise_words]
|
| 151 |
+
text = " ".join(tokens)
|
| 152 |
+
return text
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def preprocess_text(text):
|
| 156 |
+
# 1. Clean HTML
|
| 157 |
+
text = clean_html(text)
|
| 158 |
+
|
| 159 |
+
# 2. Lowercase
|
| 160 |
+
text = text.lower()
|
| 161 |
+
|
| 162 |
+
# 3. Stemming
|
| 163 |
+
text = stemmer.stem(text)
|
| 164 |
+
|
| 165 |
+
# 4. Hapus stopwords dan html noise
|
| 166 |
+
tokens = [word for word in text.split()
|
| 167 |
+
if word not in final_stopwords and word not in html_noise]
|
| 168 |
+
|
| 169 |
+
# 5. Ambil hanya kata (huruf saja)
|
| 170 |
+
tokens = [t for t in tokens if re.search(r"[a-zA-Z]", t)]
|
| 171 |
+
|
| 172 |
+
# 6. Gabung kembali
|
| 173 |
+
text = " ".join(tokens)
|
| 174 |
+
|
| 175 |
+
# 7. Hapus huruf tunggal
|
| 176 |
+
text = remove_single_letters(text)
|
| 177 |
+
|
| 178 |
+
return text.strip()
|
| 179 |
+
|
| 180 |
+
# ===============================================
|
| 181 |
+
# 3. Load & Preprocess Dataset
|
| 182 |
+
# ===============================================
|
| 183 |
+
try:
|
| 184 |
+
df = pd.read_csv(FILE_PATH)
|
| 185 |
+
df.dropna(subset=['caption'], inplace=True)
|
| 186 |
+
df['caption'] = df['caption'].astype(str)
|
| 187 |
+
df['caption_clean'] = df['caption'].apply(preprocess_text)
|
| 188 |
+
df['caption'] = df['caption'].apply(hapus)
|
| 189 |
+
|
| 190 |
+
print("β
Dataset berhasil dimuat & dipreproses.")
|
| 191 |
+
print(f"Jumlah data: {len(df)} baris")
|
| 192 |
+
if 'caption_pred' in df.columns:
|
| 193 |
+
print("\nDistribusi Sentimen (caption_pred):")
|
| 194 |
+
print(df['caption_pred'].value_counts())
|
| 195 |
+
except FileNotFoundError:
|
| 196 |
+
print(f"β Error: File '{FILE_PATH}' tidak ditemukan.")
|
| 197 |
+
raise SystemExit
|
| 198 |
+
|
| 199 |
+
# ===============================================
|
| 200 |
+
# 4. Fungsi utilitas
|
| 201 |
+
# ===============================================
|
| 202 |
+
def get_top_words_per_topic(model, feature_names, n_top_words):
|
| 203 |
+
topics = {}
|
| 204 |
+
for topic_idx, topic in enumerate(model.components_):
|
| 205 |
+
top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
|
| 206 |
+
top_features = [feature_names[i] for i in top_features_ind]
|
| 207 |
+
topics[topic_idx] = top_features
|
| 208 |
+
return topics
|
| 209 |
+
|
| 210 |
+
def format_topics_sentences(topics):
|
| 211 |
+
return {topic_idx: ", ".join(words) for topic_idx, words in topics.items()}
|
| 212 |
+
|
| 213 |
+
def create_circular_wordcloud(words_list, title, n_words=10):
|
| 214 |
+
text_data = " ".join(words_list[:n_words])
|
| 215 |
+
if not text_data.strip():
|
| 216 |
+
print(f"Tidak ada kata untuk word cloud '{title}'.")
|
| 217 |
+
return
|
| 218 |
+
x, y = np.ogrid[:400, :400]
|
| 219 |
+
mask = (x - 200) ** 2 + (y - 200) ** 2 > 190 ** 2
|
| 220 |
+
mask = 255 * mask.astype(int)
|
| 221 |
+
wc = WordCloud(width=800, height=800, background_color='white',
|
| 222 |
+
colormap='viridis', mask=mask,
|
| 223 |
+
contour_width=3, contour_color='steelblue').generate(text_data)
|
| 224 |
+
plt.figure(figsize=(8, 8))
|
| 225 |
+
plt.imshow(wc, interpolation='bilinear')
|
| 226 |
+
plt.title(title, fontsize=18, pad=15)
|
| 227 |
+
plt.axis('off')
|
| 228 |
+
plt.show()
|
| 229 |
+
|
| 230 |
+
def get_top_words_by_doc_frequency(df_subset, n_top_words=10):
|
| 231 |
+
word_doc_count = Counter()
|
| 232 |
+
for text in df_subset['caption_clean'].fillna(""):
|
| 233 |
+
tokens = [w for w in text.split() if not re.fullmatch(r"[a-z]", w)]
|
| 234 |
+
unique_tokens = set(tokens)
|
| 235 |
+
word_doc_count.update(unique_tokens)
|
| 236 |
+
return word_doc_count.most_common(n_top_words)
|
| 237 |
+
|
| 238 |
+
summarizer = pipeline(
|
| 239 |
+
"summarization",
|
| 240 |
+
model="google/mt5-small",
|
| 241 |
+
tokenizer="google/mt5-small"
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
def generate_summary(text, max_length=60, min_length=20):
|
| 245 |
+
if not text or len(text.split()) < 10:
|
| 246 |
+
return text
|
| 247 |
+
try:
|
| 248 |
+
result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
| 249 |
+
return result[0]['summary_text']
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f"β οΈ Error summarizing: {e}")
|
| 252 |
+
return text
|
| 253 |
+
|
| 254 |
+
def summarize_text(corpus, n_topics=5, n_words=10):
|
| 255 |
+
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
|
| 256 |
+
X = vectorizer.fit_transform(corpus)
|
| 257 |
+
|
| 258 |
+
nmf = NMF(n_components=n_topics, random_state=42)
|
| 259 |
+
nmf.fit(X)
|
| 260 |
+
|
| 261 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 262 |
+
key_sentences = []
|
| 263 |
+
|
| 264 |
+
for topic_idx, topic in enumerate(nmf.components_):
|
| 265 |
+
top_words = [feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]
|
| 266 |
+
key_sentences.extend(top_words)
|
| 267 |
+
|
| 268 |
+
# ubah jadi paragraf ringkas
|
| 269 |
+
summary = " ".join(key_sentences)
|
| 270 |
+
return summary
|
| 271 |
+
|
| 272 |
+
# ===============================================
|
| 273 |
+
# 5. GLOBAL Topic Modeling dan Pembuatan Ringkasan (PARAGRAF)
|
| 274 |
+
# ===============================================
|
| 275 |
+
print("\n--- π§ Memprediksi Topik dan Membuat Ringkasan untuk Semua Data ---")
|
| 276 |
+
|
| 277 |
+
# πΉ Gabungkan caption + comment jadi satu teks
|
| 278 |
+
df['combined_text'] = df['caption_clean'].fillna('') + " " + df['comments_pred'].fillna('')
|
| 279 |
+
|
| 280 |
+
# --- TF-IDF Vectorizer ---
|
| 281 |
+
global_vectorizer = TfidfVectorizer(
|
| 282 |
+
max_df=0.9,
|
| 283 |
+
min_df=10,
|
| 284 |
+
max_features=1000,
|
| 285 |
+
stop_words=final_stopwords,
|
| 286 |
+
ngram_range=(1, 2)
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
global_tfidf = global_vectorizer.fit_transform(df['combined_text'])
|
| 290 |
+
global_feature_names = global_vectorizer.get_feature_names_out()
|
| 291 |
+
|
| 292 |
+
# --- Bagian NMF + Summary ---
|
| 293 |
+
if global_tfidf.shape[1] == 0:
|
| 294 |
+
df['predicted_topic_id'] = -1
|
| 295 |
+
df['predicted_topic'] = "Tidak ada fitur yang cukup untuk modeling"
|
| 296 |
+
df['summary'] = "Tidak dapat membuat ringkasan"
|
| 297 |
+
print("β οΈ Peringatan: Kosakata terlalu sedikit setelah preprocessing. Topic modeling tidak dapat dilakukan.")
|
| 298 |
+
else:
|
| 299 |
+
global_nmf_model = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
|
| 300 |
+
global_nmf_model.fit(global_tfidf)
|
| 301 |
+
|
| 302 |
+
# Distribusi topik per dokumen
|
| 303 |
+
topic_distribution = global_nmf_model.transform(global_tfidf)
|
| 304 |
+
df['predicted_topic_id'] = np.argmax(topic_distribution, axis=1)
|
| 305 |
+
|
| 306 |
+
# Ambil kata-kata penting tiap topik
|
| 307 |
+
def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
|
| 308 |
+
top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
|
| 309 |
+
return [feature_names[i] for i in top_indices]
|
| 310 |
+
|
| 311 |
+
# Mapping topik β keyword utama
|
| 312 |
+
topic_keywords = {}
|
| 313 |
+
for topic_idx in range(N_TOPICS):
|
| 314 |
+
top_words = get_top_words_for_topic(global_nmf_model, global_feature_names, topic_idx, N_TOP_WORDS)
|
| 315 |
+
topic_keywords[topic_idx] = ", ".join(top_words)
|
| 316 |
+
|
| 317 |
+
df['predicted_topic'] = df['predicted_topic_id'].map(topic_keywords).fillna("Topik tidak teridentifikasi")
|
| 318 |
+
|
| 319 |
+
# πΉ Update ringkasan pakai IndoBERT, berdasarkan teks gabungan
|
| 320 |
+
df['summary'] = df['combined_text'].apply(lambda x: generate_summary(x))
|
| 321 |
+
|
| 322 |
+
print("β
Prediksi topik selesai, ringkasan memakai IndoBERT Summarization (gabungan caption + comment).")
|
| 323 |
+
|
| 324 |
+
# Menampilkan hasil untuk verifikasi
|
| 325 |
+
print("\n--- β¨ Contoh Hasil Prediksi Topik dan Ringkasan ---")
|
| 326 |
+
display(df[['caption', 'comments_pred', 'predicted_topic', 'summary']].head(10))
|
| 327 |
+
|
| 328 |
+
# ===============================================
|
| 329 |
+
# 6. Analisis per Sentimen + WordCloud + TAMPILKAN BUKTI BERDASARKAN KEYWORD
|
| 330 |
+
# ===============================================
|
| 331 |
+
analysis_result = {} # tempat simpan hasil JSON
|
| 332 |
+
|
| 333 |
+
if 'caption_pred' in df.columns:
|
| 334 |
+
sentiments = ['positif', 'negatif', 'netral']
|
| 335 |
+
|
| 336 |
+
# Pandas tampilkan teks penuh
|
| 337 |
+
pd.set_option('display.max_colwidth', None)
|
| 338 |
+
|
| 339 |
+
for sentiment in sentiments:
|
| 340 |
+
print(f"\n\n=======================================================")
|
| 341 |
+
print(f"π Analisis Mendalam untuk Sentimen: '{sentiment.upper()}'")
|
| 342 |
+
print(f"=======================================================")
|
| 343 |
+
|
| 344 |
+
subset_df = df[df['caption_pred'] == sentiment].copy()
|
| 345 |
+
analysis_result[sentiment] = [] # list kosong untuk simpan hasil tiap sentimen
|
| 346 |
+
|
| 347 |
+
if subset_df.empty:
|
| 348 |
+
print(f"Tidak ada data untuk sentimen '{sentiment}'.")
|
| 349 |
+
continue
|
| 350 |
+
|
| 351 |
+
# 1. Dapatkan kata-kata teratas
|
| 352 |
+
top_words_tuples = get_top_words_by_doc_frequency(subset_df, n_top_words=N_TOP_WORDS)
|
| 353 |
+
|
| 354 |
+
if not top_words_tuples:
|
| 355 |
+
print(f"Tidak ada kata signifikan pada sentimen '{sentiment}' untuk dianalisis.")
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
# 2. Buat WordCloud
|
| 359 |
+
words_list_for_wc = [word for word, count in top_words_tuples]
|
| 360 |
+
create_circular_wordcloud(words_list_for_wc, f"WordCloud Sentimen {sentiment.upper()}", n_words=N_TOP_WORDS)
|
| 361 |
+
|
| 362 |
+
# 3. Tampilkan bukti ringkasan
|
| 363 |
+
print(f"\n--- π Bukti Ringkasan Berdasarkan Kata Kunci Populer ---")
|
| 364 |
+
|
| 365 |
+
for word, doc_count in top_words_tuples:
|
| 366 |
+
relevant_data = subset_df[
|
| 367 |
+
subset_df['caption_clean'].str.contains(r'\b{}\b'.format(re.escape(word)), case=False, na=False)
|
| 368 |
+
]
|
| 369 |
+
|
| 370 |
+
summaries_list = []
|
| 371 |
+
if not relevant_data.empty:
|
| 372 |
+
print(f"\nβ
Kata Kunci: '{word}' (ditemukan dalam {len(relevant_data)} data pada sentimen ini)")
|
| 373 |
+
|
| 374 |
+
for i, row in enumerate(relevant_data.itertuples(index=False), 1):
|
| 375 |
+
caption = getattr(row, "caption_clean", "")
|
| 376 |
+
link = getattr(row, "link", None) or getattr(row, "url", None) or "-"
|
| 377 |
+
comment = getattr(row, "comments_pred", "")
|
| 378 |
+
print(f" {i}. {caption} π {link} π¬ {comment}")
|
| 379 |
+
|
| 380 |
+
summaries_list.append({
|
| 381 |
+
"caption": caption,
|
| 382 |
+
"link": link,
|
| 383 |
+
"comment": comment
|
| 384 |
+
})
|
| 385 |
+
|
| 386 |
+
else:
|
| 387 |
+
print(f"\nβ Kata Kunci: '{word}' (tidak ditemukan data relevan untuk ditampilkan)")
|
| 388 |
+
|
| 389 |
+
# tetap simpan ke JSON meskipun kosong
|
| 390 |
+
analysis_result[sentiment].append({
|
| 391 |
+
"keyword": word,
|
| 392 |
+
"count": int(len(relevant_data)),
|
| 393 |
+
"summary": summaries_list
|
| 394 |
+
})
|
| 395 |
+
|
| 396 |
+
else:
|
| 397 |
+
print("\nKolom 'caption_pred' tidak ditemukan. Melewati analisis per sentimen.")
|
| 398 |
+
|
| 399 |
+
# ===============================================
|
| 400 |
+
# Simpan hasil JSON
|
| 401 |
+
# ===============================================
|
| 402 |
+
with open("sentiment_analysis_result.json", "w", encoding="utf-8") as f:
|
| 403 |
+
json.dump(analysis_result, f, ensure_ascii=False, indent=4)
|
| 404 |
+
|
| 405 |
+
print("\nπ Hasil analisis juga telah disimpan di 'sentiment_analysis_result.json'")
|
| 406 |
+
|
| 407 |
+
# ===============================================
|
| 408 |
+
# Prediksi Dataset Berita (judul, isi_berita, tag, link)
|
| 409 |
+
# ===============================================
|
| 410 |
+
|
| 411 |
+
FILE_BERITA = '/content/drive/MyDrive/Machine Learning/Sentiment Analysis/berita (6).csv'
|
| 412 |
+
|
| 413 |
+
try:
|
| 414 |
+
df_berita = pd.read_csv(FILE_BERITA)
|
| 415 |
+
df_berita.dropna(subset=['isi_berita'], inplace=True)
|
| 416 |
+
df_berita['isi_berita'] = df_berita['isi_berita'].astype(str)
|
| 417 |
+
|
| 418 |
+
# Preprocessing isi_berita
|
| 419 |
+
df_berita['isi_berita_clean'] = df_berita['isi_berita'].apply(preprocess_text)
|
| 420 |
+
|
| 421 |
+
print("β
Dataset berita berhasil dimuat & dipreproses.")
|
| 422 |
+
print(f"Jumlah data: {len(df_berita)} baris")
|
| 423 |
+
|
| 424 |
+
except FileNotFoundError:
|
| 425 |
+
print(f"β Error: File '{FILE_BERITA}' tidak ditemukan.")
|
| 426 |
+
raise SystemExit
|
| 427 |
+
|
| 428 |
+
# ===============================================
|
| 429 |
+
# Topic Modeling untuk berita
|
| 430 |
+
# ===============================================
|
| 431 |
+
print("\n--- π§ Memprediksi Topik & Ringkasan untuk Dataset Berita ---")
|
| 432 |
+
|
| 433 |
+
# πΉ Gabungkan isi_berita_clean + judul + tag
|
| 434 |
+
df_berita['combined_text'] = (
|
| 435 |
+
df_berita['isi_berita_clean'].fillna('') + " " +
|
| 436 |
+
df_berita['judul'].fillna('') + " " +
|
| 437 |
+
df_berita['tag'].fillna('')
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
# --- TF-IDF Vectorizer ---
|
| 441 |
+
vectorizer_berita = TfidfVectorizer(
|
| 442 |
+
max_df=0.9,
|
| 443 |
+
min_df=5,
|
| 444 |
+
max_features=1000,
|
| 445 |
+
stop_words=final_stopwords,
|
| 446 |
+
ngram_range=(1, 2)
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
tfidf_berita = vectorizer_berita.fit_transform(df_berita['combined_text'])
|
| 450 |
+
feature_names_berita = vectorizer_berita.get_feature_names_out()
|
| 451 |
+
|
| 452 |
+
if tfidf_berita.shape[1] == 0:
|
| 453 |
+
df_berita['predicted_topic_id'] = -1
|
| 454 |
+
df_berita['predicted_topic'] = "Tidak cukup fitur untuk modeling"
|
| 455 |
+
df_berita['summary'] = "Tidak dapat membuat ringkasan"
|
| 456 |
+
else:
|
| 457 |
+
nmf_berita = NMF(n_components=N_TOPICS, random_state=42, max_iter=500, l1_ratio=0.5)
|
| 458 |
+
nmf_berita.fit(tfidf_berita)
|
| 459 |
+
|
| 460 |
+
topic_dist_berita = nmf_berita.transform(tfidf_berita)
|
| 461 |
+
df_berita['predicted_topic_id'] = np.argmax(topic_dist_berita, axis=1)
|
| 462 |
+
|
| 463 |
+
# Ambil kata topik
|
| 464 |
+
def get_top_words_for_topic(model, feature_names, topic_idx, n_words=10):
|
| 465 |
+
top_indices = model.components_[topic_idx].argsort()[:-n_words - 1:-1]
|
| 466 |
+
return [feature_names[i] for i in top_indices]
|
| 467 |
+
|
| 468 |
+
topic_keywords_berita = {}
|
| 469 |
+
for topic_idx in range(N_TOPICS):
|
| 470 |
+
top_words = get_top_words_for_topic(nmf_berita, feature_names_berita, topic_idx, N_TOP_WORDS)
|
| 471 |
+
topic_keywords_berita[topic_idx] = ", ".join(top_words)
|
| 472 |
+
|
| 473 |
+
df_berita['predicted_topic'] = df_berita['predicted_topic_id'].map(topic_keywords_berita).fillna("Topik tidak teridentifikasi")
|
| 474 |
+
|
| 475 |
+
# πΉ Summarization IndoBERT (Google mT5)
|
| 476 |
+
df_berita['summary'] = df_berita['isi_berita'].apply(lambda x: generate_summary(x))
|
| 477 |
+
|
| 478 |
+
print("β
Prediksi topik & ringkasan berita selesai.")
|
| 479 |
+
|
| 480 |
+
# ===============================================
|
| 481 |
+
# Simpan hasil JSON
|
| 482 |
+
# ===============================================
|
| 483 |
+
output_data = []
|
| 484 |
+
for row in df_berita.itertuples(index=False):
|
| 485 |
+
output_data.append({
|
| 486 |
+
"judul": getattr(row, "judul", ""),
|
| 487 |
+
"tag": getattr(row, "tag", ""),
|
| 488 |
+
"link": getattr(row, "link", ""),
|
| 489 |
+
"isi_berita": getattr(row, "isi_berita", ""),
|
| 490 |
+
"isi_berita_clean": getattr(row, "isi_berita_clean", ""),
|
| 491 |
+
"predicted_topic": getattr(row, "predicted_topic", ""),
|
| 492 |
+
"summary": getattr(row, "summary", "")
|
| 493 |
+
})
|
| 494 |
+
|
| 495 |
+
with open("berita_analysis_result.json", "w", encoding="utf-8") as f:
|
| 496 |
+
json.dump(output_data, f, ensure_ascii=False, indent=4)
|
| 497 |
+
|
| 498 |
+
print("\nπ Hasil analisis berita disimpan di 'berita_analysis_result.json'")
|
| 499 |
+
|
| 500 |
+
!pip install pyngrok flask
|
| 501 |
+
|
| 502 |
+
from flask import Flask, jsonify
|
| 503 |
+
from pyngrok import ngrok
|
| 504 |
+
import json
|
| 505 |
+
|
| 506 |
+
# Masukkan token ngrok kamu
|
| 507 |
+
ngrok.set_auth_token("31odwJIHeYFk9aOrDfXDajKjK87_7esvX4phWySwTCG3BQ1R2")
|
| 508 |
+
|
| 509 |
+
# Load JSON hasil analisis sentiment
|
| 510 |
+
with open("sentiment_analysis_result.json", "r", encoding="utf-8") as f:
|
| 511 |
+
sentiment_result = json.load(f)
|
| 512 |
+
|
| 513 |
+
# Load JSON hasil analisis berita
|
| 514 |
+
with open("berita_analysis_result.json", "r", encoding="utf-8") as f:
|
| 515 |
+
berita_result = json.load(f)
|
| 516 |
+
|
| 517 |
+
# Inisialisasi Flask
|
| 518 |
+
app = Flask(__name__)
|
| 519 |
+
|
| 520 |
+
# Endpoint untuk sentiment
|
| 521 |
+
@app.route("/api/sentiment", methods=["GET"])
|
| 522 |
+
def api_sentiment():
|
| 523 |
+
return jsonify(sentiment_result)
|
| 524 |
+
|
| 525 |
+
# Endpoint untuk berita
|
| 526 |
+
@app.route("/api/berita", methods=["GET"])
|
| 527 |
+
def api_berita():
|
| 528 |
+
return jsonify(berita_result)
|
| 529 |
+
|
| 530 |
+
# Jalankan Flask di port 5000
|
| 531 |
+
port = 5000
|
| 532 |
+
public_url = ngrok.connect(port)
|
| 533 |
+
print("π Public URL:", public_url)
|
| 534 |
+
|
| 535 |
+
app.run(port=port)
|