File size: 16,547 Bytes
505dc0c ae4dc53 505dc0c ae4dc53 6bf1d73 ae4dc53 505dc0c ae4dc53 505dc0c 6bf1d73 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 6bf1d73 ae4dc53 505dc0c 6bf1d73 ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c 6bf1d73 ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 6bf1d73 505dc0c 6bf1d73 ae4dc53 f981f69 6bf1d73 ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c ae4dc53 505dc0c 6bf1d73 ae4dc53 505dc0c ae4dc53 46b0ee8 505dc0c ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 e7adf54 6bf1d73 e7adf54 ae4dc53 6bf1d73 ae4dc53 6bf1d73 ae4dc53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 |
import requests
from bs4 import BeautifulSoup
import json
import time
import logging
import re
from urllib.parse import urlparse
from groq import Groq
from requests.exceptions import HTTPError, RequestException, ReadTimeout
from http.client import RemoteDisconnected
import os
from datetime import datetime
import schedule
import threading
import sys
import gradio as gr
# === CUSTOM EXCEPTION ===
class RateLimitExceeded(Exception):
"""Exception raised when Groq API or Pixabay API rate limit is exceeded."""
pass
# === KONFIGURASI ===
GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
GROQ_API_KEY = "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"
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f"
GROQ_MODEL = "gemma2-9b-it"
REQUEST_TIMEOUT = 10
GROQ_TIMEOUT = 30
RETRY_BACKOFF_FACTOR = 2
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = 3
# Setup logging with timestamp-based file
log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
handlers=[
logging.FileHandler(log_file, encoding="utf-8"),
logging.StreamHandler()
]
)
client = Groq(api_key=GROQ_API_KEY)
# === HELPER FUNCTIONS ===
def send_telegram_message(message):
"""Send message to Telegram chat."""
try:
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
payload = {
"chat_id": TELEGRAM_CHAT_ID,
"text": message,
"parse_mode": "Markdown"
}
response = requests.post(url, json=payload, timeout=5)
response.raise_for_status()
logging.info("Telegram message sent successfully")
except Exception as e:
logging.error(f"Failed to send Telegram message: {str(e)}")
def is_valid_url(url):
"""Validate URL format."""
try:
result = urlparse(url)
return all([result.scheme in ['http', 'https'], result.netloc])
except Exception:
return False
def is_valid_html(html):
"""Check if HTML starts with <article> and is not empty."""
return html and html.strip().startswith('<article') and html.strip().endswith('</article>')
def retry_request(func, *args, **kwargs):
"""Retry HTTP requests with exponential backoff."""
for attempt in range(MAX_RETRIES):
try:
return func(*args, **kwargs)
except (HTTPError, RemoteDisconnected) as e:
if isinstance(e, HTTPError) and e.response.status_code == 429:
sleep_time = RETRY_BACKOFF_FACTOR ** attempt
logging.warning(f"Rate limit hit, retrying in {sleep_time}s...")
time.sleep(sleep_time)
else:
logging.error(f"Request failed: {str(e)}")
if attempt < MAX_RETRIES - 1:
time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1)
continue
raise
except ReadTimeout as e:
logging.error(f"Read timeout: {str(e)}")
if attempt < MAX_RETRIES - 1:
time.sleep(2)
continue
raise
raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
def generate_image_keywords(text):
"""Generate image search keywords using Groq AI."""
prompt = f"""
Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus:
- Relevan dengan topik utama artikel.
- Singkat dan spesifik (1-2 kata per frasa).
- Tidak mengandung nama merek atau orang.
- Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep).
Teks artikel:
{text[:1000]} # Batasi ke 1000 karakter untuk efisiensi
Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3
"""
try:
completion = client.chat.completions.create(
model=GROQ_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_completion_tokens=50,
timeout=GROQ_TIMEOUT
)
keywords = completion.choices[0].message.content.strip()
# Format ke URL-encoded
return keywords.replace(',', '+').replace(' ', '+')
except HTTPError as e:
if e.response.status_code == 429:
logging.error("Groq API rate limit exceeded for keyword generation")
raise RateLimitExceeded("Groq API rate limit exceeded")
logging.error(f"Failed to generate keywords: {str(e)}")
return "default+image" # Fallback keyword
except Exception as e:
logging.error(f"Failed to generate keywords: {str(e)}")
return "default+image"
def fetch_pixabay_image(keywords):
"""Fetch image URL from Pixabay API."""
try:
url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280"
def get_image():
resp = requests.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp
response = retry_request(get_image)
data = response.json()
if data.get('hits') and len(data['hits']) > 0:
# Prioritaskan largeImageURL untuk HD, fallback ke webformatURL
image = data['hits'][0]
image_url = image.get('largeImageURL', image['webformatURL'])
logging.info(f"Fetched Pixabay image: {image_url}")
return image_url
else:
logging.warning(f"No images found for keywords: {keywords}")
return ""
except HTTPError as e:
if e.response.status_code == 429:
logging.error("Pixabay API rate limit exceeded")
raise RateLimitExceeded("Pixabay API rate limit exceeded")
logging.error(f"Failed to fetch Pixabay image: {str(e)}")
return ""
except Exception as e:
logging.error(f"Failed to fetch Pixabay image: {str(e)}")
return ""
# === CORE FUNCTIONS ===
def fetch_links(sheet_name="Sheet2"):
"""Fetch links from Google Sheet where judul is empty."""
try:
def get_links():
resp = requests.get(
GAS_URL,
params={"sheetName": sheet_name},
timeout=REQUEST_TIMEOUT
)
resp.raise_for_status()
return resp.json()
data = retry_request(get_links)
links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
logging.info(f"Fetched {len(links)} links from sheet {sheet_name}")
return links
except Exception as e:
logging.error(f"Failed to fetch links: {str(e)}")
return []
def clean_html(soup):
"""Remove ads, scripts, and empty elements from HTML."""
ad_selectors = [
'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]',
'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]',
'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail',
'div[class*="promo"]', 'div[class*="widget"]'
]
for selector in ad_selectors:
for element in soup.select(selector):
element.decompose()
# Remove empty elements, preserve Arabic text
for elem in soup.find_all():
text = elem.get_text(strip=True)
if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text):
elem.decompose()
return soup
def scrape_detik(link):
"""Scrape article content from Detik."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept-Language": "id-ID,id;q=0.9",
}
try:
def get_article():
resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp
resp = retry_request(get_article)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'html.parser')
# Extract content
content = (
soup.select_one('.detail__body-text') or
soup.select_one('article') or
soup.select_one('.entry-content') or
soup.select_one('.post-content')
)
if not content:
logging.warning(f"No content found at {link}")
return None
# Clean and get text
cleaned_content = clean_html(content)
text = cleaned_content.get_text(separator='\n', strip=True)
if not text:
logging.warning(f"Empty content after cleaning at {link}")
return None
return text
except Exception as e:
logging.error(f"Failed to scrape {link}: {str(e)}")
return None
def rewrite_with_ai(text):
"""Rewrite article using Groq AI model in streaming mode, include Pixabay image."""
# Generate keywords for Pixabay
keywords = generate_image_keywords(text)
time.sleep(DELAY_BETWEEN_REQUESTS) # Delay untuk menghindari rate limit
image_url = fetch_pixabay_image(keywords)
prompt = f"""
Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
- Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
- Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
- Ubah alur artikel, buat tata letak dan tampilan menarik,
- Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik,
- Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
- Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada),
- Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
- Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada,
- Sertakan gambar dari URL berikut (jika valid): {image_url}
Artikel asli:
{text}
Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
"""
try:
completion = client.chat.completions.create(
model=GROQ_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=1,
max_completion_tokens=1691,
top_p=1,
stream=True,
stop=None,
timeout=GROQ_TIMEOUT
)
# Collect streaming response
html_content = ""
for chunk in completion:
content = chunk.choices[0].delta.content or ""
html_content += content
html_content = html_content.strip()
if not is_valid_html(html_content):
logging.warning("AI output is not valid HTML article")
return None
return html_content
except HTTPError as e:
if e.response.status_code == 429:
logging.error("Groq API rate limit exceeded")
raise RateLimitExceeded("Groq API rate limit exceeded")
logging.error(f"Failed to rewrite article: {str(e)}")
return None
except Exception as e:
logging.error(f"Failed to rewrite article: {str(e)}")
return None
def extract_title_from_html(html):
"""Extract title from rewritten HTML."""
try:
soup = BeautifulSoup(html, 'html.parser')
title_tag = soup.find('h2')
title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
return title
except Exception as e:
logging.error(f"Failed to extract title: {str(e)}")
return "Judul Tidak Ditemukan"
def kirim_ke_sheet(judul, konten_html, link):
"""Send rewritten title and content to Google Sheet."""
if not judul or not konten_html:
logging.warning(f"Empty title or content for link {link}")
return
try:
payload = {
"method": "updateRowByLink",
"link": link,
"judul": judul,
"konten": konten_html
}
def send_data():
resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp
retry_request(send_data)
logging.info(f"Successfully sent to sheet: {judul}")
except Exception as e:
logging.error(f"Failed to send to sheet for {link}: {str(e)}")
# === MAIN ===
def main():
"""Main function to process articles."""
logging.info("Starting scrape and rewrite process")
processed_count = 0
MAX_ARTICLES = 20
try:
rows = fetch_links()
logging.info(f"Found {len(rows)} links to process")
for idx, row in enumerate(rows, 1):
if processed_count >= MAX_ARTICLES:
logging.info(f"Reached maximum article limit of {MAX_ARTICLES}")
break
link = row['link']
logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
# Scrape article
artikel = scrape_detik(link)
if not artikel:
logging.warning(f"Skipping {link} due to empty content")
continue
# Rewrite with AI and fetch Pixabay image
rewrite_html = rewrite_with_ai(artikel)
if not rewrite_html:
logging.warning(f"Skipping {link} due to rewrite failure")
continue
# Add 1-minute delay after AI rewrite
logging.info("Waiting for 60 seconds after AI rewrite...")
time.sleep(60)
# Extract title
judul = extract_title_from_html(rewrite_html)
# Send to sheet
kirim_ke_sheet(judul, rewrite_html, link)
processed_count += 1
# Delay to avoid rate limits
time.sleep(DELAY_BETWEEN_REQUESTS)
# Send success message
message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully."
send_telegram_message(message)
except RateLimitExceeded as e:
message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination."
logging.error(str(e))
send_telegram_message(message)
raise
except Exception as e:
message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination."
logging.error(f"Unexpected error: {str(e)}")
send_telegram_message(message)
raise
finally:
logging.info("Process ended")
# === SCHEDULER ===
def run_scheduler():
"""Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
schedule.every().day.at("05:00").do(main) # 12:00 WIB
logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
while True:
schedule.run_pending()
time.sleep(60)
# === GRADIO INTERFACE ===
def gradio_interface():
"""Gradio interface for manual execution and status."""
main()
return "Manual execution started. Check logs for details."
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
logging.info("Running in manual mode")
main()
else:
scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
scheduler_thread.start()
iface = gr.Interface(
fn=gradio_interface,
inputs=None,
outputs="text",
title="Article Scraper and Rewriter",
description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB."
)
logging.info("Starting Gradio interface")
iface.launch() |