|
|
import requests |
|
|
from bs4 import BeautifulSoup |
|
|
import json |
|
|
import time |
|
|
import logging |
|
|
import re |
|
|
from urllib.parse import urlparse |
|
|
from groq import Groq |
|
|
from requests.exceptions import HTTPError, RequestException, ReadTimeout |
|
|
from http.client import RemoteDisconnected |
|
|
import os |
|
|
from datetime import datetime |
|
|
import schedule |
|
|
import threading |
|
|
import sys |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
class RateLimitExceeded(Exception): |
|
|
"""Exception raised when Groq API or Pixabay API rate limit is exceeded.""" |
|
|
pass |
|
|
|
|
|
|
|
|
GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec") |
|
|
GROQ_API_KEY = "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" |
|
|
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk") |
|
|
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613") |
|
|
PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f" |
|
|
GROQ_MODEL = "gemma2-9b-it" |
|
|
REQUEST_TIMEOUT = 10 |
|
|
GROQ_TIMEOUT = 30 |
|
|
RETRY_BACKOFF_FACTOR = 2 |
|
|
MAX_RETRIES = 3 |
|
|
DELAY_BETWEEN_REQUESTS = 3 |
|
|
|
|
|
|
|
|
log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" |
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s", |
|
|
handlers=[ |
|
|
logging.FileHandler(log_file, encoding="utf-8"), |
|
|
logging.StreamHandler() |
|
|
] |
|
|
) |
|
|
|
|
|
client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
|
|
|
def send_telegram_message(message): |
|
|
"""Send message to Telegram chat.""" |
|
|
try: |
|
|
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" |
|
|
payload = { |
|
|
"chat_id": TELEGRAM_CHAT_ID, |
|
|
"text": message, |
|
|
"parse_mode": "Markdown" |
|
|
} |
|
|
response = requests.post(url, json=payload, timeout=5) |
|
|
response.raise_for_status() |
|
|
logging.info("Telegram message sent successfully") |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to send Telegram message: {str(e)}") |
|
|
|
|
|
def is_valid_url(url): |
|
|
"""Validate URL format.""" |
|
|
try: |
|
|
result = urlparse(url) |
|
|
return all([result.scheme in ['http', 'https'], result.netloc]) |
|
|
except Exception: |
|
|
return False |
|
|
|
|
|
def is_valid_html(html): |
|
|
"""Check if HTML starts with <article> and is not empty.""" |
|
|
return html and html.strip().startswith('<article') and html.strip().endswith('</article>') |
|
|
|
|
|
def retry_request(func, *args, **kwargs): |
|
|
"""Retry HTTP requests with exponential backoff.""" |
|
|
for attempt in range(MAX_RETRIES): |
|
|
try: |
|
|
return func(*args, **kwargs) |
|
|
except (HTTPError, RemoteDisconnected) as e: |
|
|
if isinstance(e, HTTPError) and e.response.status_code == 429: |
|
|
sleep_time = RETRY_BACKOFF_FACTOR ** attempt |
|
|
logging.warning(f"Rate limit hit, retrying in {sleep_time}s...") |
|
|
time.sleep(sleep_time) |
|
|
else: |
|
|
logging.error(f"Request failed: {str(e)}") |
|
|
if attempt < MAX_RETRIES - 1: |
|
|
time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1) |
|
|
continue |
|
|
raise |
|
|
except ReadTimeout as e: |
|
|
logging.error(f"Read timeout: {str(e)}") |
|
|
if attempt < MAX_RETRIES - 1: |
|
|
time.sleep(2) |
|
|
continue |
|
|
raise |
|
|
raise Exception(f"Max retries ({MAX_RETRIES}) exceeded") |
|
|
|
|
|
def generate_image_keywords(text): |
|
|
"""Generate image search keywords using Groq AI.""" |
|
|
prompt = f""" |
|
|
Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus: |
|
|
- Relevan dengan topik utama artikel. |
|
|
- Singkat dan spesifik (1-2 kata per frasa). |
|
|
- Tidak mengandung nama merek atau orang. |
|
|
- Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep). |
|
|
|
|
|
Teks artikel: |
|
|
{text[:1000]} # Batasi ke 1000 karakter untuk efisiensi |
|
|
|
|
|
Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3 |
|
|
""" |
|
|
try: |
|
|
completion = client.chat.completions.create( |
|
|
model=GROQ_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
temperature=0.7, |
|
|
max_completion_tokens=50, |
|
|
timeout=GROQ_TIMEOUT |
|
|
) |
|
|
keywords = completion.choices[0].message.content.strip() |
|
|
|
|
|
return keywords.replace(',', '+').replace(' ', '+') |
|
|
except HTTPError as e: |
|
|
if e.response.status_code == 429: |
|
|
logging.error("Groq API rate limit exceeded for keyword generation") |
|
|
raise RateLimitExceeded("Groq API rate limit exceeded") |
|
|
logging.error(f"Failed to generate keywords: {str(e)}") |
|
|
return "default+image" |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to generate keywords: {str(e)}") |
|
|
return "default+image" |
|
|
|
|
|
def fetch_pixabay_image(keywords): |
|
|
"""Fetch image URL from Pixabay API.""" |
|
|
try: |
|
|
url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280" |
|
|
|
|
|
def get_image(): |
|
|
resp = requests.get(url, timeout=REQUEST_TIMEOUT) |
|
|
resp.raise_for_status() |
|
|
return resp |
|
|
|
|
|
response = retry_request(get_image) |
|
|
data = response.json() |
|
|
|
|
|
if data.get('hits') and len(data['hits']) > 0: |
|
|
|
|
|
image = data['hits'][0] |
|
|
image_url = image.get('largeImageURL', image['webformatURL']) |
|
|
logging.info(f"Fetched Pixabay image: {image_url}") |
|
|
return image_url |
|
|
else: |
|
|
logging.warning(f"No images found for keywords: {keywords}") |
|
|
return "" |
|
|
except HTTPError as e: |
|
|
if e.response.status_code == 429: |
|
|
logging.error("Pixabay API rate limit exceeded") |
|
|
raise RateLimitExceeded("Pixabay API rate limit exceeded") |
|
|
logging.error(f"Failed to fetch Pixabay image: {str(e)}") |
|
|
return "" |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to fetch Pixabay image: {str(e)}") |
|
|
return "" |
|
|
|
|
|
|
|
|
def fetch_links(sheet_name="Sheet2"): |
|
|
"""Fetch links from Google Sheet where judul is empty.""" |
|
|
try: |
|
|
def get_links(): |
|
|
resp = requests.get( |
|
|
GAS_URL, |
|
|
params={"sheetName": sheet_name}, |
|
|
timeout=REQUEST_TIMEOUT |
|
|
) |
|
|
resp.raise_for_status() |
|
|
return resp.json() |
|
|
|
|
|
data = retry_request(get_links) |
|
|
links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))] |
|
|
logging.info(f"Fetched {len(links)} links from sheet {sheet_name}") |
|
|
return links |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to fetch links: {str(e)}") |
|
|
return [] |
|
|
|
|
|
def clean_html(soup): |
|
|
"""Remove ads, scripts, and empty elements from HTML.""" |
|
|
ad_selectors = [ |
|
|
'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]', |
|
|
'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]', |
|
|
'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail', |
|
|
'div[class*="promo"]', 'div[class*="widget"]' |
|
|
] |
|
|
for selector in ad_selectors: |
|
|
for element in soup.select(selector): |
|
|
element.decompose() |
|
|
|
|
|
|
|
|
for elem in soup.find_all(): |
|
|
text = elem.get_text(strip=True) |
|
|
if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text): |
|
|
elem.decompose() |
|
|
|
|
|
return soup |
|
|
|
|
|
def scrape_detik(link): |
|
|
"""Scrape article content from Detik.""" |
|
|
headers = { |
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", |
|
|
"Accept-Language": "id-ID,id;q=0.9", |
|
|
} |
|
|
try: |
|
|
def get_article(): |
|
|
resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT) |
|
|
resp.raise_for_status() |
|
|
return resp |
|
|
|
|
|
resp = retry_request(get_article) |
|
|
resp.encoding = 'utf-8' |
|
|
soup = BeautifulSoup(resp.text, 'html.parser') |
|
|
|
|
|
|
|
|
content = ( |
|
|
soup.select_one('.detail__body-text') or |
|
|
soup.select_one('article') or |
|
|
soup.select_one('.entry-content') or |
|
|
soup.select_one('.post-content') |
|
|
) |
|
|
if not content: |
|
|
logging.warning(f"No content found at {link}") |
|
|
return None |
|
|
|
|
|
|
|
|
cleaned_content = clean_html(content) |
|
|
text = cleaned_content.get_text(separator='\n', strip=True) |
|
|
if not text: |
|
|
logging.warning(f"Empty content after cleaning at {link}") |
|
|
return None |
|
|
|
|
|
return text |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to scrape {link}: {str(e)}") |
|
|
return None |
|
|
|
|
|
def rewrite_with_ai(text): |
|
|
"""Rewrite article using Groq AI model in streaming mode, include Pixabay image.""" |
|
|
|
|
|
keywords = generate_image_keywords(text) |
|
|
time.sleep(DELAY_BETWEEN_REQUESTS) |
|
|
image_url = fetch_pixabay_image(keywords) |
|
|
|
|
|
prompt = f""" |
|
|
Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar: |
|
|
|
|
|
- Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar), |
|
|
- Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh, |
|
|
- Ubah alur artikel, buat tata letak dan tampilan menarik, |
|
|
- Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik, |
|
|
- Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust), |
|
|
- Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>, |
|
|
- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada), |
|
|
- Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel, |
|
|
- Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral, |
|
|
- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada, |
|
|
- Sertakan gambar dari URL berikut (jika valid): {image_url} |
|
|
|
|
|
Artikel asli: |
|
|
{text} |
|
|
|
|
|
Hasilkan hanya kode HTML mulai dari <article> hingga </article>. |
|
|
""" |
|
|
try: |
|
|
completion = client.chat.completions.create( |
|
|
model=GROQ_MODEL, |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
temperature=1, |
|
|
max_completion_tokens=1691, |
|
|
top_p=1, |
|
|
stream=True, |
|
|
stop=None, |
|
|
timeout=GROQ_TIMEOUT |
|
|
) |
|
|
|
|
|
|
|
|
html_content = "" |
|
|
for chunk in completion: |
|
|
content = chunk.choices[0].delta.content or "" |
|
|
html_content += content |
|
|
|
|
|
html_content = html_content.strip() |
|
|
if not is_valid_html(html_content): |
|
|
logging.warning("AI output is not valid HTML article") |
|
|
return None |
|
|
return html_content |
|
|
except HTTPError as e: |
|
|
if e.response.status_code == 429: |
|
|
logging.error("Groq API rate limit exceeded") |
|
|
raise RateLimitExceeded("Groq API rate limit exceeded") |
|
|
logging.error(f"Failed to rewrite article: {str(e)}") |
|
|
return None |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to rewrite article: {str(e)}") |
|
|
return None |
|
|
|
|
|
def extract_title_from_html(html): |
|
|
"""Extract title from rewritten HTML.""" |
|
|
try: |
|
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
title_tag = soup.find('h2') |
|
|
title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan" |
|
|
return title |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to extract title: {str(e)}") |
|
|
return "Judul Tidak Ditemukan" |
|
|
|
|
|
def kirim_ke_sheet(judul, konten_html, link): |
|
|
"""Send rewritten title and content to Google Sheet.""" |
|
|
if not judul or not konten_html: |
|
|
logging.warning(f"Empty title or content for link {link}") |
|
|
return |
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"method": "updateRowByLink", |
|
|
"link": link, |
|
|
"judul": judul, |
|
|
"konten": konten_html |
|
|
} |
|
|
|
|
|
def send_data(): |
|
|
resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT) |
|
|
resp.raise_for_status() |
|
|
return resp |
|
|
|
|
|
retry_request(send_data) |
|
|
logging.info(f"Successfully sent to sheet: {judul}") |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to send to sheet for {link}: {str(e)}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function to process articles.""" |
|
|
logging.info("Starting scrape and rewrite process") |
|
|
processed_count = 0 |
|
|
MAX_ARTICLES = 20 |
|
|
try: |
|
|
rows = fetch_links() |
|
|
logging.info(f"Found {len(rows)} links to process") |
|
|
|
|
|
for idx, row in enumerate(rows, 1): |
|
|
if processed_count >= MAX_ARTICLES: |
|
|
logging.info(f"Reached maximum article limit of {MAX_ARTICLES}") |
|
|
break |
|
|
|
|
|
link = row['link'] |
|
|
logging.info(f"[{idx}/{len(rows)}] Processing: {link}") |
|
|
|
|
|
|
|
|
artikel = scrape_detik(link) |
|
|
if not artikel: |
|
|
logging.warning(f"Skipping {link} due to empty content") |
|
|
continue |
|
|
|
|
|
|
|
|
rewrite_html = rewrite_with_ai(artikel) |
|
|
if not rewrite_html: |
|
|
logging.warning(f"Skipping {link} due to rewrite failure") |
|
|
continue |
|
|
|
|
|
|
|
|
logging.info("Waiting for 60 seconds after AI rewrite...") |
|
|
time.sleep(60) |
|
|
|
|
|
|
|
|
judul = extract_title_from_html(rewrite_html) |
|
|
|
|
|
|
|
|
kirim_ke_sheet(judul, rewrite_html, link) |
|
|
processed_count += 1 |
|
|
|
|
|
|
|
|
time.sleep(DELAY_BETWEEN_REQUESTS) |
|
|
|
|
|
|
|
|
message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully." |
|
|
send_telegram_message(message) |
|
|
|
|
|
except RateLimitExceeded as e: |
|
|
message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination." |
|
|
logging.error(str(e)) |
|
|
send_telegram_message(message) |
|
|
raise |
|
|
except Exception as e: |
|
|
message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination." |
|
|
logging.error(f"Unexpected error: {str(e)}") |
|
|
send_telegram_message(message) |
|
|
raise |
|
|
finally: |
|
|
logging.info("Process ended") |
|
|
|
|
|
|
|
|
def run_scheduler(): |
|
|
"""Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB.""" |
|
|
|
|
|
schedule.every().day.at("05:00").do(main) |
|
|
logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB") |
|
|
|
|
|
while True: |
|
|
schedule.run_pending() |
|
|
time.sleep(60) |
|
|
|
|
|
|
|
|
def gradio_interface(): |
|
|
"""Gradio interface for manual execution and status.""" |
|
|
main() |
|
|
return "Manual execution started. Check logs for details." |
|
|
|
|
|
if __name__ == "__main__": |
|
|
if len(sys.argv) > 1 and sys.argv[1].lower() == "manual": |
|
|
logging.info("Running in manual mode") |
|
|
main() |
|
|
else: |
|
|
scheduler_thread = threading.Thread(target=run_scheduler, daemon=True) |
|
|
scheduler_thread.start() |
|
|
iface = gr.Interface( |
|
|
fn=gradio_interface, |
|
|
inputs=None, |
|
|
outputs="text", |
|
|
title="Article Scraper and Rewriter", |
|
|
description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB." |
|
|
) |
|
|
logging.info("Starting Gradio interface") |
|
|
iface.launch() |