| | import requests |
| | from bs4 import BeautifulSoup |
| | import json |
| | import time |
| | import logging |
| | import re |
| | from urllib.parse import urlparse |
| | from groq import Groq |
| | from requests.exceptions import HTTPError, RequestException, ReadTimeout |
| | from http.client import RemoteDisconnected |
| | import os |
| | from datetime import datetime |
| | import schedule |
| | import threading |
| | import sys |
| | import gradio as gr |
| |
|
| | |
| | class RateLimitExceeded(Exception): |
| | """Exception raised when Groq API or Pixabay API rate limit is exceeded.""" |
| | pass |
| |
|
| | |
| | GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbw9-8PtNI5tyDSS4dvXLLzmhD2scr4enMGvXFQvFUZsorUyZyfpfBOH216DgkQc68PH/exec") |
| | GROQ_API_KEY = "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" |
| | TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk") |
| | TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613") |
| | PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f" |
| | GROQ_MODEL = "gemma2-9b-it" |
| | REQUEST_TIMEOUT = 10 |
| | GROQ_TIMEOUT = 30 |
| | RETRY_BACKOFF_FACTOR = 2 |
| | MAX_RETRIES = 3 |
| | DELAY_BETWEEN_REQUESTS = 3 |
| |
|
| | |
| | log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" |
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s", |
| | handlers=[ |
| | logging.FileHandler(log_file, encoding="utf-8"), |
| | logging.StreamHandler() |
| | ] |
| | ) |
| |
|
| | client = Groq(api_key=GROQ_API_KEY) |
| |
|
| | |
| | def send_telegram_message(message): |
| | """Send message to Telegram chat.""" |
| | try: |
| | url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" |
| | payload = { |
| | "chat_id": TELEGRAM_CHAT_ID, |
| | "text": message, |
| | "parse_mode": "Markdown" |
| | } |
| | response = requests.post(url, json=payload, timeout=5) |
| | response.raise_for_status() |
| | logging.info("Telegram message sent successfully") |
| | except Exception as e: |
| | logging.error(f"Failed to send Telegram message: {str(e)}") |
| |
|
| | def is_valid_url(url): |
| | """Validate URL format.""" |
| | try: |
| | result = urlparse(url) |
| | return all([result.scheme in ['http', 'https'], result.netloc]) |
| | except Exception: |
| | return False |
| |
|
| | def is_valid_html(html): |
| | """Check if HTML starts with <article> and is not empty.""" |
| | return html and html.strip().startswith('<article') and html.strip().endswith('</article>') |
| |
|
| | def retry_request(func, *args, **kwargs): |
| | """Retry HTTP requests with exponential backoff.""" |
| | for attempt in range(MAX_RETRIES): |
| | try: |
| | return func(*args, **kwargs) |
| | except (HTTPError, RemoteDisconnected) as e: |
| | if isinstance(e, HTTPError) and e.response.status_code == 429: |
| | sleep_time = RETRY_BACKOFF_FACTOR ** attempt |
| | logging.warning(f"Rate limit hit, retrying in {sleep_time}s...") |
| | time.sleep(sleep_time) |
| | else: |
| | logging.error(f"Request failed: {str(e)}") |
| | if attempt < MAX_RETRIES - 1: |
| | time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1) |
| | continue |
| | raise |
| | except ReadTimeout as e: |
| | logging.error(f"Read timeout: {str(e)}") |
| | if attempt < MAX_RETRIES - 1: |
| | time.sleep(2) |
| | continue |
| | raise |
| | raise Exception(f"Max retries ({MAX_RETRIES}) exceeded") |
| |
|
| | def generate_image_keywords(text): |
| | """Generate image search keywords using Groq AI.""" |
| | prompt = f""" |
| | Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus: |
| | - Relevan dengan topik utama artikel. |
| | - Singkat dan spesifik (1-2 kata per frasa). |
| | - Tidak mengandung nama merek atau orang. |
| | - Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep). |
| | |
| | Teks artikel: |
| | {text[:1000]} # Batasi ke 1000 karakter untuk efisiensi |
| | |
| | Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3 |
| | """ |
| | try: |
| | completion = client.chat.completions.create( |
| | model=GROQ_MODEL, |
| | messages=[{"role": "user", "content": prompt}], |
| | temperature=0.7, |
| | max_completion_tokens=50, |
| | timeout=GROQ_TIMEOUT |
| | ) |
| | keywords = completion.choices[0].message.content.strip() |
| | |
| | return keywords.replace(',', '+').replace(' ', '+') |
| | except HTTPError as e: |
| | if e.response.status_code == 429: |
| | logging.error("Groq API rate limit exceeded for keyword generation") |
| | raise RateLimitExceeded("Groq API rate limit exceeded") |
| | logging.error(f"Failed to generate keywords: {str(e)}") |
| | return "default+image" |
| | except Exception as e: |
| | logging.error(f"Failed to generate keywords: {str(e)}") |
| | return "default+image" |
| |
|
| | def fetch_pixabay_image(keywords): |
| | """Fetch image URL from Pixabay API.""" |
| | try: |
| | url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280" |
| | |
| | def get_image(): |
| | resp = requests.get(url, timeout=REQUEST_TIMEOUT) |
| | resp.raise_for_status() |
| | return resp |
| | |
| | response = retry_request(get_image) |
| | data = response.json() |
| | |
| | if data.get('hits') and len(data['hits']) > 0: |
| | |
| | image = data['hits'][0] |
| | image_url = image.get('largeImageURL', image['webformatURL']) |
| | logging.info(f"Fetched Pixabay image: {image_url}") |
| | return image_url |
| | else: |
| | logging.warning(f"No images found for keywords: {keywords}") |
| | return "" |
| | except HTTPError as e: |
| | if e.response.status_code == 429: |
| | logging.error("Pixabay API rate limit exceeded") |
| | raise RateLimitExceeded("Pixabay API rate limit exceeded") |
| | logging.error(f"Failed to fetch Pixabay image: {str(e)}") |
| | return "" |
| | except Exception as e: |
| | logging.error(f"Failed to fetch Pixabay image: {str(e)}") |
| | return "" |
| |
|
| | |
| | def fetch_links(sheet_name="Sheet2"): |
| | """Fetch links from Google Sheet where judul is empty.""" |
| | try: |
| | def get_links(): |
| | resp = requests.get( |
| | GAS_URL, |
| | params={"sheetName": sheet_name}, |
| | timeout=REQUEST_TIMEOUT |
| | ) |
| | resp.raise_for_status() |
| | return resp.json() |
| | |
| | data = retry_request(get_links) |
| | links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))] |
| | logging.info(f"Fetched {len(links)} links from sheet {sheet_name}") |
| | return links |
| | except Exception as e: |
| | logging.error(f"Failed to fetch links: {str(e)}") |
| | return [] |
| |
|
| | def clean_html(soup): |
| | """Remove ads, scripts, and empty elements from HTML.""" |
| | ad_selectors = [ |
| | 'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]', |
| | 'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]', |
| | 'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail', |
| | 'div[class*="promo"]', 'div[class*="widget"]' |
| | ] |
| | for selector in ad_selectors: |
| | for element in soup.select(selector): |
| | element.decompose() |
| | |
| | |
| | for elem in soup.find_all(): |
| | text = elem.get_text(strip=True) |
| | if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text): |
| | elem.decompose() |
| | |
| | return soup |
| |
|
| | def scrape_detik(link): |
| | """Scrape article content from Detik.""" |
| | headers = { |
| | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", |
| | "Accept-Language": "id-ID,id;q=0.9", |
| | } |
| | try: |
| | def get_article(): |
| | resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT) |
| | resp.raise_for_status() |
| | return resp |
| | |
| | resp = retry_request(get_article) |
| | resp.encoding = 'utf-8' |
| | soup = BeautifulSoup(resp.text, 'html.parser') |
| | |
| | |
| | content = ( |
| | soup.select_one('.detail__body-text') or |
| | soup.select_one('article') or |
| | soup.select_one('.entry-content') or |
| | soup.select_one('.post-content') |
| | ) |
| | if not content: |
| | logging.warning(f"No content found at {link}") |
| | return None |
| | |
| | |
| | cleaned_content = clean_html(content) |
| | text = cleaned_content.get_text(separator='\n', strip=True) |
| | if not text: |
| | logging.warning(f"Empty content after cleaning at {link}") |
| | return None |
| | |
| | return text |
| | except Exception as e: |
| | logging.error(f"Failed to scrape {link}: {str(e)}") |
| | return None |
| |
|
| | def rewrite_with_ai(text): |
| | """Rewrite article using Groq AI model in streaming mode, include Pixabay image.""" |
| | |
| | keywords = generate_image_keywords(text) |
| | time.sleep(DELAY_BETWEEN_REQUESTS) |
| | image_url = fetch_pixabay_image(keywords) |
| | |
| | prompt = f""" |
| | Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar: |
| | |
| | - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar), |
| | - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh, |
| | - Ubah alur artikel, buat tata letak dan tampilan menarik seperti berita pada umumnya, |
| | - Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik, |
| | - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust), |
| | - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>, |
| | - Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada), |
| | - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel, |
| | - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral, |
| | - Pertahankan teks Arab, kutipan hadis atau ayat, dan nama tempat dan waktu, |
| | - Wajib sertakan gambar dari URL berikut: {image_url} |
| | |
| | Artikel asli: |
| | {text} |
| | |
| | Hasilkan hanya kode HTML mulai dari <article> hingga </article>. |
| | """ |
| | try: |
| | completion = client.chat.completions.create( |
| | model=GROQ_MODEL, |
| | messages=[{"role": "user", "content": prompt}], |
| | temperature=1, |
| | max_completion_tokens=1691, |
| | top_p=1, |
| | stream=True, |
| | stop=None, |
| | timeout=GROQ_TIMEOUT |
| | ) |
| | |
| | |
| | html_content = "" |
| | for chunk in completion: |
| | content = chunk.choices[0].delta.content or "" |
| | html_content += content |
| | |
| | html_content = html_content.strip() |
| | if not is_valid_html(html_content): |
| | logging.warning("AI output is not valid HTML article") |
| | return None |
| | return html_content |
| | except HTTPError as e: |
| | if e.response.status_code == 429: |
| | logging.error("Groq API rate limit exceeded") |
| | raise RateLimitExceeded("Groq API rate limit exceeded") |
| | logging.error(f"Failed to rewrite article: {str(e)}") |
| | return None |
| | except Exception as e: |
| | logging.error(f"Failed to rewrite article: {str(e)}") |
| | return None |
| |
|
| | def extract_title_from_html(html): |
| | """Extract title from rewritten HTML.""" |
| | try: |
| | soup = BeautifulSoup(html, 'html.parser') |
| | title_tag = soup.find('h2') |
| | title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan" |
| | return title |
| | except Exception as e: |
| | logging.error(f"Failed to extract title: {str(e)}") |
| | return "Judul Tidak Ditemukan" |
| |
|
| | def kirim_ke_sheet(judul, konten_html, link): |
| | """Send rewritten title and content to Google Sheet.""" |
| | if not judul or not konten_html: |
| | logging.warning(f"Empty title or content for link {link}") |
| | return |
| | |
| | try: |
| | payload = { |
| | "method": "updateRowByLink", |
| | "link": link, |
| | "judul": judul, |
| | "konten": konten_html |
| | } |
| | |
| | def send_data(): |
| | resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT) |
| | resp.raise_for_status() |
| | return resp |
| | |
| | retry_request(send_data) |
| | logging.info(f"Successfully sent to sheet: {judul}") |
| | except Exception as e: |
| | logging.error(f"Failed to send to sheet for {link}: {str(e)}") |
| |
|
| | |
| | def main(): |
| | """Main function to process articles.""" |
| | logging.info("Starting scrape and rewrite process") |
| | processed_count = 0 |
| | MAX_ARTICLES = 21 |
| | try: |
| | rows = fetch_links() |
| | logging.info(f"Found {len(rows)} links to process") |
| | |
| | for idx, row in enumerate(rows, 1): |
| | if processed_count >= MAX_ARTICLES: |
| | logging.info(f"Reached maximum article limit of {MAX_ARTICLES}") |
| | break |
| | |
| | link = row['link'] |
| | logging.info(f"[{idx}/{len(rows)}] Processing: {link}") |
| | |
| | |
| | artikel = scrape_detik(link) |
| | if not artikel: |
| | logging.warning(f"Skipping {link} due to empty content") |
| | continue |
| | |
| | |
| | rewrite_html = rewrite_with_ai(artikel) |
| | if not rewrite_html: |
| | logging.warning(f"Skipping {link} due to rewrite failure") |
| | continue |
| | |
| | |
| | logging.info("Waiting for 60 seconds after AI rewrite...") |
| | time.sleep(60) |
| | |
| | |
| | judul = extract_title_from_html(rewrite_html) |
| | |
| | |
| | kirim_ke_sheet(judul, rewrite_html, link) |
| | processed_count += 1 |
| | |
| | |
| | time.sleep(DELAY_BETWEEN_REQUESTS) |
| | |
| | |
| | message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully." |
| | send_telegram_message(message) |
| | |
| | except RateLimitExceeded as e: |
| | message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination." |
| | logging.error(str(e)) |
| | send_telegram_message(message) |
| | raise |
| | except Exception as e: |
| | message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination." |
| | logging.error(f"Unexpected error: {str(e)}") |
| | send_telegram_message(message) |
| | raise |
| | finally: |
| | logging.info("Process ended") |
| |
|
| | |
| | def run_scheduler(): |
| | """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB.""" |
| | |
| | schedule.every().day.at("06:00").do(main) |
| | logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB") |
| | |
| | while True: |
| | schedule.run_pending() |
| | time.sleep(60) |
| |
|
| | |
| | def gradio_interface(): |
| | """Gradio interface for manual execution and status.""" |
| | main() |
| | return "Manual execution started. Check logs for details." |
| |
|
| | if __name__ == "__main__": |
| | if len(sys.argv) > 1 and sys.argv[1].lower() == "manual": |
| | logging.info("Running in manual mode") |
| | main() |
| | else: |
| | scheduler_thread = threading.Thread(target=run_scheduler, daemon=True) |
| | scheduler_thread.start() |
| | iface = gr.Interface( |
| | fn=gradio_interface, |
| | inputs=None, |
| | outputs="text", |
| | title="Article Scraper and Rewriter", |
| | description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB." |
| | ) |
| | logging.info("Starting Gradio interface") |
| | iface.launch() |