and is not empty.""" return html and html.strip().startswith('') def retry_request(func, *args, **kwargs): """Retry HTTP requests with exponential backoff.""" for attempt in range(MAX_RETRIES): try: return func(*args, **kwargs) except (HTTPError, RemoteDisconnected) as e: if isinstance(e, HTTPError) and e.response.status_code == 429: sleep_time = RETRY_BACKOFF_FACTOR ** attempt logging.warning(f"Rate limit hit, retrying in {sleep_time}s...") time.sleep(sleep_time) else: logging.error(f"Request failed: {str(e)}") if attempt < MAX_RETRIES - 1: time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1) continue raise except ReadTimeout as e: logging.error(f"Read timeout: {str(e)}") if attempt < MAX_RETRIES - 1: time.sleep(2) continue raise raise Exception(f"Max retries ({MAX_RETRIES}) exceeded") def generate_image_keywords(text): """Generate image search keywords using Groq AI.""" prompt = f""" Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus: - Relevan dengan topik utama artikel. - Singkat dan spesifik (1-2 kata per frasa). - Tidak mengandung nama merek atau orang. - Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep). Teks artikel: {text[:1000]} # Batasi ke 1000 karakter untuk efisiensi Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3 """ try: completion = client.chat.completions.create( model=GROQ_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.7, max_completion_tokens=50, timeout=GROQ_TIMEOUT ) keywords = completion.choices[0].message.content.strip() # Format ke URL-encoded return keywords.replace(',', '+').replace(' ', '+') except HTTPError as e: if e.response.status_code == 429: logging.error("Groq API rate limit exceeded for keyword generation") raise RateLimitExceeded("Groq API rate limit exceeded") logging.error(f"Failed to generate keywords: {str(e)}") return "default+image" # Fallback keyword except Exception as e: logging.error(f"Failed to generate keywords: {str(e)}") return "default+image" def fetch_pixabay_image(keywords): """Fetch image URL from Pixabay API.""" try: url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280" def get_image(): resp = requests.get(url, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp response = retry_request(get_image) data = response.json() if data.get('hits') and len(data['hits']) > 0: # Prioritaskan largeImageURL untuk HD, fallback ke webformatURL image = data['hits'][0] image_url = image.get('largeImageURL', image['webformatURL']) logging.info(f"Fetched Pixabay image: {image_url}") return image_url else: logging.warning(f"No images found for keywords: {keywords}") return "" except HTTPError as e: if e.response.status_code == 429: logging.error("Pixabay API rate limit exceeded") raise RateLimitExceeded("Pixabay API rate limit exceeded") logging.error(f"Failed to fetch Pixabay image: {str(e)}") return "" except Exception as e: logging.error(f"Failed to fetch Pixabay image: {str(e)}") return "" # === CORE FUNCTIONS === def fetch_links(sheet_name="Sheet2"): """Fetch links from Google Sheet where judul is empty.""" try: def get_links(): resp = requests.get( GAS_URL, params={"sheetName": sheet_name}, timeout=REQUEST_TIMEOUT ) resp.raise_for_status() return resp.json() data = retry_request(get_links) links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))] logging.info(f"Fetched {len(links)} links from sheet {sheet_name}") return links except Exception as e: logging.error(f"Failed to fetch links: {str(e)}") return [] def clean_html(soup): """Remove ads, scripts, and empty elements from HTML.""" ad_selectors = [ 'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]', 'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]', 'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail', 'div[class*="promo"]', 'div[class*="widget"]' ] for selector in ad_selectors: for element in soup.select(selector): element.decompose() # Remove empty elements, preserve Arabic text for elem in soup.find_all(): text = elem.get_text(strip=True) if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text): elem.decompose() return soup def scrape_detik(link): """Scrape article content from Detik.""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept-Language": "id-ID,id;q=0.9", } try: def get_article(): resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp resp = retry_request(get_article) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, 'html.parser') # Extract content content = ( soup.select_one('.detail__body-text') or soup.select_one('article') or soup.select_one('.entry-content') or soup.select_one('.post-content') ) if not content: logging.warning(f"No content found at {link}") return None # Clean and get text cleaned_content = clean_html(content) text = cleaned_content.get_text(separator='\n', strip=True) if not text: logging.warning(f"Empty content after cleaning at {link}") return None return text except Exception as e: logging.error(f"Failed to scrape {link}: {str(e)}") return None def rewrite_with_ai(text): """Rewrite article using Groq AI model in streaming mode, include Pixabay image.""" # Generate keywords for Pixabay keywords = generate_image_keywords(text) time.sleep(DELAY_BETWEEN_REQUESTS) # Delay untuk menghindari rate limit image_url = fetch_pixabay_image(keywords) prompt = f""" Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar: - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar), - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh, - Ubah alur artikel, buat tata letak dan tampilan menarik, - Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik, - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust), - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag

dan diakhiri dengan

, - Struktur konten SEO-friendly: gunakan

untuk paragraf,

untuk subjudul, dan untuk gambar (jika ada), - Jangan gunakan

dalam artikel, karena sudah dipakai di luar artikel, - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral, - Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada, - Sertakan gambar dari URL berikut (jika valid): {image_url} Artikel asli: {text} Hasilkan hanya kode HTML mulai dari
hingga
. """ try: completion = client.chat.completions.create( model=GROQ_MODEL, messages=[{"role": "user", "content": prompt}], temperature=1, max_completion_tokens=1691, top_p=1, stream=True, stop=None, timeout=GROQ_TIMEOUT ) # Collect streaming response html_content = "" for chunk in completion: content = chunk.choices[0].delta.content or "" html_content += content html_content = html_content.strip() if not is_valid_html(html_content): logging.warning("AI output is not valid HTML article") return None return html_content except HTTPError as e: if e.response.status_code == 429: logging.error("Groq API rate limit exceeded") raise RateLimitExceeded("Groq API rate limit exceeded") logging.error(f"Failed to rewrite article: {str(e)}") return None except Exception as e: logging.error(f"Failed to rewrite article: {str(e)}") return None def extract_title_from_html(html): """Extract title from rewritten HTML.""" try: soup = BeautifulSoup(html, 'html.parser') title_tag = soup.find('h2') title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan" return title except Exception as e: logging.error(f"Failed to extract title: {str(e)}") return "Judul Tidak Ditemukan" def kirim_ke_sheet(judul, konten_html, link): """Send rewritten title and content to Google Sheet.""" if not judul or not konten_html: logging.warning(f"Empty title or content for link {link}") return try: payload = { "method": "updateRowByLink", "link": link, "judul": judul, "konten": konten_html } def send_data(): resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp retry_request(send_data) logging.info(f"Successfully sent to sheet: {judul}") except Exception as e: logging.error(f"Failed to send to sheet for {link}: {str(e)}") # === MAIN === def main(): """Main function to process articles.""" logging.info("Starting scrape and rewrite process") processed_count = 0 MAX_ARTICLES = 20 try: rows = fetch_links() logging.info(f"Found {len(rows)} links to process") for idx, row in enumerate(rows, 1): if processed_count >= MAX_ARTICLES: logging.info(f"Reached maximum article limit of {MAX_ARTICLES}") break link = row['link'] logging.info(f"[{idx}/{len(rows)}] Processing: {link}") # Scrape article artikel = scrape_detik(link) if not artikel: logging.warning(f"Skipping {link} due to empty content") continue # Rewrite with AI and fetch Pixabay image rewrite_html = rewrite_with_ai(artikel) if not rewrite_html: logging.warning(f"Skipping {link} due to rewrite failure") continue # Add 1-minute delay after AI rewrite logging.info("Waiting for 60 seconds after AI rewrite...") time.sleep(60) # Extract title judul = extract_title_from_html(rewrite_html) # Send to sheet kirim_ke_sheet(judul, rewrite_html, link) processed_count += 1 # Delay to avoid rate limits time.sleep(DELAY_BETWEEN_REQUESTS) # Send success message message = f"✅ Scrape and Rewrite Completed\nProcessed {processed_count} articles successfully." send_telegram_message(message) except RateLimitExceeded as e: message = f"❌ Script Terminated: API rate limit exceeded.\nProcessed {processed_count} articles before termination." logging.error(str(e)) send_telegram_message(message) raise except Exception as e: message = f"❌ Script Terminated: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination." logging.error(f"Unexpected error: {str(e)}") send_telegram_message(message) raise finally: logging.info("Process ended") # === SCHEDULER === def run_scheduler(): """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB.""" schedule.every().day.at("05:00").do(main) # 12:00 WIB logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB") while True: schedule.run_pending() time.sleep(60) # === GRADIO INTERFACE === def gradio_interface(): """Gradio interface for manual execution and status.""" main() return "Manual execution started. Check logs for details." if name == "main": if len(sys.argv) > 1 and sys.argv[1].lower() == "manual": logging.info("Running in manual mode") main() else: scheduler_thread = threading.Thread(target=run_scheduler, daemon=True) scheduler_thread.start() iface = gr.Interface( fn=gradio_interface, inputs=None, outputs="text", title="Article Scraper and Rewriter", description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB." ) logging.info("Starting Gradio interface") iface.launch()