Spaces:

aronsaras
/

scraper

Sleeping

App Files Files Community

scraper / app.py

aronsaras

Update app.py

46b0ee8 verified 7 months ago

raw

history blame contribute delete

16.5 kB

	import requests
	from bs4 import BeautifulSoup
	import json
	import time
	import logging
	import re
	from urllib.parse import urlparse
	from groq import Groq
	from requests.exceptions import HTTPError, RequestException, ReadTimeout
	from http.client import RemoteDisconnected
	import os
	from datetime import datetime
	import schedule
	import threading
	import sys
	import gradio as gr

	# === CUSTOM EXCEPTION ===
	class RateLimitExceeded(Exception):
	"""Exception raised when Groq API or Pixabay API rate limit is exceeded."""
	pass

	# === KONFIGURASI ===
	GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
	GROQ_API_KEY = "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"
	TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
	TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
	PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f"
	GROQ_MODEL = "gemma2-9b-it"
	REQUEST_TIMEOUT = 10
	GROQ_TIMEOUT = 30
	RETRY_BACKOFF_FACTOR = 2
	MAX_RETRIES = 3
	DELAY_BETWEEN_REQUESTS = 3

	# Setup logging with timestamp-based file
	log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
	handlers=[
	logging.FileHandler(log_file, encoding="utf-8"),
	logging.StreamHandler()
	]
	)

	client = Groq(api_key=GROQ_API_KEY)

	# === HELPER FUNCTIONS ===
	def send_telegram_message(message):
	"""Send message to Telegram chat."""
	try:
	url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
	payload = {
	"chat_id": TELEGRAM_CHAT_ID,
	"text": message,
	"parse_mode": "Markdown"
	}
	response = requests.post(url, json=payload, timeout=5)
	response.raise_for_status()
	logging.info("Telegram message sent successfully")
	except Exception as e:
	logging.error(f"Failed to send Telegram message: {str(e)}")

	def is_valid_url(url):
	"""Validate URL format."""
	try:
	result = urlparse(url)
	return all([result.scheme in ['http', 'https'], result.netloc])
	except Exception:
	return False

	def is_valid_html(html):
	"""Check if HTML starts with <article> and is not empty."""
	return html and html.strip().startswith('<article') and html.strip().endswith('</article>')

	def retry_request(func, args, *kwargs):
	"""Retry HTTP requests with exponential backoff."""
	for attempt in range(MAX_RETRIES):
	try:
	return func(args, *kwargs)
	except (HTTPError, RemoteDisconnected) as e:
	if isinstance(e, HTTPError) and e.response.status_code == 429:
	sleep_time = RETRY_BACKOFF_FACTOR ** attempt
	logging.warning(f"Rate limit hit, retrying in {sleep_time}s...")
	time.sleep(sleep_time)
	else:
	logging.error(f"Request failed: {str(e)}")
	if attempt < MAX_RETRIES - 1:
	time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1)
	continue
	raise
	except ReadTimeout as e:
	logging.error(f"Read timeout: {str(e)}")
	if attempt < MAX_RETRIES - 1:
	time.sleep(2)
	continue
	raise
	raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")

	def generate_image_keywords(text):
	"""Generate image search keywords using Groq AI."""
	prompt = f"""
	Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus:
	- Relevan dengan topik utama artikel.
	- Singkat dan spesifik (1-2 kata per frasa).
	- Tidak mengandung nama merek atau orang.
	- Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep).

	Teks artikel:
	{text[:1000]} # Batasi ke 1000 karakter untuk efisiensi

	Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3
	"""
	try:
	completion = client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.7,
	max_completion_tokens=50,
	timeout=GROQ_TIMEOUT
	)
	keywords = completion.choices[0].message.content.strip()
	# Format ke URL-encoded
	return keywords.replace(',', '+').replace(' ', '+')
	except HTTPError as e:
	if e.response.status_code == 429:
	logging.error("Groq API rate limit exceeded for keyword generation")
	raise RateLimitExceeded("Groq API rate limit exceeded")
	logging.error(f"Failed to generate keywords: {str(e)}")
	return "default+image" # Fallback keyword
	except Exception as e:
	logging.error(f"Failed to generate keywords: {str(e)}")
	return "default+image"

	def fetch_pixabay_image(keywords):
	"""Fetch image URL from Pixabay API."""
	try:
	url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280"

	def get_image():
	resp = requests.get(url, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()
	return resp

	response = retry_request(get_image)
	data = response.json()

	if data.get('hits') and len(data['hits']) > 0:
	# Prioritaskan largeImageURL untuk HD, fallback ke webformatURL
	image = data['hits'][0]
	image_url = image.get('largeImageURL', image['webformatURL'])
	logging.info(f"Fetched Pixabay image: {image_url}")
	return image_url
	else:
	logging.warning(f"No images found for keywords: {keywords}")
	return ""
	except HTTPError as e:
	if e.response.status_code == 429:
	logging.error("Pixabay API rate limit exceeded")
	raise RateLimitExceeded("Pixabay API rate limit exceeded")
	logging.error(f"Failed to fetch Pixabay image: {str(e)}")
	return ""
	except Exception as e:
	logging.error(f"Failed to fetch Pixabay image: {str(e)}")
	return ""

	# === CORE FUNCTIONS ===
	def fetch_links(sheet_name="Sheet2"):
	"""Fetch links from Google Sheet where judul is empty."""
	try:
	def get_links():
	resp = requests.get(
	GAS_URL,
	params={"sheetName": sheet_name},
	timeout=REQUEST_TIMEOUT
	)
	resp.raise_for_status()
	return resp.json()

	data = retry_request(get_links)
	links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
	logging.info(f"Fetched {len(links)} links from sheet {sheet_name}")
	return links
	except Exception as e:
	logging.error(f"Failed to fetch links: {str(e)}")
	return []

	def clean_html(soup):
	"""Remove ads, scripts, and empty elements from HTML."""
	ad_selectors = [
	'div[class="ads"]', 'div[class="advert"]', 'div[class*="banner"]',
	'div[id="ads"]', 'div[id="advert"]', 'div[id*="banner"]',
	'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail',
	'div[class="promo"]', 'div[class="widget"]'
	]
	for selector in ad_selectors:
	for element in soup.select(selector):
	element.decompose()

	# Remove empty elements, preserve Arabic text
	for elem in soup.find_all():
	text = elem.get_text(strip=True)
	if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text):
	elem.decompose()

	return soup

	def scrape_detik(link):
	"""Scrape article content from Detik."""
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	"Accept-Language": "id-ID,id;q=0.9",
	}
	try:
	def get_article():
	resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()
	return resp

	resp = retry_request(get_article)
	resp.encoding = 'utf-8'
	soup = BeautifulSoup(resp.text, 'html.parser')

	# Extract content
	content = (
	soup.select_one('.detail__body-text') or
	soup.select_one('article') or
	soup.select_one('.entry-content') or
	soup.select_one('.post-content')
	)
	if not content:
	logging.warning(f"No content found at {link}")
	return None

	# Clean and get text
	cleaned_content = clean_html(content)
	text = cleaned_content.get_text(separator='\n', strip=True)
	if not text:
	logging.warning(f"Empty content after cleaning at {link}")
	return None

	return text
	except Exception as e:
	logging.error(f"Failed to scrape {link}: {str(e)}")
	return None

	def rewrite_with_ai(text):
	"""Rewrite article using Groq AI model in streaming mode, include Pixabay image."""
	# Generate keywords for Pixabay
	keywords = generate_image_keywords(text)
	time.sleep(DELAY_BETWEEN_REQUESTS) # Delay untuk menghindari rate limit
	image_url = fetch_pixabay_image(keywords)

	prompt = f"""
	Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:

	- Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
	- Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
	- Ubah alur artikel, buat tata letak dan tampilan menarik,
	- Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik,
	- Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
	- Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
	- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada),
	- Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
	- Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
	- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada,
	- Sertakan gambar dari URL berikut (jika valid): {image_url}

	Artikel asli:
	{text}

	Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
	"""
	try:
	completion = client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[{"role": "user", "content": prompt}],
	temperature=1,
	max_completion_tokens=1691,
	top_p=1,
	stream=True,
	stop=None,
	timeout=GROQ_TIMEOUT
	)

	# Collect streaming response
	html_content = ""
	for chunk in completion:
	content = chunk.choices[0].delta.content or ""
	html_content += content

	html_content = html_content.strip()
	if not is_valid_html(html_content):
	logging.warning("AI output is not valid HTML article")
	return None
	return html_content
	except HTTPError as e:
	if e.response.status_code == 429:
	logging.error("Groq API rate limit exceeded")
	raise RateLimitExceeded("Groq API rate limit exceeded")
	logging.error(f"Failed to rewrite article: {str(e)}")
	return None
	except Exception as e:
	logging.error(f"Failed to rewrite article: {str(e)}")
	return None

	def extract_title_from_html(html):
	"""Extract title from rewritten HTML."""
	try:
	soup = BeautifulSoup(html, 'html.parser')
	title_tag = soup.find('h2')
	title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
	return title
	except Exception as e:
	logging.error(f"Failed to extract title: {str(e)}")
	return "Judul Tidak Ditemukan"

	def kirim_ke_sheet(judul, konten_html, link):
	"""Send rewritten title and content to Google Sheet."""
	if not judul or not konten_html:
	logging.warning(f"Empty title or content for link {link}")
	return

	try:
	payload = {
	"method": "updateRowByLink",
	"link": link,
	"judul": judul,
	"konten": konten_html
	}

	def send_data():
	resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()
	return resp

	retry_request(send_data)
	logging.info(f"Successfully sent to sheet: {judul}")
	except Exception as e:
	logging.error(f"Failed to send to sheet for {link}: {str(e)}")

	# === MAIN ===
	def main():
	"""Main function to process articles."""
	logging.info("Starting scrape and rewrite process")
	processed_count = 0
	MAX_ARTICLES = 20
	try:
	rows = fetch_links()
	logging.info(f"Found {len(rows)} links to process")

	for idx, row in enumerate(rows, 1):
	if processed_count >= MAX_ARTICLES:
	logging.info(f"Reached maximum article limit of {MAX_ARTICLES}")
	break

	link = row['link']
	logging.info(f"[{idx}/{len(rows)}] Processing: {link}")

	# Scrape article
	artikel = scrape_detik(link)
	if not artikel:
	logging.warning(f"Skipping {link} due to empty content")
	continue

	# Rewrite with AI and fetch Pixabay image
	rewrite_html = rewrite_with_ai(artikel)
	if not rewrite_html:
	logging.warning(f"Skipping {link} due to rewrite failure")
	continue

	# Add 1-minute delay after AI rewrite
	logging.info("Waiting for 60 seconds after AI rewrite...")
	time.sleep(60)

	# Extract title
	judul = extract_title_from_html(rewrite_html)

	# Send to sheet
	kirim_ke_sheet(judul, rewrite_html, link)
	processed_count += 1

	# Delay to avoid rate limits
	time.sleep(DELAY_BETWEEN_REQUESTS)

	# Send success message
	message = f"✅ Scrape and Rewrite Completed\nProcessed {processed_count} articles successfully."
	send_telegram_message(message)

	except RateLimitExceeded as e:
	message = f"❌ Script Terminated: API rate limit exceeded.\nProcessed {processed_count} articles before termination."
	logging.error(str(e))
	send_telegram_message(message)
	raise
	except Exception as e:
	message = f"❌ Script Terminated: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination."
	logging.error(f"Unexpected error: {str(e)}")
	send_telegram_message(message)
	raise
	finally:
	logging.info("Process ended")

	# === SCHEDULER ===
	def run_scheduler():
	"""Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""

	schedule.every().day.at("05:00").do(main) # 12:00 WIB
	logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")

	while True:
	schedule.run_pending()
	time.sleep(60)

	# === GRADIO INTERFACE ===
	def gradio_interface():
	"""Gradio interface for manual execution and status."""
	main()
	return "Manual execution started. Check logs for details."

	if __name__ == "__main__":
	if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
	logging.info("Running in manual mode")
	main()
	else:
	scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
	scheduler_thread.start()
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=None,
	outputs="text",
	title="Article Scraper and Rewriter",
	description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB."
	)
	logging.info("Starting Gradio interface")
	iface.launch()