File size: 16,547 Bytes
505dc0c
 
 
 
 
 
 
 
ae4dc53
505dc0c
 
 
ae4dc53
 
 
 
 
 
 
6bf1d73
ae4dc53
505dc0c
 
 
ae4dc53
505dc0c
 
6bf1d73
505dc0c
 
 
 
 
 
 
ae4dc53
505dc0c
 
 
ae4dc53
 
 
 
 
505dc0c
 
 
 
ae4dc53
505dc0c
ae4dc53
505dc0c
 
ae4dc53
 
 
 
 
 
 
 
 
 
505dc0c
 
ae4dc53
505dc0c
 
 
ae4dc53
 
505dc0c
 
ae4dc53
505dc0c
 
 
ae4dc53
505dc0c
 
 
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
6bf1d73
ae4dc53
 
 
505dc0c
6bf1d73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae4dc53
505dc0c
ae4dc53
505dc0c
 
ae4dc53
 
 
 
 
 
505dc0c
ae4dc53
505dc0c
ae4dc53
 
 
 
 
 
505dc0c
 
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505dc0c
 
 
6bf1d73
ae4dc53
 
 
 
505dc0c
ae4dc53
 
 
 
 
 
 
505dc0c
ae4dc53
 
 
 
 
 
 
 
 
 
6bf1d73
ae4dc53
 
 
 
 
 
6bf1d73
ae4dc53
6bf1d73
ae4dc53
 
6bf1d73
505dc0c
6bf1d73
 
 
 
 
 
 
ae4dc53
 
 
 
 
f981f69
6bf1d73
ae4dc53
 
6bf1d73
ae4dc53
 
6bf1d73
 
ae4dc53
 
 
 
 
 
505dc0c
 
 
 
ae4dc53
 
 
 
 
 
505dc0c
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505dc0c
 
ae4dc53
505dc0c
 
6bf1d73
ae4dc53
 
 
 
 
505dc0c
 
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46b0ee8
505dc0c
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
6bf1d73
ae4dc53
 
 
 
6bf1d73
 
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bf1d73
ae4dc53
 
 
 
 
 
 
 
 
 
 
6bf1d73
ae4dc53
e7adf54
6bf1d73
e7adf54
 
ae4dc53
 
 
6bf1d73
ae4dc53
 
 
 
6bf1d73
ae4dc53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import requests
from bs4 import BeautifulSoup
import json
import time
import logging
import re
from urllib.parse import urlparse
from groq import Groq
from requests.exceptions import HTTPError, RequestException, ReadTimeout
from http.client import RemoteDisconnected
import os
from datetime import datetime
import schedule
import threading
import sys
import gradio as gr

# === CUSTOM EXCEPTION ===
class RateLimitExceeded(Exception):
    """Exception raised when Groq API or Pixabay API rate limit is exceeded."""
    pass

# === KONFIGURASI ===
GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
GROQ_API_KEY = "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"
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f"
GROQ_MODEL = "gemma2-9b-it"
REQUEST_TIMEOUT = 10
GROQ_TIMEOUT = 30
RETRY_BACKOFF_FACTOR = 2
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = 3

# Setup logging with timestamp-based file
log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
    handlers=[
        logging.FileHandler(log_file, encoding="utf-8"),
        logging.StreamHandler()
    ]
)

client = Groq(api_key=GROQ_API_KEY)

# === HELPER FUNCTIONS ===
def send_telegram_message(message):
    """Send message to Telegram chat."""
    try:
        url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
        payload = {
            "chat_id": TELEGRAM_CHAT_ID,
            "text": message,
            "parse_mode": "Markdown"
        }
        response = requests.post(url, json=payload, timeout=5)
        response.raise_for_status()
        logging.info("Telegram message sent successfully")
    except Exception as e:
        logging.error(f"Failed to send Telegram message: {str(e)}")

def is_valid_url(url):
    """Validate URL format."""
    try:
        result = urlparse(url)
        return all([result.scheme in ['http', 'https'], result.netloc])
    except Exception:
        return False

def is_valid_html(html):
    """Check if HTML starts with <article> and is not empty."""
    return html and html.strip().startswith('<article') and html.strip().endswith('</article>')

def retry_request(func, *args, **kwargs):
    """Retry HTTP requests with exponential backoff."""
    for attempt in range(MAX_RETRIES):
        try:
            return func(*args, **kwargs)
        except (HTTPError, RemoteDisconnected) as e:
            if isinstance(e, HTTPError) and e.response.status_code == 429:
                sleep_time = RETRY_BACKOFF_FACTOR ** attempt
                logging.warning(f"Rate limit hit, retrying in {sleep_time}s...")
                time.sleep(sleep_time)
            else:
                logging.error(f"Request failed: {str(e)}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1)
                    continue
                raise
        except ReadTimeout as e:
            logging.error(f"Read timeout: {str(e)}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(2)
                continue
            raise
    raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")

def generate_image_keywords(text):
    """Generate image search keywords using Groq AI."""
    prompt = f"""
    Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus:
    - Relevan dengan topik utama artikel.
    - Singkat dan spesifik (1-2 kata per frasa).
    - Tidak mengandung nama merek atau orang.
    - Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep).
    
    Teks artikel:
    {text[:1000]}  # Batasi ke 1000 karakter untuk efisiensi
    
    Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3
    """
    try:
        completion = client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_completion_tokens=50,
            timeout=GROQ_TIMEOUT
        )
        keywords = completion.choices[0].message.content.strip()
        # Format ke URL-encoded
        return keywords.replace(',', '+').replace(' ', '+')
    except HTTPError as e:
        if e.response.status_code == 429:
            logging.error("Groq API rate limit exceeded for keyword generation")
            raise RateLimitExceeded("Groq API rate limit exceeded")
        logging.error(f"Failed to generate keywords: {str(e)}")
        return "default+image"  # Fallback keyword
    except Exception as e:
        logging.error(f"Failed to generate keywords: {str(e)}")
        return "default+image"

def fetch_pixabay_image(keywords):
    """Fetch image URL from Pixabay API."""
    try:
        url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280"
        
        def get_image():
            resp = requests.get(url, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp
        
        response = retry_request(get_image)
        data = response.json()
        
        if data.get('hits') and len(data['hits']) > 0:
            # Prioritaskan largeImageURL untuk HD, fallback ke webformatURL
            image = data['hits'][0]
            image_url = image.get('largeImageURL', image['webformatURL'])
            logging.info(f"Fetched Pixabay image: {image_url}")
            return image_url
        else:
            logging.warning(f"No images found for keywords: {keywords}")
            return ""
    except HTTPError as e:
        if e.response.status_code == 429:
            logging.error("Pixabay API rate limit exceeded")
            raise RateLimitExceeded("Pixabay API rate limit exceeded")
        logging.error(f"Failed to fetch Pixabay image: {str(e)}")
        return ""
    except Exception as e:
        logging.error(f"Failed to fetch Pixabay image: {str(e)}")
        return ""

# === CORE FUNCTIONS ===
def fetch_links(sheet_name="Sheet2"):
    """Fetch links from Google Sheet where judul is empty."""
    try:
        def get_links():
            resp = requests.get(
                GAS_URL,
                params={"sheetName": sheet_name},
                timeout=REQUEST_TIMEOUT
            )
            resp.raise_for_status()
            return resp.json()
        
        data = retry_request(get_links)
        links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
        logging.info(f"Fetched {len(links)} links from sheet {sheet_name}")
        return links
    except Exception as e:
        logging.error(f"Failed to fetch links: {str(e)}")
        return []

def clean_html(soup):
    """Remove ads, scripts, and empty elements from HTML."""
    ad_selectors = [
        'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]',
        'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]',
        'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail',
        'div[class*="promo"]', 'div[class*="widget"]'
    ]
    for selector in ad_selectors:
        for element in soup.select(selector):
            element.decompose()
    
    # Remove empty elements, preserve Arabic text
    for elem in soup.find_all():
        text = elem.get_text(strip=True)
        if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text):
            elem.decompose()
    
    return soup

def scrape_detik(link):
    """Scrape article content from Detik."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept-Language": "id-ID,id;q=0.9",
    }
    try:
        def get_article():
            resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp
        
        resp = retry_request(get_article)
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, 'html.parser')
        
        # Extract content
        content = (
            soup.select_one('.detail__body-text') or
            soup.select_one('article') or
            soup.select_one('.entry-content') or
            soup.select_one('.post-content')
        )
        if not content:
            logging.warning(f"No content found at {link}")
            return None
        
        # Clean and get text
        cleaned_content = clean_html(content)
        text = cleaned_content.get_text(separator='\n', strip=True)
        if not text:
            logging.warning(f"Empty content after cleaning at {link}")
            return None
        
        return text
    except Exception as e:
        logging.error(f"Failed to scrape {link}: {str(e)}")
        return None

def rewrite_with_ai(text):
    """Rewrite article using Groq AI model in streaming mode, include Pixabay image."""
    # Generate keywords for Pixabay
    keywords = generate_image_keywords(text)
    time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay untuk menghindari rate limit
    image_url = fetch_pixabay_image(keywords)
    
    prompt = f"""
Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:

- Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
- Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
- Ubah alur artikel, buat tata letak dan tampilan menarik,
- Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik,
- Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
- Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada),
- Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
- Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada,
- Sertakan gambar dari URL berikut (jika valid): {image_url}

Artikel asli:
{text}

Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
"""
    try:
        completion = client.chat.completions.create(
            model=GROQ_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=1,
            max_completion_tokens=1691,
            top_p=1,
            stream=True,
            stop=None,
            timeout=GROQ_TIMEOUT
        )
        
        # Collect streaming response
        html_content = ""
        for chunk in completion:
            content = chunk.choices[0].delta.content or ""
            html_content += content
        
        html_content = html_content.strip()
        if not is_valid_html(html_content):
            logging.warning("AI output is not valid HTML article")
            return None
        return html_content
    except HTTPError as e:
        if e.response.status_code == 429:
            logging.error("Groq API rate limit exceeded")
            raise RateLimitExceeded("Groq API rate limit exceeded")
        logging.error(f"Failed to rewrite article: {str(e)}")
        return None
    except Exception as e:
        logging.error(f"Failed to rewrite article: {str(e)}")
        return None

def extract_title_from_html(html):
    """Extract title from rewritten HTML."""
    try:
        soup = BeautifulSoup(html, 'html.parser')
        title_tag = soup.find('h2')
        title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
        return title
    except Exception as e:
        logging.error(f"Failed to extract title: {str(e)}")
        return "Judul Tidak Ditemukan"

def kirim_ke_sheet(judul, konten_html, link):
    """Send rewritten title and content to Google Sheet."""
    if not judul or not konten_html:
        logging.warning(f"Empty title or content for link {link}")
        return
    
    try:
        payload = {
            "method": "updateRowByLink",
            "link": link,
            "judul": judul,
            "konten": konten_html
        }
        
        def send_data():
            resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp
        
        retry_request(send_data)
        logging.info(f"Successfully sent to sheet: {judul}")
    except Exception as e:
        logging.error(f"Failed to send to sheet for {link}: {str(e)}")

# === MAIN ===
def main():
    """Main function to process articles."""
    logging.info("Starting scrape and rewrite process")
    processed_count = 0
    MAX_ARTICLES = 20
    try:
        rows = fetch_links()
        logging.info(f"Found {len(rows)} links to process")
        
        for idx, row in enumerate(rows, 1):
            if processed_count >= MAX_ARTICLES:
                logging.info(f"Reached maximum article limit of {MAX_ARTICLES}")
                break
                
            link = row['link']
            logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
            
            # Scrape article
            artikel = scrape_detik(link)
            if not artikel:
                logging.warning(f"Skipping {link} due to empty content")
                continue
            
            # Rewrite with AI and fetch Pixabay image
            rewrite_html = rewrite_with_ai(artikel)
            if not rewrite_html:
                logging.warning(f"Skipping {link} due to rewrite failure")
                continue
            
            # Add 1-minute delay after AI rewrite
            logging.info("Waiting for 60 seconds after AI rewrite...")
            time.sleep(60)
            
            # Extract title
            judul = extract_title_from_html(rewrite_html)
            
            # Send to sheet
            kirim_ke_sheet(judul, rewrite_html, link)
            processed_count += 1
            
            # Delay to avoid rate limits
            time.sleep(DELAY_BETWEEN_REQUESTS)
        
        # Send success message
        message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully."
        send_telegram_message(message)
    
    except RateLimitExceeded as e:
        message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination."
        logging.error(str(e))
        send_telegram_message(message)
        raise
    except Exception as e:
        message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination."
        logging.error(f"Unexpected error: {str(e)}")
        send_telegram_message(message)
        raise
    finally:
        logging.info("Process ended")

# === SCHEDULER ===
def run_scheduler():
    """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
   
    schedule.every().day.at("05:00").do(main)  # 12:00 WIB
    logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
    
    while True:
        schedule.run_pending()
        time.sleep(60)

# === GRADIO INTERFACE ===
def gradio_interface():
    """Gradio interface for manual execution and status."""
    main()
    return "Manual execution started. Check logs for details."

if __name__ == "__main__":
    if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
        logging.info("Running in manual mode")
        main()
    else:
        scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
        scheduler_thread.start()
        iface = gr.Interface(
            fn=gradio_interface,
            inputs=None,
            outputs="text",
            title="Article Scraper and Rewriter",
            description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB."
        )
        logging.info("Starting Gradio interface")
        iface.launch()