Spaces:

aronsaras
/

scraper

Sleeping

App Files Files Community

aronsaras commited on Jun 19, 2025

Commit

ae4dc53

verified ·

1 Parent(s): b2b8c63

Update app.py

Browse files

Files changed (1) hide show

app.py +308 -59

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import json
@@ -7,14 +6,23 @@ import logging
 import re
 from urllib.parse import urlparse
 from groq import Groq
-from requests.exceptions import HTTPError, ReadTimeout
 from http.client import RemoteDisconnected
 import os
 from datetime import datetime
 # === KONFIGURASI ===
 GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
-GROQ_API_KEY ="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"
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
 TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
 GROQ_MODEL = "gemma2-9b-it"
@@ -24,113 +32,354 @@ RETRY_BACKOFF_FACTOR = 2
 MAX_RETRIES = 3
 DELAY_BETWEEN_REQUESTS = 3
 log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
 logging.basicConfig(
     level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[logging.FileHandler(log_file, encoding="utf-8"), logging.StreamHandler()]
 )
 client = Groq(api_key=GROQ_API_KEY)
-# --- FUNGSI PENDUKUNG ---
-class RateLimitExceeded(Exception):
-    pass
 def send_telegram_message(message):
     try:
         url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
-        payload = {"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "Markdown"}
-        requests.post(url, json=payload, timeout=5)
-    except: pass
 def is_valid_url(url):
     try:
         result = urlparse(url)
         return all([result.scheme in ['http', 'https'], result.netloc])
-    except: return False
 def is_valid_html(html):
     return html and html.strip().startswith('<article') and html.strip().endswith('</article>')
 def retry_request(func, *args, **kwargs):
     for attempt in range(MAX_RETRIES):
         try:
             return func(*args, **kwargs)
-        except (HTTPError, RemoteDisconnected, ReadTimeout):
-            time.sleep(RETRY_BACKOFF_FACTOR ** attempt)
-    raise Exception("Max retries exceeded")
 def fetch_links(sheet_name="Sheet2"):
     try:
         def get_links():
-            resp = requests.get(GAS_URL, params={"sheetName": sheet_name}, timeout=REQUEST_TIMEOUT)
             return resp.json()
         data = retry_request(get_links)
-        return [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
-    except: return []
 def clean_html(soup):
-    for tag in soup(["script", "iframe"]): tag.decompose()
     return soup
 def extract_main_image(soup):
     try:
-        img = soup.select_one('article img')
-        return img['src'] if img and is_valid_url(img['src']) else ""
-    except: return ""
 def scrape_detik(link):
     try:
-        resp = retry_request(lambda: requests.get(link, timeout=REQUEST_TIMEOUT))
         soup = BeautifulSoup(resp.text, 'html.parser')
-        content = soup.select_one('article')
         image_url = extract_main_image(soup)
-        cleaned = clean_html(content)
-        return cleaned.get_text("\n", strip=True), image_url
-    except: return None, None
 def rewrite_with_ai(text, image_url):
-    prompt = f"""Tulis ulang artikel berikut menjadi HTML <article>...\n\n{text}"""
     try:
         completion = client.chat.completions.create(
             model=GROQ_MODEL,
             messages=[{"role": "user", "content": prompt}],
-            stream=True
         )
-        html = "".join(chunk.choices[0].delta.content or "" for chunk in completion)
-        return html if is_valid_html(html) else None
-    except: return None
 def extract_title_from_html(html):
     try:
         soup = BeautifulSoup(html, 'html.parser')
-        return soup.find('h2').get_text(strip=True)
-    except: return "Tanpa Judul"
 def kirim_ke_sheet(judul, konten_html, link):
     try:
-        payload = {"method": "updateRowByLink", "link": link, "judul": judul, "konten": konten_html}
-        retry_request(lambda: requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT))
-    except: pass
-# --- FUNGSI UTAMA YANG DIPANGGIL ---
-def jalankan_script():
-    processed = 0
-    rows = fetch_links()
-    for row in rows[:3]:  # BATAS 3 LINK
-        link = row['link']
-        artikel, img = scrape_detik(link)
-        if not artikel: continue
-        hasil = rewrite_with_ai(artikel, img)
-        if not hasil: continue
-        judul = extract_title_from_html(hasil)
-        kirim_ke_sheet(judul, hasil, link)
-        processed += 1
-        time.sleep(DELAY_BETWEEN_REQUESTS)
-    send_telegram_message(f"Selesai! {processed} artikel diproses.")
-    return f"Sukses: {processed} artikel diproses"
-# --- GRADIO UNTUK HUGGING FACE ---
-iface = gr.Interface(fn=jalankan_script, inputs=[], outputs="text")
-app = gr.mount_gradio_app(app=None, blocks=iface, path="/run")

 import requests
 from bs4 import BeautifulSoup
 import json
 import re
 from urllib.parse import urlparse
 from groq import Groq
+from requests.exceptions import HTTPError, RequestException, ReadTimeout
 from http.client import RemoteDisconnected
 import os
 from datetime import datetime
+import schedule
+import threading
+import sys
+import gradio as gr
+# === CUSTOM EXCEPTION ===
+class RateLimitExceeded(Exception):
+    """Exception raised when Groq API rate limit is exceeded."""
+    pass
 # === KONFIGURASI ===
 GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
+GROQ_API_KEY = "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"
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
 TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
 GROQ_MODEL = "gemma2-9b-it"
 MAX_RETRIES = 3
 DELAY_BETWEEN_REQUESTS = 3
+# Setup logging with timestamp-based file
 log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
 logging.basicConfig(
     level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
+    handlers=[
+        logging.FileHandler(log_file, encoding="utf-8"),
+        logging.StreamHandler()
+    ]
 )
 client = Groq(api_key=GROQ_API_KEY)
+# === HELPER FUNCTIONS ===
 def send_telegram_message(message):
+    """Send message to Telegram chat."""
     try:
         url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
+        payload = {
+            "chat_id": TELEGRAM_CHAT_ID,
+            "text": message,
+            "parse_mode": "Markdown"
+        }
+        response = requests.post(url, json=payload, timeout=5)
+        response.raise_for_status()
+        logging.info("Telegram message sent successfully")
+    except Exception as e:
+        logging.error(f"Failed to send Telegram message: {str(e)}")
 def is_valid_url(url):
+    """Validate URL format."""
     try:
         result = urlparse(url)
         return all([result.scheme in ['http', 'https'], result.netloc])
+    except Exception:
+        return False
 def is_valid_html(html):
+    """Check if HTML starts with <article> and is not empty."""
     return html and html.strip().startswith('<article') and html.strip().endswith('</article>')
 def retry_request(func, *args, **kwargs):
+    """Retry HTTP requests with exponential backoff."""
     for attempt in range(MAX_RETRIES):
         try:
             return func(*args, **kwargs)
+        except (HTTPError, RemoteDisconnected) as e:
+            if isinstance(e, HTTPError) and e.response.status_code == 429:
+                sleep_time = RETRY_BACKOFF_FACTOR ** attempt
+                logging.warning(f"Rate limit hit, retrying in {sleep_time}s...")
+                time.sleep(sleep_time)
+            else:
+                logging.error(f"Request failed: {str(e)}")
+                if attempt < MAX_RETRIES - 1:
+                    time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1)
+                    continue
+                raise
+        except ReadTimeout as e:
+            logging.error(f"Read timeout: {str(e)}")
+            if attempt < MAX_RETRIES - 1:
+                time.sleep(2)  # Specific delay for timeout
+                continue
+            raise
+    raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
+# === CORE FUNCTIONS ===
 def fetch_links(sheet_name="Sheet2"):
+    """Fetch links from Google Sheet where judul is empty."""
     try:
         def get_links():
+            resp = requests.get(
+                GAS_URL,
+                params={"sheetName": sheet_name},
+                timeout=REQUEST_TIMEOUT
+            )
+            resp.raise_for_status()
             return resp.json()
         data = retry_request(get_links)
+        links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
+        logging.info(f"Fetched {len(links)} links from sheet {sheet_name}")
+        return links
+    except Exception as e:
+        logging.error(f"Failed to fetch links: {str(e)}")
+        return []
 def clean_html(soup):
+    """Remove ads, scripts, and empty elements from HTML."""
+    ad_selectors = [
+        'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]',
+        'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]',
+        'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail',
+        'div[class*="promo"]', 'div[class*="widget"]'
+    ]
+    for selector in ad_selectors:
+        for element in soup.select(selector):
+            element.decompose()
+    # Remove empty elements, preserve Arabic text
+    for elem in soup.find_all():
+        text = elem.get_text(strip=True)
+        if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text):
+            elem.decompose()
     return soup
 def extract_main_image(soup):
+    """Extract URL of the main image from the article."""
     try:
+        img = (
+            soup.select_one('.detail__media img') or
+            soup.select_one('article img') or
+            soup.select_one('img[alt*="main"]') or
+            soup.select_one('img[data-testid*="main-image"]')
+        )
+        src = img['src'] if img and img.get('src') else ""
+        return src if is_valid_url(src) else ""
+    except Exception as e:
+        logging.warning(f"Failed to extract main image: {str(e)}")
+        return ""
 def scrape_detik(link):
+    """Scrape article content and main image from Detik."""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+        "Accept-Language": "id-ID,id;q=0.9",
+    }
     try:
+        def get_article():
+            resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
+            resp.raise_for_status()
+            return resp
+        resp = retry_request(get_article)
+        resp.encoding = 'utf-8'
         soup = BeautifulSoup(resp.text, 'html.parser')
+        # Extract content
+        content = (
+            soup.select_one('.detail__body-text') or
+            soup.select_one('article') or
+            soup.select_one('.entry-content') or
+            soup.select_one('.post-content')
+        )
+        if not content:
+            logging.warning(f"No content found at {link}")
+            return None, None
+        # Extract main image
         image_url = extract_main_image(soup)
+        # Clean and get text
+        cleaned_content = clean_html(content)
+        text = cleaned_content.get_text(separator='\n', strip=True)
+        if not text:
+            logging.warning(f"Empty content after cleaning at {link}")
+            return None, None
+        return text, image_url
+    except Exception as e:
+        logging.error(f"Failed to scrape {link}: {str(e)}")
+        return None, None
 def rewrite_with_ai(text, image_url):
+    """Rewrite article using Groq AI model in streaming mode."""
+    prompt = f"""
+Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
+- Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
+- Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
+- Kamu boleh mengubah alur artikel, dan buat tata letak dan tampilan menarik,
+- Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
+- Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
+- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar utama (jika ada),
+- Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
+- Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
+- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada.
+Artikel asli:
+{text}
+URL gambar utama (jika ada):
+{image_url}
+Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
+"""
     try:
         completion = client.chat.completions.create(
             model=GROQ_MODEL,
             messages=[{"role": "user", "content": prompt}],
+            temperature=1,
+            max_completion_tokens=1691,
+            top_p=1,
+            stream=True,
+            stop=None,
+            timeout=GROQ_TIMEOUT
         )
+        # Collect streaming response
+        html_content = ""
+        for chunk in completion:
+            content = chunk.choices[0].delta.content or ""
+            html_content += content
+        html_content = html_content.strip()
+        if not is_valid_html(html_content):
+            logging.warning("AI output is not valid HTML article")
+            return None
+        return html_content
+    except HTTPError as e:
+        if e.response.status_code == 429:
+            logging.error("Groq API rate limit exceeded")
+            raise RateLimitExceeded("Groq API rate limit exceeded")
+        logging.error(f"Failed to rewrite article: {str(e)}")
+        return None
+    except Exception as e:
+        logging.error(f"Failed to rewrite article: {str(e)}")
+        return None
 def extract_title_from_html(html):
+    """Extract title from rewritten HTML."""
     try:
         soup = BeautifulSoup(html, 'html.parser')
+        title_tag = soup.find('h2')  # Only h2 as per prompt
+        title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
+        return title
+    except Exception as e:
+        logging.error(f"Failed to extract title: {str(e)}")
+        return "Judul Tidak Ditemukan"
 def kirim_ke_sheet(judul, konten_html, link):
+    """Send rewritten title and content to Google Sheet."""
+    if not judul or not konten_html:
+        logging.warning(f"Empty title or content for link {link}")
+        return
+    try:
+        payload = {
+            "method": "updateRowByLink",
+            "link": link,
+            "judul": judul,
+            "konten": konten_html
+        }
+        def send_data():
+            resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT)
+            resp.raise_for_status()
+            return resp
+        retry_request(send_data)
+        logging.info(f"Successfully sent to sheet: {judul}")
+    except Exception as e:
+        logging.error(f"Failed to send to sheet for {link}: {str(e)}")
+# === MAIN ===
+def main():
+    """Main function to process articles."""
+    logging.info("Starting scrape and rewrite process")
+    processed_count = 0
+    MAX_ARTICLES = 40
     try:
+        rows = fetch_links()
+        logging.info(f"Found {len(rows)} links to process")
+        for idx, row in enumerate(rows, 1):
+            if processed_count >= MAX_ARTICLES:
+                logging.info(f"Reached maximum article limit of {MAX_ARTICLES}")
+                break
+            link = row['link']
+            logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
+            # Scrape article
+            artikel, image_url = scrape_detik(link)
+            if not artikel:
+                logging.warning(f"Skipping {link} due to empty content")
+                continue
+            # Rewrite with AI
+            rewrite_html = rewrite_with_ai(artikel, image_url)
+            if not rewrite_html:
+                logging.warning(f"Skipping {link} due to rewrite failure")
+                continue
+            # Add 1-minute delay after AI rewrite
+            logging.info("Waiting for 60 seconds after AI rewrite...")
+            time.sleep(60)
+            # Extract title
+            judul = extract_title_from_html(rewrite_html)
+            # Send to sheet
+            kirim_ke_sheet(judul, rewrite_html, link)
+            processed_count += 1
+            # Delay to avoid rate limits
+            time.sleep(DELAY_BETWEEN_REQUESTS)
+        # Send success message
+        message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully."
+        send_telegram_message(message)
+    except RateLimitExceeded as e:
+        message = f"❌ *Script Terminated*: Groq API rate limit exceeded.\nProcessed {processed_count} articles before termination."
+        logging.error(str(e))
+        send_telegram_message(message)
+        raise
+    except Exception as e:
+        message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination."
+        logging.error(f"Unexpected error: {str(e)}")
+        send_telegram_message(message)
+        raise
+    finally:
+        logging.info("Process ended")
+# === SCHEDULER ===
+def run_scheduler():
+    """Run scheduler to execute main() at 00:00 WIB daily."""
+    schedule.every().day.at("00:00").do(main)
+    logging.info("Scheduler started, waiting for 00:00 WIB")
+    while True:
+        schedule.run_pending()
+        time.sleep(60)  # Check every minute
+# === GRADIO INTERFACE ===
+def gradio_interface():
+    """Gradio interface for manual execution and status."""
+    main()  # Run main() manually when button is clicked
+    return "Manual execution started. Check logs for details."
+if __name__ == "__main__":
+    # Check for manual execution via command line argument
+    if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
+        logging.info("Running in manual mode")
+        main()
+    else:
+        # Start scheduler in a separate thread
+        scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
+        scheduler_thread.start()
+        # Launch Gradio interface
+        iface = gr.Interface(
+            fn=gradio_interface,
+            inputs=None,
+            outputs="text",
+            title="Article Scraper and Rewriter",
+            description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB."
+        )
+        logging.info("Starting Gradio interface")
+        iface.launch()