Spaces:

aronsaras
/

scraper

Sleeping

App Files Files Community

aronsaras commited on Jul 4, 2025

Commit

6bf1d73

verified ·

1 Parent(s): f981f69

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -48

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import gradio as gr
 # === CUSTOM EXCEPTION ===
 class RateLimitExceeded(Exception):
-    """Exception raised when Groq API rate limit is exceeded."""
     pass
 # === KONFIGURASI ===
@@ -25,6 +25,7 @@ GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh
 GROQ_API_KEY = "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"
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
 TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
 GROQ_MODEL = "gemma2-9b-it"
 REQUEST_TIMEOUT = 10
 GROQ_TIMEOUT = 30
@@ -92,11 +93,78 @@ def retry_request(func, *args, **kwargs):
         except ReadTimeout as e:
             logging.error(f"Read timeout: {str(e)}")
             if attempt < MAX_RETRIES - 1:
-                time.sleep(2)  # Specific delay for timeout
                 continue
             raise
     raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
 # === CORE FUNCTIONS ===
 def fetch_links(sheet_name="Sheet2"):
     """Fetch links from Google Sheet where judul is empty."""
@@ -138,23 +206,8 @@ def clean_html(soup):
     return soup
-def extract_main_image(soup):
-    """Extract URL of the main image from the article."""
-    try:
-        img = (
-            soup.select_one('.detail__media img') or
-            soup.select_one('article img') or
-            soup.select_one('img[alt*="main"]') or
-            soup.select_one('img[data-testid*="main-image"]')
-        )
-        src = img['src'] if img and img.get('src') else ""
-        return src if is_valid_url(src) else ""
-    except Exception as e:
-        logging.warning(f"Failed to extract main image: {str(e)}")
-        return ""
 def scrape_detik(link):
-    """Scrape article content and main image from Detik."""
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
         "Accept-Language": "id-ID,id;q=0.9",
@@ -178,45 +231,45 @@ def scrape_detik(link):
         )
         if not content:
             logging.warning(f"No content found at {link}")
-            return None, None
-        # Extract main image
-        image_url = extract_main_image(soup)
         # Clean and get text
         cleaned_content = clean_html(content)
         text = cleaned_content.get_text(separator='\n', strip=True)
         if not text:
             logging.warning(f"Empty content after cleaning at {link}")
-            return None, None
-        return text, image_url
     except Exception as e:
         logging.error(f"Failed to scrape {link}: {str(e)}")
-        return None, None
-def rewrite_with_ai(text, image_url):
-    """Rewrite article using Groq AI model in streaming mode."""
     prompt = f"""
 Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
 - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
 - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
 - Ubah alur artikel, buat tata letak dan tampilan menarik,
-- Tambah kalima atau paragraf yang relevan dengan topik agar semakin artikel semakin unik,
 - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
 - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
-- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar utama (jika ada),
 - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
 - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
-- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada.
 Artikel asli:
 {text}
-URL gambar utama (jika ada):
-{image_url}
 Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
 """
     try:
@@ -256,7 +309,7 @@ def extract_title_from_html(html):
     """Extract title from rewritten HTML."""
     try:
         soup = BeautifulSoup(html, 'html.parser')
-        title_tag = soup.find('h2')  # Only h2 as per prompt
         title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
         return title
     except Exception as e:
@@ -306,13 +359,13 @@ def main():
             logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
             # Scrape article
-            artikel, image_url = scrape_detik(link)
             if not artikel:
                 logging.warning(f"Skipping {link} due to empty content")
                 continue
-            # Rewrite with AI
-            rewrite_html = rewrite_with_ai(artikel, image_url)
             if not rewrite_html:
                 logging.warning(f"Skipping {link} due to rewrite failure")
                 continue
@@ -336,7 +389,7 @@ def main():
         send_telegram_message(message)
     except RateLimitExceeded as e:
-        message = f"❌ *Script Terminated*: Groq API rate limit exceeded.\nProcessed {processed_count} articles before termination."
         logging.error(str(e))
         send_telegram_message(message)
         raise
@@ -348,36 +401,30 @@ def main():
     finally:
         logging.info("Process ended")
-        # === SCHEDULER ===
 def run_scheduler():
     """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
-    # Asumsi server di UTC: 00:00 WIB = 17:00 UTC, 12:00 WIB = 05:00 UTC
-    schedule.every().day.at("17:00").do(main)  # 00:00 WIB
     schedule.every().day.at("05:00").do(main)  # 12:00 WIB
     logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
     while True:
         schedule.run_pending()
-        time.sleep(60)  # Cek setiap menit
 # === GRADIO INTERFACE ===
 def gradio_interface():
     """Gradio interface for manual execution and status."""
-    main()  # Run main() manually when button is clicked
     return "Manual execution started. Check logs for details."
 if __name__ == "__main__":
-    # Check for manual execution via command line argument
     if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
         logging.info("Running in manual mode")
         main()
     else:
-        # Start scheduler in a separate thread
         scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
         scheduler_thread.start()
-        # Launch Gradio interface
         iface = gr.Interface(
             fn=gradio_interface,
             inputs=None,

 # === CUSTOM EXCEPTION ===
 class RateLimitExceeded(Exception):
+    """Exception raised when Groq API or Pixabay API rate limit is exceeded."""
     pass
 # === KONFIGURASI ===
 GROQ_API_KEY = "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"
 TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
 TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
+PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f"
 GROQ_MODEL = "gemma2-9b-it"
 REQUEST_TIMEOUT = 10
 GROQ_TIMEOUT = 30
         except ReadTimeout as e:
             logging.error(f"Read timeout: {str(e)}")
             if attempt < MAX_RETRIES - 1:
+                time.sleep(2)
                 continue
             raise
     raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
+def generate_image_keywords(text):
+    """Generate image search keywords using Groq AI."""
+    prompt = f"""
+    Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus:
+    - Relevan dengan topik utama artikel.
+    - Singkat dan spesifik (1-2 kata per frasa).
+    - Tidak mengandung nama merek atau orang.
+    - Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep).
+    Teks artikel:
+    {text[:1000]}  # Batasi ke 1000 karakter untuk efisiensi
+    Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3
+    """
+    try:
+        completion = client.chat.completions.create(
+            model=GROQ_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.7,
+            max_completion_tokens=50,
+            timeout=GROQ_TIMEOUT
+        )
+        keywords = completion.choices[0].message.content.strip()
+        # Format ke URL-encoded
+        return keywords.replace(',', '+').replace(' ', '+')
+    except HTTPError as e:
+        if e.response.status_code == 429:
+            logging.error("Groq API rate limit exceeded for keyword generation")
+            raise RateLimitExceeded("Groq API rate limit exceeded")
+        logging.error(f"Failed to generate keywords: {str(e)}")
+        return "default+image"  # Fallback keyword
+    except Exception as e:
+        logging.error(f"Failed to generate keywords: {str(e)}")
+        return "default+image"
+def fetch_pixabay_image(keywords):
+    """Fetch image URL from Pixabay API."""
+    try:
+        url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280"
+        def get_image():
+            resp = requests.get(url, timeout=REQUEST_TIMEOUT)
+            resp.raise_for_status()
+            return resp
+        response = retry_request(get_image)
+        data = response.json()
+        if data.get('hits') and len(data['hits']) > 0:
+            # Prioritaskan largeImageURL untuk HD, fallback ke webformatURL
+            image = data['hits'][0]
+            image_url = image.get('largeImageURL', image['webformatURL'])
+            logging.info(f"Fetched Pixabay image: {image_url}")
+            return image_url
+        else:
+            logging.warning(f"No images found for keywords: {keywords}")
+            return ""
+    except HTTPError as e:
+        if e.response.status_code == 429:
+            logging.error("Pixabay API rate limit exceeded")
+            raise RateLimitExceeded("Pixabay API rate limit exceeded")
+        logging.error(f"Failed to fetch Pixabay image: {str(e)}")
+        return ""
+    except Exception as e:
+        logging.error(f"Failed to fetch Pixabay image: {str(e)}")
+        return ""
 # === CORE FUNCTIONS ===
 def fetch_links(sheet_name="Sheet2"):
     """Fetch links from Google Sheet where judul is empty."""
     return soup
 def scrape_detik(link):
+    """Scrape article content from Detik."""
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
         "Accept-Language": "id-ID,id;q=0.9",
         )
         if not content:
             logging.warning(f"No content found at {link}")
+            return None
         # Clean and get text
         cleaned_content = clean_html(content)
         text = cleaned_content.get_text(separator='\n', strip=True)
         if not text:
             logging.warning(f"Empty content after cleaning at {link}")
+            return None
+        return text
     except Exception as e:
         logging.error(f"Failed to scrape {link}: {str(e)}")
+        return None
+def rewrite_with_ai(text):
+    """Rewrite article using Groq AI model in streaming mode, include Pixabay image."""
+    # Generate keywords for Pixabay
+    keywords = generate_image_keywords(text)
+    time.sleep(DELAY_BETWEEN_REQUESTS)  # Delay untuk menghindari rate limit
+    image_url = fetch_pixabay_image(keywords)
     prompt = f"""
 Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
 - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
 - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
 - Ubah alur artikel, buat tata letak dan tampilan menarik,
+- Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik,
 - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
 - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
+- Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada),
 - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
 - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
+- Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada,
+- Sertakan gambar dari URL berikut (jika valid): {image_url}
 Artikel asli:
 {text}
 Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
 """
     try:
     """Extract title from rewritten HTML."""
     try:
         soup = BeautifulSoup(html, 'html.parser')
+        title_tag = soup.find('h2')
         title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
         return title
     except Exception as e:
             logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
             # Scrape article
+            artikel = scrape_detik(link)
             if not artikel:
                 logging.warning(f"Skipping {link} due to empty content")
                 continue
+            # Rewrite with AI and fetch Pixabay image
+            rewrite_html = rewrite_with_ai(artikel)
             if not rewrite_html:
                 logging.warning(f"Skipping {link} due to rewrite failure")
                 continue
         send_telegram_message(message)
     except RateLimitExceeded as e:
+        message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination."
         logging.error(str(e))
         send_telegram_message(message)
         raise
     finally:
         logging.info("Process ended")
+# === SCHEDULER ===
 def run_scheduler():
     """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
     schedule.every().day.at("05:00").do(main)  # 12:00 WIB
     logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
     while True:
         schedule.run_pending()
+        time.sleep(60)
 # === GRADIO INTERFACE ===
 def gradio_interface():
     """Gradio interface for manual execution and status."""
+    main()
     return "Manual execution started. Check logs for details."
 if __name__ == "__main__":
     if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
         logging.info("Running in manual mode")
         main()
     else:
         scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
         scheduler_thread.start()
         iface = gr.Interface(
             fn=gradio_interface,
             inputs=None,