aronsaras commited on
Commit
6bf1d73
·
verified ·
1 Parent(s): f981f69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -48
app.py CHANGED
@@ -17,7 +17,7 @@ import gradio as gr
17
 
18
  # === CUSTOM EXCEPTION ===
19
  class RateLimitExceeded(Exception):
20
- """Exception raised when Groq API rate limit is exceeded."""
21
  pass
22
 
23
  # === KONFIGURASI ===
@@ -25,6 +25,7 @@ GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh
25
  GROQ_API_KEY = "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"
26
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
27
  TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
 
28
  GROQ_MODEL = "gemma2-9b-it"
29
  REQUEST_TIMEOUT = 10
30
  GROQ_TIMEOUT = 30
@@ -92,11 +93,78 @@ def retry_request(func, *args, **kwargs):
92
  except ReadTimeout as e:
93
  logging.error(f"Read timeout: {str(e)}")
94
  if attempt < MAX_RETRIES - 1:
95
- time.sleep(2) # Specific delay for timeout
96
  continue
97
  raise
98
  raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # === CORE FUNCTIONS ===
101
  def fetch_links(sheet_name="Sheet2"):
102
  """Fetch links from Google Sheet where judul is empty."""
@@ -138,23 +206,8 @@ def clean_html(soup):
138
 
139
  return soup
140
 
141
- def extract_main_image(soup):
142
- """Extract URL of the main image from the article."""
143
- try:
144
- img = (
145
- soup.select_one('.detail__media img') or
146
- soup.select_one('article img') or
147
- soup.select_one('img[alt*="main"]') or
148
- soup.select_one('img[data-testid*="main-image"]')
149
- )
150
- src = img['src'] if img and img.get('src') else ""
151
- return src if is_valid_url(src) else ""
152
- except Exception as e:
153
- logging.warning(f"Failed to extract main image: {str(e)}")
154
- return ""
155
-
156
  def scrape_detik(link):
157
- """Scrape article content and main image from Detik."""
158
  headers = {
159
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
160
  "Accept-Language": "id-ID,id;q=0.9",
@@ -178,45 +231,45 @@ def scrape_detik(link):
178
  )
179
  if not content:
180
  logging.warning(f"No content found at {link}")
181
- return None, None
182
-
183
- # Extract main image
184
- image_url = extract_main_image(soup)
185
 
186
  # Clean and get text
187
  cleaned_content = clean_html(content)
188
  text = cleaned_content.get_text(separator='\n', strip=True)
189
  if not text:
190
  logging.warning(f"Empty content after cleaning at {link}")
191
- return None, None
192
 
193
- return text, image_url
194
  except Exception as e:
195
  logging.error(f"Failed to scrape {link}: {str(e)}")
196
- return None, None
197
 
198
- def rewrite_with_ai(text, image_url):
199
- """Rewrite article using Groq AI model in streaming mode."""
 
 
 
 
 
200
  prompt = f"""
201
  Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
202
 
203
  - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
204
  - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
205
  - Ubah alur artikel, buat tata letak dan tampilan menarik,
206
- - Tambah kalima atau paragraf yang relevan dengan topik agar semakin artikel semakin unik,
207
  - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
208
  - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
209
- - Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar utama (jika ada),
210
  - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
211
  - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
212
- - Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada.
 
213
 
214
  Artikel asli:
215
  {text}
216
 
217
- URL gambar utama (jika ada):
218
- {image_url}
219
-
220
  Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
221
  """
222
  try:
@@ -256,7 +309,7 @@ def extract_title_from_html(html):
256
  """Extract title from rewritten HTML."""
257
  try:
258
  soup = BeautifulSoup(html, 'html.parser')
259
- title_tag = soup.find('h2') # Only h2 as per prompt
260
  title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
261
  return title
262
  except Exception as e:
@@ -306,13 +359,13 @@ def main():
306
  logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
307
 
308
  # Scrape article
309
- artikel, image_url = scrape_detik(link)
310
  if not artikel:
311
  logging.warning(f"Skipping {link} due to empty content")
312
  continue
313
 
314
- # Rewrite with AI
315
- rewrite_html = rewrite_with_ai(artikel, image_url)
316
  if not rewrite_html:
317
  logging.warning(f"Skipping {link} due to rewrite failure")
318
  continue
@@ -336,7 +389,7 @@ def main():
336
  send_telegram_message(message)
337
 
338
  except RateLimitExceeded as e:
339
- message = f"❌ *Script Terminated*: Groq API rate limit exceeded.\nProcessed {processed_count} articles before termination."
340
  logging.error(str(e))
341
  send_telegram_message(message)
342
  raise
@@ -348,36 +401,30 @@ def main():
348
  finally:
349
  logging.info("Process ended")
350
 
351
-
352
- # === SCHEDULER ===
353
  def run_scheduler():
354
  """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
355
- # Asumsi server di UTC: 00:00 WIB = 17:00 UTC, 12:00 WIB = 05:00 UTC
356
- schedule.every().day.at("17:00").do(main) # 00:00 WIB
357
  schedule.every().day.at("05:00").do(main) # 12:00 WIB
358
  logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
359
 
360
  while True:
361
  schedule.run_pending()
362
- time.sleep(60) # Cek setiap menit
363
 
364
  # === GRADIO INTERFACE ===
365
  def gradio_interface():
366
  """Gradio interface for manual execution and status."""
367
- main() # Run main() manually when button is clicked
368
  return "Manual execution started. Check logs for details."
369
 
370
  if __name__ == "__main__":
371
- # Check for manual execution via command line argument
372
  if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
373
  logging.info("Running in manual mode")
374
  main()
375
  else:
376
- # Start scheduler in a separate thread
377
  scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
378
  scheduler_thread.start()
379
-
380
- # Launch Gradio interface
381
  iface = gr.Interface(
382
  fn=gradio_interface,
383
  inputs=None,
 
17
 
18
  # === CUSTOM EXCEPTION ===
19
  class RateLimitExceeded(Exception):
20
+ """Exception raised when Groq API or Pixabay API rate limit is exceeded."""
21
  pass
22
 
23
  # === KONFIGURASI ===
 
25
  GROQ_API_KEY = "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"
26
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
27
  TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
28
+ PIXABAY_API_KEY = "51175753-096073a3b283350c4eca0022f"
29
  GROQ_MODEL = "gemma2-9b-it"
30
  REQUEST_TIMEOUT = 10
31
  GROQ_TIMEOUT = 30
 
93
  except ReadTimeout as e:
94
  logging.error(f"Read timeout: {str(e)}")
95
  if attempt < MAX_RETRIES - 1:
96
+ time.sleep(2)
97
  continue
98
  raise
99
  raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
100
 
101
+ def generate_image_keywords(text):
102
+ """Generate image search keywords using Groq AI."""
103
+ prompt = f"""
104
+ Kamu adalah asisten AI yang menganalisis artikel berbahasa Indonesia. Berdasarkan teks artikel berikut, hasilkan 3-5 kata kunci (dalam bahasa Indonesia) yang relevan untuk mencari gambar di Pixabay. Kata kunci harus:
105
+ - Relevan dengan topik utama artikel.
106
+ - Singkat dan spesifik (1-2 kata per frasa).
107
+ - Tidak mengandung nama merek atau orang.
108
+ - Cocok untuk pencarian gambar (misal, objek, tempat, atau konsep).
109
+
110
+ Teks artikel:
111
+ {text[:1000]} # Batasi ke 1000 karakter untuk efisiensi
112
+
113
+ Hasilkan kata kunci dalam format: keyword1,keyword2,keyword3
114
+ """
115
+ try:
116
+ completion = client.chat.completions.create(
117
+ model=GROQ_MODEL,
118
+ messages=[{"role": "user", "content": prompt}],
119
+ temperature=0.7,
120
+ max_completion_tokens=50,
121
+ timeout=GROQ_TIMEOUT
122
+ )
123
+ keywords = completion.choices[0].message.content.strip()
124
+ # Format ke URL-encoded
125
+ return keywords.replace(',', '+').replace(' ', '+')
126
+ except HTTPError as e:
127
+ if e.response.status_code == 429:
128
+ logging.error("Groq API rate limit exceeded for keyword generation")
129
+ raise RateLimitExceeded("Groq API rate limit exceeded")
130
+ logging.error(f"Failed to generate keywords: {str(e)}")
131
+ return "default+image" # Fallback keyword
132
+ except Exception as e:
133
+ logging.error(f"Failed to generate keywords: {str(e)}")
134
+ return "default+image"
135
+
136
+ def fetch_pixabay_image(keywords):
137
+ """Fetch image URL from Pixabay API."""
138
+ try:
139
+ url = f"https://pixabay.com/api/?key={PIXABAY_API_KEY}&q={keywords}&image_type=photo&per_page=3&orientation=horizontal&safesearch=true&min_width=1280"
140
+
141
+ def get_image():
142
+ resp = requests.get(url, timeout=REQUEST_TIMEOUT)
143
+ resp.raise_for_status()
144
+ return resp
145
+
146
+ response = retry_request(get_image)
147
+ data = response.json()
148
+
149
+ if data.get('hits') and len(data['hits']) > 0:
150
+ # Prioritaskan largeImageURL untuk HD, fallback ke webformatURL
151
+ image = data['hits'][0]
152
+ image_url = image.get('largeImageURL', image['webformatURL'])
153
+ logging.info(f"Fetched Pixabay image: {image_url}")
154
+ return image_url
155
+ else:
156
+ logging.warning(f"No images found for keywords: {keywords}")
157
+ return ""
158
+ except HTTPError as e:
159
+ if e.response.status_code == 429:
160
+ logging.error("Pixabay API rate limit exceeded")
161
+ raise RateLimitExceeded("Pixabay API rate limit exceeded")
162
+ logging.error(f"Failed to fetch Pixabay image: {str(e)}")
163
+ return ""
164
+ except Exception as e:
165
+ logging.error(f"Failed to fetch Pixabay image: {str(e)}")
166
+ return ""
167
+
168
  # === CORE FUNCTIONS ===
169
  def fetch_links(sheet_name="Sheet2"):
170
  """Fetch links from Google Sheet where judul is empty."""
 
206
 
207
  return soup
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def scrape_detik(link):
210
+ """Scrape article content from Detik."""
211
  headers = {
212
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
213
  "Accept-Language": "id-ID,id;q=0.9",
 
231
  )
232
  if not content:
233
  logging.warning(f"No content found at {link}")
234
+ return None
 
 
 
235
 
236
  # Clean and get text
237
  cleaned_content = clean_html(content)
238
  text = cleaned_content.get_text(separator='\n', strip=True)
239
  if not text:
240
  logging.warning(f"Empty content after cleaning at {link}")
241
+ return None
242
 
243
+ return text
244
  except Exception as e:
245
  logging.error(f"Failed to scrape {link}: {str(e)}")
246
+ return None
247
 
248
+ def rewrite_with_ai(text):
249
+ """Rewrite article using Groq AI model in streaming mode, include Pixabay image."""
250
+ # Generate keywords for Pixabay
251
+ keywords = generate_image_keywords(text)
252
+ time.sleep(DELAY_BETWEEN_REQUESTS) # Delay untuk menghindari rate limit
253
+ image_url = fetch_pixabay_image(keywords)
254
+
255
  prompt = f"""
256
  Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
257
 
258
  - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
259
  - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
260
  - Ubah alur artikel, buat tata letak dan tampilan menarik,
261
+ - Tambah kalimat atau paragraf yang relevan dengan topik agar artikel semakin unik,
262
  - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
263
  - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
264
+ - Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar (jika ada),
265
  - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
266
  - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
267
+ - Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada,
268
+ - Sertakan gambar dari URL berikut (jika valid): {image_url}
269
 
270
  Artikel asli:
271
  {text}
272
 
 
 
 
273
  Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
274
  """
275
  try:
 
309
  """Extract title from rewritten HTML."""
310
  try:
311
  soup = BeautifulSoup(html, 'html.parser')
312
+ title_tag = soup.find('h2')
313
  title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
314
  return title
315
  except Exception as e:
 
359
  logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
360
 
361
  # Scrape article
362
+ artikel = scrape_detik(link)
363
  if not artikel:
364
  logging.warning(f"Skipping {link} due to empty content")
365
  continue
366
 
367
+ # Rewrite with AI and fetch Pixabay image
368
+ rewrite_html = rewrite_with_ai(artikel)
369
  if not rewrite_html:
370
  logging.warning(f"Skipping {link} due to rewrite failure")
371
  continue
 
389
  send_telegram_message(message)
390
 
391
  except RateLimitExceeded as e:
392
+ message = f"❌ *Script Terminated*: API rate limit exceeded.\nProcessed {processed_count} articles before termination."
393
  logging.error(str(e))
394
  send_telegram_message(message)
395
  raise
 
401
  finally:
402
  logging.info("Process ended")
403
 
404
+ # === SCHEDULER ===
 
405
  def run_scheduler():
406
  """Run scheduler untuk menjalankan main() pada pukul 00:00 WIB dan 12:00 WIB."""
407
+
 
408
  schedule.every().day.at("05:00").do(main) # 12:00 WIB
409
  logging.info("Scheduler started, waiting untuk 00:00 WIB dan 12:00 WIB")
410
 
411
  while True:
412
  schedule.run_pending()
413
+ time.sleep(60)
414
 
415
  # === GRADIO INTERFACE ===
416
  def gradio_interface():
417
  """Gradio interface for manual execution and status."""
418
+ main()
419
  return "Manual execution started. Check logs for details."
420
 
421
  if __name__ == "__main__":
 
422
  if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
423
  logging.info("Running in manual mode")
424
  main()
425
  else:
 
426
  scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
427
  scheduler_thread.start()
 
 
428
  iface = gr.Interface(
429
  fn=gradio_interface,
430
  inputs=None,