aronsaras commited on
Commit
ae4dc53
·
verified ·
1 Parent(s): b2b8c63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +308 -59
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import json
@@ -7,14 +6,23 @@ import logging
7
  import re
8
  from urllib.parse import urlparse
9
  from groq import Groq
10
- from requests.exceptions import HTTPError, ReadTimeout
11
  from http.client import RemoteDisconnected
12
  import os
13
  from datetime import datetime
 
 
 
 
 
 
 
 
 
14
 
15
  # === KONFIGURASI ===
16
  GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
17
- GROQ_API_KEY ="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"
18
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
19
  TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
20
  GROQ_MODEL = "gemma2-9b-it"
@@ -24,113 +32,354 @@ RETRY_BACKOFF_FACTOR = 2
24
  MAX_RETRIES = 3
25
  DELAY_BETWEEN_REQUESTS = 3
26
 
 
27
  log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
28
  logging.basicConfig(
29
  level=logging.INFO,
30
- format="%(asctime)s - %(levelname)s - %(message)s",
31
- handlers=[logging.FileHandler(log_file, encoding="utf-8"), logging.StreamHandler()]
 
 
 
32
  )
33
 
34
  client = Groq(api_key=GROQ_API_KEY)
35
 
36
- # --- FUNGSI PENDUKUNG ---
37
- class RateLimitExceeded(Exception):
38
- pass
39
-
40
  def send_telegram_message(message):
 
41
  try:
42
  url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
43
- payload = {"chat_id": TELEGRAM_CHAT_ID, "text": message, "parse_mode": "Markdown"}
44
- requests.post(url, json=payload, timeout=5)
45
- except: pass
 
 
 
 
 
 
 
46
 
47
  def is_valid_url(url):
 
48
  try:
49
  result = urlparse(url)
50
  return all([result.scheme in ['http', 'https'], result.netloc])
51
- except: return False
 
52
 
53
  def is_valid_html(html):
 
54
  return html and html.strip().startswith('<article') and html.strip().endswith('</article>')
55
 
56
  def retry_request(func, *args, **kwargs):
 
57
  for attempt in range(MAX_RETRIES):
58
  try:
59
  return func(*args, **kwargs)
60
- except (HTTPError, RemoteDisconnected, ReadTimeout):
61
- time.sleep(RETRY_BACKOFF_FACTOR ** attempt)
62
- raise Exception("Max retries exceeded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
 
64
  def fetch_links(sheet_name="Sheet2"):
 
65
  try:
66
  def get_links():
67
- resp = requests.get(GAS_URL, params={"sheetName": sheet_name}, timeout=REQUEST_TIMEOUT)
 
 
 
 
 
68
  return resp.json()
 
69
  data = retry_request(get_links)
70
- return [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
71
- except: return []
 
 
 
 
72
 
73
  def clean_html(soup):
74
- for tag in soup(["script", "iframe"]): tag.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return soup
76
 
77
  def extract_main_image(soup):
 
78
  try:
79
- img = soup.select_one('article img')
80
- return img['src'] if img and is_valid_url(img['src']) else ""
81
- except: return ""
 
 
 
 
 
 
 
 
82
 
83
  def scrape_detik(link):
 
 
 
 
 
84
  try:
85
- resp = retry_request(lambda: requests.get(link, timeout=REQUEST_TIMEOUT))
 
 
 
 
 
 
86
  soup = BeautifulSoup(resp.text, 'html.parser')
87
- content = soup.select_one('article')
 
 
 
 
 
 
 
 
 
 
 
 
88
  image_url = extract_main_image(soup)
89
- cleaned = clean_html(content)
90
- return cleaned.get_text("\n", strip=True), image_url
91
- except: return None, None
 
 
 
 
 
 
 
 
 
92
 
93
  def rewrite_with_ai(text, image_url):
94
- prompt = f"""Tulis ulang artikel berikut menjadi HTML <article>...\n\n{text}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  try:
96
  completion = client.chat.completions.create(
97
  model=GROQ_MODEL,
98
  messages=[{"role": "user", "content": prompt}],
99
- stream=True
 
 
 
 
 
100
  )
101
- html = "".join(chunk.choices[0].delta.content or "" for chunk in completion)
102
- return html if is_valid_html(html) else None
103
- except: return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def extract_title_from_html(html):
 
106
  try:
107
  soup = BeautifulSoup(html, 'html.parser')
108
- return soup.find('h2').get_text(strip=True)
109
- except: return "Tanpa Judul"
 
 
 
 
110
 
111
  def kirim_ke_sheet(judul, konten_html, link):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  try:
113
- payload = {"method": "updateRowByLink", "link": link, "judul": judul, "konten": konten_html}
114
- retry_request(lambda: requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT))
115
- except: pass
116
-
117
- # --- FUNGSI UTAMA YANG DIPANGGIL ---
118
- def jalankan_script():
119
- processed = 0
120
- rows = fetch_links()
121
- for row in rows[:3]: # BATAS 3 LINK
122
- link = row['link']
123
- artikel, img = scrape_detik(link)
124
- if not artikel: continue
125
- hasil = rewrite_with_ai(artikel, img)
126
- if not hasil: continue
127
- judul = extract_title_from_html(hasil)
128
- kirim_ke_sheet(judul, hasil, link)
129
- processed += 1
130
- time.sleep(DELAY_BETWEEN_REQUESTS)
131
- send_telegram_message(f"Selesai! {processed} artikel diproses.")
132
- return f"Sukses: {processed} artikel diproses"
133
-
134
- # --- GRADIO UNTUK HUGGING FACE ---
135
- iface = gr.Interface(fn=jalankan_script, inputs=[], outputs="text")
136
- app = gr.mount_gradio_app(app=None, blocks=iface, path="/run")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  import json
 
6
  import re
7
  from urllib.parse import urlparse
8
  from groq import Groq
9
+ from requests.exceptions import HTTPError, RequestException, ReadTimeout
10
  from http.client import RemoteDisconnected
11
  import os
12
  from datetime import datetime
13
+ import schedule
14
+ import threading
15
+ import sys
16
+ import gradio as gr
17
+
18
+ # === CUSTOM EXCEPTION ===
19
+ class RateLimitExceeded(Exception):
20
+ """Exception raised when Groq API rate limit is exceeded."""
21
+ pass
22
 
23
  # === KONFIGURASI ===
24
  GAS_URL = os.getenv("GAS_URL", "https://script.google.com/macros/s/AKfycbwstcoUh2CQmuoTgxapW9cUhzQFx6glp25DaCqrvBdwKrb77wqeMN0RzB8UMpiAQ2PtQA/exec")
25
+ GROQ_API_KEY = "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"
26
  TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "7166094967:AAHb5S2hN6L527y1-GoXPzBdU4RB8jnYelk")
27
  TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "6929677613")
28
  GROQ_MODEL = "gemma2-9b-it"
 
32
  MAX_RETRIES = 3
33
  DELAY_BETWEEN_REQUESTS = 3
34
 
35
+ # Setup logging with timestamp-based file
36
  log_file = f"scrape_rewrite_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
37
  logging.basicConfig(
38
  level=logging.INFO,
39
+ format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
40
+ handlers=[
41
+ logging.FileHandler(log_file, encoding="utf-8"),
42
+ logging.StreamHandler()
43
+ ]
44
  )
45
 
46
  client = Groq(api_key=GROQ_API_KEY)
47
 
48
+ # === HELPER FUNCTIONS ===
 
 
 
49
  def send_telegram_message(message):
50
+ """Send message to Telegram chat."""
51
  try:
52
  url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
53
+ payload = {
54
+ "chat_id": TELEGRAM_CHAT_ID,
55
+ "text": message,
56
+ "parse_mode": "Markdown"
57
+ }
58
+ response = requests.post(url, json=payload, timeout=5)
59
+ response.raise_for_status()
60
+ logging.info("Telegram message sent successfully")
61
+ except Exception as e:
62
+ logging.error(f"Failed to send Telegram message: {str(e)}")
63
 
64
  def is_valid_url(url):
65
+ """Validate URL format."""
66
  try:
67
  result = urlparse(url)
68
  return all([result.scheme in ['http', 'https'], result.netloc])
69
+ except Exception:
70
+ return False
71
 
72
  def is_valid_html(html):
73
+ """Check if HTML starts with <article> and is not empty."""
74
  return html and html.strip().startswith('<article') and html.strip().endswith('</article>')
75
 
76
  def retry_request(func, *args, **kwargs):
77
+ """Retry HTTP requests with exponential backoff."""
78
  for attempt in range(MAX_RETRIES):
79
  try:
80
  return func(*args, **kwargs)
81
+ except (HTTPError, RemoteDisconnected) as e:
82
+ if isinstance(e, HTTPError) and e.response.status_code == 429:
83
+ sleep_time = RETRY_BACKOFF_FACTOR ** attempt
84
+ logging.warning(f"Rate limit hit, retrying in {sleep_time}s...")
85
+ time.sleep(sleep_time)
86
+ else:
87
+ logging.error(f"Request failed: {str(e)}")
88
+ if attempt < MAX_RETRIES - 1:
89
+ time.sleep(RETRY_BACKOFF_FACTOR ** attempt + 1)
90
+ continue
91
+ raise
92
+ except ReadTimeout as e:
93
+ logging.error(f"Read timeout: {str(e)}")
94
+ if attempt < MAX_RETRIES - 1:
95
+ time.sleep(2) # Specific delay for timeout
96
+ continue
97
+ raise
98
+ raise Exception(f"Max retries ({MAX_RETRIES}) exceeded")
99
 
100
+ # === CORE FUNCTIONS ===
101
  def fetch_links(sheet_name="Sheet2"):
102
+ """Fetch links from Google Sheet where judul is empty."""
103
  try:
104
  def get_links():
105
+ resp = requests.get(
106
+ GAS_URL,
107
+ params={"sheetName": sheet_name},
108
+ timeout=REQUEST_TIMEOUT
109
+ )
110
+ resp.raise_for_status()
111
  return resp.json()
112
+
113
  data = retry_request(get_links)
114
+ links = [item for item in data if not item.get("judul") and is_valid_url(item.get("link"))]
115
+ logging.info(f"Fetched {len(links)} links from sheet {sheet_name}")
116
+ return links
117
+ except Exception as e:
118
+ logging.error(f"Failed to fetch links: {str(e)}")
119
+ return []
120
 
121
  def clean_html(soup):
122
+ """Remove ads, scripts, and empty elements from HTML."""
123
+ ad_selectors = [
124
+ 'div[class*="ads"]', 'div[class*="advert"]', 'div[class*="banner"]',
125
+ 'div[id*="ads"]', 'div[id*="advert"]', 'div[id*="banner"]',
126
+ 'script', 'iframe', '[class*="sponsored"]', 'div.parallaxindetail',
127
+ 'div[class*="promo"]', 'div[class*="widget"]'
128
+ ]
129
+ for selector in ad_selectors:
130
+ for element in soup.select(selector):
131
+ element.decompose()
132
+
133
+ # Remove empty elements, preserve Arabic text
134
+ for elem in soup.find_all():
135
+ text = elem.get_text(strip=True)
136
+ if not text and not elem.find_all(['img', 'video']) and not re.search(r'[\u0600-\u06FF]', text):
137
+ elem.decompose()
138
+
139
  return soup
140
 
141
  def extract_main_image(soup):
142
+ """Extract URL of the main image from the article."""
143
  try:
144
+ img = (
145
+ soup.select_one('.detail__media img') or
146
+ soup.select_one('article img') or
147
+ soup.select_one('img[alt*="main"]') or
148
+ soup.select_one('img[data-testid*="main-image"]')
149
+ )
150
+ src = img['src'] if img and img.get('src') else ""
151
+ return src if is_valid_url(src) else ""
152
+ except Exception as e:
153
+ logging.warning(f"Failed to extract main image: {str(e)}")
154
+ return ""
155
 
156
  def scrape_detik(link):
157
+ """Scrape article content and main image from Detik."""
158
+ headers = {
159
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
160
+ "Accept-Language": "id-ID,id;q=0.9",
161
+ }
162
  try:
163
+ def get_article():
164
+ resp = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
165
+ resp.raise_for_status()
166
+ return resp
167
+
168
+ resp = retry_request(get_article)
169
+ resp.encoding = 'utf-8'
170
  soup = BeautifulSoup(resp.text, 'html.parser')
171
+
172
+ # Extract content
173
+ content = (
174
+ soup.select_one('.detail__body-text') or
175
+ soup.select_one('article') or
176
+ soup.select_one('.entry-content') or
177
+ soup.select_one('.post-content')
178
+ )
179
+ if not content:
180
+ logging.warning(f"No content found at {link}")
181
+ return None, None
182
+
183
+ # Extract main image
184
  image_url = extract_main_image(soup)
185
+
186
+ # Clean and get text
187
+ cleaned_content = clean_html(content)
188
+ text = cleaned_content.get_text(separator='\n', strip=True)
189
+ if not text:
190
+ logging.warning(f"Empty content after cleaning at {link}")
191
+ return None, None
192
+
193
+ return text, image_url
194
+ except Exception as e:
195
+ logging.error(f"Failed to scrape {link}: {str(e)}")
196
+ return None, None
197
 
198
  def rewrite_with_ai(text, image_url):
199
+ """Rewrite article using Groq AI model in streaming mode."""
200
+ prompt = f"""
201
+ Kamu adalah jurnalis profesional di Indonesia. Tugasmu adalah menulis ulang artikel berikut agar:
202
+
203
+ - Terlihat ditulis manusia, tidak seperti AI (gunakan bahasa alami, tidak repetitif, tidak datar),
204
+ - Tidak dianggap plagiat: gunakan kalimat yang berbeda, namun makna dan informasi tetap utuh,
205
+ - Kamu boleh mengubah alur artikel, dan buat tata letak dan tampilan menarik,
206
+ - Artikel dapat terindeks Google dan memenuhi prinsip E-E-A-T (Experience, Expertise, Authoritativeness, Trust),
207
+ - Artikel disusun dalam format HTML lengkap dan valid, dimulai dari tag <article> dan diakhiri dengan </article>,
208
+ - Struktur konten SEO-friendly: gunakan <p> untuk paragraf, <h2> untuk subjudul, dan <img> untuk gambar utama (jika ada),
209
+ - Jangan gunakan <h1> dalam artikel, karena sudah dipakai di luar artikel,
210
+ - Jangan menambahkan fakta atau narasi baru, tapi boleh buat intro dan penutup yang relevan dan netral,
211
+ - Pertahankan teks Arab, kutipan hadis atau ayat, dan gaya islami jika ada.
212
+
213
+ Artikel asli:
214
+ {text}
215
+
216
+ URL gambar utama (jika ada):
217
+ {image_url}
218
+
219
+ Hasilkan hanya kode HTML mulai dari <article> hingga </article>.
220
+ """
221
  try:
222
  completion = client.chat.completions.create(
223
  model=GROQ_MODEL,
224
  messages=[{"role": "user", "content": prompt}],
225
+ temperature=1,
226
+ max_completion_tokens=1691,
227
+ top_p=1,
228
+ stream=True,
229
+ stop=None,
230
+ timeout=GROQ_TIMEOUT
231
  )
232
+
233
+ # Collect streaming response
234
+ html_content = ""
235
+ for chunk in completion:
236
+ content = chunk.choices[0].delta.content or ""
237
+ html_content += content
238
+
239
+ html_content = html_content.strip()
240
+ if not is_valid_html(html_content):
241
+ logging.warning("AI output is not valid HTML article")
242
+ return None
243
+ return html_content
244
+ except HTTPError as e:
245
+ if e.response.status_code == 429:
246
+ logging.error("Groq API rate limit exceeded")
247
+ raise RateLimitExceeded("Groq API rate limit exceeded")
248
+ logging.error(f"Failed to rewrite article: {str(e)}")
249
+ return None
250
+ except Exception as e:
251
+ logging.error(f"Failed to rewrite article: {str(e)}")
252
+ return None
253
 
254
  def extract_title_from_html(html):
255
+ """Extract title from rewritten HTML."""
256
  try:
257
  soup = BeautifulSoup(html, 'html.parser')
258
+ title_tag = soup.find('h2') # Only h2 as per prompt
259
+ title = title_tag.get_text(strip=True) if title_tag else "Judul Tidak Ditemukan"
260
+ return title
261
+ except Exception as e:
262
+ logging.error(f"Failed to extract title: {str(e)}")
263
+ return "Judul Tidak Ditemukan"
264
 
265
  def kirim_ke_sheet(judul, konten_html, link):
266
+ """Send rewritten title and content to Google Sheet."""
267
+ if not judul or not konten_html:
268
+ logging.warning(f"Empty title or content for link {link}")
269
+ return
270
+
271
+ try:
272
+ payload = {
273
+ "method": "updateRowByLink",
274
+ "link": link,
275
+ "judul": judul,
276
+ "konten": konten_html
277
+ }
278
+
279
+ def send_data():
280
+ resp = requests.post(GAS_URL, json=payload, timeout=REQUEST_TIMEOUT)
281
+ resp.raise_for_status()
282
+ return resp
283
+
284
+ retry_request(send_data)
285
+ logging.info(f"Successfully sent to sheet: {judul}")
286
+ except Exception as e:
287
+ logging.error(f"Failed to send to sheet for {link}: {str(e)}")
288
+
289
+ # === MAIN ===
290
+ def main():
291
+ """Main function to process articles."""
292
+ logging.info("Starting scrape and rewrite process")
293
+ processed_count = 0
294
+ MAX_ARTICLES = 40
295
  try:
296
+ rows = fetch_links()
297
+ logging.info(f"Found {len(rows)} links to process")
298
+
299
+ for idx, row in enumerate(rows, 1):
300
+ if processed_count >= MAX_ARTICLES:
301
+ logging.info(f"Reached maximum article limit of {MAX_ARTICLES}")
302
+ break
303
+
304
+ link = row['link']
305
+ logging.info(f"[{idx}/{len(rows)}] Processing: {link}")
306
+
307
+ # Scrape article
308
+ artikel, image_url = scrape_detik(link)
309
+ if not artikel:
310
+ logging.warning(f"Skipping {link} due to empty content")
311
+ continue
312
+
313
+ # Rewrite with AI
314
+ rewrite_html = rewrite_with_ai(artikel, image_url)
315
+ if not rewrite_html:
316
+ logging.warning(f"Skipping {link} due to rewrite failure")
317
+ continue
318
+
319
+ # Add 1-minute delay after AI rewrite
320
+ logging.info("Waiting for 60 seconds after AI rewrite...")
321
+ time.sleep(60)
322
+
323
+ # Extract title
324
+ judul = extract_title_from_html(rewrite_html)
325
+
326
+ # Send to sheet
327
+ kirim_ke_sheet(judul, rewrite_html, link)
328
+ processed_count += 1
329
+
330
+ # Delay to avoid rate limits
331
+ time.sleep(DELAY_BETWEEN_REQUESTS)
332
+
333
+ # Send success message
334
+ message = f"✅ *Scrape and Rewrite Completed*\nProcessed {processed_count} articles successfully."
335
+ send_telegram_message(message)
336
+
337
+ except RateLimitExceeded as e:
338
+ message = f"❌ *Script Terminated*: Groq API rate limit exceeded.\nProcessed {processed_count} articles before termination."
339
+ logging.error(str(e))
340
+ send_telegram_message(message)
341
+ raise
342
+ except Exception as e:
343
+ message = f"❌ *Script Terminated*: Unexpected error: {str(e)}\nProcessed {processed_count} articles before termination."
344
+ logging.error(f"Unexpected error: {str(e)}")
345
+ send_telegram_message(message)
346
+ raise
347
+ finally:
348
+ logging.info("Process ended")
349
+
350
+ # === SCHEDULER ===
351
+ def run_scheduler():
352
+ """Run scheduler to execute main() at 00:00 WIB daily."""
353
+ schedule.every().day.at("00:00").do(main)
354
+ logging.info("Scheduler started, waiting for 00:00 WIB")
355
+
356
+ while True:
357
+ schedule.run_pending()
358
+ time.sleep(60) # Check every minute
359
+
360
+ # === GRADIO INTERFACE ===
361
+ def gradio_interface():
362
+ """Gradio interface for manual execution and status."""
363
+ main() # Run main() manually when button is clicked
364
+ return "Manual execution started. Check logs for details."
365
+
366
+ if __name__ == "__main__":
367
+ # Check for manual execution via command line argument
368
+ if len(sys.argv) > 1 and sys.argv[1].lower() == "manual":
369
+ logging.info("Running in manual mode")
370
+ main()
371
+ else:
372
+ # Start scheduler in a separate thread
373
+ scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
374
+ scheduler_thread.start()
375
+
376
+ # Launch Gradio interface
377
+ iface = gr.Interface(
378
+ fn=gradio_interface,
379
+ inputs=None,
380
+ outputs="text",
381
+ title="Article Scraper and Rewriter",
382
+ description="Click 'Submit' to run the scraper manually or wait for the scheduled run at 00:00 WIB."
383
+ )
384
+ logging.info("Starting Gradio interface")
385
+ iface.launch()