| |
| """Web Scrapping.ipynb |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1OLoBK18jpB685Ivi8Zi3SzuVYiXJ9jRa |
| """ |
|
|
| !pip install selenium |
| !pip install webdriver-manager |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| from datetime import datetime |
|
|
| def scrape_detik_search(keyword, max_pages=1): |
| base_search_url = "https://www.detik.com/search/searchall" |
| results = [] |
|
|
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36' |
| } |
|
|
| for page in range(1, max_pages + 1): |
| params = { |
| 'query': keyword, |
| 'siteid': '2', |
| 'sortby': 'time', |
| 'page': page |
| } |
| print(f"Scraping page {page}...") |
| r = requests.get(base_search_url, params=params, headers=headers) |
| if r.status_code != 200: |
| print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.") |
| break |
|
|
| soup = BeautifulSoup(r.text, 'html.parser') |
|
|
| news_list = soup.find_all('div', class_='media') |
|
|
| if not news_list: |
| print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.") |
| break |
|
|
| for news in news_list: |
| try: |
| title_tag = news.find('h3', class_='media__title') |
| if not title_tag: |
| continue |
| link_tag = title_tag.find('a', class_='media__link') |
| if not link_tag or not link_tag.has_attr('href'): |
| continue |
| link = link_tag['href'] |
| title = link_tag.text.strip() |
|
|
| date_tag = news.find('div', class_='media__date') |
| if date_tag: |
| span_tag = date_tag.find('span') |
| if span_tag and span_tag.has_attr('d-time'): |
| timestamp = span_tag['d-time'] |
| news_date = datetime.fromtimestamp(int(timestamp)) |
| else: |
| news_date = None |
| else: |
| news_date = None |
|
|
| |
| |
| |
|
|
| |
| news_resp = requests.get(link, headers=headers) |
| if news_resp.status_code != 200: |
| print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.") |
| continue |
|
|
| news_soup = BeautifulSoup(news_resp.text, 'html.parser') |
|
|
| content_div = news_soup.find('div', class_='detail__body-text') or \ |
| news_soup.find('div', class_='detail_text') |
|
|
| if content_div: |
| content_parts = [] |
| for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): |
| text = tag.get_text(strip=True) |
| if text: |
| prefix = tag.name.upper() if tag.name.startswith('h') else '' |
| if prefix: |
| content_parts.append(f"{prefix}: {text}") |
| else: |
| content_parts.append(text) |
| content = '\n'.join(content_parts) |
| else: |
| content = '' |
|
|
| |
| nav_div = news_soup.find('div', class_='nav') |
|
|
| tags = [] |
| if nav_div: |
| tags = [a.text.strip() for a in nav_div.find_all('a', class_='nav__item')] |
|
|
| results.append({ |
| 'judul': title, |
| 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '', |
| 'tag': ', '.join(tags), |
| 'isi_berita': content, |
| 'link': link |
| }) |
|
|
| print(f"Berhasil scrape berita: {title}") |
|
|
| time.sleep(1) |
|
|
| except Exception as e: |
| print(f"Error saat memproses berita: {e}") |
| continue |
|
|
| time.sleep(2) |
|
|
| return pd.DataFrame(results) |
|
|
| if __name__ == "__main__": |
| keyword = "Kabupaten Cirebon" |
| df = scrape_detik_search(keyword) |
| if not df.empty: |
| df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig') |
| print("Selesai menyimpan data berita ke detik_berita_cirebon.csv") |
| else: |
| print("Tidak ada data yang berhasil di-scrape.") |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| from urllib.parse import quote_plus |
|
|
| BASE_HOST = "https://radarcirebon.disway.id" |
| BASE_SEARCH = BASE_HOST + "/search/kata/" |
|
|
| def make_search_url(keyword, page, per_page=30): |
| q = quote_plus(keyword) |
| if page == 1: |
| return f"{BASE_SEARCH}?c={q}&num=" |
| else: |
| offset = (page - 1) * per_page |
| return f"{BASE_SEARCH}{offset}/{offset}/?c={q}&num=" |
|
|
| def absolute_url(href): |
| if not href: |
| return None |
| href = href.strip() |
| if href.startswith("http://") or href.startswith("https://"): |
| return href |
| if href.startswith("/"): |
| return BASE_HOST + href |
| return BASE_HOST + "/" + href |
|
|
| def scrape_radar_cirebon(keyword, max_pages=100, per_page=30, delay_between_items=1.0, delay_between_pages=2.0): |
| sess = requests.Session() |
| sess.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36' |
| }) |
|
|
| results = [] |
| seen_links = set() |
|
|
| for page in range(1, max_pages + 1): |
| url = make_search_url(keyword, page, per_page) |
| print(f"\nScraping page {page} -> {url}") |
| try: |
| r = sess.get(url, timeout=15) |
| except Exception as e: |
| print(f" ERROR: Gagal request halaman search: {e}") |
| break |
|
|
| if r.status_code != 200: |
| print(f" ERROR: status code {r.status_code}, hentikan scraping.") |
| break |
|
|
| soup = BeautifulSoup(r.text, "html.parser") |
|
|
| |
| news_list = soup.find_all(class_='media-heading') |
| if not news_list: |
| news_list = soup.find_all('div', class_='media') |
| if not news_list: |
| news_list = soup.find_all('article') |
| if not news_list: |
| news_list = soup.select('ul.search-results li') or soup.select('div.search-result') or [] |
|
|
| if not news_list: |
| print(" Tidak ada berita ditemukan di halaman ini.") |
| continue |
|
|
| print(f" Ketemu {len(news_list)} item.") |
|
|
| for item in news_list: |
| try: |
| a = item.find('a', href=True) or item.select_one('a[href]') |
| if not a: |
| continue |
|
|
| link = absolute_url(a.get('href')) |
| if not link or link in seen_links: |
| continue |
| seen_links.add(link) |
|
|
| title = a.get_text(strip=True) |
|
|
| |
| try: |
| detail_r = sess.get(link, timeout=15) |
| except Exception as e: |
| print(f" ERROR request detail {link}: {e}") |
| continue |
| if detail_r.status_code != 200: |
| print(f" ERROR status {detail_r.status_code} for {link}") |
| continue |
|
|
| detail_soup = BeautifulSoup(detail_r.text, "html.parser") |
|
|
| |
| h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1') |
| title_detail = h1.get_text(strip=True) if h1 else title |
|
|
| |
| date_text = None |
|
|
| |
| date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date') |
| if date_detail_tag: |
| print("Ditemukan dengan Target Langsung") |
| |
| date_text = date_detail_tag.get_text(strip=True) |
|
|
| |
| if not date_text: |
| post_info_div = detail_soup.find('div', class_='post-info') |
| if post_info_div: |
| tag_tanggal = post_info_div.find('span', class_='date') |
| if tag_tanggal: |
| print("Ditemukan dengan Target Kontainer") |
| date_text = tag_tanggal.get_text(strip=True) |
|
|
| |
| if not date_text: |
| |
| date_pattern = re.compile(r'\w+,\s*\d{1,2}\s+\w+\s+\d{4}|\w+\s+\d{2}-\d{2}-\d{4}') |
| found_text = detail_soup.find(string=date_pattern) |
| if found_text: |
| print("Ditemukan dengan Target Pola Teks (Regex)") |
| date_text = found_text.strip() |
|
|
| |
| content_container = None |
| for cls in ('entry-content', 'post-content', 'article-body', 'detail__body-text', 'detail_text', 'content', 'article__content'): |
| content_container = detail_soup.find('div', class_=cls) |
| if content_container: |
| break |
| if not content_container: |
| content_container = detail_soup.find('article') |
|
|
| content_parts = [] |
| search_scope = content_container if content_container else detail_soup |
| for p in search_scope.find_all('p'): |
| text = p.get_text(strip=True) |
| if text and 'Baca Juga:' not in text: |
| content_parts.append(text) |
| content = "\n".join(content_parts) |
|
|
| tags = [] |
| try: |
| |
| |
| tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href) |
|
|
| |
| for a_tag in tag_links: |
| |
| tag_text = a_tag.get('title', '').strip() |
|
|
| |
| if tag_text: |
| tags.append(tag_text) |
|
|
| |
|
|
| except Exception as e: |
| |
| print(f" Terjadi error saat mencari tag: {e}") |
|
|
| |
| final_tags = ", ".join(tags) if tags else "-" |
|
|
| results.append({ |
| "judul": title_detail, |
| "tanggal": date_text, |
| "tag": final_tags, |
| "isi_berita": content, |
| "link": link |
| }) |
|
|
| print(f" Berhasil: {title_detail} | Tags: {', '.join(tags) if tags else '-'}") |
|
|
| time.sleep(delay_between_items) |
|
|
| except Exception as e: |
| print(f" Error saat memproses item: {e}") |
| continue |
|
|
| time.sleep(delay_between_pages) |
|
|
| df = pd.DataFrame(results) |
| return df |
|
|
| if __name__ == "__main__": |
| keyword = "kabupaten cirebon" |
| df = scrape_radar_cirebon(keyword, max_pages=100) |
| if not df.empty: |
| df.to_csv("/content/drive/MyDrive/Machine Learning/Sentiment Analysis/radarcirebondisway_berita.csv", index=False, encoding="utf-8-sig") |
| print("\nSelesai menyimpan data berita ke radarcirebon_berita.csv") |
| else: |
| print("\nTidak ada data yang berhasil di-scrape.") |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| import re |
| import random |
| from urllib.parse import quote_plus, urlparse, urlunparse |
|
|
| BASE_HOST = "https://www.antaranews.com" |
| BASE_SEARCH = BASE_HOST + "/search" |
|
|
| def make_search_url(keyword, page): |
| q = quote_plus(keyword) |
| if page == 1: |
| return f"{BASE_SEARCH}?q={q}" |
| else: |
| return f"{BASE_SEARCH}?q={q}&page={page}" |
|
|
| def absolute_url(href): |
| if not href: |
| return None |
| href = href.strip() |
| if href.startswith("http://") or href.startswith("https://"): |
| return href |
| if href.startswith("/"): |
| return BASE_HOST + href |
| return BASE_HOST + "/" + href |
|
|
| def normalize_url(href): |
| """Buat URL konsisten: absolut + buang query/fragment + hapus trailing slash.""" |
| if not href: |
| return None |
| href = absolute_url(href) |
| parsed = urlparse(href) |
| clean = parsed._replace(query="", fragment="") |
| return urlunparse(clean).rstrip("/") |
|
|
| def get_with_retry(sess, url, max_retries=3, delay_range=(2, 5)): |
| """Request dengan retry & delay acak.""" |
| for attempt in range(max_retries): |
| try: |
| r = sess.get(url, timeout=15) |
| r.raise_for_status() |
| return r |
| except Exception as e: |
| print(f" Percobaan {attempt+1} gagal: {e}") |
| if attempt < max_retries - 1: |
| time.sleep(random.uniform(*delay_range)) |
| return None |
|
|
| def scrape_antaranews(keyword, max_pages=5, delay_between_items=(1, 2), delay_between_pages=(2, 4)): |
| sess = requests.Session() |
| sess.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
| 'AppleWebKit/537.36 (KHTML, like Gecko) ' |
| 'Chrome/115.0 Safari/537.36' |
| }) |
|
|
| results = [] |
| seen_links = set() |
|
|
| for page in range(1, max_pages + 1): |
| url = make_search_url(keyword, page) |
| print(f"\nScraping page {page} -> {url}") |
|
|
| r = get_with_retry(sess, url) |
| if not r: |
| print(f" ERROR: Gagal request halaman search setelah retry.") |
| continue |
|
|
| soup = BeautifulSoup(r.text, "html.parser") |
|
|
| |
| anchors = soup.select('a[href*="/berita/"]') |
| all_links_in_page = {normalize_url(a.get('href')) for a in anchors if a.get('href')} |
| all_links_in_page = {l for l in all_links_in_page if l} |
| new_links = all_links_in_page - seen_links |
| print(f" Ketemu {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.") |
|
|
| seen_links.update(all_links_in_page) |
|
|
| for link in sorted(new_links): |
| detail_r = get_with_retry(sess, link) |
| if not detail_r: |
| print(f" ERROR: Gagal request detail {link}") |
| continue |
|
|
| detail_soup = BeautifulSoup(detail_r.text, "html.parser") |
|
|
| |
| h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1') |
| title_detail = h1.get_text(strip=True) if h1 else "" |
|
|
| |
| date_detail = "" |
| cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar') |
| if cal_icon: |
| parent_li = cal_icon.find_parent('li') or cal_icon.find_parent() |
| if parent_li: |
| date_detail = parent_li.get_text(" ", strip=True) |
| if not date_detail: |
| text_all = detail_soup.get_text(" ", strip=True) |
| m = re.search(r'\b(?:[A-Za-z]+,\s*\d{1,2}\s+[A-Za-z]+ \d{4}\s*\d{1,2}:\d{2}\s*WIB|\d+\s+jam lalu|\bWIB\b)', text_all) |
| if m: |
| date_detail = m.group(0) |
|
|
| |
| content_parts = [] |
| article_body = detail_soup.find('div', class_='wrap__article-detail-content') \ |
| or detail_soup.find('div', class_='detail__body-text') \ |
| or detail_soup.find('article') |
| search_scope = article_body if article_body else detail_soup |
| for p in search_scope.find_all('p'): |
| text = p.get_text(strip=True) |
| if text and not text.lower().startswith("baca juga"): |
| content_parts.append(text) |
| content = "\n".join(content_parts) |
|
|
| |
| tags = [] |
| found = False |
| for ul in detail_soup.find_all('ul', class_='list-inline'): |
| if ul.find('i', class_='fa-tags') or ul.find('i', class_='fas fa-tags'): |
| for a in ul.find_all('a', href=True): |
| if '/tag/' in a['href']: |
| tag_text = a.get('title') if a.get('title') else a.get_text(strip=True) |
| if tag_text: |
| tags.append(tag_text) |
| if tags: |
| found = True |
| break |
| if not found: |
| for a in detail_soup.select('a[href*="/tag/"]'): |
| tag_text = a.get('title') if a.get('title') else a.get_text(strip=True) |
| if tag_text: |
| tags.append(tag_text) |
| tags = list(dict.fromkeys(tags)) |
|
|
| results.append({ |
| "judul": title_detail, |
| "tanggal": date_detail, |
| "tag": ", ".join(tags) if tags else "-", |
| "isi_berita": content, |
| "link": link |
| }) |
|
|
| print(f" Berhasil: {title_detail} | Tanggal: {date_detail if date_detail else '-'} | Tags: {', '.join(tags) if tags else '-'}") |
|
|
| time.sleep(random.uniform(*delay_between_items)) |
|
|
| time.sleep(random.uniform(*delay_between_pages)) |
|
|
| df = pd.DataFrame(results) |
| return df |
|
|
| if __name__ == "__main__": |
| keyword = "kabupaten cirebon" |
| df = scrape_antaranews(keyword, max_pages=100) |
| if not df.empty: |
| df.to_csv("antaranews_berita.csv", index=False, encoding="utf-8-sig") |
| print(f"\nSelesai menyimpan {len(df)} data berita ke antaranews_berita.csv") |
| else: |
| print("\nTidak ada data yang berhasil di-scrape.") |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| import random |
| from urllib.parse import quote, urlparse, urlunparse |
| import re |
|
|
| from selenium import webdriver |
| from selenium.webdriver.chrome.service import Service |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support.ui import WebDriverWait |
| from selenium.webdriver.support import expected_conditions as EC |
| from webdriver_manager.chrome import ChromeDriverManager |
| from selenium.common.exceptions import TimeoutException |
|
|
| BASE_HOST = "https://www.cnnindonesia.com" |
|
|
| |
| def make_search_url(keyword, page): |
| """ |
| Membuat URL pencarian yang benar untuk setiap halaman. |
| """ |
| q = quote(keyword) |
| base_url = f"{BASE_HOST}/search?query={q}&result_type=latest" |
| if page == 1: |
| return base_url |
| else: |
| return f"{base_url}&page={page}" |
|
|
| |
| def absolute_url(href): |
| if not href: return None |
| href = href.strip() |
| if href.startswith("http://") or href.startswith("https://"): return href |
| if href.startswith("/"): return BASE_HOST + href |
| return BASE_HOST + "/" + href |
|
|
| def normalize_url(href): |
| if not href: return None |
| href = absolute_url(href) |
| parsed = urlparse(href) |
| clean = parsed._replace(query="", fragment="") |
| return urlunparse(clean).rstrip("/") |
|
|
| def parse_cnn_date(raw_date): |
| if not raw_date: return "-" |
| if '|' in raw_date: raw_date = raw_date.split('|')[1] |
| raw = raw_date.replace(" WIB", "").strip() |
| try: |
| from datetime import datetime |
| import locale |
| try: locale.setlocale(locale.LC_TIME, 'id_ID.UTF-8') |
| except locale.Error: locale.setlocale(locale.LC_TIME, '') |
| dt = datetime.strptime(raw, "%A, %d %b %Y %H:%M") |
| return dt.strftime("%Y-%m-%d %H:%M") |
| except Exception: return raw_date.strip() |
|
|
| def looks_like_article_href(href): |
| if not href: return False |
| parsed = urlparse(href.strip()) |
| path = parsed.path |
| if any(skip in path for skip in ['/search', '/tag', '/kategori', '/author', '/channel', '/indeks', '/video', '/foto']): return False |
| if re.search(r'/\d{14}-\d{2,3}-\d{6,}', path): return True |
| return False |
|
|
| HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"} |
|
|
| def fetch_article_detail(url, retries=3, delay=3): |
| for attempt in range(1, retries + 1): |
| try: |
| resp = requests.get(url, headers=HEADERS, timeout=15) |
| if resp.status_code == 200: return resp.text |
| else: print(f" WARNING: HTTP {resp.status_code} saat akses {url}") |
| except Exception as e: print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}") |
| time.sleep(delay) |
| return None |
|
|
| def scrape_cnn_with_selenium(keyword, max_pages=3, delay_between_items=(1,2)): |
| results = [] |
| seen_links = set() |
|
|
| print("Menginisialisasi browser Chrome...") |
| service = Service(ChromeDriverManager().install()) |
| options = webdriver.ChromeOptions() |
| options.add_argument("--headless") |
| options.add_argument("--disable-blink-features=AutomationControlled") |
| options.add_experimental_option("excludeSwitches", ["enable-automation"]) |
| options.add_experimental_option('useAutomationExtension', False) |
|
|
| driver = webdriver.Chrome(service=service, options=options) |
| driver.set_page_load_timeout(30) |
|
|
| |
| for page in range(1, max_pages + 1): |
| |
| url = make_search_url(keyword, page) |
| print(f"\nMembuka halaman {page} -> {url}") |
| driver.get(url) |
|
|
| |
| if page == 1: |
| try: |
| print("Mencari pop-up cookie...") |
| cookie_agree_button = WebDriverWait(driver, 10).until( |
| EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']")) |
| ) |
| cookie_agree_button.click() |
| print(" Pop-up cookie ditemukan dan ditutup.") |
| time.sleep(2) |
| except TimeoutException: |
| print(" Pop-up cookie tidak ditemukan, melanjutkan proses.") |
|
|
| print(f"Mengambil data dari halaman {page}...") |
|
|
| try: |
| |
| WebDriverWait(driver, 15).until( |
| EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")) |
| ) |
| except TimeoutException: |
| print(f" WARNING: Waktu habis menunggu konten di halaman {page}. Mungkin halaman ini kosong.") |
| continue |
|
|
| page_html = driver.page_source |
| soup = BeautifulSoup(page_html, "html.parser") |
|
|
| link_elements = soup.select('div.nhl-list article a[href]') |
|
|
| all_links_in_page = {normalize_url(a['href']) for a in link_elements if looks_like_article_href(a['href'])} |
| new_links = all_links_in_page - seen_links |
|
|
| if not new_links: |
| print(" Tidak ada link baru yang ditemukan di halaman ini.") |
| |
|
|
| print(f" Ditemukan {len(new_links)} link baru.") |
| seen_links.update(new_links) |
|
|
| |
| for link in sorted(new_links): |
| print(f" -> Memproses: {link}") |
| html_detail = fetch_article_detail(link) |
| if not html_detail: continue |
| detail_soup = BeautifulSoup(html_detail, "html.parser") |
|
|
| title_el = detail_soup.select_one('h1') |
| title_text = title_el.get_text(strip=True) if title_el else "-" |
|
|
| date_el = detail_soup.select_one('div.text-cnn_grey.text-sm') |
| date_text = parse_cnn_date(date_el.get_text(strip=True)) if date_el else "-" |
|
|
| tags_list = [] |
| topik_terkait_header = detail_soup.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*')) |
| if topik_terkait_header: |
| tags_container = topik_terkait_header.find_next_sibling('div') |
| if tags_container: |
| tags_elements = tags_container.select('a') |
| tags_list = [tag.get_text(strip=True) for tag in tags_elements] |
|
|
| content_parts = [] |
| content_container = detail_soup.select_one("div.detail-text") |
| if content_container: |
| for p in content_container.find_all('p'): |
| text = p.get_text(" ", strip=True) |
| if text and not text.lower().startswith("lihat juga") and not text.lower().startswith("scroll to continue"): |
| content_parts.append(text) |
|
|
| results.append({ |
| "judul": title_text, "tanggal": date_text, |
| "tag": ", ".join(tags_list) if tags_list else "-", |
| "isi_berita": "\n".join(content_parts) if content_parts else "-", "link": link |
| }) |
| print(f" Berhasil: {title_text} | Tanggal: {date_text}") |
| time.sleep(random.uniform(*delay_between_items)) |
|
|
| print("\nMenutup browser...") |
| driver.quit() |
| return pd.DataFrame(results) |
|
|
| if __name__ == "__main__": |
| keyword = "kabupaten cirebon" |
| df = scrape_cnn_with_selenium(keyword, max_pages=100) |
| if not df.empty: |
| df.to_csv("cnnindonesia_berita_final.csv", index=False, encoding="utf-8-sig") |
| print(f"\nSelesai menyimpan {len(df)} data berita ke cnnindonesia_berita_final.csv") |
| else: |
| print("\nTidak ada data yang berhasil di-scrape.") |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| import random |
| from urllib.parse import quote, urlparse, urlunparse |
| import re |
|
|
| |
| BASE_HOST = "https://radarcirebon.id" |
|
|
| def make_search_url(keyword, page): |
| """ |
| Membuat URL pencarian sesuai format radarcirebon.id. |
| Contoh: https://radarcirebon.id/search/kabupaten+cirebon/page/2/ |
| """ |
| |
| q = quote(keyword).replace('%20', '+') |
| if page == 1: |
| return f"{BASE_HOST}/search/{q}/" |
| else: |
| return f"{BASE_HOST}/search/{q}/page/{page}/" |
|
|
| def normalize_url(href): |
| """ |
| Memastikan URL dalam format absolut dan bersih (tanpa parameter). |
| """ |
| if not href: |
| return None |
| href = href.strip() |
| |
| if href.startswith("//"): |
| href = "https:" + href |
| elif href.startswith("/"): |
| href = BASE_HOST + href |
| elif not href.startswith("http"): |
| return None |
|
|
| parsed = urlparse(href) |
| clean = parsed._replace(query="", fragment="") |
| return urlunparse(clean).rstrip("/") |
|
|
| def parse_radarcirebon_date(raw_date): |
| """ |
| Mengubah format tanggal dari 'Selasa, 12 Agu 2025 - 11:01' |
| menjadi format standar 'YYYY-MM-DD HH:MM'. |
| """ |
| if not raw_date: |
| return "-" |
| try: |
| |
| month_map = { |
| 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'Mei': '05', 'Jun': '06', |
| 'Jul': '07', 'Agu': '08', 'Sep': '09', 'Okt': '10', 'Nov': '11', 'Des': '12' |
| } |
| |
| date_part = raw_date.split(', ')[1] |
| parts = date_part.replace(' - ', ' ').split() |
|
|
| day = parts[0].zfill(2) |
| month_abbr = parts[1] |
| year = parts[2] |
| time_str = parts[3] |
|
|
| |
| month = month_map.get(month_abbr, '00') |
|
|
| return f"{year}-{month}-{day} {time_str}" |
| except Exception: |
| return raw_date.strip() |
|
|
| def looks_like_article_href(href): |
| """ |
| Memfilter URL agar hanya mengambil link artikel yang valid. |
| Contoh URL artikel: /2025/08/12/nama-artikel/ |
| """ |
| if not href: |
| return False |
| |
| return bool(re.search(r'/\d{4}/\d{2}/\d{2}/', href)) |
|
|
| HEADERS = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/126.0.0.0 Safari/537.36" |
| } |
|
|
| def fetch_url(url, retries=3, delay=3): |
| """ |
| Fungsi untuk mengambil konten dari sebuah URL dengan mekanisme coba lagi (retry). |
| """ |
| for attempt in range(1, retries + 1): |
| try: |
| resp = requests.get(url, headers=HEADERS, timeout=15) |
| if resp.status_code == 200: |
| return resp.text |
| else: |
| print(f" WARNING: HTTP {resp.status_code} saat akses {url}") |
| except Exception as e: |
| print(f" WARNING: Gagal akses {url} ({attempt}/{retries}): {e}") |
| time.sleep(delay) |
| return None |
|
|
| def scrape_radarcirebon(keyword, max_pages=3, delay_between_items=(1, 2), delay_between_pages=(2, 4)): |
| """ |
| Fungsi utama untuk melakukan scraping dari situs radarcirebon.id. |
| """ |
| results = [] |
| seen_links = set() |
|
|
| for page in range(1, max_pages + 1): |
| url = make_search_url(keyword, page) |
| print(f"\nScraping halaman {page} -> {url}") |
|
|
| html = fetch_url(url) |
| if not html: |
| print(f" ERROR: Gagal mengambil halaman pencarian {page}") |
| continue |
|
|
| soup = BeautifulSoup(html, "html.parser") |
|
|
| |
| link_elements = soup.select('article .wp-block-latest-posts__post-title a') |
| print(f" DEBUG: Ditemukan {len(link_elements)} elemen link di halaman {page}") |
|
|
| all_links_in_page = set() |
| for a in link_elements: |
| href_raw = a.get('href') |
| if href_raw and looks_like_article_href(href_raw): |
| norm = normalize_url(href_raw) |
| if norm: |
| all_links_in_page.add(norm) |
|
|
| new_links = all_links_in_page - seen_links |
| print(f" Menemukan {len(all_links_in_page)} link artikel di halaman ini, {len(new_links)} link baru.") |
| seen_links.update(all_links_in_page) |
|
|
| for link in sorted(list(new_links)): |
| html_detail = fetch_url(link) |
| if not html_detail: |
| print(f" ERROR: Gagal mengambil artikel {link}") |
| continue |
|
|
| detail_soup = BeautifulSoup(html_detail, "html.parser") |
|
|
| |
| title_el = detail_soup.select_one('h1.entry-title') |
| title_detail = title_el.get_text(strip=True) if title_el else "-" |
|
|
| |
| date_el = detail_soup.select_one('time.entry-date') |
| date_detail = parse_radarcirebon_date(date_el.get_text(strip=True)) if date_el else "-" |
|
|
| |
| content_parts = [] |
| content_container = detail_soup.select_one('div.entry-content') |
| if content_container: |
| for p in content_container.select('p'): |
| |
| if not p.find_parent(class_='read-also'): |
| text = p.get_text(" ", strip=True) |
| if text: |
| content_parts.append(text) |
| content = "\n".join(content_parts) |
|
|
| |
| tags_container = detail_soup.select_one('div.wp-block-tag-cloud') |
| tags = [a.get_text(strip=True) for a in tags_container.select('a')] if tags_container else [] |
| tags = list(dict.fromkeys(tags)) |
|
|
| results.append({ |
| "judul": title_detail, |
| "tanggal": date_detail, |
| "tag": ", ".join(tags) if tags else "-", |
| "isi_berita": content if content else "-", |
| "link": link |
| }) |
| print(f" Berhasil: {title_detail} | Tanggal: {date_detail}") |
| time.sleep(random.uniform(*delay_between_items)) |
|
|
| |
| time.sleep(random.uniform(*delay_between_pages)) |
|
|
| return pd.DataFrame(results) |
|
|
| if __name__ == "__main__": |
| keyword = "kabupaten cirebon" |
| |
| df = scrape_radarcirebon(keyword, max_pages=3) |
| if not df.empty: |
| |
| output_filename = "radarcirebon_berita.csv" |
| df.to_csv(output_filename, index=False, encoding="utf-8-sig") |
| print(f"\nSelesai menyimpan {len(df)} data berita ke {output_filename}") |
| else: |
| print("\nTidak ada data yang berhasil di-scrape.") |
|
|
| |
|
|
| import requests |
|
|
| url = "https://radarcirebon.id/2025/08/12/warga-resah-dprd-cirebon-panggil-dpkpp-untuk-tuntaskan-masalah-psu-di-dua-perumahan/" |
| headers = { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36" |
| } |
| resp = requests.get(url, headers=headers) |
| with open("detail.html", "w", encoding="utf-8") as f: |
| f.write(resp.text) |
| print("HTML halaman disimpan ke page.html") |
|
|
| |
|
|
| import requests |
| from bs4 import BeautifulSoup |
| import pandas as pd |
| import time |
| from datetime import datetime |
|
|
| def scrape_detik_search(keyword, max_years=3, max_pages=100): |
| base_search_url = "https://www.detik.com/search/searchall" |
| results = [] |
|
|
| cutoff_date = datetime.now().replace(year=datetime.now().year - max_years) |
|
|
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36' |
| } |
|
|
| for page in range(1, max_pages + 1): |
| params = { |
| 'query': keyword, |
| 'siteid': '2', |
| 'sortby': 'time', |
| 'page': page |
| } |
| print(f"Scraping page {page}...") |
| r = requests.get(base_search_url, params=params, headers=headers) |
| if r.status_code != 200: |
| print(f"Gagal akses halaman (status {r.status_code}), hentikan scraping.") |
| break |
|
|
| soup = BeautifulSoup(r.text, 'html.parser') |
|
|
| news_list = soup.find_all('div', class_='media') |
|
|
| if not news_list: |
| print("Tidak ada berita ditemukan di halaman ini, hentikan scraping.") |
| break |
|
|
| for news in news_list: |
| try: |
| title_tag = news.find('h3', class_='media__title') |
| if not title_tag: |
| continue |
| link_tag = title_tag.find('a', class_='media__link') |
| if not link_tag or not link_tag.has_attr('href'): |
| continue |
| link = link_tag['href'] |
| title = link_tag.text.strip() |
|
|
| date_tag = news.find('div', class_='media__date') |
| if date_tag: |
| span_tag = date_tag.find('span') |
| if span_tag and span_tag.has_attr('d-time'): |
| timestamp = span_tag['d-time'] |
| news_date = datetime.fromtimestamp(int(timestamp)) |
| else: |
| news_date = None |
| else: |
| news_date = None |
|
|
| if news_date and news_date < cutoff_date: |
| print("Berita sudah melewati batas waktu 3 tahun, hentikan scraping.") |
| return pd.DataFrame(results) |
|
|
| |
| news_resp = requests.get(link, headers=headers) |
| if news_resp.status_code != 200: |
| print(f"Gagal akses detail berita: {link} (status {news_resp.status_code}), skip berita ini.") |
| continue |
|
|
| news_soup = BeautifulSoup(news_resp.text, 'html.parser') |
|
|
| content_div = news_soup.find('div', class_='detail__body-text') or \ |
| news_soup.find('div', class_='detail_text') |
|
|
| if content_div: |
| content_parts = [] |
| for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): |
| text = tag.get_text(strip=True) |
| if text: |
| prefix = tag.name.upper() if tag.name.startswith('h') else '' |
| if prefix: |
| content_parts.append(f"{prefix}: {text}") |
| else: |
| content_parts.append(text) |
| content = '\n'.join(content_parts) |
| else: |
| content = '' |
|
|
| tag_list_div = news_soup.find('div', class_='tag__list') or \ |
| news_soup.find('div', class_='detail_tag') |
|
|
| tags = [] |
| if tag_list_div: |
| tags = [t.text.strip() for t in tag_list_div.find_all('a')] |
|
|
| results.append({ |
| 'judul': title, |
| 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '', |
| 'tag': ', '.join(tags), |
| 'isi_berita': content, |
| 'link': link |
| }) |
|
|
| print(f"Berhasil scrape berita: {title}") |
|
|
| time.sleep(1) |
|
|
| except Exception as e: |
| print(f"Error saat memproses berita: {e}") |
| continue |
|
|
| time.sleep(2) |
|
|
| return pd.DataFrame(results) |
|
|
| if __name__ == "__main__": |
| keyword = "Kabupaten Cirebon" |
| df = scrape_detik_search(keyword) |
| if not df.empty: |
| df.to_csv("detik_berita_cirebonnn.csv", index=False, encoding='utf-8-sig') |
| print("Selesai menyimpan data berita ke detik_berita_cirebon.csv") |
| else: |
| print("Tidak ada data yang berhasil di-scrape.") |