Spaces:
Sleeping
Sleeping
FauziIsyrinApridal commited on
Commit ·
4f5cf5c
1
Parent(s): 39cd32c
...
Browse files- requirements.txt +1 -3
- scrapping/dosen_scrap.py +306 -141
- scrapping/jadwal_scrap.py +402 -227
- scrapping/jurusan_scrap.py +315 -119
- scrapping/pnp_scrap.py +411 -94
- scrapping/utils/crawl4ai_utils.py +0 -168
requirements.txt
CHANGED
|
@@ -2,6 +2,4 @@ scrapy
|
|
| 2 |
supabase
|
| 3 |
python-dotenv
|
| 4 |
requests
|
| 5 |
-
beautifulsoup4
|
| 6 |
-
crawl4ai
|
| 7 |
-
playwright
|
|
|
|
| 2 |
supabase
|
| 3 |
python-dotenv
|
| 4 |
requests
|
| 5 |
+
beautifulsoup4
|
|
|
|
|
|
scrapping/dosen_scrap.py
CHANGED
|
@@ -1,160 +1,325 @@
|
|
|
|
|
|
|
|
| 1 |
from datetime import datetime
|
| 2 |
import re
|
| 3 |
from supabase import create_client
|
| 4 |
import os
|
| 5 |
-
|
| 6 |
-
from bs4 import BeautifulSoup
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
try:
|
| 10 |
-
from utils.crawl4ai_utils import crawl_domain_parallel_sync
|
| 11 |
-
except Exception:
|
| 12 |
-
import sys as _sys
|
| 13 |
-
import os as _os
|
| 14 |
-
_sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
|
| 15 |
-
from crawl4ai_utils import crawl_domain_parallel_sync
|
| 16 |
-
|
| 17 |
-
# Dedup upload utility
|
| 18 |
try:
|
| 19 |
from utils.supabase_utils import upload_if_changed
|
| 20 |
except Exception:
|
| 21 |
-
|
| 22 |
-
import os as _os2
|
| 23 |
-
_sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
|
| 24 |
from supabase_utils import upload_if_changed
|
| 25 |
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def parse_tables(html: str, page_url: str = '') -> Dict[str, List[Dict[str, str]]]:
|
| 41 |
-
soup = BeautifulSoup(html or '', 'html.parser')
|
| 42 |
-
data: Dict[str, List[Dict[str, str]]] = {
|
| 43 |
-
'jabatan': [],
|
| 44 |
-
'staff_pengajar': [],
|
| 45 |
-
'staff_administrasi': [],
|
| 46 |
-
'staff_teknisi': [],
|
| 47 |
}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
for url, html in all_pages.items():
|
| 96 |
-
if not html:
|
| 97 |
-
continue
|
| 98 |
-
parsed = parse_tables(html, page_url=url)
|
| 99 |
-
for k, v in parsed.items():
|
| 100 |
-
merged[k].extend(v)
|
| 101 |
-
return merged
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
if not items:
|
| 117 |
-
continue
|
| 118 |
-
lines.append(f"# {title}\n")
|
| 119 |
-
lines.append(f"Jumlah data: {len(items)}\n\n")
|
| 120 |
-
for it in items:
|
| 121 |
-
if key == 'jabatan':
|
| 122 |
-
paragraph = f"{it.get('pejabat','')} menjabat sebagai {it.get('jabatan','')}"
|
| 123 |
-
else:
|
| 124 |
-
paragraph = f"{it.get('nama','')} adalah staf dengan NIP {it.get('nip','')}"
|
| 125 |
-
if it.get('jurusan'):
|
| 126 |
-
paragraph += f" dan bertugas di {it['jurusan']}"
|
| 127 |
-
lines.append(paragraph.strip() + "\n")
|
| 128 |
-
lines.append("\n")
|
| 129 |
-
return ''.join(lines)
|
| 130 |
|
|
|
|
| 131 |
|
| 132 |
if __name__ == '__main__':
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
)
|
| 137 |
-
bucket = os.environ.get('NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET', 'pnp-bot-storage')
|
| 138 |
-
|
| 139 |
-
pages = crawl_domain_parallel_sync(
|
| 140 |
-
seed_url=SEED_URL,
|
| 141 |
-
max_pages=40,
|
| 142 |
-
max_concurrency=6,
|
| 143 |
-
only_important=True,
|
| 144 |
-
timeout=30,
|
| 145 |
-
headless=True,
|
| 146 |
-
)
|
| 147 |
-
collected = merge_collections(pages)
|
| 148 |
-
text = build_text(collected)
|
| 149 |
-
ts = datetime.now().strftime('%Y%m%d_%H%M')
|
| 150 |
-
filename = f"data_dosen_{ts}.txt"
|
| 151 |
-
try:
|
| 152 |
-
result = upload_if_changed(supabase, bucket, filename, text)
|
| 153 |
-
if result.get('result') == 'uploaded':
|
| 154 |
-
print(f"✅ Uploaded {filename}")
|
| 155 |
-
elif result.get('result') == 'skipped':
|
| 156 |
-
print(f"⏭️ Skipped (unchanged) {filename}")
|
| 157 |
-
else:
|
| 158 |
-
print(f"❌ Upload error: {result.get('error')}")
|
| 159 |
-
except Exception as e:
|
| 160 |
-
print(f"❌ Error uploading: {e}")
|
|
|
|
| 1 |
+
import scrapy
|
| 2 |
+
from scrapy.crawler import CrawlerProcess
|
| 3 |
from datetime import datetime
|
| 4 |
import re
|
| 5 |
from supabase import create_client
|
| 6 |
import os
|
| 7 |
+
import sys
|
|
|
|
| 8 |
|
| 9 |
+
# Try import shared dedup upload utility
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
try:
|
| 11 |
from utils.supabase_utils import upload_if_changed
|
| 12 |
except Exception:
|
| 13 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
|
|
|
|
|
|
| 14 |
from supabase_utils import upload_if_changed
|
| 15 |
|
| 16 |
|
| 17 |
+
class DosenSpider(scrapy.Spider):
|
| 18 |
+
name = 'dosen_spider'
|
| 19 |
+
start_urls = ['https://sipeg.pnp.ac.id/']
|
| 20 |
+
|
| 21 |
+
custom_settings = {
|
| 22 |
+
'DOWNLOAD_DELAY': 1,
|
| 23 |
+
'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
|
| 24 |
+
'ROBOTSTXT_OBEY': True,
|
| 25 |
+
'LOG_LEVEL': 'INFO',
|
| 26 |
+
'CONCURRENT_REQUESTS': 1,
|
| 27 |
+
'HTTPCACHE_ENABLED': False,
|
| 28 |
+
'RETRY_TIMES': 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
+
|
| 31 |
+
def __init__(self, *args, **kwargs):
|
| 32 |
+
super(DosenSpider, self).__init__(*args, **kwargs)
|
| 33 |
+
# Initialize Supabase client
|
| 34 |
+
self.supabase = create_client(
|
| 35 |
+
os.environ.get("NEXT_PUBLIC_SUPABASE_URL"),
|
| 36 |
+
os.environ.get("SUPABASE_SERVICE_KEY")
|
| 37 |
+
)
|
| 38 |
+
self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
| 39 |
+
self.collected_data = []
|
| 40 |
+
|
| 41 |
+
def parse(self, response):
|
| 42 |
+
# Mengekstrak menu utama dan submenu
|
| 43 |
+
main_menu_items = response.css('li.level1')
|
| 44 |
+
|
| 45 |
+
for menu_item in main_menu_items:
|
| 46 |
+
menu_title = menu_item.css('span.bg::text').get('').strip()
|
| 47 |
+
main_link = menu_item.css('a::attr(href)').get()
|
| 48 |
+
|
| 49 |
+
if main_link:
|
| 50 |
+
main_link = response.urljoin(main_link)
|
| 51 |
+
|
| 52 |
+
# Follow link menu utama
|
| 53 |
+
yield scrapy.Request(
|
| 54 |
+
url=main_link,
|
| 55 |
+
callback=self.parse_page,
|
| 56 |
+
meta={'page_title': menu_title, 'page_number': 1}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Cek submenu
|
| 60 |
+
submenus = menu_item.css('li.level2')
|
| 61 |
+
for submenu in submenus:
|
| 62 |
+
submenu_title = submenu.css('span.bg::text').get('').strip()
|
| 63 |
+
submenu_link = submenu.css('a::attr(href)').get()
|
| 64 |
+
|
| 65 |
+
if submenu_link:
|
| 66 |
+
submenu_link = response.urljoin(submenu_link)
|
| 67 |
+
|
| 68 |
+
# Follow link submenu
|
| 69 |
+
yield scrapy.Request(
|
| 70 |
+
url=submenu_link,
|
| 71 |
+
callback=self.parse_page,
|
| 72 |
+
meta={'page_title': submenu_title, 'page_number': 1}
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def parse_page(self, response):
|
| 76 |
+
page_title = response.meta.get('page_title', '')
|
| 77 |
+
page_number = response.meta.get('page_number', 1)
|
| 78 |
+
|
| 79 |
+
# Cek pesan "Data belum tersedia"
|
| 80 |
+
page_text = ' '.join(response.css('body ::text').getall()).lower()
|
| 81 |
+
unavailable_messages = [
|
| 82 |
+
'data staf pengajar belum tersedia',
|
| 83 |
+
'data staf administrasi belum tersedia',
|
| 84 |
+
'data staf teknisi belum tersedia'
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
if any(msg in page_text for msg in unavailable_messages):
|
| 88 |
+
self.logger.info(f"Data tidak tersedia pada halaman: {response.url}")
|
| 89 |
+
return
|
| 90 |
+
|
| 91 |
+
# Cek tabel dalam halaman
|
| 92 |
+
tables = response.css('table.table-landscape, table.table, table.table-bordered')
|
| 93 |
+
|
| 94 |
+
if tables:
|
| 95 |
+
for table in tables:
|
| 96 |
+
# Ambil header tabel untuk menentukan jenis tabel
|
| 97 |
+
headers = [h.strip() for h in table.css('th::text').getall()]
|
| 98 |
+
|
| 99 |
+
# Tentukan jenis tabel berdasarkan header
|
| 100 |
+
if 'Jabatan' in headers and 'Pejabat' in headers:
|
| 101 |
+
yield from self.extract_officials_table(table, page_title)
|
| 102 |
+
elif 'Nama' in headers and 'NIP' in headers:
|
| 103 |
+
# Tentukan jenis staf berdasarkan judul halaman
|
| 104 |
+
staff_type = self.determine_simple_staff_type(page_title)
|
| 105 |
+
yield from self.extract_staff_table(table, page_title, staff_type, page_number)
|
| 106 |
+
else:
|
| 107 |
+
self.logger.info(f"No tables found on page: {response.url}")
|
| 108 |
+
|
| 109 |
+
# Improved pagination handling
|
| 110 |
+
current_url = response.url
|
| 111 |
+
base_url = current_url.split('?')[0] if '?' in current_url else current_url
|
| 112 |
+
|
| 113 |
+
# Extract p value from current URL if it exists
|
| 114 |
+
current_p = 0
|
| 115 |
+
if 'p=' in current_url:
|
| 116 |
+
try:
|
| 117 |
+
current_p = int(current_url.split('p=')[1].split('&')[0])
|
| 118 |
+
except (ValueError, IndexError):
|
| 119 |
+
current_p = 0
|
| 120 |
+
|
| 121 |
+
# Determine items per page based on staff type
|
| 122 |
+
staff_type = self.determine_simple_staff_type(page_title)
|
| 123 |
+
if staff_type == 'staff_pengajar':
|
| 124 |
+
items_per_page = 30
|
| 125 |
+
elif staff_type in ['staff_administrasi', 'staff_teknisi']:
|
| 126 |
+
items_per_page = 25
|
| 127 |
+
else:
|
| 128 |
+
items_per_page = 0 # No pagination for jabatan
|
| 129 |
+
|
| 130 |
+
# First try to get the Next link using XPath
|
| 131 |
+
next_page = None
|
| 132 |
+
next_link = response.xpath('//span[@class="table-link"]/a[contains(text(), "Next")]/@href').get()
|
| 133 |
+
|
| 134 |
+
if next_link:
|
| 135 |
+
next_page = response.urljoin(next_link)
|
| 136 |
+
elif current_p >= 0 and items_per_page > 0:
|
| 137 |
+
next_p = items_per_page if current_p == 0 else current_p + items_per_page
|
| 138 |
+
next_page = f"{base_url}?p={next_p}"
|
| 139 |
+
self.logger.info(f"Constructed next page URL with p parameter: {next_page}")
|
| 140 |
+
|
| 141 |
+
# Fallback to other pagination methods if specific method failed
|
| 142 |
+
if not next_page:
|
| 143 |
+
pagination_xpath_patterns = [
|
| 144 |
+
'//ul[contains(@class, "pagination")]/li/a[contains(text(), "Next")]/@href',
|
| 145 |
+
'//ul[contains(@class, "pagination")]/li/a[contains(text(), "»")]/@href',
|
| 146 |
+
f'//ul[contains(@class, "pagination")]/li/a[contains(text(), "{page_number + 1}")]/@href',
|
| 147 |
+
'//a[@class="next page-numbers"]/@href',
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
for xpath in pagination_xpath_patterns:
|
| 151 |
+
next_page_link = response.xpath(xpath).get()
|
| 152 |
+
if next_page_link:
|
| 153 |
+
next_page = response.urljoin(next_page_link)
|
| 154 |
+
self.logger.info(f"Found next page link using XPath: {next_page}")
|
| 155 |
+
break
|
| 156 |
+
|
| 157 |
+
# Generic parameter detection as last resort
|
| 158 |
+
if not next_page:
|
| 159 |
+
if 'page=' in current_url:
|
| 160 |
+
next_page = current_url.replace(f'page={page_number}', f'page={page_number + 1}')
|
| 161 |
+
elif 'p=' in current_url and 'p=' not in next_page:
|
| 162 |
+
next_page = current_url.replace(f'p={current_p}', f'p={current_p + items_per_page}')
|
| 163 |
+
elif 'halaman=' in current_url:
|
| 164 |
+
next_page = current_url.replace(f'halaman={page_number}', f'halaman={page_number + 1}')
|
| 165 |
+
elif 'page/' in current_url:
|
| 166 |
+
next_page = current_url.replace(f'page/{page_number}', f'page/{page_number + 1}')
|
| 167 |
+
|
| 168 |
+
if next_page:
|
| 169 |
+
next_page_number = page_number + 1
|
| 170 |
+
|
| 171 |
+
if 'p=' in next_page:
|
| 172 |
+
try:
|
| 173 |
+
p_value = int(next_page.split('p=')[1].split('&')[0])
|
| 174 |
+
next_page_number = (p_value // items_per_page) + 1
|
| 175 |
+
except (ValueError, IndexError):
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
self.logger.info(f"Following to next page: {next_page} (Page {next_page_number})")
|
| 179 |
+
yield scrapy.Request(
|
| 180 |
+
url=next_page,
|
| 181 |
+
callback=self.parse_page,
|
| 182 |
+
meta={'page_title': page_title, 'page_number': next_page_number}
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
def determine_simple_staff_type(self, page_title):
|
| 186 |
+
"""Menentukan jenis staf berdasarkan judul halaman"""
|
| 187 |
+
page_title_lower = page_title.lower()
|
| 188 |
+
|
| 189 |
+
if any(word in page_title_lower for word in ['dosen', 'pengajar', 'akademik', 'jurusan']):
|
| 190 |
+
return 'staff_pengajar'
|
| 191 |
+
elif any(word in page_title_lower for word in ['administrasi', 'admin', 'tata usaha', 'pegawai']):
|
| 192 |
+
return 'staff_administrasi'
|
| 193 |
+
elif any(word in page_title_lower for word in ['teknisi', 'lab', 'teknik', 'laboratorium']):
|
| 194 |
+
return 'staff_teknisi'
|
| 195 |
+
|
| 196 |
+
return 'staff_lainnya'
|
| 197 |
+
|
| 198 |
+
def extract_officials_table(self, table, page_title):
|
| 199 |
+
rows = table.css('tr')
|
| 200 |
+
|
| 201 |
+
for row in rows:
|
| 202 |
+
row_html = row.get()
|
| 203 |
+
|
| 204 |
+
period_match = re.search(r'<!--\s*<td[^>]*>(.*?)</td>\s*-->', row_html)
|
| 205 |
+
period = period_match.group(1).strip() if period_match else ""
|
| 206 |
+
|
| 207 |
+
cells = row.css('td')
|
| 208 |
+
if len(cells) < 3:
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
number = cells[0].css('::text').get('').strip()
|
| 212 |
+
position = cells[1].css('::text').get('').strip()
|
| 213 |
+
official = cells[2].css('::text').get('').strip()
|
| 214 |
+
|
| 215 |
+
item = {
|
| 216 |
+
'halaman': page_title,
|
| 217 |
+
'tipe': 'jabatan',
|
| 218 |
+
'nomor': number,
|
| 219 |
+
'jabatan': position,
|
| 220 |
+
'pejabat': official,
|
| 221 |
+
'periode': period
|
| 222 |
+
}
|
| 223 |
+
self.collected_data.append(item)
|
| 224 |
+
yield item
|
| 225 |
+
|
| 226 |
+
def extract_staff_table(self, table, page_title, staff_type, page_number):
|
| 227 |
+
rows = table.css('tr')
|
| 228 |
+
rows = rows[1:] if len(rows) > 1 else []
|
| 229 |
+
|
| 230 |
+
for row in rows:
|
| 231 |
+
cells = row.css('td')
|
| 232 |
+
if len(cells) < 3:
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
number = cells[0].css('::text').get('').strip() if len(cells) > 0 else ""
|
| 236 |
+
|
| 237 |
+
name_cell = cells[1] if len(cells) > 1 else None
|
| 238 |
+
name = ""
|
| 239 |
+
|
| 240 |
+
if name_cell:
|
| 241 |
+
name_link = name_cell.css('a::text').get()
|
| 242 |
+
name = name_link.strip() if name_link else name_cell.css('::text').get('').strip()
|
| 243 |
+
detail_url = name_cell.css('a::attr(href)').get()
|
| 244 |
+
|
| 245 |
+
nip = cells[2].css('::text').get('').strip() if len(cells) > 2 else ""
|
| 246 |
+
department = cells[3].css('::text').get('').strip() if len(cells) > 3 else ""
|
| 247 |
+
|
| 248 |
+
if not name and not nip:
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
item = {
|
| 252 |
+
'halaman': page_title,
|
| 253 |
+
'tipe': staff_type,
|
| 254 |
+
'halaman_ke': page_number,
|
| 255 |
+
'nomor': number,
|
| 256 |
+
'nama': name,
|
| 257 |
+
'nip': nip,
|
| 258 |
+
'jurusan': department,
|
| 259 |
+
'detail': detail_url
|
| 260 |
+
}
|
| 261 |
+
self.collected_data.append(item)
|
| 262 |
+
yield item
|
| 263 |
+
|
| 264 |
+
def closed(self, reason):
|
| 265 |
+
"""Called when spider closes - formats data and uploads to Supabase"""
|
| 266 |
+
# Generate text content
|
| 267 |
+
text_content = self.generate_text_output()
|
| 268 |
+
|
| 269 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 270 |
+
filename = f"data_dosen_{timestamp}.txt"
|
| 271 |
+
|
| 272 |
+
# Upload to Supabase with deduplication
|
| 273 |
+
try:
|
| 274 |
+
result = upload_if_changed(self.supabase, self.storage_bucket, filename, text_content)
|
| 275 |
+
if result.get('result') == 'uploaded':
|
| 276 |
+
self.logger.info(f"Successfully uploaded {filename} to Supabase storage")
|
| 277 |
+
elif result.get('result') == 'skipped':
|
| 278 |
+
self.logger.info(f"Skipped upload for {filename} (content unchanged)")
|
| 279 |
+
else:
|
| 280 |
+
self.logger.error(f"Failed to upload {filename} to Supabase: {result.get('error')}")
|
| 281 |
+
except Exception as e:
|
| 282 |
+
self.logger.error(f"Error uploading to Supabase: {str(e)}")
|
| 283 |
+
|
| 284 |
+
def generate_text_output(self):
|
| 285 |
+
output = []
|
| 286 |
+
output.append(f"# Data Dosen dan Staff PNP\n")
|
| 287 |
+
output.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
| 288 |
|
| 289 |
+
grouped = {}
|
| 290 |
+
for item in self.collected_data:
|
| 291 |
+
tipe = item.get('tipe', 'lainnya')
|
| 292 |
+
grouped.setdefault(tipe, []).append(item)
|
| 293 |
|
| 294 |
+
section_titles = {
|
| 295 |
+
'jabatan': 'Daftar Jabatan Struktural',
|
| 296 |
+
'staff_pengajar': 'Daftar Dosen dan Pengajar',
|
| 297 |
+
'staff_administrasi': 'Daftar Staff Administrasi',
|
| 298 |
+
'staff_teknisi': 'Daftar Staff Teknisi',
|
| 299 |
+
'staff_lainnya': 'Daftar Staff Lainnya'
|
| 300 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
+
for tipe, items in grouped.items():
|
| 303 |
+
title = section_titles.get(tipe, tipe.capitalize())
|
| 304 |
+
output.append(f"# {title}\n")
|
| 305 |
+
output.append(f"Jumlah data: {len(items)}\n\n")
|
| 306 |
|
| 307 |
+
for item in items:
|
| 308 |
+
if tipe == 'jabatan':
|
| 309 |
+
paragraph = f"{item['pejabat']} menjabat sebagai {item['jabatan']}."
|
| 310 |
+
if item.get('periode'):
|
| 311 |
+
paragraph += f" Masa jabatan berlangsung selama {item['periode']}."
|
| 312 |
+
else:
|
| 313 |
+
paragraph = f"{item['nama']} adalah staf dengan NIP {item['nip']}."
|
| 314 |
+
if item.get('jurusan'):
|
| 315 |
+
paragraph += f" Ia bertugas di {item['jurusan']}."
|
| 316 |
+
if item.get('detail'):
|
| 317 |
+
paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
|
| 318 |
+
output.append(paragraph + "\n\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
+
return ''.join(output)
|
| 321 |
|
| 322 |
if __name__ == '__main__':
|
| 323 |
+
process = CrawlerProcess()
|
| 324 |
+
process.crawl(DosenSpider)
|
| 325 |
+
process.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapping/jadwal_scrap.py
CHANGED
|
@@ -1,241 +1,416 @@
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
from datetime import datetime
|
| 4 |
from supabase import create_client
|
| 5 |
from io import StringIO
|
| 6 |
-
from typing import Dict, List, Tuple
|
| 7 |
-
from bs4 import BeautifulSoup
|
| 8 |
-
|
| 9 |
-
# Crawl4AI helper for rendered fetching
|
| 10 |
-
try:
|
| 11 |
-
from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
|
| 12 |
-
except Exception:
|
| 13 |
-
import sys as _sys
|
| 14 |
-
_sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
| 15 |
-
from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
|
| 16 |
-
|
| 17 |
-
# Shared dedup upload utility
|
| 18 |
-
try:
|
| 19 |
-
from utils.supabase_utils import upload_if_changed
|
| 20 |
-
except Exception:
|
| 21 |
-
import sys as _sys2
|
| 22 |
-
_sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
| 23 |
-
from supabase_utils import upload_if_changed
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
# =====================
|
| 27 |
-
# Standalone helpers for non-Scrapy execution below
|
| 28 |
-
# =====================
|
| 29 |
-
|
| 30 |
-
# Constants for targeted pages
|
| 31 |
-
BASE_PRESENSI = 'https://presensi.pnp.ac.id/'
|
| 32 |
-
ELEKTRO_URL = 'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
|
| 33 |
-
EXCLUDED = ['elektronika', 'telkom', 'listrik']
|
| 34 |
-
|
| 35 |
-
# Initialize Supabase for standalone run
|
| 36 |
-
_SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
| 37 |
-
_SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
| 38 |
-
supabase = create_client(_SUPABASE_URL, _SUPABASE_KEY)
|
| 39 |
-
bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
| 40 |
-
|
| 41 |
-
# Buffers for aggregated uploads keyed by jurusan_id
|
| 42 |
-
file_buffers: Dict[str, StringIO] = {}
|
| 43 |
-
|
| 44 |
-
def _init_buffer(jurusan_id: str, jurusan_name: str):
|
| 45 |
-
if jurusan_id not in file_buffers:
|
| 46 |
-
file_buffers[jurusan_id] = StringIO()
|
| 47 |
-
buf = file_buffers[jurusan_id]
|
| 48 |
-
today = datetime.now().strftime("%Y-%m-%d")
|
| 49 |
-
buf.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
|
| 50 |
-
buf.write(f"**Jurusan:** {jurusan_name}\n")
|
| 51 |
-
buf.write(f"**Tanggal Update:** {today}\n")
|
| 52 |
-
buf.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
|
| 53 |
-
buf.write("---\n\n")
|
| 54 |
-
|
| 55 |
-
def clean_text_list(nodes) -> List[str]:
|
| 56 |
-
out: List[str] = []
|
| 57 |
-
for n in nodes:
|
| 58 |
-
try:
|
| 59 |
-
txt = ' '.join(n.get_text(' ', strip=True).split())
|
| 60 |
-
except Exception:
|
| 61 |
-
txt = ''
|
| 62 |
-
if txt:
|
| 63 |
-
out.append(txt)
|
| 64 |
-
return out
|
| 65 |
-
|
| 66 |
-
def build_schedule_grid_bs(days: List[str], time_slots: List[str]):
|
| 67 |
-
return {day: {t: 'kosong' for t in time_slots} for day in days}
|
| 68 |
-
|
| 69 |
-
def write_schedule_to_buffer_bs(buffer: StringIO, schedule_grid: Dict[str, Dict[str, str]], days: List[str], time_slots: List[str]):
|
| 70 |
-
for day in days:
|
| 71 |
-
current_course = None
|
| 72 |
-
current_times: List[str] = []
|
| 73 |
-
day_schedule: List[str] = []
|
| 74 |
-
for t in time_slots:
|
| 75 |
-
course = schedule_grid[day][t]
|
| 76 |
-
if course == current_course:
|
| 77 |
-
current_times.append(t)
|
| 78 |
-
else:
|
| 79 |
-
if current_course and current_course.lower() != 'kosong':
|
| 80 |
-
first_start = current_times[0].split('-')[0].strip()
|
| 81 |
-
last_end = current_times[-1].split('-')[-1].strip()
|
| 82 |
-
time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
|
| 83 |
-
day_schedule.append(f"- {day} {time_range} | {current_course}")
|
| 84 |
-
current_course = course
|
| 85 |
-
current_times = [t]
|
| 86 |
-
if current_course and current_course.lower() != 'kosong':
|
| 87 |
-
first_start = current_times[0].split('-')[0].strip()
|
| 88 |
-
last_end = current_times[-1].split('-')[-1].strip()
|
| 89 |
-
time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
|
| 90 |
-
day_schedule.append(f"- {day} {time_range} | {current_course}")
|
| 91 |
-
for entry in day_schedule:
|
| 92 |
-
buffer.write(entry + "\n")
|
| 93 |
-
buffer.write("\n")
|
| 94 |
-
|
| 95 |
-
def process_table(tbl, jurusan_id: str, jurusan_name: str, idx: int):
|
| 96 |
-
_init_buffer(jurusan_id, jurusan_name)
|
| 97 |
-
buf = file_buffers[jurusan_id]
|
| 98 |
-
# Caption or fallback
|
| 99 |
-
cap_tag = tbl.find('caption')
|
| 100 |
-
caption_text = cap_tag.get_text(' ', strip=True) if cap_tag else f"Jadwal Kelas {idx + 1}"
|
| 101 |
-
thead = tbl.find('thead')
|
| 102 |
-
if thead:
|
| 103 |
-
thead_text = ' '.join(thead.get_text(' ', strip=True).split())
|
| 104 |
-
if thead_text:
|
| 105 |
-
caption_text = f"{caption_text} {thead_text}"
|
| 106 |
-
caption_text = re.sub(r'\s+', ' ', caption_text).strip()
|
| 107 |
-
# Header lists
|
| 108 |
-
days = clean_text_list(thead.select('th.xAxis')) if thead else []
|
| 109 |
-
if not days and thead:
|
| 110 |
-
days = clean_text_list(thead.select('th[class*="xAxis"]'))
|
| 111 |
-
tbody = tbl.find('tbody')
|
| 112 |
-
time_slots = clean_text_list(tbody.select('tr:not(.foot) th.yAxis')) if tbody else []
|
| 113 |
-
if not time_slots and tbody:
|
| 114 |
-
time_slots = clean_text_list(tbody.select('th[class*="yAxis"]'))
|
| 115 |
-
if not days or not time_slots:
|
| 116 |
-
return
|
| 117 |
-
# Section header
|
| 118 |
-
buf.write(f"## Jadwal Perkuliahan {caption_text}\n\n")
|
| 119 |
-
buf.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
|
| 120 |
-
# Build grid and fill
|
| 121 |
-
grid = build_schedule_grid_bs(days, time_slots)
|
| 122 |
-
rows = tbody.select('tr:not(.foot)') if tbody else []
|
| 123 |
-
active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
|
| 124 |
-
for row_idx, row in enumerate(rows):
|
| 125 |
-
if row_idx >= len(time_slots):
|
| 126 |
-
continue
|
| 127 |
-
current_time = time_slots[row_idx]
|
| 128 |
-
filled_cols = set()
|
| 129 |
-
# apply rowspans
|
| 130 |
-
to_remove = []
|
| 131 |
-
for (rs_col, rs_start), (rs_left, content) in list(active_rowspans.items()):
|
| 132 |
-
if rs_left > 0 and rs_col < len(days):
|
| 133 |
-
grid[days[rs_col]][current_time] = content
|
| 134 |
-
filled_cols.add(rs_col)
|
| 135 |
-
active_rowspans[(rs_col, rs_start)] = (rs_left - 1, content)
|
| 136 |
-
if rs_left - 1 <= 0:
|
| 137 |
-
to_remove.append((rs_col, rs_start))
|
| 138 |
-
for k in to_remove:
|
| 139 |
-
del active_rowspans[k]
|
| 140 |
-
# cells
|
| 141 |
-
cells = row.select('td')
|
| 142 |
-
col_idx = 0
|
| 143 |
-
for cell in cells:
|
| 144 |
-
while col_idx < len(days) and col_idx in filled_cols:
|
| 145 |
-
col_idx += 1
|
| 146 |
-
if col_idx >= len(days):
|
| 147 |
-
break
|
| 148 |
-
cell_text = ' '.join(cell.get_text(' ', strip=True).split())
|
| 149 |
-
cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
|
| 150 |
-
rowspan = int(cell.get('rowspan', '1') or '1')
|
| 151 |
-
colspan = int(cell.get('colspan', '1') or '1')
|
| 152 |
-
for c in range(colspan):
|
| 153 |
-
cur = col_idx + c
|
| 154 |
-
if cur < len(days):
|
| 155 |
-
grid[days[cur]][current_time] = cell_text
|
| 156 |
-
if rowspan > 1:
|
| 157 |
-
for c in range(colspan):
|
| 158 |
-
active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
|
| 159 |
-
col_idx += colspan
|
| 160 |
-
write_schedule_to_buffer_bs(buf, grid, days, time_slots)
|
| 161 |
-
|
| 162 |
-
def run_parallel():
|
| 163 |
-
# 1) Special Elektro page (single target page)
|
| 164 |
-
try:
|
| 165 |
-
elektro_html = fetch_html_sync(ELEKTRO_URL)
|
| 166 |
-
esoup = BeautifulSoup(elektro_html, 'html.parser')
|
| 167 |
-
tables = esoup.select('table')
|
| 168 |
-
if tables:
|
| 169 |
-
jurusan_id = 'teknik_elektro'
|
| 170 |
-
jurusan_name = 'Jurusan Teknik Elektro'
|
| 171 |
-
for idx, tbl in enumerate(tables):
|
| 172 |
-
process_table(tbl, jurusan_id, jurusan_name, idx)
|
| 173 |
-
except Exception as e:
|
| 174 |
-
print(f"[Jadwal] Error fetching Elektro page: {e}")
|
| 175 |
-
|
| 176 |
-
# 2) Parallel crawl within presensi domain to discover pages and schedule tables
|
| 177 |
-
try:
|
| 178 |
-
crawled: Dict[str, str] = crawl_domain_parallel_sync(
|
| 179 |
-
seed_url=BASE_PRESENSI,
|
| 180 |
-
max_pages=40,
|
| 181 |
-
max_concurrency=6,
|
| 182 |
-
only_important=False, # we need to find 'groups_days_horizontal' links which may not match keywords
|
| 183 |
-
timeout=40,
|
| 184 |
-
headless=True,
|
| 185 |
-
)
|
| 186 |
-
for url, html in crawled.items():
|
| 187 |
-
if not html:
|
| 188 |
-
continue
|
| 189 |
-
try:
|
| 190 |
-
soup = BeautifulSoup(html, 'html.parser')
|
| 191 |
-
# If this page itself is a groups_days_horizontal schedule page, parse tables directly
|
| 192 |
-
if 'groups_days_horizontal' in url and 'subgroups_days_horizontal' not in url:
|
| 193 |
-
title = soup.title.get_text(strip=True) if soup.title else 'Jadwal'
|
| 194 |
-
jurusan_id = title.replace(' ', '_')
|
| 195 |
-
jurusan_name = title
|
| 196 |
-
for idx, tbl in enumerate(soup.select('table[id^="table_"], table')):
|
| 197 |
-
process_table(tbl, jurusan_id=jurusan_id, jurusan_name=jurusan_name, idx=idx)
|
| 198 |
-
continue
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
print(f"✅ Successfully uploaded {filename}")
|
| 230 |
-
elif
|
| 231 |
print(f"⏭️ Skipped upload for {filename} (content unchanged)")
|
| 232 |
else:
|
| 233 |
print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
|
| 234 |
-
except Exception as e:
|
| 235 |
-
print(f"❌ Error uploading {filename}: {e}")
|
| 236 |
-
finally:
|
| 237 |
buffer.close()
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
if __name__ == "__main__":
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import scrapy
|
| 2 |
+
from scrapy.crawler import CrawlerProcess
|
| 3 |
import os
|
| 4 |
import re
|
| 5 |
from datetime import datetime
|
| 6 |
from supabase import create_client
|
| 7 |
from io import StringIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class PnpSpider(scrapy.Spider):
|
| 12 |
+
name = 'pnp_spider'
|
| 13 |
+
allowed_domains = ['presensi.pnp.ac.id', 'elektro.pnp.ac.id']
|
| 14 |
+
start_urls = [
|
| 15 |
+
'https://presensi.pnp.ac.id/',
|
| 16 |
+
'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
excluded_departments = ['elektronika', 'telkom', 'listrik']
|
| 20 |
+
|
| 21 |
+
def __init__(self, *args, **kwargs):
|
| 22 |
+
super(PnpSpider, self).__init__(*args, **kwargs)
|
| 23 |
+
# Initialize Supabase client
|
| 24 |
+
url = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
| 25 |
+
key = os.environ.get("SUPABASE_SERVICE_KEY")
|
| 26 |
+
|
| 27 |
+
self.supabase = create_client(url, key)
|
| 28 |
+
self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
|
| 29 |
+
|
| 30 |
+
self.file_buffers = {} # Dictionary to store StringIO objects
|
| 31 |
+
self.current_date = datetime.now().strftime("%Y-%m-%d")
|
| 32 |
+
|
| 33 |
+
def closed(self, reason):
|
| 34 |
+
print(f"Spider closing with reason: {reason}")
|
| 35 |
+
print(f"Uploading {len(self.file_buffers)} files to Supabase...")
|
| 36 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 37 |
+
for jurusan_id, buffer in self.file_buffers.items():
|
| 38 |
+
filename = f"{jurusan_id}_{timestamp}.txt"
|
| 39 |
+
content = buffer.getvalue()
|
| 40 |
+
print(f"Uploading {filename} with content length: {len(content)}")
|
| 41 |
+
result = self.upload_to_supabase(jurusan_id, filename, content)
|
| 42 |
+
if result.get('result') == 'uploaded':
|
| 43 |
print(f"✅ Successfully uploaded {filename}")
|
| 44 |
+
elif result.get('result') == 'skipped':
|
| 45 |
print(f"⏭️ Skipped upload for {filename} (content unchanged)")
|
| 46 |
else:
|
| 47 |
print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
|
|
|
|
|
|
|
|
|
|
| 48 |
buffer.close()
|
| 49 |
|
| 50 |
+
def upload_to_supabase(self, jurusan_id, filename, content):
|
| 51 |
+
"""Upload content to Supabase Storage with deduplication by content.
|
| 52 |
+
|
| 53 |
+
It compares the new content with the most recent existing file for the same jurusan_id
|
| 54 |
+
(files named like f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"). If identical, skip upload.
|
| 55 |
+
Returns dict: {'result': 'uploaded'|'skipped'|'error', 'error': Optional[str]}
|
| 56 |
+
"""
|
| 57 |
+
try:
|
| 58 |
+
# 1) Try to find the latest existing file for this jurusan_id
|
| 59 |
+
latest_name = self._get_latest_existing_filename(jurusan_id)
|
| 60 |
+
if latest_name:
|
| 61 |
+
try:
|
| 62 |
+
existing_bytes = self.supabase.storage.from_(self.storage_bucket).download(latest_name)
|
| 63 |
+
existing_content = existing_bytes.decode('utf-8') if isinstance(existing_bytes, (bytes, bytearray)) else str(existing_bytes)
|
| 64 |
+
if existing_content == content:
|
| 65 |
+
return {"result": "skipped"}
|
| 66 |
+
except Exception as inner_e:
|
| 67 |
+
# If download fails, proceed to upload as fallback, but log
|
| 68 |
+
print(f"Warning: failed to download existing file '{latest_name}' for comparison: {inner_e}")
|
| 69 |
+
|
| 70 |
+
# 2) Upload new content
|
| 71 |
+
self.supabase.storage.from_(self.storage_bucket).upload(
|
| 72 |
+
path=filename,
|
| 73 |
+
file=content.encode('utf-8'),
|
| 74 |
+
file_options={"content-type": "text/plain"}
|
| 75 |
+
)
|
| 76 |
+
return {"result": "uploaded"}
|
| 77 |
+
except Exception as e:
|
| 78 |
+
return {"result": "error", "error": str(e)}
|
| 79 |
+
|
| 80 |
+
def _get_latest_existing_filename(self, jurusan_id):
|
| 81 |
+
"""Return the latest existing filename in the bucket for a given jurusan_id or None.
|
| 82 |
+
|
| 83 |
+
It expects files following the pattern: f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
# List files at the root of the bucket
|
| 87 |
+
files = self.supabase.storage.from_(self.storage_bucket).list()
|
| 88 |
+
if not files:
|
| 89 |
+
return None
|
| 90 |
+
|
| 91 |
+
# files could be list of dicts with 'name' key depending on supabase-py version
|
| 92 |
+
names = []
|
| 93 |
+
for f in files:
|
| 94 |
+
try:
|
| 95 |
+
name = f.get('name') if isinstance(f, dict) else getattr(f, 'name', None)
|
| 96 |
+
except Exception:
|
| 97 |
+
name = None
|
| 98 |
+
if not name:
|
| 99 |
+
continue
|
| 100 |
+
names.append(name)
|
| 101 |
+
|
| 102 |
+
# Filter by jurusan_id prefix and timestamp pattern
|
| 103 |
+
pattern = re.compile(rf"^{re.escape(jurusan_id)}_\d{{8}}_\d{{6}}\.txt$")
|
| 104 |
+
matched = [n for n in names if pattern.match(n)]
|
| 105 |
+
if not matched:
|
| 106 |
+
return None
|
| 107 |
+
|
| 108 |
+
# Sort by timestamp extracted from filename
|
| 109 |
+
def extract_ts(name: str):
|
| 110 |
+
m = re.search(r"_(\d{8}_\d{6})\.txt$", name)
|
| 111 |
+
return m.group(1) if m else "00000000_000000"
|
| 112 |
+
|
| 113 |
+
matched.sort(key=extract_ts, reverse=True)
|
| 114 |
+
return matched[0]
|
| 115 |
+
except Exception as e:
|
| 116 |
+
print(f"Warning: could not list existing files for comparison: {e}")
|
| 117 |
+
return None
|
| 118 |
+
|
| 119 |
+
def parse(self, response):
|
| 120 |
+
if 'elektro.pnp.ac.id' in response.url:
|
| 121 |
+
jurusan_id = 'teknik_elektro'
|
| 122 |
+
jurusan_name = 'Jurusan Teknik Elektro'
|
| 123 |
+
return self.parse_elektro_page(response, jurusan_id, jurusan_name)
|
| 124 |
+
|
| 125 |
+
print("Memulai scraping dari halaman utama...")
|
| 126 |
+
jurusan_links = set(response.xpath('//article[contains(@class, "section")]//a/@href').getall())
|
| 127 |
+
|
| 128 |
+
for link in jurusan_links:
|
| 129 |
+
if any(excluded in link.lower() for excluded in self.excluded_departments):
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
jurusan_url = response.urljoin(link)
|
| 133 |
+
jurusan_id = self.extract_jurusan_id(link)
|
| 134 |
+
yield scrapy.Request(jurusan_url,
|
| 135 |
+
callback=self.parse_jurusan,
|
| 136 |
+
meta={'jurusan_id': jurusan_id})
|
| 137 |
+
|
| 138 |
+
def parse_elektro_page(self, response, jurusan_id, jurusan_name):
|
| 139 |
+
if jurusan_id not in self.file_buffers:
|
| 140 |
+
self.initialize_document_buffer(jurusan_id, jurusan_name)
|
| 141 |
+
|
| 142 |
+
output_buffer = self.file_buffers[jurusan_id]
|
| 143 |
+
tables = response.xpath('//table')
|
| 144 |
+
|
| 145 |
+
if not tables:
|
| 146 |
+
return
|
| 147 |
+
|
| 148 |
+
for table_idx, table in enumerate(tables):
|
| 149 |
+
caption_text = self.get_table_caption(table, table_idx)
|
| 150 |
+
class_info = self.clean_class_info(caption_text, table)
|
| 151 |
+
|
| 152 |
+
if not class_info:
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
self.write_section_header(output_buffer, class_info)
|
| 156 |
+
|
| 157 |
+
days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall() or \
|
| 158 |
+
table.xpath('.//thead//th[contains(@class, "xAxis")]/text()').getall()
|
| 159 |
+
time_slots = table.xpath('.//tbody//th[@class="yAxis"]/text()').getall() or \
|
| 160 |
+
table.xpath('.//tbody//th[contains(@class, "yAxis")]/text()').getall()
|
| 161 |
+
|
| 162 |
+
if not days or not time_slots:
|
| 163 |
+
continue
|
| 164 |
+
|
| 165 |
+
schedule_grid = self.build_schedule_grid(days, time_slots)
|
| 166 |
+
self.process_table_rows(table, schedule_grid, days, time_slots)
|
| 167 |
+
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
|
| 168 |
+
|
| 169 |
+
def initialize_document_buffer(self, jurusan_id, jurusan_name):
|
| 170 |
+
"""Initialize a new document with proper title and metadata"""
|
| 171 |
+
self.file_buffers[jurusan_id] = StringIO()
|
| 172 |
+
buffer = self.file_buffers[jurusan_id]
|
| 173 |
+
|
| 174 |
+
# Write document title and metadata
|
| 175 |
+
buffer.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
|
| 176 |
+
buffer.write(f"**Jurusan:** {jurusan_name}\n")
|
| 177 |
+
buffer.write(f"**Tanggal Update:** {self.current_date}\n")
|
| 178 |
+
buffer.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
|
| 179 |
+
buffer.write("---\n\n")
|
| 180 |
+
|
| 181 |
+
def get_table_caption(self, table, table_idx):
|
| 182 |
+
"""Extract and clean table caption text"""
|
| 183 |
+
caption = table.xpath('.//caption//text()').getall()
|
| 184 |
+
caption_text = ' '.join(caption).strip()
|
| 185 |
+
|
| 186 |
+
if not caption_text:
|
| 187 |
+
caption_text = table.xpath('preceding::h2[1]//text()|preceding::h3[1]//text()|preceding::h4[1]//text()').get()
|
| 188 |
+
caption_text = caption_text.strip() if caption_text else f"Jadwal Kelas {table_idx + 1}"
|
| 189 |
+
|
| 190 |
+
return caption_text
|
| 191 |
+
|
| 192 |
+
def clean_class_info(self, caption_text, table):
|
| 193 |
+
"""Combine and clean class information"""
|
| 194 |
+
thead_class_info = ' '.join(table.xpath('.//thead/tr[1]//text()').getall()).strip()
|
| 195 |
+
class_info = f"{caption_text} {thead_class_info}" if thead_class_info else caption_text
|
| 196 |
+
return re.sub(r'\s+', ' ', class_info).strip()
|
| 197 |
+
|
| 198 |
+
def write_section_header(self, buffer, class_info):
|
| 199 |
+
"""Write a section header for each class schedule"""
|
| 200 |
+
buffer.write(f"## Jadwal Perkuliahan {class_info}\n\n")
|
| 201 |
+
buffer.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
|
| 202 |
+
|
| 203 |
+
def build_schedule_grid(self, days, time_slots):
|
| 204 |
+
"""Initialize the schedule grid structure"""
|
| 205 |
+
return {day: {time: 'kosong' for time in time_slots} for day in days}
|
| 206 |
+
|
| 207 |
+
def process_table_rows(self, table, schedule_grid, days, time_slots):
|
| 208 |
+
"""Process table rows respecting rowspans and colspans"""
|
| 209 |
+
rows = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]')
|
| 210 |
+
active_rowspans = {}
|
| 211 |
+
|
| 212 |
+
for row_idx, row in enumerate(rows):
|
| 213 |
+
if row_idx >= len(time_slots):
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
current_time = time_slots[row_idx]
|
| 217 |
+
filled_columns = set()
|
| 218 |
+
|
| 219 |
+
# Apply active rowspans
|
| 220 |
+
self.apply_active_rowspans(active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx)
|
| 221 |
+
|
| 222 |
+
# Process current row cells
|
| 223 |
+
cells = row.xpath('./td')
|
| 224 |
+
col_idx = 0
|
| 225 |
+
|
| 226 |
+
for cell in cells:
|
| 227 |
+
while col_idx < len(days) and col_idx in filled_columns:
|
| 228 |
+
col_idx += 1
|
| 229 |
+
|
| 230 |
+
if col_idx >= len(days):
|
| 231 |
+
break
|
| 232 |
+
|
| 233 |
+
cell_content = self.process_cell_content(cell)
|
| 234 |
+
rowspan = int(cell.xpath('./@rowspan').get() or 1)
|
| 235 |
+
colspan = int(cell.xpath('./@colspan').get() or 1)
|
| 236 |
+
|
| 237 |
+
self.update_schedule_grid(schedule_grid, days, current_time, col_idx, colspan, cell_content)
|
| 238 |
+
self.update_active_rowspans(active_rowspans, row_idx, col_idx, colspan, rowspan, cell_content)
|
| 239 |
+
|
| 240 |
+
col_idx += colspan
|
| 241 |
+
|
| 242 |
+
def apply_active_rowspans(self, active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx):
|
| 243 |
+
"""Apply content from cells with rowspan to current row"""
|
| 244 |
+
rowspans_to_remove = []
|
| 245 |
+
|
| 246 |
+
for (rs_col_idx, rs_row_start_idx), (rowspan_left, content) in active_rowspans.items():
|
| 247 |
+
if rowspan_left > 0 and rs_col_idx < len(days):
|
| 248 |
+
day = days[rs_col_idx]
|
| 249 |
+
schedule_grid[day][current_time] = content
|
| 250 |
+
filled_columns.add(rs_col_idx)
|
| 251 |
+
|
| 252 |
+
active_rowspans[(rs_col_idx, rs_row_start_idx)] = (rowspan_left - 1, content)
|
| 253 |
+
if rowspan_left - 1 <= 0:
|
| 254 |
+
rowspans_to_remove.append((rs_col_idx, rs_row_start_idx))
|
| 255 |
+
|
| 256 |
+
for key in rowspans_to_remove:
|
| 257 |
+
del active_rowspans[key]
|
| 258 |
+
|
| 259 |
+
def process_cell_content(self, cell):
|
| 260 |
+
"""Extract and clean cell content"""
|
| 261 |
+
content = ' '.join(cell.xpath('.//text()').getall()).strip()
|
| 262 |
+
return 'kosong' if not content or content == '---' else content
|
| 263 |
+
|
| 264 |
+
def update_schedule_grid(self, schedule_grid, days, current_time, col_idx, colspan, content):
|
| 265 |
+
"""Update schedule grid with cell content"""
|
| 266 |
+
for c in range(colspan):
|
| 267 |
+
current_col_idx = col_idx + c
|
| 268 |
+
if current_col_idx < len(days):
|
| 269 |
+
schedule_grid[days[current_col_idx]][current_time] = content
|
| 270 |
+
|
| 271 |
+
def update_active_rowspans(self, active_rowspans, row_idx, col_idx, colspan, rowspan, content):
|
| 272 |
+
"""Track cells with rowspan for future rows"""
|
| 273 |
+
if rowspan > 1:
|
| 274 |
+
for c in range(colspan):
|
| 275 |
+
active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, content)
|
| 276 |
+
|
| 277 |
+
def format_course_entry(self, time_slots, course_info):
|
| 278 |
+
"""Format a course entry for optimal RAG retrieval"""
|
| 279 |
+
# Parse course information
|
| 280 |
+
parts = course_info.split()
|
| 281 |
+
course_code = parts[0] if parts and len(parts[0]) == 7 and parts[0][:3].isalpha() and parts[0][3:].isdigit() else ""
|
| 282 |
+
course_name = ""
|
| 283 |
+
lecturer = ""
|
| 284 |
+
room = ""
|
| 285 |
+
|
| 286 |
+
# Extract course name, lecturer, and room
|
| 287 |
+
if "_" in course_info:
|
| 288 |
+
# Format: COURSE_CODE Course_Name_P Lecturer Room
|
| 289 |
+
course_parts = course_info.split("_P")
|
| 290 |
+
if len(course_parts) > 1:
|
| 291 |
+
course_name = course_parts[0].replace(course_code, "").strip()
|
| 292 |
+
remaining = course_parts[1].strip().split()
|
| 293 |
+
lecturer = " ".join(remaining[:-1])
|
| 294 |
+
room = remaining[-1] if remaining else ""
|
| 295 |
+
else:
|
| 296 |
+
# Alternative format
|
| 297 |
+
course_name = " ".join(parts[1:-2]) if len(parts) > 3 else course_info.replace(course_code, "").strip()
|
| 298 |
+
lecturer = parts[-2] if len(parts) > 1 else ""
|
| 299 |
+
room = parts[-1] if parts else ""
|
| 300 |
+
|
| 301 |
+
# Format time range
|
| 302 |
+
time_range = self.format_time_range(time_slots)
|
| 303 |
+
|
| 304 |
+
# Create structured information
|
| 305 |
+
return {
|
| 306 |
+
"time_range": time_range,
|
| 307 |
+
"course_code": course_code,
|
| 308 |
+
"course_name": course_name,
|
| 309 |
+
"lecturer": lecturer,
|
| 310 |
+
"room": room
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
def write_schedule_to_buffer(self, buffer, schedule_grid, days, time_slots):
|
| 314 |
+
for day in days:
|
| 315 |
+
current_course = None
|
| 316 |
+
current_times = []
|
| 317 |
+
day_schedule = []
|
| 318 |
+
|
| 319 |
+
for time_slot in time_slots:
|
| 320 |
+
course = schedule_grid[day][time_slot]
|
| 321 |
+
|
| 322 |
+
if course == current_course:
|
| 323 |
+
current_times.append(time_slot)
|
| 324 |
+
else:
|
| 325 |
+
if current_course and current_course.lower() != 'kosong':
|
| 326 |
+
time_range = self.format_time_range(current_times)
|
| 327 |
+
entry = f"- {day} {time_range} | {current_course}"
|
| 328 |
+
day_schedule.append(entry)
|
| 329 |
+
current_course = course
|
| 330 |
+
current_times = [time_slot]
|
| 331 |
+
|
| 332 |
+
# Tambahkan entri terakhir
|
| 333 |
+
if current_course and current_course.lower() != 'kosong':
|
| 334 |
+
time_range = self.format_time_range(current_times)
|
| 335 |
+
entry = f"- {day} {time_range} | {current_course}"
|
| 336 |
+
day_schedule.append(entry)
|
| 337 |
+
|
| 338 |
+
# Tulis hasil ke buffer
|
| 339 |
+
for entry in day_schedule:
|
| 340 |
+
buffer.write(entry + "\n")
|
| 341 |
+
|
| 342 |
+
buffer.write("\n") # spasi antar hari
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
def format_time_range(self, time_slots):
|
| 346 |
+
"""Format multiple time slots into a readable range"""
|
| 347 |
+
if len(time_slots) == 1:
|
| 348 |
+
return time_slots[0]
|
| 349 |
+
|
| 350 |
+
first_start = time_slots[0].split('-')[0].strip()
|
| 351 |
+
last_end = time_slots[-1].split('-')[-1].strip()
|
| 352 |
+
return f"{first_start} - {last_end}"
|
| 353 |
+
|
| 354 |
+
def extract_jurusan_id(self, link):
|
| 355 |
+
match = re.search(r'department\?dep=(\d+)', link)
|
| 356 |
+
return match.group(1) if match else f"unknown_{hash(link) % 1000}"
|
| 357 |
+
|
| 358 |
+
def parse_jurusan(self, response):
|
| 359 |
+
jurusan_id = response.meta.get('jurusan_id')
|
| 360 |
+
jurusan_name = self.extract_title_jurusan_name(response)
|
| 361 |
+
|
| 362 |
+
groups_days_horizontal_link = response.xpath('//td/a[contains(@href, "groups_days_horizontal") and not(contains(@href, "subgroups_days_horizontal"))]/@href').get()
|
| 363 |
+
|
| 364 |
+
if groups_days_horizontal_link:
|
| 365 |
+
groups_days_horizontal_url = response.urljoin(groups_days_horizontal_link)
|
| 366 |
+
safe_jurusan_name = re.sub(r'[^\w\-_\. ]', '_', jurusan_name)
|
| 367 |
+
|
| 368 |
+
yield scrapy.Request(groups_days_horizontal_url,
|
| 369 |
+
callback=self.parse_jadwal,
|
| 370 |
+
meta={'jurusan_id': safe_jurusan_name, 'jurusan_name': jurusan_name})
|
| 371 |
+
|
| 372 |
+
def parse_jadwal(self, response):
|
| 373 |
+
jurusan_id = response.meta.get('jurusan_id')
|
| 374 |
+
jurusan_name = response.meta.get('jurusan_name')
|
| 375 |
+
|
| 376 |
+
if jurusan_id not in self.file_buffers:
|
| 377 |
+
self.initialize_document_buffer(jurusan_id, jurusan_name)
|
| 378 |
+
|
| 379 |
+
output_buffer = self.file_buffers[jurusan_id]
|
| 380 |
+
tables = response.xpath('//table[contains(@id, "table_")]') or response.xpath('//table')
|
| 381 |
+
|
| 382 |
+
for table in tables:
|
| 383 |
+
caption_text = self.get_table_caption(table, 0)
|
| 384 |
+
class_info = self.clean_class_info(caption_text, table)
|
| 385 |
+
|
| 386 |
+
if not class_info:
|
| 387 |
+
continue
|
| 388 |
+
|
| 389 |
+
self.write_section_header(output_buffer, class_info)
|
| 390 |
+
|
| 391 |
+
days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall()
|
| 392 |
+
time_slots = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]/th[@class="yAxis"]/text()').getall()
|
| 393 |
+
|
| 394 |
+
if not days or not time_slots:
|
| 395 |
+
continue
|
| 396 |
+
|
| 397 |
+
schedule_grid = self.build_schedule_grid(days, time_slots)
|
| 398 |
+
self.process_table_rows(table, schedule_grid, days, time_slots)
|
| 399 |
+
self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
|
| 400 |
+
|
| 401 |
+
def extract_title_jurusan_name(self, response):
|
| 402 |
+
title = response.xpath('//title/text()').get()
|
| 403 |
+
return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}"
|
| 404 |
|
| 405 |
if __name__ == "__main__":
|
| 406 |
+
process = CrawlerProcess(settings={
|
| 407 |
+
'DOWNLOAD_DELAY': 1,
|
| 408 |
+
'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
|
| 409 |
+
'ROBOTSTXT_OBEY': True,
|
| 410 |
+
'LOG_LEVEL': 'INFO',
|
| 411 |
+
'HTTPCACHE_ENABLED': False,
|
| 412 |
+
'CONCURRENT_REQUESTS': 1,
|
| 413 |
+
'RETRY_TIMES': 3
|
| 414 |
+
})
|
| 415 |
+
process.crawl(PnpSpider)
|
| 416 |
+
process.start()
|
scrapping/jurusan_scrap.py
CHANGED
|
@@ -1,130 +1,326 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
from datetime import datetime
|
| 4 |
-
from typing import Dict, List
|
| 5 |
-
|
| 6 |
from bs4 import BeautifulSoup
|
|
|
|
| 7 |
from supabase import create_client
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
try:
|
| 11 |
-
from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
|
| 12 |
-
except Exception:
|
| 13 |
-
import sys as _sys
|
| 14 |
-
import os as _os
|
| 15 |
-
_sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
|
| 16 |
-
from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
|
| 17 |
-
|
| 18 |
-
# Shared dedup upload utility
|
| 19 |
try:
|
| 20 |
from utils.supabase_utils import upload_if_changed
|
| 21 |
except Exception:
|
| 22 |
-
|
| 23 |
-
import os as _os2
|
| 24 |
-
_sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
|
| 25 |
from supabase_utils import upload_if_changed
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
try:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
except Exception as e:
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
if status == 'uploaded':
|
| 123 |
-
print(f"✅ Uploaded rekap: {rekap_filename}")
|
| 124 |
-
elif status == 'skipped':
|
| 125 |
-
print(f"⏭️ Skipped upload (unchanged): {rekap_filename}")
|
| 126 |
-
else:
|
| 127 |
-
print(f"❌ Upload error for {rekap_filename}: {result.get('error')}")
|
| 128 |
-
except Exception as e:
|
| 129 |
-
print(f"❌ Error uploading rekap: {e}")
|
| 130 |
-
# End of minimal Crawl4AI rekap script
|
|
|
|
| 1 |
+
import scrapy
|
| 2 |
+
from scrapy.crawler import CrawlerProcess
|
|
|
|
|
|
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
from supabase import create_client
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
import os, re, tempfile
|
| 8 |
+
import sys
|
| 9 |
|
| 10 |
+
# Try import shared dedup upload utility
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
try:
|
| 12 |
from utils.supabase_utils import upload_if_changed
|
| 13 |
except Exception:
|
| 14 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
|
|
|
|
|
|
| 15 |
from supabase_utils import upload_if_changed
|
| 16 |
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
| 20 |
+
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
| 21 |
+
SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def is_valid_prodi(nama):
|
| 25 |
+
return bool(re.match(
|
| 26 |
+
r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b',
|
| 27 |
+
nama, re.I
|
| 28 |
+
))
|
| 29 |
+
|
| 30 |
+
class JurusanSpider(scrapy.Spider):
|
| 31 |
+
name = "jurusan"
|
| 32 |
+
custom_settings = {
|
| 33 |
+
'DOWNLOAD_DELAY': 1,
|
| 34 |
+
'USER_AGENT': "PNPBot/1.2",
|
| 35 |
+
'ROBOTSTXT_OBEY': True,
|
| 36 |
+
'LOG_LEVEL': 'INFO',
|
| 37 |
+
'CONCURRENT_REQUESTS': 1,
|
| 38 |
+
'RETRY_TIMES': 3
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
domain_to_name = {
|
| 42 |
+
'akt.pnp.ac.id': 'Akuntansi',
|
| 43 |
+
'an.pnp.ac.id': 'Administrasi_Niaga',
|
| 44 |
+
'bing.pnp.ac.id': 'Bahasa_Inggris',
|
| 45 |
+
'elektro.pnp.ac.id': 'Teknik_Elektro',
|
| 46 |
+
'me.pnp.ac.id': 'Teknik_Mesin',
|
| 47 |
+
'sipil.pnp.ac.id': 'Teknik_Sipil',
|
| 48 |
+
'ti.pnp.ac.id': 'Teknologi_Informasi',
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
|
| 52 |
+
|
| 53 |
+
def __init__(self):
|
| 54 |
+
self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 55 |
+
self.bucket = SUPABASE_BUCKET
|
| 56 |
+
self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
|
| 57 |
+
self.per_jurusan_pages = {}
|
| 58 |
+
self.rekap_prodi = {}
|
| 59 |
+
|
| 60 |
+
def parse(self, response):
|
| 61 |
+
domain = response.url.split("//")[1].split("/")[0]
|
| 62 |
+
jurusan = self.domain_to_name.get(domain, domain)
|
| 63 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 64 |
+
|
| 65 |
+
program_studi = []
|
| 66 |
+
|
| 67 |
+
# Ambil semua <a> yang mengandung nama program studi (D3, D4, dll.)
|
| 68 |
+
for a_tag in soup.find_all("a"):
|
| 69 |
+
item = a_tag.get_text(strip=True)
|
| 70 |
+
href = a_tag.get("href")
|
| 71 |
+
if item and is_valid_prodi(item) and item not in program_studi:
|
| 72 |
+
program_studi.append(item)
|
| 73 |
+
if href:
|
| 74 |
+
prodi_url = response.urljoin(href)
|
| 75 |
+
self.logger.info(f"[🧩] Ditemukan prodi: {item} ({prodi_url}) di jurusan {jurusan}")
|
| 76 |
+
yield scrapy.Request(prodi_url, callback=self.parse_detail, meta={"jurusan": jurusan, "url": prodi_url})
|
| 77 |
+
|
| 78 |
+
# Simpan hasil awal ke dict untuk rekap
|
| 79 |
+
self.rekap_prodi[jurusan] = program_studi
|
| 80 |
+
|
| 81 |
+
# Tetap follow semua link internal untuk backup scraping
|
| 82 |
+
for a in soup.find_all("a", href=True):
|
| 83 |
+
href = a["href"]
|
| 84 |
+
if href.startswith("http") and domain in href:
|
| 85 |
+
yield scrapy.Request(href, callback=self.parse_detail, meta={"jurusan": jurusan, "url": href})
|
| 86 |
+
elif href.startswith("/"):
|
| 87 |
+
yield scrapy.Request(response.urljoin(href), callback=self.parse_detail, meta={"jurusan": jurusan, "url": response.urljoin(href)})
|
| 88 |
+
|
| 89 |
+
def parse_detail(self, response):
|
| 90 |
+
jurusan = response.meta["jurusan"]
|
| 91 |
+
url = response.meta["url"]
|
| 92 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 93 |
+
|
| 94 |
+
# Tentukan area konten utama terlebih dahulu
|
| 95 |
+
candidates = soup.select(
|
| 96 |
+
"main, article, #content, #primary, .site-content, .entry-content, .post-content, .content, .page-content, .container main, .elementor-section.elementor-top-section, .elementor-container, .elementor-widget-theme-post-content"
|
| 97 |
+
)
|
| 98 |
+
def text_len(el):
|
| 99 |
+
try:
|
| 100 |
+
return len(el.get_text(" ", strip=True))
|
| 101 |
+
except Exception:
|
| 102 |
+
return 0
|
| 103 |
+
main_area = max(candidates, key=text_len) if candidates else soup.body or soup
|
| 104 |
+
|
| 105 |
+
# Bersihkan elemen yang tidak perlu (diperluas)
|
| 106 |
+
blacklist_selectors = [
|
| 107 |
+
'header', 'footer', 'nav', 'aside', 'menu', 'form',
|
| 108 |
+
'.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
|
| 109 |
+
'.site-header', '.site-footer', '#site-header', '#colophon', '.widget', '.widget-area',
|
| 110 |
+
'.breadcrumbs', '.pagination', '.navigation', '.page-links',
|
| 111 |
+
'script', 'style', 'noscript', 'iframe',
|
| 112 |
+
'.social-links', '.share-buttons', '.newsletter',
|
| 113 |
+
'.ad-container', '.ads', '.advert', '[role="navigation"]', '[aria-label*="breadcrumb" i]'
|
| 114 |
+
]
|
| 115 |
+
for selector in blacklist_selectors:
|
| 116 |
+
for tag in main_area.select(selector):
|
| 117 |
+
tag.decompose()
|
| 118 |
+
|
| 119 |
+
# Hapus elemen kosong yang tersisa dalam area utama
|
| 120 |
+
for element in list(main_area.find_all(True)):
|
| 121 |
+
if not element.get_text(strip=True) and not element.find_all(True):
|
| 122 |
+
element.decompose()
|
| 123 |
+
|
| 124 |
+
title_tag = main_area.find("h1") or soup.find("title")
|
| 125 |
+
page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
|
| 126 |
+
|
| 127 |
+
# ==== KHUSUS Halaman Pimpinan Jurusan TI ====
|
| 128 |
+
if url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
|
| 129 |
+
leadership_data = {
|
| 130 |
+
"Pimpinan Jurusan": [],
|
| 131 |
+
"Koordinator Program Studi": [],
|
| 132 |
+
"Kepala Labor": []
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
member_items = soup.find_all(class_="member-item")
|
| 136 |
+
for member in member_items:
|
| 137 |
+
name_tag = member.find(class_="item-title")
|
| 138 |
+
name = name_tag.get_text(strip=True) if name_tag else "N/A"
|
| 139 |
+
position_tag = member.find(class_="small-text")
|
| 140 |
+
position = position_tag.get_text(strip=True) if position_tag else "N/A"
|
| 141 |
+
|
| 142 |
+
if "Ketua Jurusan" in position or "Sekretaris Jurusan" in position:
|
| 143 |
+
leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
|
| 144 |
+
elif "Koordinator Program Studi" in position or "Koordinator PSDKU" in position:
|
| 145 |
+
leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
|
| 146 |
+
elif "Kepala Labor" in position:
|
| 147 |
+
leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
|
| 148 |
+
|
| 149 |
+
naratif = []
|
| 150 |
+
naratif.append("## Pimpinan Jurusan")
|
| 151 |
+
for leader in leadership_data["Pimpinan Jurusan"]:
|
| 152 |
+
naratif.append(f"- {leader['jabatan']}: {leader['nama']}")
|
| 153 |
|
| 154 |
+
naratif.append("\n## Koordinator Program Studi")
|
| 155 |
+
for coordinator in leadership_data["Koordinator Program Studi"]:
|
| 156 |
+
naratif.append(f"- {coordinator['jabatan']}: {coordinator['nama']}")
|
| 157 |
+
|
| 158 |
+
naratif.append("\n## Kepala Labor")
|
| 159 |
+
for lab_head in leadership_data["Kepala Labor"]:
|
| 160 |
+
naratif.append(f"- {lab_head['jabatan']}: {lab_head['nama']}")
|
| 161 |
+
|
| 162 |
+
content_text = f"""# Pimpinan Jurusan Teknologi Informasi
|
| 163 |
+
|
| 164 |
+
URL: {url}
|
| 165 |
+
Jurusan: Teknologi Informasi
|
| 166 |
+
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
|
| 167 |
+
|
| 168 |
+
""" + "\n".join(naratif)
|
| 169 |
+
|
| 170 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({
|
| 171 |
+
"url": url,
|
| 172 |
+
"title": "Pimpinan Jurusan Teknologi Informasi",
|
| 173 |
+
"content": content_text
|
| 174 |
+
})
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
# ==== KHUSUS Halaman Dosen Staf Pengajar TI ====
|
| 178 |
+
elif url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
|
| 179 |
+
dosen_data = []
|
| 180 |
+
gallery = soup.find('div', class_='gallery')
|
| 181 |
+
if gallery:
|
| 182 |
+
for item in gallery.find_all('dl', class_='gallery-item'):
|
| 183 |
+
caption = item.find('dd', class_='wp-caption-text')
|
| 184 |
+
nama_gelar = caption.get_text(strip=True) if caption else ""
|
| 185 |
+
link_tag = item.find('a')
|
| 186 |
+
link = link_tag['href'] if link_tag and link_tag.has_attr('href') else ""
|
| 187 |
+
img_tag = item.find('img')
|
| 188 |
+
foto = img_tag['src'] if img_tag and img_tag.has_attr('src') else ""
|
| 189 |
+
dosen_data.append({
|
| 190 |
+
"nama_gelar": nama_gelar,
|
| 191 |
+
"link_profil": link,
|
| 192 |
+
"foto": foto
|
| 193 |
+
})
|
| 194 |
+
|
| 195 |
+
content_text = f"""# Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi
|
| 196 |
+
|
| 197 |
+
URL: {url}
|
| 198 |
+
Jurusan: Teknologi Informasi
|
| 199 |
+
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
|
| 200 |
+
Jumlah Dosen: {len(dosen_data)}
|
| 201 |
+
|
| 202 |
+
## Daftar Dosen:
|
| 203 |
+
"""
|
| 204 |
+
for idx, dosen in enumerate(dosen_data, 1):
|
| 205 |
+
content_text += f"\n### {idx}. {dosen['nama_gelar']}"
|
| 206 |
+
if dosen['link_profil']:
|
| 207 |
+
content_text += f"\n- Link Profil: {dosen['link_profil']}"
|
| 208 |
+
if dosen['foto']:
|
| 209 |
+
content_text += f"\n- Foto: {dosen['foto']}"
|
| 210 |
+
content_text += "\n"
|
| 211 |
+
|
| 212 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({
|
| 213 |
+
"url": url,
|
| 214 |
+
"title": "Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi",
|
| 215 |
+
"content": content_text
|
| 216 |
+
})
|
| 217 |
+
return
|
| 218 |
+
|
| 219 |
+
# ==== PARSING STANDAR ====
|
| 220 |
+
body_text = []
|
| 221 |
+
for p in main_area.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
|
| 222 |
+
txt = p.get_text(strip=True)
|
| 223 |
+
if txt:
|
| 224 |
+
body_text.append(txt)
|
| 225 |
+
|
| 226 |
+
content_text = f"""# {page_title}
|
| 227 |
+
|
| 228 |
+
URL: {url}
|
| 229 |
+
Jurusan: {jurusan.replace('_', ' ')}
|
| 230 |
+
Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
|
| 231 |
+
|
| 232 |
+
""" + "\n\n".join(body_text)
|
| 233 |
+
|
| 234 |
+
# Tambahkan semua tabel dari area utama saja
|
| 235 |
+
for i, table in enumerate(main_area.find_all("table")):
|
| 236 |
+
content_text += f"\n\nTabel {i+1}\n\n"
|
| 237 |
+
for row in table.find_all("tr"):
|
| 238 |
+
cols = row.find_all(["td", "th"])
|
| 239 |
+
row_data = [col.get_text(strip=True) for col in cols]
|
| 240 |
+
content_text += " | ".join(row_data) + "\n"
|
| 241 |
+
|
| 242 |
+
self.per_jurusan_pages.setdefault(jurusan, []).append({
|
| 243 |
+
"url": url,
|
| 244 |
+
"title": page_title,
|
| 245 |
+
"content": content_text
|
| 246 |
+
})
|
| 247 |
+
|
| 248 |
+
def closed(self, reason):
|
| 249 |
+
# Simpan file tiap jurusan
|
| 250 |
+
for jurusan, pages in self.per_jurusan_pages.items():
|
| 251 |
+
filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
|
| 252 |
+
try:
|
| 253 |
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
| 254 |
+
for page in pages:
|
| 255 |
+
f.write(page["content"] + "\n\n---\n\n")
|
| 256 |
+
temp_path = f.name
|
| 257 |
+
# Read content back to ensure consistent comparison behavior
|
| 258 |
+
with open(temp_path, 'r', encoding='utf-8') as rf:
|
| 259 |
+
content_text = rf.read()
|
| 260 |
+
result = upload_if_changed(self.supabase, self.bucket, filename, content_text)
|
| 261 |
+
if result.get('result') == 'uploaded':
|
| 262 |
+
self.logger.info(f"✅ Uploaded file jurusan: {filename}")
|
| 263 |
+
elif result.get('result') == 'skipped':
|
| 264 |
+
self.logger.info(f"⏭️ Skipped upload for {filename} (content unchanged)")
|
| 265 |
+
else:
|
| 266 |
+
self.logger.error(f"❌ Gagal upload {filename}: {result.get('error')}")
|
| 267 |
+
except Exception as e:
|
| 268 |
+
self.logger.error(f"❌ Gagal upload {filename}: {e}")
|
| 269 |
+
finally:
|
| 270 |
+
if os.path.exists(temp_path):
|
| 271 |
+
os.remove(temp_path)
|
| 272 |
+
|
| 273 |
+
# Rekap program studi
|
| 274 |
+
rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
|
| 275 |
try:
|
| 276 |
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
|
| 277 |
+
f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
|
| 278 |
+
|
| 279 |
+
total_prodi = 0
|
| 280 |
+
jumlah_jurusan = 0
|
| 281 |
+
|
| 282 |
+
for jurusan, daftar in self.rekap_prodi.items():
|
| 283 |
+
valid_prodi = []
|
| 284 |
+
for p in daftar:
|
| 285 |
+
if is_valid_prodi(p):
|
| 286 |
+
valid_prodi.append(p.strip())
|
| 287 |
+
|
| 288 |
+
if not valid_prodi:
|
| 289 |
+
continue
|
| 290 |
+
|
| 291 |
+
jurusan_baca = jurusan.replace("_", " ")
|
| 292 |
+
f.write(f"{jurusan_baca}:\n")
|
| 293 |
+
for p in sorted(set(valid_prodi)):
|
| 294 |
+
f.write(f"- {p}\n")
|
| 295 |
+
jumlah_prodi = len(valid_prodi)
|
| 296 |
+
f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
|
| 297 |
+
|
| 298 |
+
total_prodi += jumlah_prodi
|
| 299 |
+
jumlah_jurusan += 1
|
| 300 |
+
|
| 301 |
+
f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
|
| 302 |
+
f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
|
| 303 |
+
|
| 304 |
+
temp_path = f.name
|
| 305 |
+
|
| 306 |
+
# Read content then use dedup upload
|
| 307 |
+
with open(temp_path, 'r', encoding='utf-8') as rf:
|
| 308 |
+
rekap_text = rf.read()
|
| 309 |
+
result = upload_if_changed(self.supabase, self.bucket, rekap_filename, rekap_text)
|
| 310 |
+
if result.get('result') == 'uploaded':
|
| 311 |
+
self.logger.info(f"✅ Uploaded file rekap: {rekap_filename}")
|
| 312 |
+
elif result.get('result') == 'skipped':
|
| 313 |
+
self.logger.info(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
|
| 314 |
+
else:
|
| 315 |
+
self.logger.error(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
|
| 316 |
+
|
| 317 |
except Exception as e:
|
| 318 |
+
self.logger.error(f"❌ Gagal upload rekap: {e}")
|
| 319 |
+
finally:
|
| 320 |
+
if os.path.exists(temp_path):
|
| 321 |
+
os.remove(temp_path)
|
| 322 |
+
|
| 323 |
+
if __name__ == "__main__":
|
| 324 |
+
process = CrawlerProcess()
|
| 325 |
+
process.crawl(JurusanSpider)
|
| 326 |
+
process.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scrapping/pnp_scrap.py
CHANGED
|
@@ -1,18 +1,10 @@
|
|
|
|
|
|
|
|
| 1 |
from datetime import datetime
|
| 2 |
import re
|
| 3 |
import os
|
| 4 |
from supabase import create_client, Client
|
| 5 |
import html
|
| 6 |
-
from typing import List
|
| 7 |
-
from urllib.parse import urljoin
|
| 8 |
-
|
| 9 |
-
from bs4 import BeautifulSoup
|
| 10 |
-
try:
|
| 11 |
-
from utils.crawl4ai_utils import crawl_domain_parallel_sync
|
| 12 |
-
except Exception:
|
| 13 |
-
import sys
|
| 14 |
-
sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
|
| 15 |
-
from crawl4ai_utils import crawl_domain_parallel_sync
|
| 16 |
|
| 17 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
| 18 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
|
@@ -28,103 +20,428 @@ except Exception:
|
|
| 28 |
from supabase_utils import upload_if_changed
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
def
|
|
|
|
| 37 |
if not text:
|
| 38 |
-
return
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
'div.entry-content', 'article.post', 'main.site-main',
|
| 48 |
'div.content', 'div.main-content', 'div#content', 'div.page-content'
|
| 49 |
]
|
| 50 |
-
|
| 51 |
-
for
|
| 52 |
-
content_area =
|
| 53 |
if content_area:
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
for
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
|
| 98 |
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
| 99 |
-
timestamp = datetime.now().strftime(
|
| 100 |
filename = f"{safe_title}_{timestamp}.txt"
|
| 101 |
try:
|
| 102 |
result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
except Exception as e:
|
| 105 |
-
|
| 106 |
return f"failed_{filename}"
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import scrapy
|
| 2 |
+
from scrapy.crawler import CrawlerProcess
|
| 3 |
from datetime import datetime
|
| 4 |
import re
|
| 5 |
import os
|
| 6 |
from supabase import create_client, Client
|
| 7 |
import html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
|
| 10 |
SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
|
|
|
|
| 20 |
from supabase_utils import upload_if_changed
|
| 21 |
|
| 22 |
|
| 23 |
+
class PNPContentSpider(scrapy.Spider):
|
| 24 |
+
name = 'pnp_content_spider'
|
| 25 |
+
start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
|
| 26 |
|
| 27 |
+
excluded_subdomains = [
|
| 28 |
+
'akt.pnp.ac.id',
|
| 29 |
+
'an.pnp.ac.id',
|
| 30 |
+
'bing.pnp.ac.id',
|
| 31 |
+
'elektro.pnp.ac.id',
|
| 32 |
+
'me.pnp.ac.id',
|
| 33 |
+
'sipil.pnp.ac.id',
|
| 34 |
+
'ti.pnp.ac.id'
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
custom_settings = {
|
| 38 |
+
'DOWNLOAD_DELAY': 1,
|
| 39 |
+
'RETRY_TIMES': 3,
|
| 40 |
+
'HTTPCACHE_ENABLED': False,
|
| 41 |
+
'ROBOTSTXT_OBEY': True,
|
| 42 |
+
'CONCURRENT_REQUESTS': 1,
|
| 43 |
+
'RETRY_ENABLED': True,
|
| 44 |
+
'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
|
| 45 |
+
'LOG_LEVEL': 'INFO',
|
| 46 |
+
}
|
| 47 |
|
| 48 |
+
def clean_text(self, text: str) -> str:
|
| 49 |
+
"""Clean and normalize text content"""
|
| 50 |
if not text:
|
| 51 |
+
return ""
|
| 52 |
+
|
| 53 |
+
# Decode HTML entities
|
| 54 |
+
text = html.unescape(text)
|
| 55 |
+
|
| 56 |
+
# Remove extra whitespace and normalize
|
| 57 |
+
text = ' '.join(text.split())
|
| 58 |
+
|
| 59 |
+
# Fix common encoding issues
|
| 60 |
+
text = text.replace('“', '"').replace('â€', '"').replace('’', "'")
|
| 61 |
+
text = text.replace('â€"', '—').replace('â€"', '–')
|
| 62 |
+
|
| 63 |
+
return text.strip()
|
| 64 |
+
|
| 65 |
+
def format_paragraph(self, text: str) -> str:
|
| 66 |
+
text = self.clean_text(text)
|
| 67 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
| 68 |
+
paragraph = ''
|
| 69 |
+
word_count = 0
|
| 70 |
+
for sentence in sentences:
|
| 71 |
+
words = sentence.split()
|
| 72 |
+
word_count += len(words)
|
| 73 |
+
paragraph += sentence + ' '
|
| 74 |
+
if 50 <= word_count <= 150:
|
| 75 |
+
break
|
| 76 |
+
return paragraph.strip()
|
| 77 |
+
|
| 78 |
+
def parse(self, response):
|
| 79 |
+
self.logger.info(f"Processing main page: {response.url}")
|
| 80 |
+
nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
|
| 81 |
+
for item in nav_items:
|
| 82 |
+
main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
|
| 83 |
+
if not main_title:
|
| 84 |
+
main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
|
| 85 |
+
main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
|
| 86 |
+
if main_link and not main_link.startswith('#'):
|
| 87 |
+
main_link = response.urljoin(main_link)
|
| 88 |
+
if "jurusan" in main_link.lower():
|
| 89 |
+
continue
|
| 90 |
+
yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
|
| 91 |
+
submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
|
| 92 |
+
for submenu in submenus:
|
| 93 |
+
submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
|
| 94 |
+
if not submenu_title:
|
| 95 |
+
submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
|
| 96 |
+
submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
|
| 97 |
+
if submenu_link and not submenu_link.startswith('#'):
|
| 98 |
+
submenu_link = response.urljoin(submenu_link)
|
| 99 |
+
if "jurusan" in submenu_link.lower():
|
| 100 |
+
continue
|
| 101 |
+
menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
|
| 102 |
+
yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
|
| 103 |
+
|
| 104 |
+
def extract_leadership_info(self, response):
|
| 105 |
+
"""Extract leadership information from the special leadership page"""
|
| 106 |
+
self.logger.info("Extracting leadership information from special page")
|
| 107 |
+
|
| 108 |
+
leaders_data = []
|
| 109 |
+
|
| 110 |
+
# Try multiple table selectors based on the HTML structure shown
|
| 111 |
+
tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
|
| 112 |
+
|
| 113 |
+
if tables:
|
| 114 |
+
# Process each table
|
| 115 |
+
for table_idx, table in enumerate(tables):
|
| 116 |
+
self.logger.info(f"Processing table {table_idx + 1}")
|
| 117 |
+
|
| 118 |
+
rows = table.css('tr')
|
| 119 |
+
if not rows:
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
leader_info = {}
|
| 123 |
+
position_title = ""
|
| 124 |
+
|
| 125 |
+
# Look for position title (like "DIREKTUR")
|
| 126 |
+
title_elements = table.css('strong, .position-title, th')
|
| 127 |
+
for title_elem in title_elements:
|
| 128 |
+
title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
|
| 129 |
+
if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
|
| 130 |
+
position_title = title_text
|
| 131 |
+
break
|
| 132 |
+
|
| 133 |
+
# Extract key-value pairs from table rows
|
| 134 |
+
for row in rows:
|
| 135 |
+
cells = row.css('td, th')
|
| 136 |
+
|
| 137 |
+
if len(cells) >= 3:
|
| 138 |
+
# Format: Label | : | Value (3 columns)
|
| 139 |
+
key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
|
| 140 |
+
separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
|
| 141 |
+
value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
|
| 142 |
+
|
| 143 |
+
if key and value and separator == ":":
|
| 144 |
+
leader_info[key] = value
|
| 145 |
+
|
| 146 |
+
elif len(cells) == 2:
|
| 147 |
+
# Format: Label | Value (2 columns)
|
| 148 |
+
key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
|
| 149 |
+
value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
|
| 150 |
+
|
| 151 |
+
if key and value and key != value:
|
| 152 |
+
# Skip if key contains colon (likely "Label:")
|
| 153 |
+
clean_key = key.replace(':', '').strip()
|
| 154 |
+
leader_info[clean_key] = value
|
| 155 |
+
|
| 156 |
+
# Add position title if found
|
| 157 |
+
if position_title:
|
| 158 |
+
leader_info['Posisi'] = position_title
|
| 159 |
+
|
| 160 |
+
# If we found structured data, add it
|
| 161 |
+
if leader_info:
|
| 162 |
+
leaders_data.append(leader_info)
|
| 163 |
+
self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
|
| 164 |
+
|
| 165 |
+
# Fallback: Extract from general content structure
|
| 166 |
+
if not leaders_data:
|
| 167 |
+
self.logger.info("No table data found, trying general content extraction")
|
| 168 |
+
|
| 169 |
+
# Look for profile sections
|
| 170 |
+
profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
|
| 171 |
+
|
| 172 |
+
for section in profile_sections:
|
| 173 |
+
section_text = self.clean_text(' '.join(section.css('*::text').getall()))
|
| 174 |
+
|
| 175 |
+
# Check if this section contains leadership info
|
| 176 |
+
if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
|
| 177 |
+
# Try to extract structured info from the text
|
| 178 |
+
leader_info = {'description': section_text}
|
| 179 |
+
|
| 180 |
+
# Try to extract specific details using regex
|
| 181 |
+
name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
|
| 182 |
+
if name_match:
|
| 183 |
+
leader_info['Nama'] = name_match.group(0).strip()
|
| 184 |
+
|
| 185 |
+
nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
|
| 186 |
+
if nidn_match:
|
| 187 |
+
leader_info['NIDN'] = nidn_match.group(1)
|
| 188 |
+
|
| 189 |
+
leaders_data.append(leader_info)
|
| 190 |
+
|
| 191 |
+
return leaders_data
|
| 192 |
+
|
| 193 |
+
def format_leadership_content(self, leaders_data):
|
| 194 |
+
"""Format leadership data into readable content"""
|
| 195 |
+
formatted_content = []
|
| 196 |
+
|
| 197 |
+
for idx, leader in enumerate(leaders_data, 1):
|
| 198 |
+
if isinstance(leader, dict):
|
| 199 |
+
if 'description' in leader and len(leader) == 1:
|
| 200 |
+
# Simple description format
|
| 201 |
+
content = f"## Pimpinan {idx}\n\n{leader['description']}"
|
| 202 |
+
else:
|
| 203 |
+
# Structured data format - create narrative
|
| 204 |
+
position = leader.get("Posisi", "")
|
| 205 |
+
nama = leader.get("Nama", "")
|
| 206 |
+
nidn = leader.get("NIDN", "")
|
| 207 |
+
jabatan_akademik = leader.get("Jabatan Akademik", "")
|
| 208 |
+
jurusan = leader.get("Jurusan", "")
|
| 209 |
+
program_studi = leader.get("Program Studi", "")
|
| 210 |
+
|
| 211 |
+
# Create narrative starting with position
|
| 212 |
+
if position and nama:
|
| 213 |
+
content = f"## {position}\n\n"
|
| 214 |
+
narrative = f"{position} Politeknik Negeri Padang adalah {nama}."
|
| 215 |
+
elif nama:
|
| 216 |
+
content = f"## Pimpinan {idx}\n\n"
|
| 217 |
+
narrative = f"Pimpinan ini adalah {nama}."
|
| 218 |
+
else:
|
| 219 |
+
content = f"## Pimpinan {idx}\n\n"
|
| 220 |
+
narrative = "Informasi pimpinan:"
|
| 221 |
+
|
| 222 |
+
# Add academic position
|
| 223 |
+
if jabatan_akademik:
|
| 224 |
+
narrative += f" Secara akademik, beliau menjabat sebagai {jabatan_akademik}."
|
| 225 |
+
|
| 226 |
+
# Add department information
|
| 227 |
+
if jurusan:
|
| 228 |
+
narrative += f" Beliau berasal dari Jurusan {jurusan}."
|
| 229 |
+
|
| 230 |
+
# Add study program
|
| 231 |
+
if program_studi:
|
| 232 |
+
narrative += f" Program studi yang diampu adalah {program_studi}."
|
| 233 |
+
|
| 234 |
+
# Add NIDN
|
| 235 |
+
if nidn:
|
| 236 |
+
narrative += f" NIDN beliau adalah {nidn}."
|
| 237 |
+
|
| 238 |
+
content += narrative + "\n\n"
|
| 239 |
+
|
| 240 |
+
# Add any remaining information that wasn't included in narrative
|
| 241 |
+
used_keys = ['Posisi', 'Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi', 'description']
|
| 242 |
+
for key, value in leader.items():
|
| 243 |
+
if key not in used_keys:
|
| 244 |
+
content += f"**{key}**: {value}\n\n"
|
| 245 |
+
|
| 246 |
+
# Add description if exists
|
| 247 |
+
if 'description' in leader:
|
| 248 |
+
content += f"**Informasi Tambahan**: {leader['description']}\n\n"
|
| 249 |
+
|
| 250 |
+
formatted_content.append(content.strip())
|
| 251 |
+
|
| 252 |
+
return formatted_content
|
| 253 |
+
|
| 254 |
+
def parse_content(self, response):
|
| 255 |
+
page_title = response.meta.get('page_title', 'Unknown Page')
|
| 256 |
+
menu_path = response.meta.get('menu_path', '')
|
| 257 |
+
if page_title == 'Unknown Page':
|
| 258 |
+
page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
|
| 259 |
+
|
| 260 |
+
self.logger.info(f"Extracting content from: {response.url} ({page_title})")
|
| 261 |
+
|
| 262 |
+
paragraphs = []
|
| 263 |
+
|
| 264 |
+
# Special case: halaman pimpinan PNP
|
| 265 |
+
if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
|
| 266 |
+
self.logger.info("Detected leadership page - using special extraction")
|
| 267 |
+
|
| 268 |
+
leaders_data = self.extract_leadership_info(response)
|
| 269 |
+
self.logger.info(f"Found {len(leaders_data)} leadership entries")
|
| 270 |
+
|
| 271 |
+
if leaders_data:
|
| 272 |
+
formatted_leaders = self.format_leadership_content(leaders_data)
|
| 273 |
+
paragraphs = formatted_leaders
|
| 274 |
+
|
| 275 |
+
# Also extract any additional content from the page
|
| 276 |
+
additional_content = self.extract_general_content(response)
|
| 277 |
+
if additional_content:
|
| 278 |
+
paragraphs.extend(["## Informasi Tambahan"] + additional_content)
|
| 279 |
+
else:
|
| 280 |
+
# Fallback to general content extraction
|
| 281 |
+
self.logger.warning("Leadership extraction failed, falling back to general extraction")
|
| 282 |
+
paragraphs = self.extract_general_content(response)
|
| 283 |
+
else:
|
| 284 |
+
# Normal content extraction
|
| 285 |
+
paragraphs = self.extract_general_content(response)
|
| 286 |
+
|
| 287 |
+
# Create final content
|
| 288 |
+
content_text = self.create_final_content(page_title, response.url, paragraphs)
|
| 289 |
+
|
| 290 |
+
# Add table data if any (but skip for leadership pages to avoid duplication)
|
| 291 |
+
if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
|
| 292 |
+
table_content = self.extract_table_data(response)
|
| 293 |
+
if table_content:
|
| 294 |
+
content_text += "\n\n## Data Tabel\n\n" + table_content
|
| 295 |
+
|
| 296 |
+
# Upload to Supabase
|
| 297 |
+
filename = self.upload_content(page_title, content_text)
|
| 298 |
+
|
| 299 |
+
yield {
|
| 300 |
+
'url': response.url,
|
| 301 |
+
'title': page_title,
|
| 302 |
+
'menu_path': menu_path,
|
| 303 |
+
'uploaded_as': filename,
|
| 304 |
+
'timestamp': datetime.now().isoformat(),
|
| 305 |
+
'content_length': len(content_text),
|
| 306 |
+
'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Continue with additional scraping if needed
|
| 310 |
+
self.process_additional_links(response, menu_path)
|
| 311 |
+
|
| 312 |
+
def extract_general_content(self, response):
|
| 313 |
+
"""Extract general content from the page"""
|
| 314 |
+
paragraphs = []
|
| 315 |
+
|
| 316 |
+
content_selectors = [
|
| 317 |
'div.entry-content', 'article.post', 'main.site-main',
|
| 318 |
'div.content', 'div.main-content', 'div#content', 'div.page-content'
|
| 319 |
]
|
| 320 |
+
|
| 321 |
+
for selector in content_selectors:
|
| 322 |
+
content_area = response.css(selector)
|
| 323 |
if content_area:
|
| 324 |
+
elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
|
| 325 |
+
for elem in elems:
|
| 326 |
+
text = self.clean_text(' '.join(elem.css('*::text').getall()))
|
| 327 |
+
if text and len(text.split()) >= 5:
|
| 328 |
+
# Add links if any
|
| 329 |
+
links = elem.css('a::attr(href)').getall()
|
| 330 |
+
for link in links:
|
| 331 |
+
if link and not link.startswith('#'):
|
| 332 |
+
text += f" (Link: {response.urljoin(link)})"
|
| 333 |
+
paragraphs.append(text)
|
| 334 |
+
if paragraphs:
|
| 335 |
+
break
|
| 336 |
+
|
| 337 |
+
# Fallback: extract from body
|
| 338 |
+
if not paragraphs:
|
| 339 |
+
body_texts = response.css('body *::text').getall()
|
| 340 |
+
combined_text = self.clean_text(' '.join(body_texts))
|
| 341 |
+
if combined_text:
|
| 342 |
+
# Split into meaningful chunks
|
| 343 |
+
sentences = re.split(r'(?<=[.!?])\s+', combined_text)
|
| 344 |
+
current_para = ""
|
| 345 |
+
for sentence in sentences:
|
| 346 |
+
if len((current_para + " " + sentence).split()) <= 50:
|
| 347 |
+
current_para += " " + sentence
|
| 348 |
+
else:
|
| 349 |
+
if current_para.strip():
|
| 350 |
+
paragraphs.append(current_para.strip())
|
| 351 |
+
current_para = sentence
|
| 352 |
+
if current_para.strip():
|
| 353 |
+
paragraphs.append(current_para.strip())
|
| 354 |
+
|
| 355 |
+
# Format paragraphs
|
| 356 |
+
formatted_paragraphs = []
|
| 357 |
+
for para in paragraphs:
|
| 358 |
+
if len(para.split()) >= 10:
|
| 359 |
+
formatted_paragraphs.append(self.format_paragraph(para))
|
| 360 |
+
|
| 361 |
+
return formatted_paragraphs
|
| 362 |
+
|
| 363 |
+
def extract_table_data(self, response):
|
| 364 |
+
"""Extract and format table data"""
|
| 365 |
+
tables = response.css('table')
|
| 366 |
+
table_output = []
|
| 367 |
+
|
| 368 |
+
for table_idx, table in enumerate(tables):
|
| 369 |
+
table_rows = []
|
| 370 |
+
for row in table.css('tr'):
|
| 371 |
+
cells = row.css('th, td')
|
| 372 |
+
row_data = []
|
| 373 |
+
for cell in cells:
|
| 374 |
+
cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
|
| 375 |
+
if link := cell.css('a::attr(href)').get():
|
| 376 |
+
cell_text += f" (Link: {response.urljoin(link)})"
|
| 377 |
+
if cell_text:
|
| 378 |
+
row_data.append(cell_text)
|
| 379 |
+
if row_data:
|
| 380 |
+
table_rows.append(" | ".join(row_data))
|
| 381 |
+
|
| 382 |
+
if table_rows:
|
| 383 |
+
table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
|
| 384 |
+
|
| 385 |
+
return "\n\n".join(table_output)
|
| 386 |
|
| 387 |
+
def create_final_content(self, page_title, url, paragraphs):
|
| 388 |
+
"""Create the final formatted content"""
|
| 389 |
+
return f"""# {page_title}
|
| 390 |
+
|
| 391 |
+
**Tanggal**: {datetime.now().strftime('%d %B %Y')}
|
| 392 |
+
**URL**: {url}
|
| 393 |
+
|
| 394 |
+
{chr(10).join(paragraphs)}"""
|
| 395 |
+
|
| 396 |
+
def upload_content(self, page_title, content_text):
|
| 397 |
+
"""Upload content to Supabase with content-based deduplication"""
|
| 398 |
safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
|
| 399 |
safe_title = re.sub(r'[-\s]+', '-', safe_title)
|
| 400 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 401 |
filename = f"{safe_title}_{timestamp}.txt"
|
| 402 |
try:
|
| 403 |
result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
|
| 404 |
+
if result.get('result') == 'uploaded':
|
| 405 |
+
self.logger.info(f"Uploaded {filename} successfully.")
|
| 406 |
+
return filename
|
| 407 |
+
elif result.get('result') == 'skipped':
|
| 408 |
+
self.logger.info(f"Skipped upload for {filename} (content unchanged)")
|
| 409 |
+
return f"skipped_{filename}"
|
| 410 |
+
else:
|
| 411 |
+
self.logger.error(f"Upload error for {filename}: {result.get('error')}")
|
| 412 |
+
return f"failed_{filename}"
|
| 413 |
except Exception as e:
|
| 414 |
+
self.logger.error(f"Upload error for {filename}: {str(e)}")
|
| 415 |
return f"failed_{filename}"
|
| 416 |
|
| 417 |
+
def process_additional_links(self, response, menu_path):
|
| 418 |
+
"""Process additional links from the same domain"""
|
| 419 |
+
current_domain = response.url.split('//')[1].split('/')[0]
|
| 420 |
+
if 'pnp.ac.id' not in current_domain:
|
| 421 |
+
header_links = []
|
| 422 |
+
for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
|
| 423 |
+
header_links.extend(response.css(sel).getall())
|
| 424 |
+
for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
|
| 425 |
+
full_link = response.urljoin(link)
|
| 426 |
+
if current_domain in full_link:
|
| 427 |
+
yield scrapy.Request(
|
| 428 |
+
url=full_link,
|
| 429 |
+
callback=self.parse_content,
|
| 430 |
+
meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
if __name__ == '__main__':
|
| 435 |
+
process = CrawlerProcess({
|
| 436 |
+
'USER_AGENT': 'PNPBot/1.0',
|
| 437 |
+
'DOWNLOAD_DELAY': 2,
|
| 438 |
+
'ROBOTSTXT_OBEY': True,
|
| 439 |
+
'LOG_LEVEL': 'INFO',
|
| 440 |
+
'CONCURRENT_REQUESTS': 1,
|
| 441 |
+
'DOWNLOAD_TIMEOUT': 100,
|
| 442 |
+
'RETRY_TIMES': 3,
|
| 443 |
+
'HTTPCACHE_ENABLED': False,
|
| 444 |
+
'FEED_EXPORT_ENCODING': 'utf-8'
|
| 445 |
+
})
|
| 446 |
+
process.crawl(PNPContentSpider)
|
| 447 |
+
process.start()
|
scrapping/utils/crawl4ai_utils.py
DELETED
|
@@ -1,168 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
from typing import Optional, List, Dict, Set
|
| 3 |
-
from urllib.parse import urlparse, urljoin
|
| 4 |
-
|
| 5 |
-
from bs4 import BeautifulSoup
|
| 6 |
-
|
| 7 |
-
try:
|
| 8 |
-
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
| 9 |
-
except Exception as e:
|
| 10 |
-
AsyncWebCrawler = None # type: ignore
|
| 11 |
-
BrowserConfig = None # type: ignore
|
| 12 |
-
CrawlerRunConfig = None # type: ignore
|
| 13 |
-
CacheMode = None # type: ignore
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
class Crawl4AIUnavailable(Exception):
|
| 17 |
-
pass
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
|
| 21 |
-
"""Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
|
| 22 |
-
if AsyncWebCrawler is None:
|
| 23 |
-
raise Crawl4AIUnavailable(
|
| 24 |
-
"crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
|
| 25 |
-
)
|
| 26 |
-
browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
|
| 27 |
-
run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
|
| 28 |
-
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
| 29 |
-
result = await crawler.arun(url=url, config=run_conf)
|
| 30 |
-
# Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
|
| 31 |
-
html = getattr(result, "html", None)
|
| 32 |
-
if not html:
|
| 33 |
-
# Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
|
| 34 |
-
html = getattr(result, "content", None) or getattr(result, "markdown", "")
|
| 35 |
-
return html
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
|
| 39 |
-
"""Synchronous wrapper for fetch_html."""
|
| 40 |
-
return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
# ---------------- Parallel in-domain crawling helpers ---------------- #
|
| 44 |
-
|
| 45 |
-
IMPORTANT_KEYWORDS = [
|
| 46 |
-
# Bahasa Indonesia
|
| 47 |
-
"profil", "tentang", "visi", "misi", "struktur", "pimpinan",
|
| 48 |
-
"akademik", "kurikulum", "dosen", "staf", "jadwal", "kalender",
|
| 49 |
-
"pengumuman", "berita", "pengabdian", "penelitian", "organisasi",
|
| 50 |
-
"program-studi", "prodi", "sarjana", "diploma", "magister",
|
| 51 |
-
# English fallbacks
|
| 52 |
-
"about", "profile", "leadership", "faculty", "staff", "schedule",
|
| 53 |
-
"announcement", "news", "curriculum", "study-program"
|
| 54 |
-
]
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def _same_domain(url: str, base_netloc: str) -> bool:
|
| 58 |
-
try:
|
| 59 |
-
return urlparse(url).netloc.endswith(base_netloc)
|
| 60 |
-
except Exception:
|
| 61 |
-
return False
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def _discover_links(base_url: str, html: str) -> List[str]:
|
| 65 |
-
soup = BeautifulSoup(html or "", "html.parser")
|
| 66 |
-
links: List[str] = []
|
| 67 |
-
for a in soup.find_all("a", href=True):
|
| 68 |
-
href = a["href"].strip()
|
| 69 |
-
if href.startswith("#") or href.lower().startswith("javascript:"):
|
| 70 |
-
continue
|
| 71 |
-
abs_url = urljoin(base_url, href)
|
| 72 |
-
links.append(abs_url)
|
| 73 |
-
return links
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
def _is_important(url: str) -> bool:
|
| 77 |
-
lu = url.lower()
|
| 78 |
-
return any(k in lu for k in IMPORTANT_KEYWORDS)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
async def crawl_domain_parallel(
|
| 82 |
-
seed_url: str,
|
| 83 |
-
max_pages: int = 20,
|
| 84 |
-
max_concurrency: int = 5,
|
| 85 |
-
only_important: bool = True,
|
| 86 |
-
timeout: int = 30,
|
| 87 |
-
headless: bool = True,
|
| 88 |
-
) -> Dict[str, str]:
|
| 89 |
-
"""
|
| 90 |
-
Crawl pages in the same domain as seed_url in parallel using a single AsyncWebCrawler session.
|
| 91 |
-
Returns {url: html} for fetched pages. If only_important=True, limits to URLs containing important keywords.
|
| 92 |
-
"""
|
| 93 |
-
if AsyncWebCrawler is None:
|
| 94 |
-
raise Crawl4AIUnavailable(
|
| 95 |
-
"crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
parsed = urlparse(seed_url)
|
| 99 |
-
base_netloc = parsed.netloc
|
| 100 |
-
|
| 101 |
-
browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
|
| 102 |
-
run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
|
| 103 |
-
|
| 104 |
-
results: Dict[str, str] = {}
|
| 105 |
-
visited: Set[str] = set()
|
| 106 |
-
frontier: List[str] = [seed_url]
|
| 107 |
-
sem = asyncio.Semaphore(max_concurrency)
|
| 108 |
-
|
| 109 |
-
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
| 110 |
-
async def fetch_one(url: str):
|
| 111 |
-
async with sem:
|
| 112 |
-
try:
|
| 113 |
-
res = await crawler.arun(url=url, config=run_conf)
|
| 114 |
-
html = getattr(res, "html", None) or getattr(res, "content", None) or getattr(res, "markdown", "")
|
| 115 |
-
results[url] = html or ""
|
| 116 |
-
return html or ""
|
| 117 |
-
except Exception:
|
| 118 |
-
results[url] = ""
|
| 119 |
-
return ""
|
| 120 |
-
|
| 121 |
-
while frontier and len(visited) < max_pages:
|
| 122 |
-
batch: List[str] = []
|
| 123 |
-
# Build a batch from frontier
|
| 124 |
-
while frontier and len(batch) < max_concurrency and len(visited) + len(batch) < max_pages:
|
| 125 |
-
u = frontier.pop(0)
|
| 126 |
-
if u in visited:
|
| 127 |
-
continue
|
| 128 |
-
if not _same_domain(u, base_netloc):
|
| 129 |
-
continue
|
| 130 |
-
if only_important and not _is_important(u) and u != seed_url:
|
| 131 |
-
continue
|
| 132 |
-
visited.add(u)
|
| 133 |
-
batch.append(u)
|
| 134 |
-
|
| 135 |
-
if not batch:
|
| 136 |
-
break
|
| 137 |
-
|
| 138 |
-
pages = await asyncio.gather(*(fetch_one(u) for u in batch))
|
| 139 |
-
# Discover more links from fetched pages
|
| 140 |
-
for u, html in zip(batch, pages):
|
| 141 |
-
if not html:
|
| 142 |
-
continue
|
| 143 |
-
for link in _discover_links(u, html):
|
| 144 |
-
if link not in visited and _same_domain(link, base_netloc):
|
| 145 |
-
frontier.append(link)
|
| 146 |
-
|
| 147 |
-
return results
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
def crawl_domain_parallel_sync(
|
| 151 |
-
seed_url: str,
|
| 152 |
-
max_pages: int = 20,
|
| 153 |
-
max_concurrency: int = 5,
|
| 154 |
-
only_important: bool = True,
|
| 155 |
-
timeout: int = 30,
|
| 156 |
-
headless: bool = True,
|
| 157 |
-
) -> Dict[str, str]:
|
| 158 |
-
"""Sync wrapper around crawl_domain_parallel."""
|
| 159 |
-
return asyncio.run(
|
| 160 |
-
crawl_domain_parallel(
|
| 161 |
-
seed_url=seed_url,
|
| 162 |
-
max_pages=max_pages,
|
| 163 |
-
max_concurrency=max_concurrency,
|
| 164 |
-
only_important=only_important,
|
| 165 |
-
timeout=timeout,
|
| 166 |
-
headless=headless,
|
| 167 |
-
)
|
| 168 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|