Spaces:

Yozora721
/

pnp-chatbot-admin-v1

Sleeping

App Files Files Community

FauziIsyrinApridal commited on Aug 17, 2025

Commit

4f5cf5c

1 Parent(s): 39cd32c

...

Browse files

Files changed (6) hide show

requirements.txt +1 -3
scrapping/dosen_scrap.py +306 -141
scrapping/jadwal_scrap.py +402 -227
scrapping/jurusan_scrap.py +315 -119
scrapping/pnp_scrap.py +411 -94
scrapping/utils/crawl4ai_utils.py +0 -168

requirements.txt CHANGED Viewed

@@ -2,6 +2,4 @@ scrapy
 supabase
 python-dotenv
 requests
-beautifulsoup4
-crawl4ai
-playwright

 supabase
 python-dotenv
 requests
+beautifulsoup4

scrapping/dosen_scrap.py CHANGED Viewed

@@ -1,160 +1,325 @@
 from datetime import datetime
 import re
 from supabase import create_client
 import os
-from typing import List, Dict
-from bs4 import BeautifulSoup
-# Parallel Crawl4AI helpers
-try:
-    from utils.crawl4ai_utils import crawl_domain_parallel_sync
-except Exception:
-    import sys as _sys
-    import os as _os
-    _sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
-    from crawl4ai_utils import crawl_domain_parallel_sync
-# Dedup upload utility
 try:
     from utils.supabase_utils import upload_if_changed
 except Exception:
-    import sys as _sys2
-    import os as _os2
-    _sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
     from supabase_utils import upload_if_changed
-SEED_URL = 'https://sipeg.pnp.ac.id/'
-def _infer_staff_type_from_context(page_url: str, headers: List[str]) -> str:
-    u = (page_url or '').lower()
-    h = ' '.join(headers).lower()
-    if any(k in u for k in ['administrasi', 'tata-usaha', 'pegawai']) or any(k in h for k in ['administrasi', 'tata usaha', 'pegawai']):
-        return 'staff_administrasi'
-    if any(k in u for k in ['teknisi', 'lab', 'laboratorium']) or any(k in h for k in ['teknisi', 'laboratorium', 'lab']):
-        return 'staff_teknisi'
-    return 'staff_pengajar'
-def parse_tables(html: str, page_url: str = '') -> Dict[str, List[Dict[str, str]]]:
-    soup = BeautifulSoup(html or '', 'html.parser')
-    data: Dict[str, List[Dict[str, str]]] = {
-        'jabatan': [],
-        'staff_pengajar': [],
-        'staff_administrasi': [],
-        'staff_teknisi': [],
     }
-    for table in soup.select('table, .table, .table-bordered, .table-landscape'):
-        headers = [th.get_text(' ', strip=True) for th in table.select('th')]
-        rows = table.select('tr')
-        if not rows:
-            continue
-        # Officials table
-        if any('Jabatan' in h for h in headers) and any('Pejabat' in h for h in headers):
-            for tr in rows:
-                tds = tr.select('td')
-                if len(tds) >= 3:
-                    number = tds[0].get_text(' ', strip=True)
-                    position = tds[1].get_text(' ', strip=True)
-                    official = tds[2].get_text(' ', strip=True)
-                    if position or official:
-                        data['jabatan'].append({
-                            'nomor': number,
-                            'jabatan': position,
-                            'pejabat': official,
-                        })
-        # Staff tables
-        if any('Nama' in h for h in headers) and any('NIP' in h for h in headers):
-            # Infer staff type using URL or headers
-            staff_type = _infer_staff_type_from_context(page_url, headers)
-            for tr in rows[1:]:
-                tds = tr.select('td')
-                if len(tds) >= 3:
-                    nomor = tds[0].get_text(' ', strip=True)
-                    nama = tds[1].get_text(' ', strip=True)
-                    nip = tds[2].get_text(' ', strip=True)
-                    jur = tds[3].get_text(' ', strip=True) if len(tds) > 3 else ''
-                    if nama or nip:
-                        data[staff_type].append({
-                            'nomor': nomor,
-                            'nama': nama,
-                            'nip': nip,
-                            'jurusan': jur,
-                        })
-    return data
-def merge_collections(all_pages: Dict[str, str]) -> Dict[str, List[Dict[str, str]]]:
-    merged: Dict[str, List[Dict[str, str]]] = {
-        'jabatan': [],
-        'staff_pengajar': [],
-        'staff_administrasi': [],
-        'staff_teknisi': [],
-    }
-    for url, html in all_pages.items():
-        if not html:
-            continue
-        parsed = parse_tables(html, page_url=url)
-        for k, v in parsed.items():
-            merged[k].extend(v)
-    return merged
-def build_text(collected: Dict[str, List[Dict[str, str]]]) -> str:
-    lines: List[str] = []
-    lines.append('# Data Dosen dan Staff PNP\n')
-    lines.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
-    sections = [
-        ('jabatan', 'Daftar Jabatan Struktural'),
-        ('staff_pengajar', 'Daftar Dosen dan Pengajar'),
-        ('staff_administrasi', 'Daftar Staff Administrasi'),
-        ('staff_teknisi', 'Daftar Staff Teknisi'),
-    ]
-    for key, title in sections:
-        items = collected.get(key, [])
-        if not items:
-            continue
-        lines.append(f"# {title}\n")
-        lines.append(f"Jumlah data: {len(items)}\n\n")
-        for it in items:
-            if key == 'jabatan':
-                paragraph = f"{it.get('pejabat','')} menjabat sebagai {it.get('jabatan','')}"
-            else:
-                paragraph = f"{it.get('nama','')} adalah staf dengan NIP {it.get('nip','')}"
-                if it.get('jurusan'):
-                    paragraph += f" dan bertugas di {it['jurusan']}"
-            lines.append(paragraph.strip() + "\n")
-        lines.append("\n")
-    return ''.join(lines)
 if __name__ == '__main__':
-    supabase = create_client(
-        os.environ.get('NEXT_PUBLIC_SUPABASE_URL'),
-        os.environ.get('SUPABASE_SERVICE_KEY'),
-    )
-    bucket = os.environ.get('NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET', 'pnp-bot-storage')
-    pages = crawl_domain_parallel_sync(
-        seed_url=SEED_URL,
-        max_pages=40,
-        max_concurrency=6,
-        only_important=True,
-        timeout=30,
-        headless=True,
-    )
-    collected = merge_collections(pages)
-    text = build_text(collected)
-    ts = datetime.now().strftime('%Y%m%d_%H%M')
-    filename = f"data_dosen_{ts}.txt"
-    try:
-        result = upload_if_changed(supabase, bucket, filename, text)
-        if result.get('result') == 'uploaded':
-            print(f"✅ Uploaded {filename}")
-        elif result.get('result') == 'skipped':
-            print(f"⏭️ Skipped (unchanged) {filename}")
-        else:
-            print(f"❌ Upload error: {result.get('error')}")
-    except Exception as e:
-        print(f"❌ Error uploading: {e}")

+import scrapy
+from scrapy.crawler import CrawlerProcess
 from datetime import datetime
 import re
 from supabase import create_client
 import os
+import sys
+# Try import shared dedup upload utility
 try:
     from utils.supabase_utils import upload_if_changed
 except Exception:
+    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
     from supabase_utils import upload_if_changed
+class DosenSpider(scrapy.Spider):
+    name = 'dosen_spider'
+    start_urls = ['https://sipeg.pnp.ac.id/']
+    custom_settings = {
+        'DOWNLOAD_DELAY': 1,
+        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
+        'ROBOTSTXT_OBEY': True,
+        'LOG_LEVEL': 'INFO',
+        'CONCURRENT_REQUESTS': 1,
+        'HTTPCACHE_ENABLED': False,
+        'RETRY_TIMES': 3
     }
+    def __init__(self, *args, **kwargs):
+        super(DosenSpider, self).__init__(*args, **kwargs)
+        # Initialize Supabase client
+        self.supabase = create_client(
+            os.environ.get("NEXT_PUBLIC_SUPABASE_URL"),
+            os.environ.get("SUPABASE_SERVICE_KEY")
+        )
+        self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
+        self.collected_data = []
+    def parse(self, response):
+        # Mengekstrak menu utama dan submenu
+        main_menu_items = response.css('li.level1')
+        for menu_item in main_menu_items:
+            menu_title = menu_item.css('span.bg::text').get('').strip()
+            main_link = menu_item.css('a::attr(href)').get()
+            if main_link:
+                main_link = response.urljoin(main_link)
+                # Follow link menu utama
+                yield scrapy.Request(
+                    url=main_link,
+                    callback=self.parse_page,
+                    meta={'page_title': menu_title, 'page_number': 1}
+                )
+                # Cek submenu
+                submenus = menu_item.css('li.level2')
+                for submenu in submenus:
+                    submenu_title = submenu.css('span.bg::text').get('').strip()
+                    submenu_link = submenu.css('a::attr(href)').get()
+                    if submenu_link:
+                        submenu_link = response.urljoin(submenu_link)
+                        # Follow link submenu
+                        yield scrapy.Request(
+                            url=submenu_link,
+                            callback=self.parse_page,
+                            meta={'page_title': submenu_title, 'page_number': 1}
+                        )
+    def parse_page(self, response):
+        page_title = response.meta.get('page_title', '')
+        page_number = response.meta.get('page_number', 1)
+        # Cek pesan "Data belum tersedia"
+        page_text = ' '.join(response.css('body ::text').getall()).lower()
+        unavailable_messages = [
+            'data staf pengajar belum tersedia',
+            'data staf administrasi belum tersedia',
+            'data staf teknisi belum tersedia'
+        ]
+        if any(msg in page_text for msg in unavailable_messages):
+            self.logger.info(f"Data tidak tersedia pada halaman: {response.url}")
+            return
+        # Cek tabel dalam halaman
+        tables = response.css('table.table-landscape, table.table, table.table-bordered')
+        if tables:
+            for table in tables:
+                # Ambil header tabel untuk menentukan jenis tabel
+                headers = [h.strip() for h in table.css('th::text').getall()]
+                # Tentukan jenis tabel berdasarkan header
+                if 'Jabatan' in headers and 'Pejabat' in headers:
+                    yield from self.extract_officials_table(table, page_title)
+                elif 'Nama' in headers and 'NIP' in headers:
+                    # Tentukan jenis staf berdasarkan judul halaman
+                    staff_type = self.determine_simple_staff_type(page_title)
+                    yield from self.extract_staff_table(table, page_title, staff_type, page_number)
+        else:
+            self.logger.info(f"No tables found on page: {response.url}")
+        # Improved pagination handling
+        current_url = response.url
+        base_url = current_url.split('?')[0] if '?' in current_url else current_url
+        # Extract p value from current URL if it exists
+        current_p = 0
+        if 'p=' in current_url:
+            try:
+                current_p = int(current_url.split('p=')[1].split('&')[0])
+            except (ValueError, IndexError):
+                current_p = 0
+        # Determine items per page based on staff type
+        staff_type = self.determine_simple_staff_type(page_title)
+        if staff_type == 'staff_pengajar':
+            items_per_page = 30
+        elif staff_type in ['staff_administrasi', 'staff_teknisi']:
+            items_per_page = 25
+        else:
+            items_per_page = 0  # No pagination for jabatan
+        # First try to get the Next link using XPath
+        next_page = None
+        next_link = response.xpath('//span[@class="table-link"]/a[contains(text(), "Next")]/@href').get()
+        if next_link:
+            next_page = response.urljoin(next_link)
+        elif current_p >= 0 and items_per_page > 0:
+            next_p = items_per_page if current_p == 0 else current_p + items_per_page
+            next_page = f"{base_url}?p={next_p}"
+            self.logger.info(f"Constructed next page URL with p parameter: {next_page}")
+        # Fallback to other pagination methods if specific method failed
+        if not next_page:
+            pagination_xpath_patterns = [
+                '//ul[contains(@class, "pagination")]/li/a[contains(text(), "Next")]/@href',
+                '//ul[contains(@class, "pagination")]/li/a[contains(text(), "»")]/@href',
+                f'//ul[contains(@class, "pagination")]/li/a[contains(text(), "{page_number + 1}")]/@href',
+                '//a[@class="next page-numbers"]/@href',
+            ]
+            for xpath in pagination_xpath_patterns:
+                next_page_link = response.xpath(xpath).get()
+                if next_page_link:
+                    next_page = response.urljoin(next_page_link)
+                    self.logger.info(f"Found next page link using XPath: {next_page}")
+                    break
+        # Generic parameter detection as last resort
+        if not next_page:
+            if 'page=' in current_url:
+                next_page = current_url.replace(f'page={page_number}', f'page={page_number + 1}')
+            elif 'p=' in current_url and 'p=' not in next_page:
+                next_page = current_url.replace(f'p={current_p}', f'p={current_p + items_per_page}')
+            elif 'halaman=' in current_url:
+                next_page = current_url.replace(f'halaman={page_number}', f'halaman={page_number + 1}')
+            elif 'page/' in current_url:
+                next_page = current_url.replace(f'page/{page_number}', f'page/{page_number + 1}')
+        if next_page:
+            next_page_number = page_number + 1
+            if 'p=' in next_page:
+                try:
+                    p_value = int(next_page.split('p=')[1].split('&')[0])
+                    next_page_number = (p_value // items_per_page) + 1
+                except (ValueError, IndexError):
+                    pass
+            self.logger.info(f"Following to next page: {next_page} (Page {next_page_number})")
+            yield scrapy.Request(
+                url=next_page,
+                callback=self.parse_page,
+                meta={'page_title': page_title, 'page_number': next_page_number}
+            )
+    def determine_simple_staff_type(self, page_title):
+        """Menentukan jenis staf berdasarkan judul halaman"""
+        page_title_lower = page_title.lower()
+        if any(word in page_title_lower for word in ['dosen', 'pengajar', 'akademik', 'jurusan']):
+            return 'staff_pengajar'
+        elif any(word in page_title_lower for word in ['administrasi', 'admin', 'tata usaha', 'pegawai']):
+            return 'staff_administrasi'
+        elif any(word in page_title_lower for word in ['teknisi', 'lab', 'teknik', 'laboratorium']):
+            return 'staff_teknisi'
+        return 'staff_lainnya'
+    def extract_officials_table(self, table, page_title):
+        rows = table.css('tr')
+        for row in rows:
+            row_html = row.get()
+            period_match = re.search(r'<!--\s*<td[^>]*>(.*?)</td>\s*-->', row_html)
+            period = period_match.group(1).strip() if period_match else ""
+            cells = row.css('td')
+            if len(cells) < 3:
+                continue
+            number = cells[0].css('::text').get('').strip()
+            position = cells[1].css('::text').get('').strip()
+            official = cells[2].css('::text').get('').strip()
+            item = {
+                'halaman': page_title,
+                'tipe': 'jabatan',
+                'nomor': number,
+                'jabatan': position,
+                'pejabat': official,
+                'periode': period
+            }
+            self.collected_data.append(item)
+            yield item
+    def extract_staff_table(self, table, page_title, staff_type, page_number):
+        rows = table.css('tr')
+        rows = rows[1:] if len(rows) > 1 else []
+        for row in rows:
+            cells = row.css('td')
+            if len(cells) < 3:
+                continue
+            number = cells[0].css('::text').get('').strip() if len(cells) > 0 else ""
+            name_cell = cells[1] if len(cells) > 1 else None
+            name = ""
+            if name_cell:
+                name_link = name_cell.css('a::text').get()
+                name = name_link.strip() if name_link else name_cell.css('::text').get('').strip()
+                detail_url = name_cell.css('a::attr(href)').get()
+            nip = cells[2].css('::text').get('').strip() if len(cells) > 2 else ""
+            department = cells[3].css('::text').get('').strip() if len(cells) > 3 else ""
+            if not name and not nip:
+                continue
+            item = {
+                'halaman': page_title,
+                'tipe': staff_type,
+                'halaman_ke': page_number,
+                'nomor': number,
+                'nama': name,
+                'nip': nip,
+                'jurusan': department,
+                'detail': detail_url
+            }
+            self.collected_data.append(item)
+            yield item
+    def closed(self, reason):
+        """Called when spider closes - formats data and uploads to Supabase"""
+        # Generate text content
+        text_content = self.generate_text_output()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"data_dosen_{timestamp}.txt"
+        # Upload to Supabase with deduplication
+        try:
+            result = upload_if_changed(self.supabase, self.storage_bucket, filename, text_content)
+            if result.get('result') == 'uploaded':
+                self.logger.info(f"Successfully uploaded {filename} to Supabase storage")
+            elif result.get('result') == 'skipped':
+                self.logger.info(f"Skipped upload for {filename} (content unchanged)")
+            else:
+                self.logger.error(f"Failed to upload {filename} to Supabase: {result.get('error')}")
+        except Exception as e:
+            self.logger.error(f"Error uploading to Supabase: {str(e)}")
+    def generate_text_output(self):
+        output = []
+        output.append(f"# Data Dosen dan Staff PNP\n")
+        output.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
+        grouped = {}
+        for item in self.collected_data:
+            tipe = item.get('tipe', 'lainnya')
+            grouped.setdefault(tipe, []).append(item)
+        section_titles = {
+            'jabatan': 'Daftar Jabatan Struktural',
+            'staff_pengajar': 'Daftar Dosen dan Pengajar',
+            'staff_administrasi': 'Daftar Staff Administrasi',
+            'staff_teknisi': 'Daftar Staff Teknisi',
+            'staff_lainnya': 'Daftar Staff Lainnya'
+        }
+        for tipe, items in grouped.items():
+            title = section_titles.get(tipe, tipe.capitalize())
+            output.append(f"# {title}\n")
+            output.append(f"Jumlah data: {len(items)}\n\n")
+            for item in items:
+                if tipe == 'jabatan':
+                    paragraph = f"{item['pejabat']} menjabat sebagai {item['jabatan']}."
+                    if item.get('periode'):
+                        paragraph += f" Masa jabatan berlangsung selama {item['periode']}."
+                else:
+                    paragraph = f"{item['nama']} adalah staf dengan NIP {item['nip']}."
+                    if item.get('jurusan'):
+                        paragraph += f" Ia bertugas di {item['jurusan']}."
+                    if item.get('detail'):
+                        paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
+                output.append(paragraph + "\n\n")
+        return ''.join(output)
 if __name__ == '__main__':
+    process = CrawlerProcess()
+    process.crawl(DosenSpider)
+    process.start()

scrapping/jadwal_scrap.py CHANGED Viewed

@@ -1,241 +1,416 @@
 import os
 import re
 from datetime import datetime
 from supabase import create_client
 from io import StringIO
-from typing import Dict, List, Tuple
-from bs4 import BeautifulSoup
-# Crawl4AI helper for rendered fetching
-try:
-    from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
-except Exception:
-    import sys as _sys
-    _sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
-    from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
-# Shared dedup upload utility
-try:
-    from utils.supabase_utils import upload_if_changed
-except Exception:
-    import sys as _sys2
-    _sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
-    from supabase_utils import upload_if_changed
-# =====================
-# Standalone helpers for non-Scrapy execution below
-# =====================
-# Constants for targeted pages
-BASE_PRESENSI = 'https://presensi.pnp.ac.id/'
-ELEKTRO_URL = 'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
-EXCLUDED = ['elektronika', 'telkom', 'listrik']
-# Initialize Supabase for standalone run
-_SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
-_SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
-supabase = create_client(_SUPABASE_URL, _SUPABASE_KEY)
-bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
-# Buffers for aggregated uploads keyed by jurusan_id
-file_buffers: Dict[str, StringIO] = {}
-def _init_buffer(jurusan_id: str, jurusan_name: str):
-    if jurusan_id not in file_buffers:
-        file_buffers[jurusan_id] = StringIO()
-        buf = file_buffers[jurusan_id]
-        today = datetime.now().strftime("%Y-%m-%d")
-        buf.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
-        buf.write(f"**Jurusan:** {jurusan_name}\n")
-        buf.write(f"**Tanggal Update:** {today}\n")
-        buf.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
-        buf.write("---\n\n")
-def clean_text_list(nodes) -> List[str]:
-    out: List[str] = []
-    for n in nodes:
-        try:
-            txt = ' '.join(n.get_text(' ', strip=True).split())
-        except Exception:
-            txt = ''
-        if txt:
-            out.append(txt)
-    return out
-def build_schedule_grid_bs(days: List[str], time_slots: List[str]):
-    return {day: {t: 'kosong' for t in time_slots} for day in days}
-def write_schedule_to_buffer_bs(buffer: StringIO, schedule_grid: Dict[str, Dict[str, str]], days: List[str], time_slots: List[str]):
-    for day in days:
-        current_course = None
-        current_times: List[str] = []
-        day_schedule: List[str] = []
-        for t in time_slots:
-            course = schedule_grid[day][t]
-            if course == current_course:
-                current_times.append(t)
-            else:
-                if current_course and current_course.lower() != 'kosong':
-                    first_start = current_times[0].split('-')[0].strip()
-                    last_end = current_times[-1].split('-')[-1].strip()
-                    time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
-                    day_schedule.append(f"- {day} {time_range} | {current_course}")
-                current_course = course
-                current_times = [t]
-        if current_course and current_course.lower() != 'kosong':
-            first_start = current_times[0].split('-')[0].strip()
-            last_end = current_times[-1].split('-')[-1].strip()
-            time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
-            day_schedule.append(f"- {day} {time_range} | {current_course}")
-        for entry in day_schedule:
-            buffer.write(entry + "\n")
-        buffer.write("\n")
-def process_table(tbl, jurusan_id: str, jurusan_name: str, idx: int):
-    _init_buffer(jurusan_id, jurusan_name)
-    buf = file_buffers[jurusan_id]
-    # Caption or fallback
-    cap_tag = tbl.find('caption')
-    caption_text = cap_tag.get_text(' ', strip=True) if cap_tag else f"Jadwal Kelas {idx + 1}"
-    thead = tbl.find('thead')
-    if thead:
-        thead_text = ' '.join(thead.get_text(' ', strip=True).split())
-        if thead_text:
-            caption_text = f"{caption_text} {thead_text}"
-    caption_text = re.sub(r'\s+', ' ', caption_text).strip()
-    # Header lists
-    days = clean_text_list(thead.select('th.xAxis')) if thead else []
-    if not days and thead:
-        days = clean_text_list(thead.select('th[class*="xAxis"]'))
-    tbody = tbl.find('tbody')
-    time_slots = clean_text_list(tbody.select('tr:not(.foot) th.yAxis')) if tbody else []
-    if not time_slots and tbody:
-        time_slots = clean_text_list(tbody.select('th[class*="yAxis"]'))
-    if not days or not time_slots:
-        return
-    # Section header
-    buf.write(f"## Jadwal Perkuliahan {caption_text}\n\n")
-    buf.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
-    # Build grid and fill
-    grid = build_schedule_grid_bs(days, time_slots)
-    rows = tbody.select('tr:not(.foot)') if tbody else []
-    active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
-    for row_idx, row in enumerate(rows):
-        if row_idx >= len(time_slots):
-            continue
-        current_time = time_slots[row_idx]
-        filled_cols = set()
-        # apply rowspans
-        to_remove = []
-        for (rs_col, rs_start), (rs_left, content) in list(active_rowspans.items()):
-            if rs_left > 0 and rs_col < len(days):
-                grid[days[rs_col]][current_time] = content
-                filled_cols.add(rs_col)
-                active_rowspans[(rs_col, rs_start)] = (rs_left - 1, content)
-                if rs_left - 1 <= 0:
-                    to_remove.append((rs_col, rs_start))
-        for k in to_remove:
-            del active_rowspans[k]
-        # cells
-        cells = row.select('td')
-        col_idx = 0
-        for cell in cells:
-            while col_idx < len(days) and col_idx in filled_cols:
-                col_idx += 1
-            if col_idx >= len(days):
-                break
-            cell_text = ' '.join(cell.get_text(' ', strip=True).split())
-            cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
-            rowspan = int(cell.get('rowspan', '1') or '1')
-            colspan = int(cell.get('colspan', '1') or '1')
-            for c in range(colspan):
-                cur = col_idx + c
-                if cur < len(days):
-                    grid[days[cur]][current_time] = cell_text
-            if rowspan > 1:
-                for c in range(colspan):
-                    active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
-            col_idx += colspan
-    write_schedule_to_buffer_bs(buf, grid, days, time_slots)
-def run_parallel():
-    # 1) Special Elektro page (single target page)
-    try:
-        elektro_html = fetch_html_sync(ELEKTRO_URL)
-        esoup = BeautifulSoup(elektro_html, 'html.parser')
-        tables = esoup.select('table')
-        if tables:
-            jurusan_id = 'teknik_elektro'
-            jurusan_name = 'Jurusan Teknik Elektro'
-            for idx, tbl in enumerate(tables):
-                process_table(tbl, jurusan_id, jurusan_name, idx)
-    except Exception as e:
-        print(f"[Jadwal] Error fetching Elektro page: {e}")
-    # 2) Parallel crawl within presensi domain to discover pages and schedule tables
-    try:
-        crawled: Dict[str, str] = crawl_domain_parallel_sync(
-            seed_url=BASE_PRESENSI,
-            max_pages=40,
-            max_concurrency=6,
-            only_important=False,  # we need to find 'groups_days_horizontal' links which may not match keywords
-            timeout=40,
-            headless=True,
-        )
-        for url, html in crawled.items():
-            if not html:
-                continue
-            try:
-                soup = BeautifulSoup(html, 'html.parser')
-                # If this page itself is a groups_days_horizontal schedule page, parse tables directly
-                if 'groups_days_horizontal' in url and 'subgroups_days_horizontal' not in url:
-                    title = soup.title.get_text(strip=True) if soup.title else 'Jadwal'
-                    jurusan_id = title.replace(' ', '_')
-                    jurusan_name = title
-                    for idx, tbl in enumerate(soup.select('table[id^="table_"], table')):
-                        process_table(tbl, jurusan_id=jurusan_id, jurusan_name=jurusan_name, idx=idx)
-                    continue
-                # Otherwise, try to find the schedule link from this page
-                g_link = None
-                for a in soup.select('td a[href]'):
-                    href = a.get('href')
-                    if href and 'groups_days_horizontal' in href and 'subgroups_days_horizontal' not in href:
-                        g_link = href
-                        break
-                if not g_link:
-                    continue
-                g_url = g_link if g_link.startswith('http') else (BASE_PRESENSI + g_link.lstrip('/'))
-                g_html = fetch_html_sync(g_url)
-                gsoup = BeautifulSoup(g_html, 'html.parser')
-                title = gsoup.title.get_text(strip=True) if gsoup.title else 'Jadwal'
-                for idx, tbl in enumerate(gsoup.select('table[id^="table_"], table')):
-                    process_table(tbl, jurusan_id=title.replace(' ', '_'), jurusan_name=title, idx=idx)
-            except Exception as inner:
-                print(f"[Jadwal] Error processing crawled page {url}: {inner}")
-    except Exception as e:
-        print(f"[Jadwal] Error during parallel crawl: {e}")
-    # 3) Upload all buffers with dedup
-    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
-    for jurusan_id, buffer in list(file_buffers.items()):
-        filename = f"{jurusan_id}_{ts}.txt"
-        content = buffer.getvalue()
-        try:
-            result = upload_if_changed(supabase, bucket, filename, content)
-            status = result.get('result')
-            if status == 'uploaded':
                 print(f"✅ Successfully uploaded {filename}")
-            elif status == 'skipped':
                 print(f"⏭️ Skipped upload for {filename} (content unchanged)")
             else:
                 print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
-        except Exception as e:
-            print(f"❌ Error uploading {filename}: {e}")
-        finally:
             buffer.close()
 if __name__ == "__main__":
-    run_parallel()

+import scrapy
+from scrapy.crawler import CrawlerProcess
 import os
 import re
 from datetime import datetime
 from supabase import create_client
 from io import StringIO
+class PnpSpider(scrapy.Spider):
+    name = 'pnp_spider'
+    allowed_domains = ['presensi.pnp.ac.id', 'elektro.pnp.ac.id']
+    start_urls = [
+        'https://presensi.pnp.ac.id/',
+        'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
+    ]
+    excluded_departments = ['elektronika', 'telkom', 'listrik']
+    def __init__(self, *args, **kwargs):
+        super(PnpSpider, self).__init__(*args, **kwargs)
+        # Initialize Supabase client
+        url = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
+        key = os.environ.get("SUPABASE_SERVICE_KEY")
+        self.supabase = create_client(url, key)
+        self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
+        self.file_buffers = {}  # Dictionary to store StringIO objects
+        self.current_date = datetime.now().strftime("%Y-%m-%d")
+    def closed(self, reason):
+        print(f"Spider closing with reason: {reason}")
+        print(f"Uploading {len(self.file_buffers)} files to Supabase...")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        for jurusan_id, buffer in self.file_buffers.items():
+            filename = f"{jurusan_id}_{timestamp}.txt"
+            content = buffer.getvalue()
+            print(f"Uploading {filename} with content length: {len(content)}")
+            result = self.upload_to_supabase(jurusan_id, filename, content)
+            if result.get('result') == 'uploaded':
                 print(f"✅ Successfully uploaded {filename}")
+            elif result.get('result') == 'skipped':
                 print(f"⏭️ Skipped upload for {filename} (content unchanged)")
             else:
                 print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
             buffer.close()
+    def upload_to_supabase(self, jurusan_id, filename, content):
+        """Upload content to Supabase Storage with deduplication by content.
+        It compares the new content with the most recent existing file for the same jurusan_id
+        (files named like f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"). If identical, skip upload.
+        Returns dict: {'result': 'uploaded'|'skipped'|'error', 'error': Optional[str]}
+        """
+        try:
+            # 1) Try to find the latest existing file for this jurusan_id
+            latest_name = self._get_latest_existing_filename(jurusan_id)
+            if latest_name:
+                try:
+                    existing_bytes = self.supabase.storage.from_(self.storage_bucket).download(latest_name)
+                    existing_content = existing_bytes.decode('utf-8') if isinstance(existing_bytes, (bytes, bytearray)) else str(existing_bytes)
+                    if existing_content == content:
+                        return {"result": "skipped"}
+                except Exception as inner_e:
+                    # If download fails, proceed to upload as fallback, but log
+                    print(f"Warning: failed to download existing file '{latest_name}' for comparison: {inner_e}")
+            # 2) Upload new content
+            self.supabase.storage.from_(self.storage_bucket).upload(
+                path=filename,
+                file=content.encode('utf-8'),
+                file_options={"content-type": "text/plain"}
+            )
+            return {"result": "uploaded"}
+        except Exception as e:
+            return {"result": "error", "error": str(e)}
+    def _get_latest_existing_filename(self, jurusan_id):
+        """Return the latest existing filename in the bucket for a given jurusan_id or None.
+        It expects files following the pattern: f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"
+        """
+        try:
+            # List files at the root of the bucket
+            files = self.supabase.storage.from_(self.storage_bucket).list()
+            if not files:
+                return None
+            # files could be list of dicts with 'name' key depending on supabase-py version
+            names = []
+            for f in files:
+                try:
+                    name = f.get('name') if isinstance(f, dict) else getattr(f, 'name', None)
+                except Exception:
+                    name = None
+                if not name:
+                    continue
+                names.append(name)
+            # Filter by jurusan_id prefix and timestamp pattern
+            pattern = re.compile(rf"^{re.escape(jurusan_id)}_\d{{8}}_\d{{6}}\.txt$")
+            matched = [n for n in names if pattern.match(n)]
+            if not matched:
+                return None
+            # Sort by timestamp extracted from filename
+            def extract_ts(name: str):
+                m = re.search(r"_(\d{8}_\d{6})\.txt$", name)
+                return m.group(1) if m else "00000000_000000"
+            matched.sort(key=extract_ts, reverse=True)
+            return matched[0]
+        except Exception as e:
+            print(f"Warning: could not list existing files for comparison: {e}")
+            return None
+    def parse(self, response):
+        if 'elektro.pnp.ac.id' in response.url:
+            jurusan_id = 'teknik_elektro'
+            jurusan_name = 'Jurusan Teknik Elektro'
+            return self.parse_elektro_page(response, jurusan_id, jurusan_name)
+        print("Memulai scraping dari halaman utama...")
+        jurusan_links = set(response.xpath('//article[contains(@class, "section")]//a/@href').getall())
+        for link in jurusan_links:
+            if any(excluded in link.lower() for excluded in self.excluded_departments):
+                continue
+            jurusan_url = response.urljoin(link)
+            jurusan_id = self.extract_jurusan_id(link)
+            yield scrapy.Request(jurusan_url,
+                               callback=self.parse_jurusan,
+                               meta={'jurusan_id': jurusan_id})
+    def parse_elektro_page(self, response, jurusan_id, jurusan_name):
+        if jurusan_id not in self.file_buffers:
+            self.initialize_document_buffer(jurusan_id, jurusan_name)
+        output_buffer = self.file_buffers[jurusan_id]
+        tables = response.xpath('//table')
+        if not tables:
+            return
+        for table_idx, table in enumerate(tables):
+            caption_text = self.get_table_caption(table, table_idx)
+            class_info = self.clean_class_info(caption_text, table)
+            if not class_info:
+                continue
+            self.write_section_header(output_buffer, class_info)
+            days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall() or \
+                   table.xpath('.//thead//th[contains(@class, "xAxis")]/text()').getall()
+            time_slots = table.xpath('.//tbody//th[@class="yAxis"]/text()').getall() or \
+                         table.xpath('.//tbody//th[contains(@class, "yAxis")]/text()').getall()
+            if not days or not time_slots:
+                continue
+            schedule_grid = self.build_schedule_grid(days, time_slots)
+            self.process_table_rows(table, schedule_grid, days, time_slots)
+            self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
+    def initialize_document_buffer(self, jurusan_id, jurusan_name):
+        """Initialize a new document with proper title and metadata"""
+        self.file_buffers[jurusan_id] = StringIO()
+        buffer = self.file_buffers[jurusan_id]
+        # Write document title and metadata
+        buffer.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
+        buffer.write(f"**Jurusan:** {jurusan_name}\n")
+        buffer.write(f"**Tanggal Update:** {self.current_date}\n")
+        buffer.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
+        buffer.write("---\n\n")
+    def get_table_caption(self, table, table_idx):
+        """Extract and clean table caption text"""
+        caption = table.xpath('.//caption//text()').getall()
+        caption_text = ' '.join(caption).strip()
+        if not caption_text:
+            caption_text = table.xpath('preceding::h2[1]//text()|preceding::h3[1]//text()|preceding::h4[1]//text()').get()
+            caption_text = caption_text.strip() if caption_text else f"Jadwal Kelas {table_idx + 1}"
+        return caption_text
+    def clean_class_info(self, caption_text, table):
+        """Combine and clean class information"""
+        thead_class_info = ' '.join(table.xpath('.//thead/tr[1]//text()').getall()).strip()
+        class_info = f"{caption_text} {thead_class_info}" if thead_class_info else caption_text
+        return re.sub(r'\s+', ' ', class_info).strip()
+    def write_section_header(self, buffer, class_info):
+        """Write a section header for each class schedule"""
+        buffer.write(f"## Jadwal Perkuliahan {class_info}\n\n")
+        buffer.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
+    def build_schedule_grid(self, days, time_slots):
+        """Initialize the schedule grid structure"""
+        return {day: {time: 'kosong' for time in time_slots} for day in days}
+    def process_table_rows(self, table, schedule_grid, days, time_slots):
+        """Process table rows respecting rowspans and colspans"""
+        rows = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]')
+        active_rowspans = {}
+        for row_idx, row in enumerate(rows):
+            if row_idx >= len(time_slots):
+                continue
+            current_time = time_slots[row_idx]
+            filled_columns = set()
+            # Apply active rowspans
+            self.apply_active_rowspans(active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx)
+            # Process current row cells
+            cells = row.xpath('./td')
+            col_idx = 0
+            for cell in cells:
+                while col_idx < len(days) and col_idx in filled_columns:
+                    col_idx += 1
+                if col_idx >= len(days):
+                    break
+                cell_content = self.process_cell_content(cell)
+                rowspan = int(cell.xpath('./@rowspan').get() or 1)
+                colspan = int(cell.xpath('./@colspan').get() or 1)
+                self.update_schedule_grid(schedule_grid, days, current_time, col_idx, colspan, cell_content)
+                self.update_active_rowspans(active_rowspans, row_idx, col_idx, colspan, rowspan, cell_content)
+                col_idx += colspan
+    def apply_active_rowspans(self, active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx):
+        """Apply content from cells with rowspan to current row"""
+        rowspans_to_remove = []
+        for (rs_col_idx, rs_row_start_idx), (rowspan_left, content) in active_rowspans.items():
+            if rowspan_left > 0 and rs_col_idx < len(days):
+                day = days[rs_col_idx]
+                schedule_grid[day][current_time] = content
+                filled_columns.add(rs_col_idx)
+                active_rowspans[(rs_col_idx, rs_row_start_idx)] = (rowspan_left - 1, content)
+                if rowspan_left - 1 <= 0:
+                    rowspans_to_remove.append((rs_col_idx, rs_row_start_idx))
+        for key in rowspans_to_remove:
+            del active_rowspans[key]
+    def process_cell_content(self, cell):
+        """Extract and clean cell content"""
+        content = ' '.join(cell.xpath('.//text()').getall()).strip()
+        return 'kosong' if not content or content == '---' else content
+    def update_schedule_grid(self, schedule_grid, days, current_time, col_idx, colspan, content):
+        """Update schedule grid with cell content"""
+        for c in range(colspan):
+            current_col_idx = col_idx + c
+            if current_col_idx < len(days):
+                schedule_grid[days[current_col_idx]][current_time] = content
+    def update_active_rowspans(self, active_rowspans, row_idx, col_idx, colspan, rowspan, content):
+        """Track cells with rowspan for future rows"""
+        if rowspan > 1:
+            for c in range(colspan):
+                active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, content)
+    def format_course_entry(self, time_slots, course_info):
+        """Format a course entry for optimal RAG retrieval"""
+        # Parse course information
+        parts = course_info.split()
+        course_code = parts[0] if parts and len(parts[0]) == 7 and parts[0][:3].isalpha() and parts[0][3:].isdigit() else ""
+        course_name = ""
+        lecturer = ""
+        room = ""
+        # Extract course name, lecturer, and room
+        if "_" in course_info:
+            # Format: COURSE_CODE Course_Name_P Lecturer Room
+            course_parts = course_info.split("_P")
+            if len(course_parts) > 1:
+                course_name = course_parts[0].replace(course_code, "").strip()
+                remaining = course_parts[1].strip().split()
+                lecturer = " ".join(remaining[:-1])
+                room = remaining[-1] if remaining else ""
+        else:
+            # Alternative format
+            course_name = " ".join(parts[1:-2]) if len(parts) > 3 else course_info.replace(course_code, "").strip()
+            lecturer = parts[-2] if len(parts) > 1 else ""
+            room = parts[-1] if parts else ""
+        # Format time range
+        time_range = self.format_time_range(time_slots)
+        # Create structured information
+        return {
+            "time_range": time_range,
+            "course_code": course_code,
+            "course_name": course_name,
+            "lecturer": lecturer,
+            "room": room
+        }
+    def write_schedule_to_buffer(self, buffer, schedule_grid, days, time_slots):
+        for day in days:
+            current_course = None
+            current_times = []
+            day_schedule = []
+            for time_slot in time_slots:
+                course = schedule_grid[day][time_slot]
+                if course == current_course:
+                    current_times.append(time_slot)
+                else:
+                    if current_course and current_course.lower() != 'kosong':
+                        time_range = self.format_time_range(current_times)
+                        entry = f"- {day} {time_range} | {current_course}"
+                        day_schedule.append(entry)
+                    current_course = course
+                    current_times = [time_slot]
+            # Tambahkan entri terakhir
+            if current_course and current_course.lower() != 'kosong':
+                time_range = self.format_time_range(current_times)
+                entry = f"- {day} {time_range} | {current_course}"
+                day_schedule.append(entry)
+            # Tulis hasil ke buffer
+            for entry in day_schedule:
+                buffer.write(entry + "\n")
+            buffer.write("\n")  # spasi antar hari
+    def format_time_range(self, time_slots):
+        """Format multiple time slots into a readable range"""
+        if len(time_slots) == 1:
+            return time_slots[0]
+        first_start = time_slots[0].split('-')[0].strip()
+        last_end = time_slots[-1].split('-')[-1].strip()
+        return f"{first_start} - {last_end}"
+    def extract_jurusan_id(self, link):
+        match = re.search(r'department\?dep=(\d+)', link)
+        return match.group(1) if match else f"unknown_{hash(link) % 1000}"
+    def parse_jurusan(self, response):
+        jurusan_id = response.meta.get('jurusan_id')
+        jurusan_name = self.extract_title_jurusan_name(response)
+        groups_days_horizontal_link = response.xpath('//td/a[contains(@href, "groups_days_horizontal") and not(contains(@href, "subgroups_days_horizontal"))]/@href').get()
+        if groups_days_horizontal_link:
+            groups_days_horizontal_url = response.urljoin(groups_days_horizontal_link)
+            safe_jurusan_name = re.sub(r'[^\w\-_\. ]', '_', jurusan_name)
+            yield scrapy.Request(groups_days_horizontal_url,
+                            callback=self.parse_jadwal,
+                            meta={'jurusan_id': safe_jurusan_name, 'jurusan_name': jurusan_name})
+    def parse_jadwal(self, response):
+        jurusan_id = response.meta.get('jurusan_id')
+        jurusan_name = response.meta.get('jurusan_name')
+        if jurusan_id not in self.file_buffers:
+            self.initialize_document_buffer(jurusan_id, jurusan_name)
+        output_buffer = self.file_buffers[jurusan_id]
+        tables = response.xpath('//table[contains(@id, "table_")]') or response.xpath('//table')
+        for table in tables:
+            caption_text = self.get_table_caption(table, 0)
+            class_info = self.clean_class_info(caption_text, table)
+            if not class_info:
+                continue
+            self.write_section_header(output_buffer, class_info)
+            days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall()
+            time_slots = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]/th[@class="yAxis"]/text()').getall()
+            if not days or not time_slots:
+                continue
+            schedule_grid = self.build_schedule_grid(days, time_slots)
+            self.process_table_rows(table, schedule_grid, days, time_slots)
+            self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
+    def extract_title_jurusan_name(self, response):
+        title = response.xpath('//title/text()').get()
+        return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}"
 if __name__ == "__main__":
+    process = CrawlerProcess(settings={
+        'DOWNLOAD_DELAY': 1,
+        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
+        'ROBOTSTXT_OBEY': True,
+        'LOG_LEVEL': 'INFO',
+        'HTTPCACHE_ENABLED': False,
+        'CONCURRENT_REQUESTS': 1,
+        'RETRY_TIMES': 3
+    })
+    process.crawl(PnpSpider)
+    process.start()

scrapping/jurusan_scrap.py CHANGED Viewed

@@ -1,130 +1,326 @@
-import os
-import re
-from datetime import datetime
-from typing import Dict, List
 from bs4 import BeautifulSoup
 from supabase import create_client
-# Crawl4AI helper for rendered fetching
-try:
-    from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
-except Exception:
-    import sys as _sys
-    import os as _os
-    _sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
-    from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
-# Shared dedup upload utility
 try:
     from utils.supabase_utils import upload_if_changed
 except Exception:
-    import sys as _sys2
-    import os as _os2
-    _sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
     from supabase_utils import upload_if_changed
-DOMAIN_TO_NAME: Dict[str, str] = {
-    'akt.pnp.ac.id': 'Akuntansi',
-    'an.pnp.ac.id': 'Administrasi_Niaga',
-    'bing.pnp.ac.id': 'Bahasa_Inggris',
-    'elektro.pnp.ac.id': 'Teknik_Elektro',
-    'me.pnp.ac.id': 'Teknik_Mesin',
-    'sipil.pnp.ac.id': 'Teknik_Sipil',
-    'ti.pnp.ac.id': 'Teknologi_Informasi',
-}
-START_URLS: List[str] = [f"https://{d}/" for d in DOMAIN_TO_NAME.keys()]
-PRODI_PATTERN = re.compile(r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b', re.I)
-def is_valid_prodi(nama: str) -> bool:
-    return bool(PRODI_PATTERN.match(nama or ""))
-def extract_prodi_from_html(html: str) -> List[str]:
-    soup = BeautifulSoup(html, 'html.parser')
-    found: List[str] = []
-    for a in soup.find_all('a'):
-        txt = a.get_text(strip=True)
-        if txt and is_valid_prodi(txt) and txt not in found:
-            found.append(txt)
-    return found
-def build_rekap_text(rekap: Dict[str, List[str]]) -> str:
-    lines: List[str] = []
-    lines.append("# REKAP PROGRAM STUDI PNP\n")
-    lines.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
-    total_prodi = 0
-    jumlah_jurusan = 0
-    for jurusan_key, daftar in rekap.items():
-        valid = [p.strip() for p in daftar if is_valid_prodi(p)]
-        if not valid:
-            continue
-        jur_baca = jurusan_key.replace('_', ' ')
-        lines.append(f"{jur_baca}:\n")
-        for p in sorted(set(valid)):
-            lines.append(f"- {p}\n")
-        jumlah = len(valid)
-        lines.append(f"Jumlah program studi jurusan {jur_baca}: {jumlah}\n\n")
-        total_prodi += jumlah
-        jumlah_jurusan += 1
-    lines.append(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
-    lines.append(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
-    return ''.join(lines)
-if __name__ == '__main__':
-    # Supabase client
-    supabase = create_client(
-        os.environ.get('NEXT_PUBLIC_SUPABASE_URL'),
-        os.environ.get('SUPABASE_SERVICE_KEY'),
-    )
-    bucket = os.environ.get('NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET', 'pnp-bot-storage')
-    # Crawl each jurusan domain in parallel batches, collect prodi across important URLs
-    rekap_prodi: Dict[str, List[str]] = {}
-    for url in START_URLS:
         try:
-            domain = url.split('//')[1].strip('/').lower()
-            jurusan = DOMAIN_TO_NAME.get(domain, domain)
-            pages = crawl_domain_parallel_sync(
-                seed_url=url,
-                max_pages=30,
-                max_concurrency=6,
-                only_important=True,
-                timeout=30,
-                headless=True,
-            )
-            prodi_set = set()
-            for page_url, html in pages.items():
-                if not html:
-                    continue
-                for p in extract_prodi_from_html(html):
-                    prodi_set.add(p)
-            prodi_list = sorted(prodi_set)
-            rekap_prodi[jurusan] = prodi_list
-            print(f"[Jurusan] {jurusan}: {len(prodi_list)} prodi ditemukan dari {len(pages)} halaman penting")
         except Exception as e:
-            print(f"[Jurusan] Gagal fetch {url}: {e}")
-    # Build single REKAP file and upload with dedup
-    timestamp = datetime.now().strftime('%Y%m%d_%H%M')
-    rekap_filename = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
-    rekap_text = build_rekap_text(rekap_prodi)
-    try:
-        result = upload_if_changed(supabase, bucket, rekap_filename, rekap_text)
-        status = result.get('result')
-        if status == 'uploaded':
-            print(f"✅ Uploaded rekap: {rekap_filename}")
-        elif status == 'skipped':
-            print(f"⏭️ Skipped upload (unchanged): {rekap_filename}")
-        else:
-            print(f"❌ Upload error for {rekap_filename}: {result.get('error')}")
-    except Exception as e:
-        print(f"❌ Error uploading rekap: {e}")
-    # End of minimal Crawl4AI rekap script

+import scrapy
+from scrapy.crawler import CrawlerProcess
 from bs4 import BeautifulSoup
+from dotenv import load_dotenv
 from supabase import create_client
+from datetime import datetime
+import os, re, tempfile
+import sys
+# Try import shared dedup upload utility
 try:
     from utils.supabase_utils import upload_if_changed
 except Exception:
+    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
     from supabase_utils import upload_if_changed
+load_dotenv()
+SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
+SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
+SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
+def is_valid_prodi(nama):
+    return bool(re.match(
+        r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b',
+        nama, re.I
+    ))
+class JurusanSpider(scrapy.Spider):
+    name = "jurusan"
+    custom_settings = {
+        'DOWNLOAD_DELAY': 1,
+        'USER_AGENT': "PNPBot/1.2",
+        'ROBOTSTXT_OBEY': True,
+        'LOG_LEVEL': 'INFO',
+        'CONCURRENT_REQUESTS': 1,
+        'RETRY_TIMES': 3
+    }
+    domain_to_name = {
+        'akt.pnp.ac.id': 'Akuntansi',
+        'an.pnp.ac.id': 'Administrasi_Niaga',
+        'bing.pnp.ac.id': 'Bahasa_Inggris',
+        'elektro.pnp.ac.id': 'Teknik_Elektro',
+        'me.pnp.ac.id': 'Teknik_Mesin',
+        'sipil.pnp.ac.id': 'Teknik_Sipil',
+        'ti.pnp.ac.id': 'Teknologi_Informasi',
+    }
+    start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
+    def __init__(self):
+        self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
+        self.bucket = SUPABASE_BUCKET
+        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+        self.per_jurusan_pages = {}
+        self.rekap_prodi = {}
+    def parse(self, response):
+        domain = response.url.split("//")[1].split("/")[0]
+        jurusan = self.domain_to_name.get(domain, domain)
+        soup = BeautifulSoup(response.text, "html.parser")
+        program_studi = []
+        # Ambil semua <a> yang mengandung nama program studi (D3, D4, dll.)
+        for a_tag in soup.find_all("a"):
+            item = a_tag.get_text(strip=True)
+            href = a_tag.get("href")
+            if item and is_valid_prodi(item) and item not in program_studi:
+                program_studi.append(item)
+                if href:
+                    prodi_url = response.urljoin(href)
+                    self.logger.info(f"[🧩] Ditemukan prodi: {item} ({prodi_url}) di jurusan {jurusan}")
+                    yield scrapy.Request(prodi_url, callback=self.parse_detail, meta={"jurusan": jurusan, "url": prodi_url})
+        # Simpan hasil awal ke dict untuk rekap
+        self.rekap_prodi[jurusan] = program_studi
+        # Tetap follow semua link internal untuk backup scraping
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            if href.startswith("http") and domain in href:
+                yield scrapy.Request(href, callback=self.parse_detail, meta={"jurusan": jurusan, "url": href})
+            elif href.startswith("/"):
+                yield scrapy.Request(response.urljoin(href), callback=self.parse_detail, meta={"jurusan": jurusan, "url": response.urljoin(href)})
+    def parse_detail(self, response):
+        jurusan = response.meta["jurusan"]
+        url = response.meta["url"]
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Tentukan area konten utama terlebih dahulu
+        candidates = soup.select(
+            "main, article, #content, #primary, .site-content, .entry-content, .post-content, .content, .page-content, .container main, .elementor-section.elementor-top-section, .elementor-container, .elementor-widget-theme-post-content"
+        )
+        def text_len(el):
+            try:
+                return len(el.get_text(" ", strip=True))
+            except Exception:
+                return 0
+        main_area = max(candidates, key=text_len) if candidates else soup.body or soup
+        # Bersihkan elemen yang tidak perlu (diperluas)
+        blacklist_selectors = [
+            'header', 'footer', 'nav', 'aside', 'menu', 'form',
+            '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
+            '.site-header', '.site-footer', '#site-header', '#colophon', '.widget', '.widget-area',
+            '.breadcrumbs', '.pagination', '.navigation', '.page-links',
+            'script', 'style', 'noscript', 'iframe',
+            '.social-links', '.share-buttons', '.newsletter',
+            '.ad-container', '.ads', '.advert', '[role="navigation"]', '[aria-label*="breadcrumb" i]'
+        ]
+        for selector in blacklist_selectors:
+            for tag in main_area.select(selector):
+                tag.decompose()
+        # Hapus elemen kosong yang tersisa dalam area utama
+        for element in list(main_area.find_all(True)):
+            if not element.get_text(strip=True) and not element.find_all(True):
+                element.decompose()
+        title_tag = main_area.find("h1") or soup.find("title")
+        page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
+        # ==== KHUSUS Halaman Pimpinan Jurusan TI ====
+        if url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
+            leadership_data = {
+                "Pimpinan Jurusan": [],
+                "Koordinator Program Studi": [],
+                "Kepala Labor": []
+            }
+            member_items = soup.find_all(class_="member-item")
+            for member in member_items:
+                name_tag = member.find(class_="item-title")
+                name = name_tag.get_text(strip=True) if name_tag else "N/A"
+                position_tag = member.find(class_="small-text")
+                position = position_tag.get_text(strip=True) if position_tag else "N/A"
+                if "Ketua Jurusan" in position or "Sekretaris Jurusan" in position:
+                    leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
+                elif "Koordinator Program Studi" in position or "Koordinator PSDKU" in position:
+                    leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
+                elif "Kepala Labor" in position:
+                    leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
+            naratif = []
+            naratif.append("## Pimpinan Jurusan")
+            for leader in leadership_data["Pimpinan Jurusan"]:
+                naratif.append(f"- {leader['jabatan']}: {leader['nama']}")
+            naratif.append("\n## Koordinator Program Studi")
+            for coordinator in leadership_data["Koordinator Program Studi"]:
+                naratif.append(f"- {coordinator['jabatan']}: {coordinator['nama']}")
+            naratif.append("\n## Kepala Labor")
+            for lab_head in leadership_data["Kepala Labor"]:
+                naratif.append(f"- {lab_head['jabatan']}: {lab_head['nama']}")
+            content_text = f"""# Pimpinan Jurusan Teknologi Informasi
+    URL: {url}
+    Jurusan: Teknologi Informasi
+    Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
+    """ + "\n".join(naratif)
+            self.per_jurusan_pages.setdefault(jurusan, []).append({
+                "url": url,
+                "title": "Pimpinan Jurusan Teknologi Informasi",
+                "content": content_text
+            })
+            return
+        # ==== KHUSUS Halaman Dosen Staf Pengajar TI ====
+        elif url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
+            dosen_data = []
+            gallery = soup.find('div', class_='gallery')
+            if gallery:
+                for item in gallery.find_all('dl', class_='gallery-item'):
+                    caption = item.find('dd', class_='wp-caption-text')
+                    nama_gelar = caption.get_text(strip=True) if caption else ""
+                    link_tag = item.find('a')
+                    link = link_tag['href'] if link_tag and link_tag.has_attr('href') else ""
+                    img_tag = item.find('img')
+                    foto = img_tag['src'] if img_tag and img_tag.has_attr('src') else ""
+                    dosen_data.append({
+                        "nama_gelar": nama_gelar,
+                        "link_profil": link,
+                        "foto": foto
+                    })
+            content_text = f"""# Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi
+    URL: {url}
+    Jurusan: Teknologi Informasi
+    Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
+    Jumlah Dosen: {len(dosen_data)}
+    ## Daftar Dosen:
+    """
+            for idx, dosen in enumerate(dosen_data, 1):
+                content_text += f"\n### {idx}. {dosen['nama_gelar']}"
+                if dosen['link_profil']:
+                    content_text += f"\n- Link Profil: {dosen['link_profil']}"
+                if dosen['foto']:
+                    content_text += f"\n- Foto: {dosen['foto']}"
+                content_text += "\n"
+            self.per_jurusan_pages.setdefault(jurusan, []).append({
+                "url": url,
+                "title": "Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi",
+                "content": content_text
+            })
+            return
+        # ==== PARSING STANDAR ====
+        body_text = []
+        for p in main_area.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
+            txt = p.get_text(strip=True)
+            if txt:
+                body_text.append(txt)
+        content_text = f"""# {page_title}
+    URL: {url}
+    Jurusan: {jurusan.replace('_', ' ')}
+    Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
+    """ + "\n\n".join(body_text)
+        # Tambahkan semua tabel dari area utama saja
+        for i, table in enumerate(main_area.find_all("table")):
+            content_text += f"\n\nTabel {i+1}\n\n"
+            for row in table.find_all("tr"):
+                cols = row.find_all(["td", "th"])
+                row_data = [col.get_text(strip=True) for col in cols]
+                content_text += " | ".join(row_data) + "\n"
+        self.per_jurusan_pages.setdefault(jurusan, []).append({
+            "url": url,
+            "title": page_title,
+            "content": content_text
+        })
+    def closed(self, reason):
+        # Simpan file tiap jurusan
+        for jurusan, pages in self.per_jurusan_pages.items():
+            filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
+            try:
+                with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
+                    for page in pages:
+                        f.write(page["content"] + "\n\n---\n\n")
+                    temp_path = f.name
+                # Read content back to ensure consistent comparison behavior
+                with open(temp_path, 'r', encoding='utf-8') as rf:
+                    content_text = rf.read()
+                result = upload_if_changed(self.supabase, self.bucket, filename, content_text)
+                if result.get('result') == 'uploaded':
+                    self.logger.info(f"✅ Uploaded file jurusan: {filename}")
+                elif result.get('result') == 'skipped':
+                    self.logger.info(f"⏭️ Skipped upload for {filename} (content unchanged)")
+                else:
+                    self.logger.error(f"❌ Gagal upload {filename}: {result.get('error')}")
+            except Exception as e:
+                self.logger.error(f"❌ Gagal upload {filename}: {e}")
+            finally:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
+       # Rekap program studi
+        rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
         try:
+            with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
+                f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
+                total_prodi = 0
+                jumlah_jurusan = 0
+                for jurusan, daftar in self.rekap_prodi.items():
+                    valid_prodi = []
+                    for p in daftar:
+                        if is_valid_prodi(p):
+                            valid_prodi.append(p.strip())
+                    if not valid_prodi:
+                        continue
+                    jurusan_baca = jurusan.replace("_", " ")
+                    f.write(f"{jurusan_baca}:\n")
+                    for p in sorted(set(valid_prodi)):
+                        f.write(f"- {p}\n")
+                    jumlah_prodi = len(valid_prodi)
+                    f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
+                    total_prodi += jumlah_prodi
+                    jumlah_jurusan += 1
+                f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
+                f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
+                temp_path = f.name
+            # Read content then use dedup upload
+            with open(temp_path, 'r', encoding='utf-8') as rf:
+                rekap_text = rf.read()
+            result = upload_if_changed(self.supabase, self.bucket, rekap_filename, rekap_text)
+            if result.get('result') == 'uploaded':
+                self.logger.info(f"✅ Uploaded file rekap: {rekap_filename}")
+            elif result.get('result') == 'skipped':
+                self.logger.info(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
+            else:
+                self.logger.error(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
         except Exception as e:
+            self.logger.error(f"❌ Gagal upload rekap: {e}")
+        finally:
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+if __name__ == "__main__":
+    process = CrawlerProcess()
+    process.crawl(JurusanSpider)
+    process.start()

scrapping/pnp_scrap.py CHANGED Viewed

@@ -1,18 +1,10 @@
 from datetime import datetime
 import re
 import os
 from supabase import create_client, Client
 import html
-from typing import List
-from urllib.parse import urljoin
-from bs4 import BeautifulSoup
-try:
-    from utils.crawl4ai_utils import crawl_domain_parallel_sync
-except Exception:
-    import sys
-    sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
-    from crawl4ai_utils import crawl_domain_parallel_sync
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
@@ -28,103 +20,428 @@ except Exception:
     from supabase_utils import upload_if_changed
-if __name__ == '__main__':
-    # Crawl4AI-based lightweight runner to fetch and upload core pages
-    START_URLS = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
-    def _clean_text(text: str) -> str:
         if not text:
-            return ''
-        t = html.unescape(' '.join(text.split()))
-        t = t.replace('â€œ', '"').replace('â€', '"').replace('â€™', "'")
-        t = t.replace('â€"', '—').replace('â€"', '–')
-        return t.strip()
-    def _extract_paragraphs(html_text: str, base_url: str) -> List[str]:
-        soup = BeautifulSoup(html_text, 'html.parser')
-        selectors = [
             'div.entry-content', 'article.post', 'main.site-main',
             'div.content', 'div.main-content', 'div#content', 'div.page-content'
         ]
-        content_area = None
-        for sel in selectors:
-            content_area = soup.select_one(sel)
             if content_area:
-                break
-        nodes = content_area.select('p, h1, h2, h3, h4, h5, h6, li') if content_area else soup.select('p, h1, h2, h3, h4, h5, h6, li')
-        out: List[str] = []
-        for node in nodes:
-            text = _clean_text(node.get_text(' ', strip=True))
-            if text and len(text.split()) >= 5:
-                for a in node.find_all('a', href=True):
-                    href = a['href']
-                    if href and not href.startswith('#'):
-                        abs_url = href if href.startswith('http') else urljoin(base_url, href)
-                        text += f" (Link: {abs_url})"
-                out.append(text)
-        return out
-    def _extract_tables(html_text: str, base_url: str) -> str:
-        soup = BeautifulSoup(html_text, 'html.parser')
-        blocks: List[str] = []
-        for ti, table in enumerate(soup.select('table')):
-            rows = []
-            for tr in table.select('tr'):
-                cells = []
-                for c in tr.select('th, td'):
-                    tx = _clean_text(c.get_text(' ', strip=True))
-                    a = c.find('a', href=True)
-                    if a and a['href']:
-                        href = a['href']
-                        abs_url = href if href.startswith('http') else urljoin(base_url, href)
-                        tx += f" (Link: {abs_url})"
-                    if tx:
-                        cells.append(tx)
-                if cells:
-                    rows.append(' | '.join(cells))
-            if rows:
-                blocks.append(f"### Tabel {ti + 1}\n\n" + "\n".join(rows))
-        return "\n\n".join(blocks)
-    def _final_md(title: str, url: str, paras: List[str], tables: str) -> str:
-        md = f"# {title}\n\n**Tanggal**: {datetime.now().strftime('%d %B %Y')}\n**URL**: {url}\n\n" + "\n".join(paras)
-        if tables:
-            md += "\n\n## Data Tabel\n\n" + tables
-        return md
-    def _upload(page_title: str, content_text: str) -> str:
         safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
         safe_title = re.sub(r'[-\s]+', '-', safe_title)
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
         filename = f"{safe_title}_{timestamp}.txt"
         try:
             result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
-            return filename if result.get('result') == 'uploaded' else f"skipped_{filename}"
         except Exception as e:
-            print(f"Upload error: {e}")
             return f"failed_{filename}"
-    for seed in START_URLS:
-        try:
-            pages = crawl_domain_parallel_sync(
-                seed_url=seed,
-                max_pages=40,
-                max_concurrency=6,
-                only_important=True,
-                timeout=30,
-                headless=True,
-            )
-            for page_url, html_text in pages.items():
-                if not html_text:
-                    continue
-                soup = BeautifulSoup(html_text, 'html.parser')
-                title_node = soup.select_one('h1.entry-title, h1.page-title')
-                page_title = title_node.get_text(strip=True) if title_node else (soup.title.string.strip() if soup.title and soup.title.string else 'Unknown Page')
-                paras = _extract_paragraphs(html_text, page_url)
-                tables = _extract_tables(html_text, page_url)
-                content = _final_md(page_title, page_url, paras, tables)
-                up = _upload(page_title, content)
-                print(f"[PNP crawl] {page_url} -> {up}")
-        except Exception as e:
-            print(f"[PNP crawl] Error processing seed {seed}: {e}")

+import scrapy
+from scrapy.crawler import CrawlerProcess
 from datetime import datetime
 import re
 import os
 from supabase import create_client, Client
 import html
 SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
 SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
     from supabase_utils import upload_if_changed
+class PNPContentSpider(scrapy.Spider):
+    name = 'pnp_content_spider'
+    start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
+    excluded_subdomains = [
+        'akt.pnp.ac.id',
+        'an.pnp.ac.id',
+        'bing.pnp.ac.id',
+        'elektro.pnp.ac.id',
+        'me.pnp.ac.id',
+        'sipil.pnp.ac.id',
+        'ti.pnp.ac.id'
+    ]
+    custom_settings = {
+        'DOWNLOAD_DELAY': 1,
+        'RETRY_TIMES': 3,
+        'HTTPCACHE_ENABLED': False,
+        'ROBOTSTXT_OBEY': True,
+        'CONCURRENT_REQUESTS': 1,
+        'RETRY_ENABLED': True,
+        'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
+        'LOG_LEVEL': 'INFO',
+    }
+    def clean_text(self, text: str) -> str:
+        """Clean and normalize text content"""
         if not text:
+            return ""
+        # Decode HTML entities
+        text = html.unescape(text)
+        # Remove extra whitespace and normalize
+        text = ' '.join(text.split())
+        # Fix common encoding issues
+        text = text.replace('â€œ', '"').replace('â€', '"').replace('â€™', "'")
+        text = text.replace('â€"', '—').replace('â€"', '–')
+        return text.strip()
+    def format_paragraph(self, text: str) -> str:
+        text = self.clean_text(text)
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        paragraph = ''
+        word_count = 0
+        for sentence in sentences:
+            words = sentence.split()
+            word_count += len(words)
+            paragraph += sentence + ' '
+            if 50 <= word_count <= 150:
+                break
+        return paragraph.strip()
+    def parse(self, response):
+        self.logger.info(f"Processing main page: {response.url}")
+        nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
+        for item in nav_items:
+            main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
+            if not main_title:
+                main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
+            main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
+            if main_link and not main_link.startswith('#'):
+                main_link = response.urljoin(main_link)
+                if "jurusan" in main_link.lower():
+                    continue
+                yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
+            submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
+            for submenu in submenus:
+                submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
+                if not submenu_title:
+                    submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
+                submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
+                if submenu_link and not submenu_link.startswith('#'):
+                    submenu_link = response.urljoin(submenu_link)
+                    if "jurusan" in submenu_link.lower():
+                        continue
+                    menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
+                    yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
+    def extract_leadership_info(self, response):
+        """Extract leadership information from the special leadership page"""
+        self.logger.info("Extracting leadership information from special page")
+        leaders_data = []
+        # Try multiple table selectors based on the HTML structure shown
+        tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
+        if tables:
+            # Process each table
+            for table_idx, table in enumerate(tables):
+                self.logger.info(f"Processing table {table_idx + 1}")
+                rows = table.css('tr')
+                if not rows:
+                    continue
+                leader_info = {}
+                position_title = ""
+                # Look for position title (like "DIREKTUR")
+                title_elements = table.css('strong, .position-title, th')
+                for title_elem in title_elements:
+                    title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
+                    if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
+                        position_title = title_text
+                        break
+                # Extract key-value pairs from table rows
+                for row in rows:
+                    cells = row.css('td, th')
+                    if len(cells) >= 3:
+                        # Format: Label | : | Value (3 columns)
+                        key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
+                        separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
+                        value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
+                        if key and value and separator == ":":
+                            leader_info[key] = value
+                    elif len(cells) == 2:
+                        # Format: Label | Value (2 columns)
+                        key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
+                        value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
+                        if key and value and key != value:
+                            # Skip if key contains colon (likely "Label:")
+                            clean_key = key.replace(':', '').strip()
+                            leader_info[clean_key] = value
+                # Add position title if found
+                if position_title:
+                    leader_info['Posisi'] = position_title
+                # If we found structured data, add it
+                if leader_info:
+                    leaders_data.append(leader_info)
+                    self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
+        # Fallback: Extract from general content structure
+        if not leaders_data:
+            self.logger.info("No table data found, trying general content extraction")
+            # Look for profile sections
+            profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
+            for section in profile_sections:
+                section_text = self.clean_text(' '.join(section.css('*::text').getall()))
+                # Check if this section contains leadership info
+                if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
+                    # Try to extract structured info from the text
+                    leader_info = {'description': section_text}
+                    # Try to extract specific details using regex
+                    name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
+                    if name_match:
+                        leader_info['Nama'] = name_match.group(0).strip()
+                    nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
+                    if nidn_match:
+                        leader_info['NIDN'] = nidn_match.group(1)
+                    leaders_data.append(leader_info)
+        return leaders_data
+    def format_leadership_content(self, leaders_data):
+        """Format leadership data into readable content"""
+        formatted_content = []
+        for idx, leader in enumerate(leaders_data, 1):
+            if isinstance(leader, dict):
+                if 'description' in leader and len(leader) == 1:
+                    # Simple description format
+                    content = f"## Pimpinan {idx}\n\n{leader['description']}"
+                else:
+                    # Structured data format - create narrative
+                    position = leader.get("Posisi", "")
+                    nama = leader.get("Nama", "")
+                    nidn = leader.get("NIDN", "")
+                    jabatan_akademik = leader.get("Jabatan Akademik", "")
+                    jurusan = leader.get("Jurusan", "")
+                    program_studi = leader.get("Program Studi", "")
+                    # Create narrative starting with position
+                    if position and nama:
+                        content = f"## {position}\n\n"
+                        narrative = f"{position} Politeknik Negeri Padang adalah {nama}."
+                    elif nama:
+                        content = f"## Pimpinan {idx}\n\n"
+                        narrative = f"Pimpinan ini adalah {nama}."
+                    else:
+                        content = f"## Pimpinan {idx}\n\n"
+                        narrative = "Informasi pimpinan:"
+                    # Add academic position
+                    if jabatan_akademik:
+                        narrative += f" Secara akademik, beliau menjabat sebagai {jabatan_akademik}."
+                    # Add department information
+                    if jurusan:
+                        narrative += f" Beliau berasal dari Jurusan {jurusan}."
+                    # Add study program
+                    if program_studi:
+                        narrative += f" Program studi yang diampu adalah {program_studi}."
+                    # Add NIDN
+                    if nidn:
+                        narrative += f" NIDN beliau adalah {nidn}."
+                    content += narrative + "\n\n"
+                    # Add any remaining information that wasn't included in narrative
+                    used_keys = ['Posisi', 'Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi', 'description']
+                    for key, value in leader.items():
+                        if key not in used_keys:
+                            content += f"**{key}**: {value}\n\n"
+                    # Add description if exists
+                    if 'description' in leader:
+                        content += f"**Informasi Tambahan**: {leader['description']}\n\n"
+                formatted_content.append(content.strip())
+        return formatted_content
+    def parse_content(self, response):
+        page_title = response.meta.get('page_title', 'Unknown Page')
+        menu_path = response.meta.get('menu_path', '')
+        if page_title == 'Unknown Page':
+            page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
+        self.logger.info(f"Extracting content from: {response.url} ({page_title})")
+        paragraphs = []
+        # Special case: halaman pimpinan PNP
+        if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
+            self.logger.info("Detected leadership page - using special extraction")
+            leaders_data = self.extract_leadership_info(response)
+            self.logger.info(f"Found {len(leaders_data)} leadership entries")
+            if leaders_data:
+                formatted_leaders = self.format_leadership_content(leaders_data)
+                paragraphs = formatted_leaders
+                # Also extract any additional content from the page
+                additional_content = self.extract_general_content(response)
+                if additional_content:
+                    paragraphs.extend(["## Informasi Tambahan"] + additional_content)
+            else:
+                # Fallback to general content extraction
+                self.logger.warning("Leadership extraction failed, falling back to general extraction")
+                paragraphs = self.extract_general_content(response)
+        else:
+            # Normal content extraction
+            paragraphs = self.extract_general_content(response)
+        # Create final content
+        content_text = self.create_final_content(page_title, response.url, paragraphs)
+        # Add table data if any (but skip for leadership pages to avoid duplication)
+        if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
+            table_content = self.extract_table_data(response)
+            if table_content:
+                content_text += "\n\n## Data Tabel\n\n" + table_content
+        # Upload to Supabase
+        filename = self.upload_content(page_title, content_text)
+        yield {
+            'url': response.url,
+            'title': page_title,
+            'menu_path': menu_path,
+            'uploaded_as': filename,
+            'timestamp': datetime.now().isoformat(),
+            'content_length': len(content_text),
+            'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
+        }
+        # Continue with additional scraping if needed
+        self.process_additional_links(response, menu_path)
+    def extract_general_content(self, response):
+        """Extract general content from the page"""
+        paragraphs = []
+        content_selectors = [
             'div.entry-content', 'article.post', 'main.site-main',
             'div.content', 'div.main-content', 'div#content', 'div.page-content'
         ]
+        for selector in content_selectors:
+            content_area = response.css(selector)
             if content_area:
+                elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
+                for elem in elems:
+                    text = self.clean_text(' '.join(elem.css('*::text').getall()))
+                    if text and len(text.split()) >= 5:
+                        # Add links if any
+                        links = elem.css('a::attr(href)').getall()
+                        for link in links:
+                            if link and not link.startswith('#'):
+                                text += f" (Link: {response.urljoin(link)})"
+                        paragraphs.append(text)
+                if paragraphs:
+                    break
+        # Fallback: extract from body
+        if not paragraphs:
+            body_texts = response.css('body *::text').getall()
+            combined_text = self.clean_text(' '.join(body_texts))
+            if combined_text:
+                # Split into meaningful chunks
+                sentences = re.split(r'(?<=[.!?])\s+', combined_text)
+                current_para = ""
+                for sentence in sentences:
+                    if len((current_para + " " + sentence).split()) <= 50:
+                        current_para += " " + sentence
+                    else:
+                        if current_para.strip():
+                            paragraphs.append(current_para.strip())
+                        current_para = sentence
+                if current_para.strip():
+                    paragraphs.append(current_para.strip())
+        # Format paragraphs
+        formatted_paragraphs = []
+        for para in paragraphs:
+            if len(para.split()) >= 10:
+                formatted_paragraphs.append(self.format_paragraph(para))
+        return formatted_paragraphs
+    def extract_table_data(self, response):
+        """Extract and format table data"""
+        tables = response.css('table')
+        table_output = []
+        for table_idx, table in enumerate(tables):
+            table_rows = []
+            for row in table.css('tr'):
+                cells = row.css('th, td')
+                row_data = []
+                for cell in cells:
+                    cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
+                    if link := cell.css('a::attr(href)').get():
+                        cell_text += f" (Link: {response.urljoin(link)})"
+                    if cell_text:
+                        row_data.append(cell_text)
+                if row_data:
+                    table_rows.append(" | ".join(row_data))
+            if table_rows:
+                table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
+        return "\n\n".join(table_output)
+    def create_final_content(self, page_title, url, paragraphs):
+        """Create the final formatted content"""
+        return f"""# {page_title}
+**Tanggal**: {datetime.now().strftime('%d %B %Y')}
+**URL**: {url}
+{chr(10).join(paragraphs)}"""
+    def upload_content(self, page_title, content_text):
+        """Upload content to Supabase with content-based deduplication"""
         safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
         safe_title = re.sub(r'[-\s]+', '-', safe_title)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         filename = f"{safe_title}_{timestamp}.txt"
         try:
             result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
+            if result.get('result') == 'uploaded':
+                self.logger.info(f"Uploaded {filename} successfully.")
+                return filename
+            elif result.get('result') == 'skipped':
+                self.logger.info(f"Skipped upload for {filename} (content unchanged)")
+                return f"skipped_{filename}"
+            else:
+                self.logger.error(f"Upload error for {filename}: {result.get('error')}")
+                return f"failed_{filename}"
         except Exception as e:
+            self.logger.error(f"Upload error for {filename}: {str(e)}")
             return f"failed_{filename}"
+    def process_additional_links(self, response, menu_path):
+        """Process additional links from the same domain"""
+        current_domain = response.url.split('//')[1].split('/')[0]
+        if 'pnp.ac.id' not in current_domain:
+            header_links = []
+            for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
+                header_links.extend(response.css(sel).getall())
+            for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
+                full_link = response.urljoin(link)
+                if current_domain in full_link:
+                    yield scrapy.Request(
+                        url=full_link,
+                        callback=self.parse_content,
+                        meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
+                    )
+if __name__ == '__main__':
+    process = CrawlerProcess({
+        'USER_AGENT': 'PNPBot/1.0',
+        'DOWNLOAD_DELAY': 2,
+        'ROBOTSTXT_OBEY': True,
+        'LOG_LEVEL': 'INFO',
+        'CONCURRENT_REQUESTS': 1,
+        'DOWNLOAD_TIMEOUT': 100,
+        'RETRY_TIMES': 3,
+        'HTTPCACHE_ENABLED': False,
+        'FEED_EXPORT_ENCODING': 'utf-8'
+    })
+    process.crawl(PNPContentSpider)
+    process.start()

scrapping/utils/crawl4ai_utils.py DELETED Viewed

@@ -1,168 +0,0 @@
-import asyncio
-from typing import Optional, List, Dict, Set
-from urllib.parse import urlparse, urljoin
-from bs4 import BeautifulSoup
-try:
-    from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-except Exception as e:
-    AsyncWebCrawler = None  # type: ignore
-    BrowserConfig = None  # type: ignore
-    CrawlerRunConfig = None  # type: ignore
-    CacheMode = None  # type: ignore
-class Crawl4AIUnavailable(Exception):
-    pass
-async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
-    """Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
-    if AsyncWebCrawler is None:
-        raise Crawl4AIUnavailable(
-            "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
-        )
-    browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
-    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
-    async with AsyncWebCrawler(config=browser_conf) as crawler:
-        result = await crawler.arun(url=url, config=run_conf)
-        # Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
-        html = getattr(result, "html", None)
-        if not html:
-            # Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
-            html = getattr(result, "content", None) or getattr(result, "markdown", "")
-        return html
-def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
-    """Synchronous wrapper for fetch_html."""
-    return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))
-# ---------------- Parallel in-domain crawling helpers ---------------- #
-IMPORTANT_KEYWORDS = [
-    # Bahasa Indonesia
-    "profil", "tentang", "visi", "misi", "struktur", "pimpinan",
-    "akademik", "kurikulum", "dosen", "staf", "jadwal", "kalender",
-    "pengumuman", "berita", "pengabdian", "penelitian", "organisasi",
-    "program-studi", "prodi", "sarjana", "diploma", "magister",
-    # English fallbacks
-    "about", "profile", "leadership", "faculty", "staff", "schedule",
-    "announcement", "news", "curriculum", "study-program"
-]
-def _same_domain(url: str, base_netloc: str) -> bool:
-    try:
-        return urlparse(url).netloc.endswith(base_netloc)
-    except Exception:
-        return False
-def _discover_links(base_url: str, html: str) -> List[str]:
-    soup = BeautifulSoup(html or "", "html.parser")
-    links: List[str] = []
-    for a in soup.find_all("a", href=True):
-        href = a["href"].strip()
-        if href.startswith("#") or href.lower().startswith("javascript:"):
-            continue
-        abs_url = urljoin(base_url, href)
-        links.append(abs_url)
-    return links
-def _is_important(url: str) -> bool:
-    lu = url.lower()
-    return any(k in lu for k in IMPORTANT_KEYWORDS)
-async def crawl_domain_parallel(
-    seed_url: str,
-    max_pages: int = 20,
-    max_concurrency: int = 5,
-    only_important: bool = True,
-    timeout: int = 30,
-    headless: bool = True,
-) -> Dict[str, str]:
-    """
-    Crawl pages in the same domain as seed_url in parallel using a single AsyncWebCrawler session.
-    Returns {url: html} for fetched pages. If only_important=True, limits to URLs containing important keywords.
-    """
-    if AsyncWebCrawler is None:
-        raise Crawl4AIUnavailable(
-            "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
-        )
-    parsed = urlparse(seed_url)
-    base_netloc = parsed.netloc
-    browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
-    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
-    results: Dict[str, str] = {}
-    visited: Set[str] = set()
-    frontier: List[str] = [seed_url]
-    sem = asyncio.Semaphore(max_concurrency)
-    async with AsyncWebCrawler(config=browser_conf) as crawler:
-        async def fetch_one(url: str):
-            async with sem:
-                try:
-                    res = await crawler.arun(url=url, config=run_conf)
-                    html = getattr(res, "html", None) or getattr(res, "content", None) or getattr(res, "markdown", "")
-                    results[url] = html or ""
-                    return html or ""
-                except Exception:
-                    results[url] = ""
-                    return ""
-        while frontier and len(visited) < max_pages:
-            batch: List[str] = []
-            # Build a batch from frontier
-            while frontier and len(batch) < max_concurrency and len(visited) + len(batch) < max_pages:
-                u = frontier.pop(0)
-                if u in visited:
-                    continue
-                if not _same_domain(u, base_netloc):
-                    continue
-                if only_important and not _is_important(u) and u != seed_url:
-                    continue
-                visited.add(u)
-                batch.append(u)
-            if not batch:
-                break
-            pages = await asyncio.gather(*(fetch_one(u) for u in batch))
-            # Discover more links from fetched pages
-            for u, html in zip(batch, pages):
-                if not html:
-                    continue
-                for link in _discover_links(u, html):
-                    if link not in visited and _same_domain(link, base_netloc):
-                        frontier.append(link)
-    return results
-def crawl_domain_parallel_sync(
-    seed_url: str,
-    max_pages: int = 20,
-    max_concurrency: int = 5,
-    only_important: bool = True,
-    timeout: int = 30,
-    headless: bool = True,
-) -> Dict[str, str]:
-    """Sync wrapper around crawl_domain_parallel."""
-    return asyncio.run(
-        crawl_domain_parallel(
-            seed_url=seed_url,
-            max_pages=max_pages,
-            max_concurrency=max_concurrency,
-            only_important=only_important,
-            timeout=timeout,
-            headless=headless,
-        )
-    )