FauziIsyrinApridal commited on
Commit
4f5cf5c
·
1 Parent(s): 39cd32c
requirements.txt CHANGED
@@ -2,6 +2,4 @@ scrapy
2
  supabase
3
  python-dotenv
4
  requests
5
- beautifulsoup4
6
- crawl4ai
7
- playwright
 
2
  supabase
3
  python-dotenv
4
  requests
5
+ beautifulsoup4
 
 
scrapping/dosen_scrap.py CHANGED
@@ -1,160 +1,325 @@
 
 
1
  from datetime import datetime
2
  import re
3
  from supabase import create_client
4
  import os
5
- from typing import List, Dict
6
- from bs4 import BeautifulSoup
7
 
8
- # Parallel Crawl4AI helpers
9
- try:
10
- from utils.crawl4ai_utils import crawl_domain_parallel_sync
11
- except Exception:
12
- import sys as _sys
13
- import os as _os
14
- _sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
15
- from crawl4ai_utils import crawl_domain_parallel_sync
16
-
17
- # Dedup upload utility
18
  try:
19
  from utils.supabase_utils import upload_if_changed
20
  except Exception:
21
- import sys as _sys2
22
- import os as _os2
23
- _sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
24
  from supabase_utils import upload_if_changed
25
 
26
 
27
- SEED_URL = 'https://sipeg.pnp.ac.id/'
28
-
29
-
30
- def _infer_staff_type_from_context(page_url: str, headers: List[str]) -> str:
31
- u = (page_url or '').lower()
32
- h = ' '.join(headers).lower()
33
- if any(k in u for k in ['administrasi', 'tata-usaha', 'pegawai']) or any(k in h for k in ['administrasi', 'tata usaha', 'pegawai']):
34
- return 'staff_administrasi'
35
- if any(k in u for k in ['teknisi', 'lab', 'laboratorium']) or any(k in h for k in ['teknisi', 'laboratorium', 'lab']):
36
- return 'staff_teknisi'
37
- return 'staff_pengajar'
38
-
39
-
40
- def parse_tables(html: str, page_url: str = '') -> Dict[str, List[Dict[str, str]]]:
41
- soup = BeautifulSoup(html or '', 'html.parser')
42
- data: Dict[str, List[Dict[str, str]]] = {
43
- 'jabatan': [],
44
- 'staff_pengajar': [],
45
- 'staff_administrasi': [],
46
- 'staff_teknisi': [],
47
  }
48
- for table in soup.select('table, .table, .table-bordered, .table-landscape'):
49
- headers = [th.get_text(' ', strip=True) for th in table.select('th')]
50
- rows = table.select('tr')
51
- if not rows:
52
- continue
53
- # Officials table
54
- if any('Jabatan' in h for h in headers) and any('Pejabat' in h for h in headers):
55
- for tr in rows:
56
- tds = tr.select('td')
57
- if len(tds) >= 3:
58
- number = tds[0].get_text(' ', strip=True)
59
- position = tds[1].get_text(' ', strip=True)
60
- official = tds[2].get_text(' ', strip=True)
61
- if position or official:
62
- data['jabatan'].append({
63
- 'nomor': number,
64
- 'jabatan': position,
65
- 'pejabat': official,
66
- })
67
- # Staff tables
68
- if any('Nama' in h for h in headers) and any('NIP' in h for h in headers):
69
- # Infer staff type using URL or headers
70
- staff_type = _infer_staff_type_from_context(page_url, headers)
71
- for tr in rows[1:]:
72
- tds = tr.select('td')
73
- if len(tds) >= 3:
74
- nomor = tds[0].get_text(' ', strip=True)
75
- nama = tds[1].get_text(' ', strip=True)
76
- nip = tds[2].get_text(' ', strip=True)
77
- jur = tds[3].get_text(' ', strip=True) if len(tds) > 3 else ''
78
- if nama or nip:
79
- data[staff_type].append({
80
- 'nomor': nomor,
81
- 'nama': nama,
82
- 'nip': nip,
83
- 'jurusan': jur,
84
- })
85
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
87
 
88
- def merge_collections(all_pages: Dict[str, str]) -> Dict[str, List[Dict[str, str]]]:
89
- merged: Dict[str, List[Dict[str, str]]] = {
90
- 'jabatan': [],
91
- 'staff_pengajar': [],
92
- 'staff_administrasi': [],
93
- 'staff_teknisi': [],
94
- }
95
- for url, html in all_pages.items():
96
- if not html:
97
- continue
98
- parsed = parse_tables(html, page_url=url)
99
- for k, v in parsed.items():
100
- merged[k].extend(v)
101
- return merged
102
 
 
 
 
 
103
 
104
- def build_text(collected: Dict[str, List[Dict[str, str]]]) -> str:
105
- lines: List[str] = []
106
- lines.append('# Data Dosen dan Staff PNP\n')
107
- lines.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
108
- sections = [
109
- ('jabatan', 'Daftar Jabatan Struktural'),
110
- ('staff_pengajar', 'Daftar Dosen dan Pengajar'),
111
- ('staff_administrasi', 'Daftar Staff Administrasi'),
112
- ('staff_teknisi', 'Daftar Staff Teknisi'),
113
- ]
114
- for key, title in sections:
115
- items = collected.get(key, [])
116
- if not items:
117
- continue
118
- lines.append(f"# {title}\n")
119
- lines.append(f"Jumlah data: {len(items)}\n\n")
120
- for it in items:
121
- if key == 'jabatan':
122
- paragraph = f"{it.get('pejabat','')} menjabat sebagai {it.get('jabatan','')}"
123
- else:
124
- paragraph = f"{it.get('nama','')} adalah staf dengan NIP {it.get('nip','')}"
125
- if it.get('jurusan'):
126
- paragraph += f" dan bertugas di {it['jurusan']}"
127
- lines.append(paragraph.strip() + "\n")
128
- lines.append("\n")
129
- return ''.join(lines)
130
 
 
131
 
132
  if __name__ == '__main__':
133
- supabase = create_client(
134
- os.environ.get('NEXT_PUBLIC_SUPABASE_URL'),
135
- os.environ.get('SUPABASE_SERVICE_KEY'),
136
- )
137
- bucket = os.environ.get('NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET', 'pnp-bot-storage')
138
-
139
- pages = crawl_domain_parallel_sync(
140
- seed_url=SEED_URL,
141
- max_pages=40,
142
- max_concurrency=6,
143
- only_important=True,
144
- timeout=30,
145
- headless=True,
146
- )
147
- collected = merge_collections(pages)
148
- text = build_text(collected)
149
- ts = datetime.now().strftime('%Y%m%d_%H%M')
150
- filename = f"data_dosen_{ts}.txt"
151
- try:
152
- result = upload_if_changed(supabase, bucket, filename, text)
153
- if result.get('result') == 'uploaded':
154
- print(f"✅ Uploaded {filename}")
155
- elif result.get('result') == 'skipped':
156
- print(f"⏭️ Skipped (unchanged) {filename}")
157
- else:
158
- print(f"❌ Upload error: {result.get('error')}")
159
- except Exception as e:
160
- print(f"❌ Error uploading: {e}")
 
1
+ import scrapy
2
+ from scrapy.crawler import CrawlerProcess
3
  from datetime import datetime
4
  import re
5
  from supabase import create_client
6
  import os
7
+ import sys
 
8
 
9
+ # Try import shared dedup upload utility
 
 
 
 
 
 
 
 
 
10
  try:
11
  from utils.supabase_utils import upload_if_changed
12
  except Exception:
13
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
 
 
14
  from supabase_utils import upload_if_changed
15
 
16
 
17
+ class DosenSpider(scrapy.Spider):
18
+ name = 'dosen_spider'
19
+ start_urls = ['https://sipeg.pnp.ac.id/']
20
+
21
+ custom_settings = {
22
+ 'DOWNLOAD_DELAY': 1,
23
+ 'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
24
+ 'ROBOTSTXT_OBEY': True,
25
+ 'LOG_LEVEL': 'INFO',
26
+ 'CONCURRENT_REQUESTS': 1,
27
+ 'HTTPCACHE_ENABLED': False,
28
+ 'RETRY_TIMES': 3
 
 
 
 
 
 
 
 
29
  }
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super(DosenSpider, self).__init__(*args, **kwargs)
33
+ # Initialize Supabase client
34
+ self.supabase = create_client(
35
+ os.environ.get("NEXT_PUBLIC_SUPABASE_URL"),
36
+ os.environ.get("SUPABASE_SERVICE_KEY")
37
+ )
38
+ self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
39
+ self.collected_data = []
40
+
41
+ def parse(self, response):
42
+ # Mengekstrak menu utama dan submenu
43
+ main_menu_items = response.css('li.level1')
44
+
45
+ for menu_item in main_menu_items:
46
+ menu_title = menu_item.css('span.bg::text').get('').strip()
47
+ main_link = menu_item.css('a::attr(href)').get()
48
+
49
+ if main_link:
50
+ main_link = response.urljoin(main_link)
51
+
52
+ # Follow link menu utama
53
+ yield scrapy.Request(
54
+ url=main_link,
55
+ callback=self.parse_page,
56
+ meta={'page_title': menu_title, 'page_number': 1}
57
+ )
58
+
59
+ # Cek submenu
60
+ submenus = menu_item.css('li.level2')
61
+ for submenu in submenus:
62
+ submenu_title = submenu.css('span.bg::text').get('').strip()
63
+ submenu_link = submenu.css('a::attr(href)').get()
64
+
65
+ if submenu_link:
66
+ submenu_link = response.urljoin(submenu_link)
67
+
68
+ # Follow link submenu
69
+ yield scrapy.Request(
70
+ url=submenu_link,
71
+ callback=self.parse_page,
72
+ meta={'page_title': submenu_title, 'page_number': 1}
73
+ )
74
+
75
+ def parse_page(self, response):
76
+ page_title = response.meta.get('page_title', '')
77
+ page_number = response.meta.get('page_number', 1)
78
+
79
+ # Cek pesan "Data belum tersedia"
80
+ page_text = ' '.join(response.css('body ::text').getall()).lower()
81
+ unavailable_messages = [
82
+ 'data staf pengajar belum tersedia',
83
+ 'data staf administrasi belum tersedia',
84
+ 'data staf teknisi belum tersedia'
85
+ ]
86
+
87
+ if any(msg in page_text for msg in unavailable_messages):
88
+ self.logger.info(f"Data tidak tersedia pada halaman: {response.url}")
89
+ return
90
+
91
+ # Cek tabel dalam halaman
92
+ tables = response.css('table.table-landscape, table.table, table.table-bordered')
93
+
94
+ if tables:
95
+ for table in tables:
96
+ # Ambil header tabel untuk menentukan jenis tabel
97
+ headers = [h.strip() for h in table.css('th::text').getall()]
98
+
99
+ # Tentukan jenis tabel berdasarkan header
100
+ if 'Jabatan' in headers and 'Pejabat' in headers:
101
+ yield from self.extract_officials_table(table, page_title)
102
+ elif 'Nama' in headers and 'NIP' in headers:
103
+ # Tentukan jenis staf berdasarkan judul halaman
104
+ staff_type = self.determine_simple_staff_type(page_title)
105
+ yield from self.extract_staff_table(table, page_title, staff_type, page_number)
106
+ else:
107
+ self.logger.info(f"No tables found on page: {response.url}")
108
+
109
+ # Improved pagination handling
110
+ current_url = response.url
111
+ base_url = current_url.split('?')[0] if '?' in current_url else current_url
112
+
113
+ # Extract p value from current URL if it exists
114
+ current_p = 0
115
+ if 'p=' in current_url:
116
+ try:
117
+ current_p = int(current_url.split('p=')[1].split('&')[0])
118
+ except (ValueError, IndexError):
119
+ current_p = 0
120
+
121
+ # Determine items per page based on staff type
122
+ staff_type = self.determine_simple_staff_type(page_title)
123
+ if staff_type == 'staff_pengajar':
124
+ items_per_page = 30
125
+ elif staff_type in ['staff_administrasi', 'staff_teknisi']:
126
+ items_per_page = 25
127
+ else:
128
+ items_per_page = 0 # No pagination for jabatan
129
+
130
+ # First try to get the Next link using XPath
131
+ next_page = None
132
+ next_link = response.xpath('//span[@class="table-link"]/a[contains(text(), "Next")]/@href').get()
133
+
134
+ if next_link:
135
+ next_page = response.urljoin(next_link)
136
+ elif current_p >= 0 and items_per_page > 0:
137
+ next_p = items_per_page if current_p == 0 else current_p + items_per_page
138
+ next_page = f"{base_url}?p={next_p}"
139
+ self.logger.info(f"Constructed next page URL with p parameter: {next_page}")
140
+
141
+ # Fallback to other pagination methods if specific method failed
142
+ if not next_page:
143
+ pagination_xpath_patterns = [
144
+ '//ul[contains(@class, "pagination")]/li/a[contains(text(), "Next")]/@href',
145
+ '//ul[contains(@class, "pagination")]/li/a[contains(text(), "»")]/@href',
146
+ f'//ul[contains(@class, "pagination")]/li/a[contains(text(), "{page_number + 1}")]/@href',
147
+ '//a[@class="next page-numbers"]/@href',
148
+ ]
149
+
150
+ for xpath in pagination_xpath_patterns:
151
+ next_page_link = response.xpath(xpath).get()
152
+ if next_page_link:
153
+ next_page = response.urljoin(next_page_link)
154
+ self.logger.info(f"Found next page link using XPath: {next_page}")
155
+ break
156
+
157
+ # Generic parameter detection as last resort
158
+ if not next_page:
159
+ if 'page=' in current_url:
160
+ next_page = current_url.replace(f'page={page_number}', f'page={page_number + 1}')
161
+ elif 'p=' in current_url and 'p=' not in next_page:
162
+ next_page = current_url.replace(f'p={current_p}', f'p={current_p + items_per_page}')
163
+ elif 'halaman=' in current_url:
164
+ next_page = current_url.replace(f'halaman={page_number}', f'halaman={page_number + 1}')
165
+ elif 'page/' in current_url:
166
+ next_page = current_url.replace(f'page/{page_number}', f'page/{page_number + 1}')
167
+
168
+ if next_page:
169
+ next_page_number = page_number + 1
170
+
171
+ if 'p=' in next_page:
172
+ try:
173
+ p_value = int(next_page.split('p=')[1].split('&')[0])
174
+ next_page_number = (p_value // items_per_page) + 1
175
+ except (ValueError, IndexError):
176
+ pass
177
+
178
+ self.logger.info(f"Following to next page: {next_page} (Page {next_page_number})")
179
+ yield scrapy.Request(
180
+ url=next_page,
181
+ callback=self.parse_page,
182
+ meta={'page_title': page_title, 'page_number': next_page_number}
183
+ )
184
+
185
+ def determine_simple_staff_type(self, page_title):
186
+ """Menentukan jenis staf berdasarkan judul halaman"""
187
+ page_title_lower = page_title.lower()
188
+
189
+ if any(word in page_title_lower for word in ['dosen', 'pengajar', 'akademik', 'jurusan']):
190
+ return 'staff_pengajar'
191
+ elif any(word in page_title_lower for word in ['administrasi', 'admin', 'tata usaha', 'pegawai']):
192
+ return 'staff_administrasi'
193
+ elif any(word in page_title_lower for word in ['teknisi', 'lab', 'teknik', 'laboratorium']):
194
+ return 'staff_teknisi'
195
+
196
+ return 'staff_lainnya'
197
+
198
+ def extract_officials_table(self, table, page_title):
199
+ rows = table.css('tr')
200
+
201
+ for row in rows:
202
+ row_html = row.get()
203
+
204
+ period_match = re.search(r'<!--\s*<td[^>]*>(.*?)</td>\s*-->', row_html)
205
+ period = period_match.group(1).strip() if period_match else ""
206
+
207
+ cells = row.css('td')
208
+ if len(cells) < 3:
209
+ continue
210
+
211
+ number = cells[0].css('::text').get('').strip()
212
+ position = cells[1].css('::text').get('').strip()
213
+ official = cells[2].css('::text').get('').strip()
214
+
215
+ item = {
216
+ 'halaman': page_title,
217
+ 'tipe': 'jabatan',
218
+ 'nomor': number,
219
+ 'jabatan': position,
220
+ 'pejabat': official,
221
+ 'periode': period
222
+ }
223
+ self.collected_data.append(item)
224
+ yield item
225
+
226
+ def extract_staff_table(self, table, page_title, staff_type, page_number):
227
+ rows = table.css('tr')
228
+ rows = rows[1:] if len(rows) > 1 else []
229
+
230
+ for row in rows:
231
+ cells = row.css('td')
232
+ if len(cells) < 3:
233
+ continue
234
+
235
+ number = cells[0].css('::text').get('').strip() if len(cells) > 0 else ""
236
+
237
+ name_cell = cells[1] if len(cells) > 1 else None
238
+ name = ""
239
+
240
+ if name_cell:
241
+ name_link = name_cell.css('a::text').get()
242
+ name = name_link.strip() if name_link else name_cell.css('::text').get('').strip()
243
+ detail_url = name_cell.css('a::attr(href)').get()
244
+
245
+ nip = cells[2].css('::text').get('').strip() if len(cells) > 2 else ""
246
+ department = cells[3].css('::text').get('').strip() if len(cells) > 3 else ""
247
+
248
+ if not name and not nip:
249
+ continue
250
+
251
+ item = {
252
+ 'halaman': page_title,
253
+ 'tipe': staff_type,
254
+ 'halaman_ke': page_number,
255
+ 'nomor': number,
256
+ 'nama': name,
257
+ 'nip': nip,
258
+ 'jurusan': department,
259
+ 'detail': detail_url
260
+ }
261
+ self.collected_data.append(item)
262
+ yield item
263
+
264
+ def closed(self, reason):
265
+ """Called when spider closes - formats data and uploads to Supabase"""
266
+ # Generate text content
267
+ text_content = self.generate_text_output()
268
+
269
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
270
+ filename = f"data_dosen_{timestamp}.txt"
271
+
272
+ # Upload to Supabase with deduplication
273
+ try:
274
+ result = upload_if_changed(self.supabase, self.storage_bucket, filename, text_content)
275
+ if result.get('result') == 'uploaded':
276
+ self.logger.info(f"Successfully uploaded {filename} to Supabase storage")
277
+ elif result.get('result') == 'skipped':
278
+ self.logger.info(f"Skipped upload for {filename} (content unchanged)")
279
+ else:
280
+ self.logger.error(f"Failed to upload {filename} to Supabase: {result.get('error')}")
281
+ except Exception as e:
282
+ self.logger.error(f"Error uploading to Supabase: {str(e)}")
283
+
284
+ def generate_text_output(self):
285
+ output = []
286
+ output.append(f"# Data Dosen dan Staff PNP\n")
287
+ output.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
288
 
289
+ grouped = {}
290
+ for item in self.collected_data:
291
+ tipe = item.get('tipe', 'lainnya')
292
+ grouped.setdefault(tipe, []).append(item)
293
 
294
+ section_titles = {
295
+ 'jabatan': 'Daftar Jabatan Struktural',
296
+ 'staff_pengajar': 'Daftar Dosen dan Pengajar',
297
+ 'staff_administrasi': 'Daftar Staff Administrasi',
298
+ 'staff_teknisi': 'Daftar Staff Teknisi',
299
+ 'staff_lainnya': 'Daftar Staff Lainnya'
300
+ }
 
 
 
 
 
 
 
301
 
302
+ for tipe, items in grouped.items():
303
+ title = section_titles.get(tipe, tipe.capitalize())
304
+ output.append(f"# {title}\n")
305
+ output.append(f"Jumlah data: {len(items)}\n\n")
306
 
307
+ for item in items:
308
+ if tipe == 'jabatan':
309
+ paragraph = f"{item['pejabat']} menjabat sebagai {item['jabatan']}."
310
+ if item.get('periode'):
311
+ paragraph += f" Masa jabatan berlangsung selama {item['periode']}."
312
+ else:
313
+ paragraph = f"{item['nama']} adalah staf dengan NIP {item['nip']}."
314
+ if item.get('jurusan'):
315
+ paragraph += f" Ia bertugas di {item['jurusan']}."
316
+ if item.get('detail'):
317
+ paragraph += f" Informasi lebih lengkap tersedia di {item['detail']}."
318
+ output.append(paragraph + "\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ return ''.join(output)
321
 
322
  if __name__ == '__main__':
323
+ process = CrawlerProcess()
324
+ process.crawl(DosenSpider)
325
+ process.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scrapping/jadwal_scrap.py CHANGED
@@ -1,241 +1,416 @@
 
 
1
  import os
2
  import re
3
  from datetime import datetime
4
  from supabase import create_client
5
  from io import StringIO
6
- from typing import Dict, List, Tuple
7
- from bs4 import BeautifulSoup
8
-
9
- # Crawl4AI helper for rendered fetching
10
- try:
11
- from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
12
- except Exception:
13
- import sys as _sys
14
- _sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
15
- from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
16
-
17
- # Shared dedup upload utility
18
- try:
19
- from utils.supabase_utils import upload_if_changed
20
- except Exception:
21
- import sys as _sys2
22
- _sys2.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
23
- from supabase_utils import upload_if_changed
24
-
25
-
26
- # =====================
27
- # Standalone helpers for non-Scrapy execution below
28
- # =====================
29
-
30
- # Constants for targeted pages
31
- BASE_PRESENSI = 'https://presensi.pnp.ac.id/'
32
- ELEKTRO_URL = 'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
33
- EXCLUDED = ['elektronika', 'telkom', 'listrik']
34
-
35
- # Initialize Supabase for standalone run
36
- _SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
37
- _SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
38
- supabase = create_client(_SUPABASE_URL, _SUPABASE_KEY)
39
- bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
40
-
41
- # Buffers for aggregated uploads keyed by jurusan_id
42
- file_buffers: Dict[str, StringIO] = {}
43
-
44
- def _init_buffer(jurusan_id: str, jurusan_name: str):
45
- if jurusan_id not in file_buffers:
46
- file_buffers[jurusan_id] = StringIO()
47
- buf = file_buffers[jurusan_id]
48
- today = datetime.now().strftime("%Y-%m-%d")
49
- buf.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
50
- buf.write(f"**Jurusan:** {jurusan_name}\n")
51
- buf.write(f"**Tanggal Update:** {today}\n")
52
- buf.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
53
- buf.write("---\n\n")
54
-
55
- def clean_text_list(nodes) -> List[str]:
56
- out: List[str] = []
57
- for n in nodes:
58
- try:
59
- txt = ' '.join(n.get_text(' ', strip=True).split())
60
- except Exception:
61
- txt = ''
62
- if txt:
63
- out.append(txt)
64
- return out
65
-
66
- def build_schedule_grid_bs(days: List[str], time_slots: List[str]):
67
- return {day: {t: 'kosong' for t in time_slots} for day in days}
68
-
69
- def write_schedule_to_buffer_bs(buffer: StringIO, schedule_grid: Dict[str, Dict[str, str]], days: List[str], time_slots: List[str]):
70
- for day in days:
71
- current_course = None
72
- current_times: List[str] = []
73
- day_schedule: List[str] = []
74
- for t in time_slots:
75
- course = schedule_grid[day][t]
76
- if course == current_course:
77
- current_times.append(t)
78
- else:
79
- if current_course and current_course.lower() != 'kosong':
80
- first_start = current_times[0].split('-')[0].strip()
81
- last_end = current_times[-1].split('-')[-1].strip()
82
- time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
83
- day_schedule.append(f"- {day} {time_range} | {current_course}")
84
- current_course = course
85
- current_times = [t]
86
- if current_course and current_course.lower() != 'kosong':
87
- first_start = current_times[0].split('-')[0].strip()
88
- last_end = current_times[-1].split('-')[-1].strip()
89
- time_range = f"{first_start} - {last_end}" if len(current_times) > 1 else current_times[0]
90
- day_schedule.append(f"- {day} {time_range} | {current_course}")
91
- for entry in day_schedule:
92
- buffer.write(entry + "\n")
93
- buffer.write("\n")
94
-
95
- def process_table(tbl, jurusan_id: str, jurusan_name: str, idx: int):
96
- _init_buffer(jurusan_id, jurusan_name)
97
- buf = file_buffers[jurusan_id]
98
- # Caption or fallback
99
- cap_tag = tbl.find('caption')
100
- caption_text = cap_tag.get_text(' ', strip=True) if cap_tag else f"Jadwal Kelas {idx + 1}"
101
- thead = tbl.find('thead')
102
- if thead:
103
- thead_text = ' '.join(thead.get_text(' ', strip=True).split())
104
- if thead_text:
105
- caption_text = f"{caption_text} {thead_text}"
106
- caption_text = re.sub(r'\s+', ' ', caption_text).strip()
107
- # Header lists
108
- days = clean_text_list(thead.select('th.xAxis')) if thead else []
109
- if not days and thead:
110
- days = clean_text_list(thead.select('th[class*="xAxis"]'))
111
- tbody = tbl.find('tbody')
112
- time_slots = clean_text_list(tbody.select('tr:not(.foot) th.yAxis')) if tbody else []
113
- if not time_slots and tbody:
114
- time_slots = clean_text_list(tbody.select('th[class*="yAxis"]'))
115
- if not days or not time_slots:
116
- return
117
- # Section header
118
- buf.write(f"## Jadwal Perkuliahan {caption_text}\n\n")
119
- buf.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
120
- # Build grid and fill
121
- grid = build_schedule_grid_bs(days, time_slots)
122
- rows = tbody.select('tr:not(.foot)') if tbody else []
123
- active_rowspans: Dict[Tuple[int, int], Tuple[int, str]] = {}
124
- for row_idx, row in enumerate(rows):
125
- if row_idx >= len(time_slots):
126
- continue
127
- current_time = time_slots[row_idx]
128
- filled_cols = set()
129
- # apply rowspans
130
- to_remove = []
131
- for (rs_col, rs_start), (rs_left, content) in list(active_rowspans.items()):
132
- if rs_left > 0 and rs_col < len(days):
133
- grid[days[rs_col]][current_time] = content
134
- filled_cols.add(rs_col)
135
- active_rowspans[(rs_col, rs_start)] = (rs_left - 1, content)
136
- if rs_left - 1 <= 0:
137
- to_remove.append((rs_col, rs_start))
138
- for k in to_remove:
139
- del active_rowspans[k]
140
- # cells
141
- cells = row.select('td')
142
- col_idx = 0
143
- for cell in cells:
144
- while col_idx < len(days) and col_idx in filled_cols:
145
- col_idx += 1
146
- if col_idx >= len(days):
147
- break
148
- cell_text = ' '.join(cell.get_text(' ', strip=True).split())
149
- cell_text = 'kosong' if not cell_text or cell_text == '---' else cell_text
150
- rowspan = int(cell.get('rowspan', '1') or '1')
151
- colspan = int(cell.get('colspan', '1') or '1')
152
- for c in range(colspan):
153
- cur = col_idx + c
154
- if cur < len(days):
155
- grid[days[cur]][current_time] = cell_text
156
- if rowspan > 1:
157
- for c in range(colspan):
158
- active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, cell_text)
159
- col_idx += colspan
160
- write_schedule_to_buffer_bs(buf, grid, days, time_slots)
161
-
162
- def run_parallel():
163
- # 1) Special Elektro page (single target page)
164
- try:
165
- elektro_html = fetch_html_sync(ELEKTRO_URL)
166
- esoup = BeautifulSoup(elektro_html, 'html.parser')
167
- tables = esoup.select('table')
168
- if tables:
169
- jurusan_id = 'teknik_elektro'
170
- jurusan_name = 'Jurusan Teknik Elektro'
171
- for idx, tbl in enumerate(tables):
172
- process_table(tbl, jurusan_id, jurusan_name, idx)
173
- except Exception as e:
174
- print(f"[Jadwal] Error fetching Elektro page: {e}")
175
-
176
- # 2) Parallel crawl within presensi domain to discover pages and schedule tables
177
- try:
178
- crawled: Dict[str, str] = crawl_domain_parallel_sync(
179
- seed_url=BASE_PRESENSI,
180
- max_pages=40,
181
- max_concurrency=6,
182
- only_important=False, # we need to find 'groups_days_horizontal' links which may not match keywords
183
- timeout=40,
184
- headless=True,
185
- )
186
- for url, html in crawled.items():
187
- if not html:
188
- continue
189
- try:
190
- soup = BeautifulSoup(html, 'html.parser')
191
- # If this page itself is a groups_days_horizontal schedule page, parse tables directly
192
- if 'groups_days_horizontal' in url and 'subgroups_days_horizontal' not in url:
193
- title = soup.title.get_text(strip=True) if soup.title else 'Jadwal'
194
- jurusan_id = title.replace(' ', '_')
195
- jurusan_name = title
196
- for idx, tbl in enumerate(soup.select('table[id^="table_"], table')):
197
- process_table(tbl, jurusan_id=jurusan_id, jurusan_name=jurusan_name, idx=idx)
198
- continue
199
 
200
- # Otherwise, try to find the schedule link from this page
201
- g_link = None
202
- for a in soup.select('td a[href]'):
203
- href = a.get('href')
204
- if href and 'groups_days_horizontal' in href and 'subgroups_days_horizontal' not in href:
205
- g_link = href
206
- break
207
- if not g_link:
208
- continue
209
- g_url = g_link if g_link.startswith('http') else (BASE_PRESENSI + g_link.lstrip('/'))
210
- g_html = fetch_html_sync(g_url)
211
- gsoup = BeautifulSoup(g_html, 'html.parser')
212
- title = gsoup.title.get_text(strip=True) if gsoup.title else 'Jadwal'
213
- for idx, tbl in enumerate(gsoup.select('table[id^="table_"], table')):
214
- process_table(tbl, jurusan_id=title.replace(' ', '_'), jurusan_name=title, idx=idx)
215
- except Exception as inner:
216
- print(f"[Jadwal] Error processing crawled page {url}: {inner}")
217
- except Exception as e:
218
- print(f"[Jadwal] Error during parallel crawl: {e}")
219
-
220
- # 3) Upload all buffers with dedup
221
- ts = datetime.now().strftime("%Y%m%d_%H%M%S")
222
- for jurusan_id, buffer in list(file_buffers.items()):
223
- filename = f"{jurusan_id}_{ts}.txt"
224
- content = buffer.getvalue()
225
- try:
226
- result = upload_if_changed(supabase, bucket, filename, content)
227
- status = result.get('result')
228
- if status == 'uploaded':
 
 
 
 
 
229
  print(f"✅ Successfully uploaded {filename}")
230
- elif status == 'skipped':
231
  print(f"⏭️ Skipped upload for {filename} (content unchanged)")
232
  else:
233
  print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
234
- except Exception as e:
235
- print(f"❌ Error uploading {filename}: {e}")
236
- finally:
237
  buffer.close()
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  if __name__ == "__main__":
241
- run_parallel()
 
 
 
 
 
 
 
 
 
 
 
1
+ import scrapy
2
+ from scrapy.crawler import CrawlerProcess
3
  import os
4
  import re
5
  from datetime import datetime
6
  from supabase import create_client
7
  from io import StringIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+
10
+
11
+ class PnpSpider(scrapy.Spider):
12
+ name = 'pnp_spider'
13
+ allowed_domains = ['presensi.pnp.ac.id', 'elektro.pnp.ac.id']
14
+ start_urls = [
15
+ 'https://presensi.pnp.ac.id/',
16
+ 'https://elektro.pnp.ac.id/jadwal-perkuliahan-jurusan-teknik-elektro/jadwal-perkuliahan-program-studi-teknik-listrik/'
17
+ ]
18
+
19
+ excluded_departments = ['elektronika', 'telkom', 'listrik']
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super(PnpSpider, self).__init__(*args, **kwargs)
23
+ # Initialize Supabase client
24
+ url = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
25
+ key = os.environ.get("SUPABASE_SERVICE_KEY")
26
+
27
+ self.supabase = create_client(url, key)
28
+ self.storage_bucket = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET")
29
+
30
+ self.file_buffers = {} # Dictionary to store StringIO objects
31
+ self.current_date = datetime.now().strftime("%Y-%m-%d")
32
+
33
+ def closed(self, reason):
34
+ print(f"Spider closing with reason: {reason}")
35
+ print(f"Uploading {len(self.file_buffers)} files to Supabase...")
36
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
37
+ for jurusan_id, buffer in self.file_buffers.items():
38
+ filename = f"{jurusan_id}_{timestamp}.txt"
39
+ content = buffer.getvalue()
40
+ print(f"Uploading {filename} with content length: {len(content)}")
41
+ result = self.upload_to_supabase(jurusan_id, filename, content)
42
+ if result.get('result') == 'uploaded':
43
  print(f"✅ Successfully uploaded {filename}")
44
+ elif result.get('result') == 'skipped':
45
  print(f"⏭️ Skipped upload for {filename} (content unchanged)")
46
  else:
47
  print(f"❌ Failed to upload {filename}: {result.get('error', 'unknown error')}")
 
 
 
48
  buffer.close()
49
 
50
+ def upload_to_supabase(self, jurusan_id, filename, content):
51
+ """Upload content to Supabase Storage with deduplication by content.
52
+
53
+ It compares the new content with the most recent existing file for the same jurusan_id
54
+ (files named like f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"). If identical, skip upload.
55
+ Returns dict: {'result': 'uploaded'|'skipped'|'error', 'error': Optional[str]}
56
+ """
57
+ try:
58
+ # 1) Try to find the latest existing file for this jurusan_id
59
+ latest_name = self._get_latest_existing_filename(jurusan_id)
60
+ if latest_name:
61
+ try:
62
+ existing_bytes = self.supabase.storage.from_(self.storage_bucket).download(latest_name)
63
+ existing_content = existing_bytes.decode('utf-8') if isinstance(existing_bytes, (bytes, bytearray)) else str(existing_bytes)
64
+ if existing_content == content:
65
+ return {"result": "skipped"}
66
+ except Exception as inner_e:
67
+ # If download fails, proceed to upload as fallback, but log
68
+ print(f"Warning: failed to download existing file '{latest_name}' for comparison: {inner_e}")
69
+
70
+ # 2) Upload new content
71
+ self.supabase.storage.from_(self.storage_bucket).upload(
72
+ path=filename,
73
+ file=content.encode('utf-8'),
74
+ file_options={"content-type": "text/plain"}
75
+ )
76
+ return {"result": "uploaded"}
77
+ except Exception as e:
78
+ return {"result": "error", "error": str(e)}
79
+
80
+ def _get_latest_existing_filename(self, jurusan_id):
81
+ """Return the latest existing filename in the bucket for a given jurusan_id or None.
82
+
83
+ It expects files following the pattern: f"{jurusan_id}_YYYYMMDD_HHMMSS.txt"
84
+ """
85
+ try:
86
+ # List files at the root of the bucket
87
+ files = self.supabase.storage.from_(self.storage_bucket).list()
88
+ if not files:
89
+ return None
90
+
91
+ # files could be list of dicts with 'name' key depending on supabase-py version
92
+ names = []
93
+ for f in files:
94
+ try:
95
+ name = f.get('name') if isinstance(f, dict) else getattr(f, 'name', None)
96
+ except Exception:
97
+ name = None
98
+ if not name:
99
+ continue
100
+ names.append(name)
101
+
102
+ # Filter by jurusan_id prefix and timestamp pattern
103
+ pattern = re.compile(rf"^{re.escape(jurusan_id)}_\d{{8}}_\d{{6}}\.txt$")
104
+ matched = [n for n in names if pattern.match(n)]
105
+ if not matched:
106
+ return None
107
+
108
+ # Sort by timestamp extracted from filename
109
+ def extract_ts(name: str):
110
+ m = re.search(r"_(\d{8}_\d{6})\.txt$", name)
111
+ return m.group(1) if m else "00000000_000000"
112
+
113
+ matched.sort(key=extract_ts, reverse=True)
114
+ return matched[0]
115
+ except Exception as e:
116
+ print(f"Warning: could not list existing files for comparison: {e}")
117
+ return None
118
+
119
+ def parse(self, response):
120
+ if 'elektro.pnp.ac.id' in response.url:
121
+ jurusan_id = 'teknik_elektro'
122
+ jurusan_name = 'Jurusan Teknik Elektro'
123
+ return self.parse_elektro_page(response, jurusan_id, jurusan_name)
124
+
125
+ print("Memulai scraping dari halaman utama...")
126
+ jurusan_links = set(response.xpath('//article[contains(@class, "section")]//a/@href').getall())
127
+
128
+ for link in jurusan_links:
129
+ if any(excluded in link.lower() for excluded in self.excluded_departments):
130
+ continue
131
+
132
+ jurusan_url = response.urljoin(link)
133
+ jurusan_id = self.extract_jurusan_id(link)
134
+ yield scrapy.Request(jurusan_url,
135
+ callback=self.parse_jurusan,
136
+ meta={'jurusan_id': jurusan_id})
137
+
138
+ def parse_elektro_page(self, response, jurusan_id, jurusan_name):
139
+ if jurusan_id not in self.file_buffers:
140
+ self.initialize_document_buffer(jurusan_id, jurusan_name)
141
+
142
+ output_buffer = self.file_buffers[jurusan_id]
143
+ tables = response.xpath('//table')
144
+
145
+ if not tables:
146
+ return
147
+
148
+ for table_idx, table in enumerate(tables):
149
+ caption_text = self.get_table_caption(table, table_idx)
150
+ class_info = self.clean_class_info(caption_text, table)
151
+
152
+ if not class_info:
153
+ continue
154
+
155
+ self.write_section_header(output_buffer, class_info)
156
+
157
+ days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall() or \
158
+ table.xpath('.//thead//th[contains(@class, "xAxis")]/text()').getall()
159
+ time_slots = table.xpath('.//tbody//th[@class="yAxis"]/text()').getall() or \
160
+ table.xpath('.//tbody//th[contains(@class, "yAxis")]/text()').getall()
161
+
162
+ if not days or not time_slots:
163
+ continue
164
+
165
+ schedule_grid = self.build_schedule_grid(days, time_slots)
166
+ self.process_table_rows(table, schedule_grid, days, time_slots)
167
+ self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
168
+
169
+ def initialize_document_buffer(self, jurusan_id, jurusan_name):
170
+ """Initialize a new document with proper title and metadata"""
171
+ self.file_buffers[jurusan_id] = StringIO()
172
+ buffer = self.file_buffers[jurusan_id]
173
+
174
+ # Write document title and metadata
175
+ buffer.write(f"# Jadwal Perkuliahan {jurusan_name}\n\n")
176
+ buffer.write(f"**Jurusan:** {jurusan_name}\n")
177
+ buffer.write(f"**Tanggal Update:** {self.current_date}\n")
178
+ buffer.write(f"**Sumber:** Politeknik Negeri Padang\n\n")
179
+ buffer.write("---\n\n")
180
+
181
+ def get_table_caption(self, table, table_idx):
182
+ """Extract and clean table caption text"""
183
+ caption = table.xpath('.//caption//text()').getall()
184
+ caption_text = ' '.join(caption).strip()
185
+
186
+ if not caption_text:
187
+ caption_text = table.xpath('preceding::h2[1]//text()|preceding::h3[1]//text()|preceding::h4[1]//text()').get()
188
+ caption_text = caption_text.strip() if caption_text else f"Jadwal Kelas {table_idx + 1}"
189
+
190
+ return caption_text
191
+
192
+ def clean_class_info(self, caption_text, table):
193
+ """Combine and clean class information"""
194
+ thead_class_info = ' '.join(table.xpath('.//thead/tr[1]//text()').getall()).strip()
195
+ class_info = f"{caption_text} {thead_class_info}" if thead_class_info else caption_text
196
+ return re.sub(r'\s+', ' ', class_info).strip()
197
+
198
+ def write_section_header(self, buffer, class_info):
199
+ """Write a section header for each class schedule"""
200
+ buffer.write(f"## Jadwal Perkuliahan {class_info}\n\n")
201
+ buffer.write("Berikut adalah jadwal perkuliahan untuk kelas tersebut, diurutkan berdasarkan hari dan waktu:\n\n")
202
+
203
+ def build_schedule_grid(self, days, time_slots):
204
+ """Initialize the schedule grid structure"""
205
+ return {day: {time: 'kosong' for time in time_slots} for day in days}
206
+
207
+ def process_table_rows(self, table, schedule_grid, days, time_slots):
208
+ """Process table rows respecting rowspans and colspans"""
209
+ rows = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]')
210
+ active_rowspans = {}
211
+
212
+ for row_idx, row in enumerate(rows):
213
+ if row_idx >= len(time_slots):
214
+ continue
215
+
216
+ current_time = time_slots[row_idx]
217
+ filled_columns = set()
218
+
219
+ # Apply active rowspans
220
+ self.apply_active_rowspans(active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx)
221
+
222
+ # Process current row cells
223
+ cells = row.xpath('./td')
224
+ col_idx = 0
225
+
226
+ for cell in cells:
227
+ while col_idx < len(days) and col_idx in filled_columns:
228
+ col_idx += 1
229
+
230
+ if col_idx >= len(days):
231
+ break
232
+
233
+ cell_content = self.process_cell_content(cell)
234
+ rowspan = int(cell.xpath('./@rowspan').get() or 1)
235
+ colspan = int(cell.xpath('./@colspan').get() or 1)
236
+
237
+ self.update_schedule_grid(schedule_grid, days, current_time, col_idx, colspan, cell_content)
238
+ self.update_active_rowspans(active_rowspans, row_idx, col_idx, colspan, rowspan, cell_content)
239
+
240
+ col_idx += colspan
241
+
242
+ def apply_active_rowspans(self, active_rowspans, schedule_grid, days, current_time, filled_columns, row_idx):
243
+ """Apply content from cells with rowspan to current row"""
244
+ rowspans_to_remove = []
245
+
246
+ for (rs_col_idx, rs_row_start_idx), (rowspan_left, content) in active_rowspans.items():
247
+ if rowspan_left > 0 and rs_col_idx < len(days):
248
+ day = days[rs_col_idx]
249
+ schedule_grid[day][current_time] = content
250
+ filled_columns.add(rs_col_idx)
251
+
252
+ active_rowspans[(rs_col_idx, rs_row_start_idx)] = (rowspan_left - 1, content)
253
+ if rowspan_left - 1 <= 0:
254
+ rowspans_to_remove.append((rs_col_idx, rs_row_start_idx))
255
+
256
+ for key in rowspans_to_remove:
257
+ del active_rowspans[key]
258
+
259
+ def process_cell_content(self, cell):
260
+ """Extract and clean cell content"""
261
+ content = ' '.join(cell.xpath('.//text()').getall()).strip()
262
+ return 'kosong' if not content or content == '---' else content
263
+
264
+ def update_schedule_grid(self, schedule_grid, days, current_time, col_idx, colspan, content):
265
+ """Update schedule grid with cell content"""
266
+ for c in range(colspan):
267
+ current_col_idx = col_idx + c
268
+ if current_col_idx < len(days):
269
+ schedule_grid[days[current_col_idx]][current_time] = content
270
+
271
+ def update_active_rowspans(self, active_rowspans, row_idx, col_idx, colspan, rowspan, content):
272
+ """Track cells with rowspan for future rows"""
273
+ if rowspan > 1:
274
+ for c in range(colspan):
275
+ active_rowspans[(col_idx + c, row_idx)] = (rowspan - 1, content)
276
+
277
+ def format_course_entry(self, time_slots, course_info):
278
+ """Format a course entry for optimal RAG retrieval"""
279
+ # Parse course information
280
+ parts = course_info.split()
281
+ course_code = parts[0] if parts and len(parts[0]) == 7 and parts[0][:3].isalpha() and parts[0][3:].isdigit() else ""
282
+ course_name = ""
283
+ lecturer = ""
284
+ room = ""
285
+
286
+ # Extract course name, lecturer, and room
287
+ if "_" in course_info:
288
+ # Format: COURSE_CODE Course_Name_P Lecturer Room
289
+ course_parts = course_info.split("_P")
290
+ if len(course_parts) > 1:
291
+ course_name = course_parts[0].replace(course_code, "").strip()
292
+ remaining = course_parts[1].strip().split()
293
+ lecturer = " ".join(remaining[:-1])
294
+ room = remaining[-1] if remaining else ""
295
+ else:
296
+ # Alternative format
297
+ course_name = " ".join(parts[1:-2]) if len(parts) > 3 else course_info.replace(course_code, "").strip()
298
+ lecturer = parts[-2] if len(parts) > 1 else ""
299
+ room = parts[-1] if parts else ""
300
+
301
+ # Format time range
302
+ time_range = self.format_time_range(time_slots)
303
+
304
+ # Create structured information
305
+ return {
306
+ "time_range": time_range,
307
+ "course_code": course_code,
308
+ "course_name": course_name,
309
+ "lecturer": lecturer,
310
+ "room": room
311
+ }
312
+
313
+ def write_schedule_to_buffer(self, buffer, schedule_grid, days, time_slots):
314
+ for day in days:
315
+ current_course = None
316
+ current_times = []
317
+ day_schedule = []
318
+
319
+ for time_slot in time_slots:
320
+ course = schedule_grid[day][time_slot]
321
+
322
+ if course == current_course:
323
+ current_times.append(time_slot)
324
+ else:
325
+ if current_course and current_course.lower() != 'kosong':
326
+ time_range = self.format_time_range(current_times)
327
+ entry = f"- {day} {time_range} | {current_course}"
328
+ day_schedule.append(entry)
329
+ current_course = course
330
+ current_times = [time_slot]
331
+
332
+ # Tambahkan entri terakhir
333
+ if current_course and current_course.lower() != 'kosong':
334
+ time_range = self.format_time_range(current_times)
335
+ entry = f"- {day} {time_range} | {current_course}"
336
+ day_schedule.append(entry)
337
+
338
+ # Tulis hasil ke buffer
339
+ for entry in day_schedule:
340
+ buffer.write(entry + "\n")
341
+
342
+ buffer.write("\n") # spasi antar hari
343
+
344
+
345
+ def format_time_range(self, time_slots):
346
+ """Format multiple time slots into a readable range"""
347
+ if len(time_slots) == 1:
348
+ return time_slots[0]
349
+
350
+ first_start = time_slots[0].split('-')[0].strip()
351
+ last_end = time_slots[-1].split('-')[-1].strip()
352
+ return f"{first_start} - {last_end}"
353
+
354
+ def extract_jurusan_id(self, link):
355
+ match = re.search(r'department\?dep=(\d+)', link)
356
+ return match.group(1) if match else f"unknown_{hash(link) % 1000}"
357
+
358
+ def parse_jurusan(self, response):
359
+ jurusan_id = response.meta.get('jurusan_id')
360
+ jurusan_name = self.extract_title_jurusan_name(response)
361
+
362
+ groups_days_horizontal_link = response.xpath('//td/a[contains(@href, "groups_days_horizontal") and not(contains(@href, "subgroups_days_horizontal"))]/@href').get()
363
+
364
+ if groups_days_horizontal_link:
365
+ groups_days_horizontal_url = response.urljoin(groups_days_horizontal_link)
366
+ safe_jurusan_name = re.sub(r'[^\w\-_\. ]', '_', jurusan_name)
367
+
368
+ yield scrapy.Request(groups_days_horizontal_url,
369
+ callback=self.parse_jadwal,
370
+ meta={'jurusan_id': safe_jurusan_name, 'jurusan_name': jurusan_name})
371
+
372
+ def parse_jadwal(self, response):
373
+ jurusan_id = response.meta.get('jurusan_id')
374
+ jurusan_name = response.meta.get('jurusan_name')
375
+
376
+ if jurusan_id not in self.file_buffers:
377
+ self.initialize_document_buffer(jurusan_id, jurusan_name)
378
+
379
+ output_buffer = self.file_buffers[jurusan_id]
380
+ tables = response.xpath('//table[contains(@id, "table_")]') or response.xpath('//table')
381
+
382
+ for table in tables:
383
+ caption_text = self.get_table_caption(table, 0)
384
+ class_info = self.clean_class_info(caption_text, table)
385
+
386
+ if not class_info:
387
+ continue
388
+
389
+ self.write_section_header(output_buffer, class_info)
390
+
391
+ days = table.xpath('.//thead//th[@class="xAxis"]/text()').getall()
392
+ time_slots = table.xpath('.//tbody/tr[not(contains(@class, "foot"))]/th[@class="yAxis"]/text()').getall()
393
+
394
+ if not days or not time_slots:
395
+ continue
396
+
397
+ schedule_grid = self.build_schedule_grid(days, time_slots)
398
+ self.process_table_rows(table, schedule_grid, days, time_slots)
399
+ self.write_schedule_to_buffer(output_buffer, schedule_grid, days, time_slots)
400
+
401
+ def extract_title_jurusan_name(self, response):
402
+ title = response.xpath('//title/text()').get()
403
+ return title.strip() if title else f"Jurusan_{response.meta.get('jurusan_id')}"
404
 
405
  if __name__ == "__main__":
406
+ process = CrawlerProcess(settings={
407
+ 'DOWNLOAD_DELAY': 1,
408
+ 'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
409
+ 'ROBOTSTXT_OBEY': True,
410
+ 'LOG_LEVEL': 'INFO',
411
+ 'HTTPCACHE_ENABLED': False,
412
+ 'CONCURRENT_REQUESTS': 1,
413
+ 'RETRY_TIMES': 3
414
+ })
415
+ process.crawl(PnpSpider)
416
+ process.start()
scrapping/jurusan_scrap.py CHANGED
@@ -1,130 +1,326 @@
1
- import os
2
- import re
3
- from datetime import datetime
4
- from typing import Dict, List
5
-
6
  from bs4 import BeautifulSoup
 
7
  from supabase import create_client
 
 
 
8
 
9
- # Crawl4AI helper for rendered fetching
10
- try:
11
- from utils.crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
12
- except Exception:
13
- import sys as _sys
14
- import os as _os
15
- _sys.path.append(_os.path.join(_os.path.dirname(__file__), 'utils'))
16
- from crawl4ai_utils import fetch_html_sync, crawl_domain_parallel_sync
17
-
18
- # Shared dedup upload utility
19
  try:
20
  from utils.supabase_utils import upload_if_changed
21
  except Exception:
22
- import sys as _sys2
23
- import os as _os2
24
- _sys2.path.append(_os2.path.join(_os2.path.dirname(__file__), 'utils'))
25
  from supabase_utils import upload_if_changed
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- DOMAIN_TO_NAME: Dict[str, str] = {
29
- 'akt.pnp.ac.id': 'Akuntansi',
30
- 'an.pnp.ac.id': 'Administrasi_Niaga',
31
- 'bing.pnp.ac.id': 'Bahasa_Inggris',
32
- 'elektro.pnp.ac.id': 'Teknik_Elektro',
33
- 'me.pnp.ac.id': 'Teknik_Mesin',
34
- 'sipil.pnp.ac.id': 'Teknik_Sipil',
35
- 'ti.pnp.ac.id': 'Teknologi_Informasi',
36
- }
37
-
38
- START_URLS: List[str] = [f"https://{d}/" for d in DOMAIN_TO_NAME.keys()]
39
-
40
-
41
- PRODI_PATTERN = re.compile(r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b', re.I)
42
-
43
-
44
- def is_valid_prodi(nama: str) -> bool:
45
- return bool(PRODI_PATTERN.match(nama or ""))
46
-
47
-
48
- def extract_prodi_from_html(html: str) -> List[str]:
49
- soup = BeautifulSoup(html, 'html.parser')
50
- found: List[str] = []
51
- for a in soup.find_all('a'):
52
- txt = a.get_text(strip=True)
53
- if txt and is_valid_prodi(txt) and txt not in found:
54
- found.append(txt)
55
- return found
56
-
57
-
58
- def build_rekap_text(rekap: Dict[str, List[str]]) -> str:
59
- lines: List[str] = []
60
- lines.append("# REKAP PROGRAM STUDI PNP\n")
61
- lines.append(f"Diperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
62
- total_prodi = 0
63
- jumlah_jurusan = 0
64
- for jurusan_key, daftar in rekap.items():
65
- valid = [p.strip() for p in daftar if is_valid_prodi(p)]
66
- if not valid:
67
- continue
68
- jur_baca = jurusan_key.replace('_', ' ')
69
- lines.append(f"{jur_baca}:\n")
70
- for p in sorted(set(valid)):
71
- lines.append(f"- {p}\n")
72
- jumlah = len(valid)
73
- lines.append(f"Jumlah program studi jurusan {jur_baca}: {jumlah}\n\n")
74
- total_prodi += jumlah
75
- jumlah_jurusan += 1
76
- lines.append(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
77
- lines.append(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
78
- return ''.join(lines)
79
-
80
-
81
- if __name__ == '__main__':
82
- # Supabase client
83
- supabase = create_client(
84
- os.environ.get('NEXT_PUBLIC_SUPABASE_URL'),
85
- os.environ.get('SUPABASE_SERVICE_KEY'),
86
- )
87
- bucket = os.environ.get('NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET', 'pnp-bot-storage')
88
-
89
- # Crawl each jurusan domain in parallel batches, collect prodi across important URLs
90
- rekap_prodi: Dict[str, List[str]] = {}
91
- for url in START_URLS:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  try:
93
- domain = url.split('//')[1].strip('/').lower()
94
- jurusan = DOMAIN_TO_NAME.get(domain, domain)
95
- pages = crawl_domain_parallel_sync(
96
- seed_url=url,
97
- max_pages=30,
98
- max_concurrency=6,
99
- only_important=True,
100
- timeout=30,
101
- headless=True,
102
- )
103
- prodi_set = set()
104
- for page_url, html in pages.items():
105
- if not html:
106
- continue
107
- for p in extract_prodi_from_html(html):
108
- prodi_set.add(p)
109
- prodi_list = sorted(prodi_set)
110
- rekap_prodi[jurusan] = prodi_list
111
- print(f"[Jurusan] {jurusan}: {len(prodi_list)} prodi ditemukan dari {len(pages)} halaman penting")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  except Exception as e:
113
- print(f"[Jurusan] Gagal fetch {url}: {e}")
114
-
115
- # Build single REKAP file and upload with dedup
116
- timestamp = datetime.now().strftime('%Y%m%d_%H%M')
117
- rekap_filename = f"REKAP_PROGRAM_STUDI_{timestamp}.txt"
118
- rekap_text = build_rekap_text(rekap_prodi)
119
- try:
120
- result = upload_if_changed(supabase, bucket, rekap_filename, rekap_text)
121
- status = result.get('result')
122
- if status == 'uploaded':
123
- print(f"✅ Uploaded rekap: {rekap_filename}")
124
- elif status == 'skipped':
125
- print(f"⏭️ Skipped upload (unchanged): {rekap_filename}")
126
- else:
127
- print(f"❌ Upload error for {rekap_filename}: {result.get('error')}")
128
- except Exception as e:
129
- print(f"❌ Error uploading rekap: {e}")
130
- # End of minimal Crawl4AI rekap script
 
1
+ import scrapy
2
+ from scrapy.crawler import CrawlerProcess
 
 
 
3
  from bs4 import BeautifulSoup
4
+ from dotenv import load_dotenv
5
  from supabase import create_client
6
+ from datetime import datetime
7
+ import os, re, tempfile
8
+ import sys
9
 
10
+ # Try import shared dedup upload utility
 
 
 
 
 
 
 
 
 
11
  try:
12
  from utils.supabase_utils import upload_if_changed
13
  except Exception:
14
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
 
 
15
  from supabase_utils import upload_if_changed
16
 
17
+ load_dotenv()
18
+
19
+ SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
20
+ SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
21
+ SUPABASE_BUCKET = os.environ.get("NEXT_PUBLIC_SUPABASE_STORAGE_BUCKET", "pnp-bot-storage")
22
+
23
+
24
+ def is_valid_prodi(nama):
25
+ return bool(re.match(
26
+ r'^(D[-\s]?[2-4]|Diploma[-\s]?[2-4]|Magister|Sarjana Terapan|Teknologi Rekayasa|Prodi D3)\b',
27
+ nama, re.I
28
+ ))
29
+
30
+ class JurusanSpider(scrapy.Spider):
31
+ name = "jurusan"
32
+ custom_settings = {
33
+ 'DOWNLOAD_DELAY': 1,
34
+ 'USER_AGENT': "PNPBot/1.2",
35
+ 'ROBOTSTXT_OBEY': True,
36
+ 'LOG_LEVEL': 'INFO',
37
+ 'CONCURRENT_REQUESTS': 1,
38
+ 'RETRY_TIMES': 3
39
+ }
40
+
41
+ domain_to_name = {
42
+ 'akt.pnp.ac.id': 'Akuntansi',
43
+ 'an.pnp.ac.id': 'Administrasi_Niaga',
44
+ 'bing.pnp.ac.id': 'Bahasa_Inggris',
45
+ 'elektro.pnp.ac.id': 'Teknik_Elektro',
46
+ 'me.pnp.ac.id': 'Teknik_Mesin',
47
+ 'sipil.pnp.ac.id': 'Teknik_Sipil',
48
+ 'ti.pnp.ac.id': 'Teknologi_Informasi',
49
+ }
50
+
51
+ start_urls = [f"https://{d}/" for d in domain_to_name.keys()]
52
+
53
+ def __init__(self):
54
+ self.supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
55
+ self.bucket = SUPABASE_BUCKET
56
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M")
57
+ self.per_jurusan_pages = {}
58
+ self.rekap_prodi = {}
59
+
60
+ def parse(self, response):
61
+ domain = response.url.split("//")[1].split("/")[0]
62
+ jurusan = self.domain_to_name.get(domain, domain)
63
+ soup = BeautifulSoup(response.text, "html.parser")
64
+
65
+ program_studi = []
66
+
67
+ # Ambil semua <a> yang mengandung nama program studi (D3, D4, dll.)
68
+ for a_tag in soup.find_all("a"):
69
+ item = a_tag.get_text(strip=True)
70
+ href = a_tag.get("href")
71
+ if item and is_valid_prodi(item) and item not in program_studi:
72
+ program_studi.append(item)
73
+ if href:
74
+ prodi_url = response.urljoin(href)
75
+ self.logger.info(f"[🧩] Ditemukan prodi: {item} ({prodi_url}) di jurusan {jurusan}")
76
+ yield scrapy.Request(prodi_url, callback=self.parse_detail, meta={"jurusan": jurusan, "url": prodi_url})
77
+
78
+ # Simpan hasil awal ke dict untuk rekap
79
+ self.rekap_prodi[jurusan] = program_studi
80
+
81
+ # Tetap follow semua link internal untuk backup scraping
82
+ for a in soup.find_all("a", href=True):
83
+ href = a["href"]
84
+ if href.startswith("http") and domain in href:
85
+ yield scrapy.Request(href, callback=self.parse_detail, meta={"jurusan": jurusan, "url": href})
86
+ elif href.startswith("/"):
87
+ yield scrapy.Request(response.urljoin(href), callback=self.parse_detail, meta={"jurusan": jurusan, "url": response.urljoin(href)})
88
+
89
+ def parse_detail(self, response):
90
+ jurusan = response.meta["jurusan"]
91
+ url = response.meta["url"]
92
+ soup = BeautifulSoup(response.text, "html.parser")
93
+
94
+ # Tentukan area konten utama terlebih dahulu
95
+ candidates = soup.select(
96
+ "main, article, #content, #primary, .site-content, .entry-content, .post-content, .content, .page-content, .container main, .elementor-section.elementor-top-section, .elementor-container, .elementor-widget-theme-post-content"
97
+ )
98
+ def text_len(el):
99
+ try:
100
+ return len(el.get_text(" ", strip=True))
101
+ except Exception:
102
+ return 0
103
+ main_area = max(candidates, key=text_len) if candidates else soup.body or soup
104
+
105
+ # Bersihkan elemen yang tidak perlu (diperluas)
106
+ blacklist_selectors = [
107
+ 'header', 'footer', 'nav', 'aside', 'menu', 'form',
108
+ '.header', '.footer', '.navbar', '.nav', '.sidebar', '.menu',
109
+ '.site-header', '.site-footer', '#site-header', '#colophon', '.widget', '.widget-area',
110
+ '.breadcrumbs', '.pagination', '.navigation', '.page-links',
111
+ 'script', 'style', 'noscript', 'iframe',
112
+ '.social-links', '.share-buttons', '.newsletter',
113
+ '.ad-container', '.ads', '.advert', '[role="navigation"]', '[aria-label*="breadcrumb" i]'
114
+ ]
115
+ for selector in blacklist_selectors:
116
+ for tag in main_area.select(selector):
117
+ tag.decompose()
118
+
119
+ # Hapus elemen kosong yang tersisa dalam area utama
120
+ for element in list(main_area.find_all(True)):
121
+ if not element.get_text(strip=True) and not element.find_all(True):
122
+ element.decompose()
123
+
124
+ title_tag = main_area.find("h1") or soup.find("title")
125
+ page_title = title_tag.get_text(strip=True) if title_tag else "Halaman"
126
+
127
+ # ==== KHUSUS Halaman Pimpinan Jurusan TI ====
128
+ if url == "https://ti.pnp.ac.id/index.php/pimpinan-jurusan/":
129
+ leadership_data = {
130
+ "Pimpinan Jurusan": [],
131
+ "Koordinator Program Studi": [],
132
+ "Kepala Labor": []
133
+ }
134
+
135
+ member_items = soup.find_all(class_="member-item")
136
+ for member in member_items:
137
+ name_tag = member.find(class_="item-title")
138
+ name = name_tag.get_text(strip=True) if name_tag else "N/A"
139
+ position_tag = member.find(class_="small-text")
140
+ position = position_tag.get_text(strip=True) if position_tag else "N/A"
141
+
142
+ if "Ketua Jurusan" in position or "Sekretaris Jurusan" in position:
143
+ leadership_data["Pimpinan Jurusan"].append({"nama": name, "jabatan": position})
144
+ elif "Koordinator Program Studi" in position or "Koordinator PSDKU" in position:
145
+ leadership_data["Koordinator Program Studi"].append({"nama": name, "jabatan": position})
146
+ elif "Kepala Labor" in position:
147
+ leadership_data["Kepala Labor"].append({"nama": name, "jabatan": position})
148
+
149
+ naratif = []
150
+ naratif.append("## Pimpinan Jurusan")
151
+ for leader in leadership_data["Pimpinan Jurusan"]:
152
+ naratif.append(f"- {leader['jabatan']}: {leader['nama']}")
153
 
154
+ naratif.append("\n## Koordinator Program Studi")
155
+ for coordinator in leadership_data["Koordinator Program Studi"]:
156
+ naratif.append(f"- {coordinator['jabatan']}: {coordinator['nama']}")
157
+
158
+ naratif.append("\n## Kepala Labor")
159
+ for lab_head in leadership_data["Kepala Labor"]:
160
+ naratif.append(f"- {lab_head['jabatan']}: {lab_head['nama']}")
161
+
162
+ content_text = f"""# Pimpinan Jurusan Teknologi Informasi
163
+
164
+ URL: {url}
165
+ Jurusan: Teknologi Informasi
166
+ Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
167
+
168
+ """ + "\n".join(naratif)
169
+
170
+ self.per_jurusan_pages.setdefault(jurusan, []).append({
171
+ "url": url,
172
+ "title": "Pimpinan Jurusan Teknologi Informasi",
173
+ "content": content_text
174
+ })
175
+ return
176
+
177
+ # ==== KHUSUS Halaman Dosen Staf Pengajar TI ====
178
+ elif url == "https://ti.pnp.ac.id/index.php/dosen-staf-pengajar/":
179
+ dosen_data = []
180
+ gallery = soup.find('div', class_='gallery')
181
+ if gallery:
182
+ for item in gallery.find_all('dl', class_='gallery-item'):
183
+ caption = item.find('dd', class_='wp-caption-text')
184
+ nama_gelar = caption.get_text(strip=True) if caption else ""
185
+ link_tag = item.find('a')
186
+ link = link_tag['href'] if link_tag and link_tag.has_attr('href') else ""
187
+ img_tag = item.find('img')
188
+ foto = img_tag['src'] if img_tag and img_tag.has_attr('src') else ""
189
+ dosen_data.append({
190
+ "nama_gelar": nama_gelar,
191
+ "link_profil": link,
192
+ "foto": foto
193
+ })
194
+
195
+ content_text = f"""# Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi
196
+
197
+ URL: {url}
198
+ Jurusan: Teknologi Informasi
199
+ Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
200
+ Jumlah Dosen: {len(dosen_data)}
201
+
202
+ ## Daftar Dosen:
203
+ """
204
+ for idx, dosen in enumerate(dosen_data, 1):
205
+ content_text += f"\n### {idx}. {dosen['nama_gelar']}"
206
+ if dosen['link_profil']:
207
+ content_text += f"\n- Link Profil: {dosen['link_profil']}"
208
+ if dosen['foto']:
209
+ content_text += f"\n- Foto: {dosen['foto']}"
210
+ content_text += "\n"
211
+
212
+ self.per_jurusan_pages.setdefault(jurusan, []).append({
213
+ "url": url,
214
+ "title": "Daftar Dosen Staf Pengajar Jurusan Teknologi Informasi",
215
+ "content": content_text
216
+ })
217
+ return
218
+
219
+ # ==== PARSING STANDAR ====
220
+ body_text = []
221
+ for p in main_area.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]):
222
+ txt = p.get_text(strip=True)
223
+ if txt:
224
+ body_text.append(txt)
225
+
226
+ content_text = f"""# {page_title}
227
+
228
+ URL: {url}
229
+ Jurusan: {jurusan.replace('_', ' ')}
230
+ Tanggal Akses: {datetime.now().strftime('%d %B %Y %H:%M')}
231
+
232
+ """ + "\n\n".join(body_text)
233
+
234
+ # Tambahkan semua tabel dari area utama saja
235
+ for i, table in enumerate(main_area.find_all("table")):
236
+ content_text += f"\n\nTabel {i+1}\n\n"
237
+ for row in table.find_all("tr"):
238
+ cols = row.find_all(["td", "th"])
239
+ row_data = [col.get_text(strip=True) for col in cols]
240
+ content_text += " | ".join(row_data) + "\n"
241
+
242
+ self.per_jurusan_pages.setdefault(jurusan, []).append({
243
+ "url": url,
244
+ "title": page_title,
245
+ "content": content_text
246
+ })
247
+
248
+ def closed(self, reason):
249
+ # Simpan file tiap jurusan
250
+ for jurusan, pages in self.per_jurusan_pages.items():
251
+ filename = f"{jurusan.replace(' ', '_').upper()}_{self.timestamp}.txt"
252
+ try:
253
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
254
+ for page in pages:
255
+ f.write(page["content"] + "\n\n---\n\n")
256
+ temp_path = f.name
257
+ # Read content back to ensure consistent comparison behavior
258
+ with open(temp_path, 'r', encoding='utf-8') as rf:
259
+ content_text = rf.read()
260
+ result = upload_if_changed(self.supabase, self.bucket, filename, content_text)
261
+ if result.get('result') == 'uploaded':
262
+ self.logger.info(f"✅ Uploaded file jurusan: {filename}")
263
+ elif result.get('result') == 'skipped':
264
+ self.logger.info(f"⏭️ Skipped upload for {filename} (content unchanged)")
265
+ else:
266
+ self.logger.error(f"❌ Gagal upload {filename}: {result.get('error')}")
267
+ except Exception as e:
268
+ self.logger.error(f"❌ Gagal upload {filename}: {e}")
269
+ finally:
270
+ if os.path.exists(temp_path):
271
+ os.remove(temp_path)
272
+
273
+ # Rekap program studi
274
+ rekap_filename = f"REKAP_PROGRAM_STUDI_{self.timestamp}.txt"
275
  try:
276
+ with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False, suffix=".txt") as f:
277
+ f.write(f"# REKAP PROGRAM STUDI PNP\nDiperbarui pada: {datetime.now().strftime('%d %B %Y %H:%M')}\n\n")
278
+
279
+ total_prodi = 0
280
+ jumlah_jurusan = 0
281
+
282
+ for jurusan, daftar in self.rekap_prodi.items():
283
+ valid_prodi = []
284
+ for p in daftar:
285
+ if is_valid_prodi(p):
286
+ valid_prodi.append(p.strip())
287
+
288
+ if not valid_prodi:
289
+ continue
290
+
291
+ jurusan_baca = jurusan.replace("_", " ")
292
+ f.write(f"{jurusan_baca}:\n")
293
+ for p in sorted(set(valid_prodi)):
294
+ f.write(f"- {p}\n")
295
+ jumlah_prodi = len(valid_prodi)
296
+ f.write(f"Jumlah program studi jurusan {jurusan_baca}: {jumlah_prodi}\n\n")
297
+
298
+ total_prodi += jumlah_prodi
299
+ jumlah_jurusan += 1
300
+
301
+ f.write(f"Jumlah jurusan di Politeknik Negeri Padang: {jumlah_jurusan}\n")
302
+ f.write(f"Jumlah seluruh program studi Politeknik Negeri Padang: {total_prodi}\n")
303
+
304
+ temp_path = f.name
305
+
306
+ # Read content then use dedup upload
307
+ with open(temp_path, 'r', encoding='utf-8') as rf:
308
+ rekap_text = rf.read()
309
+ result = upload_if_changed(self.supabase, self.bucket, rekap_filename, rekap_text)
310
+ if result.get('result') == 'uploaded':
311
+ self.logger.info(f"✅ Uploaded file rekap: {rekap_filename}")
312
+ elif result.get('result') == 'skipped':
313
+ self.logger.info(f"⏭️ Skipped upload for rekap {rekap_filename} (content unchanged)")
314
+ else:
315
+ self.logger.error(f"❌ Gagal upload rekap {rekap_filename}: {result.get('error')}")
316
+
317
  except Exception as e:
318
+ self.logger.error(f" Gagal upload rekap: {e}")
319
+ finally:
320
+ if os.path.exists(temp_path):
321
+ os.remove(temp_path)
322
+
323
+ if __name__ == "__main__":
324
+ process = CrawlerProcess()
325
+ process.crawl(JurusanSpider)
326
+ process.start()
 
 
 
 
 
 
 
 
 
scrapping/pnp_scrap.py CHANGED
@@ -1,18 +1,10 @@
 
 
1
  from datetime import datetime
2
  import re
3
  import os
4
  from supabase import create_client, Client
5
  import html
6
- from typing import List
7
- from urllib.parse import urljoin
8
-
9
- from bs4 import BeautifulSoup
10
- try:
11
- from utils.crawl4ai_utils import crawl_domain_parallel_sync
12
- except Exception:
13
- import sys
14
- sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
15
- from crawl4ai_utils import crawl_domain_parallel_sync
16
 
17
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
18
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
@@ -28,103 +20,428 @@ except Exception:
28
  from supabase_utils import upload_if_changed
29
 
30
 
 
 
 
31
 
32
- if __name__ == '__main__':
33
- # Crawl4AI-based lightweight runner to fetch and upload core pages
34
- START_URLS = ['https://www.pnp.ac.id', 'https://penerimaan.pnp.ac.id']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- def _clean_text(text: str) -> str:
 
37
  if not text:
38
- return ''
39
- t = html.unescape(' '.join(text.split()))
40
- t = t.replace('“', '"').replace('â€', '"').replace('’', "'")
41
- t = t.replace('â€"', '—').replace('â€"', '–')
42
- return t.strip()
43
-
44
- def _extract_paragraphs(html_text: str, base_url: str) -> List[str]:
45
- soup = BeautifulSoup(html_text, 'html.parser')
46
- selectors = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  'div.entry-content', 'article.post', 'main.site-main',
48
  'div.content', 'div.main-content', 'div#content', 'div.page-content'
49
  ]
50
- content_area = None
51
- for sel in selectors:
52
- content_area = soup.select_one(sel)
53
  if content_area:
54
- break
55
- nodes = content_area.select('p, h1, h2, h3, h4, h5, h6, li') if content_area else soup.select('p, h1, h2, h3, h4, h5, h6, li')
56
- out: List[str] = []
57
- for node in nodes:
58
- text = _clean_text(node.get_text(' ', strip=True))
59
- if text and len(text.split()) >= 5:
60
- for a in node.find_all('a', href=True):
61
- href = a['href']
62
- if href and not href.startswith('#'):
63
- abs_url = href if href.startswith('http') else urljoin(base_url, href)
64
- text += f" (Link: {abs_url})"
65
- out.append(text)
66
- return out
67
-
68
- def _extract_tables(html_text: str, base_url: str) -> str:
69
- soup = BeautifulSoup(html_text, 'html.parser')
70
- blocks: List[str] = []
71
- for ti, table in enumerate(soup.select('table')):
72
- rows = []
73
- for tr in table.select('tr'):
74
- cells = []
75
- for c in tr.select('th, td'):
76
- tx = _clean_text(c.get_text(' ', strip=True))
77
- a = c.find('a', href=True)
78
- if a and a['href']:
79
- href = a['href']
80
- abs_url = href if href.startswith('http') else urljoin(base_url, href)
81
- tx += f" (Link: {abs_url})"
82
- if tx:
83
- cells.append(tx)
84
- if cells:
85
- rows.append(' | '.join(cells))
86
- if rows:
87
- blocks.append(f"### Tabel {ti + 1}\n\n" + "\n".join(rows))
88
- return "\n\n".join(blocks)
89
-
90
- def _final_md(title: str, url: str, paras: List[str], tables: str) -> str:
91
- md = f"# {title}\n\n**Tanggal**: {datetime.now().strftime('%d %B %Y')}\n**URL**: {url}\n\n" + "\n".join(paras)
92
- if tables:
93
- md += "\n\n## Data Tabel\n\n" + tables
94
- return md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- def _upload(page_title: str, content_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
97
  safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
98
  safe_title = re.sub(r'[-\s]+', '-', safe_title)
99
- timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
100
  filename = f"{safe_title}_{timestamp}.txt"
101
  try:
102
  result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
103
- return filename if result.get('result') == 'uploaded' else f"skipped_{filename}"
 
 
 
 
 
 
 
 
104
  except Exception as e:
105
- print(f"Upload error: {e}")
106
  return f"failed_{filename}"
107
 
108
- for seed in START_URLS:
109
- try:
110
- pages = crawl_domain_parallel_sync(
111
- seed_url=seed,
112
- max_pages=40,
113
- max_concurrency=6,
114
- only_important=True,
115
- timeout=30,
116
- headless=True,
117
- )
118
- for page_url, html_text in pages.items():
119
- if not html_text:
120
- continue
121
- soup = BeautifulSoup(html_text, 'html.parser')
122
- title_node = soup.select_one('h1.entry-title, h1.page-title')
123
- page_title = title_node.get_text(strip=True) if title_node else (soup.title.string.strip() if soup.title and soup.title.string else 'Unknown Page')
124
- paras = _extract_paragraphs(html_text, page_url)
125
- tables = _extract_tables(html_text, page_url)
126
- content = _final_md(page_title, page_url, paras, tables)
127
- up = _upload(page_title, content)
128
- print(f"[PNP crawl] {page_url} -> {up}")
129
- except Exception as e:
130
- print(f"[PNP crawl] Error processing seed {seed}: {e}")
 
 
 
 
 
 
 
 
 
1
+ import scrapy
2
+ from scrapy.crawler import CrawlerProcess
3
  from datetime import datetime
4
  import re
5
  import os
6
  from supabase import create_client, Client
7
  import html
 
 
 
 
 
 
 
 
 
 
8
 
9
  SUPABASE_URL = os.environ.get("NEXT_PUBLIC_SUPABASE_URL")
10
  SUPABASE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")
 
20
  from supabase_utils import upload_if_changed
21
 
22
 
23
+ class PNPContentSpider(scrapy.Spider):
24
+ name = 'pnp_content_spider'
25
+ start_urls = ['https://www.pnp.ac.id','https://penerimaan.pnp.ac.id']
26
 
27
+ excluded_subdomains = [
28
+ 'akt.pnp.ac.id',
29
+ 'an.pnp.ac.id',
30
+ 'bing.pnp.ac.id',
31
+ 'elektro.pnp.ac.id',
32
+ 'me.pnp.ac.id',
33
+ 'sipil.pnp.ac.id',
34
+ 'ti.pnp.ac.id'
35
+ ]
36
+
37
+ custom_settings = {
38
+ 'DOWNLOAD_DELAY': 1,
39
+ 'RETRY_TIMES': 3,
40
+ 'HTTPCACHE_ENABLED': False,
41
+ 'ROBOTSTXT_OBEY': True,
42
+ 'CONCURRENT_REQUESTS': 1,
43
+ 'RETRY_ENABLED': True,
44
+ 'USER_AGENT': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
45
+ 'LOG_LEVEL': 'INFO',
46
+ }
47
 
48
+ def clean_text(self, text: str) -> str:
49
+ """Clean and normalize text content"""
50
  if not text:
51
+ return ""
52
+
53
+ # Decode HTML entities
54
+ text = html.unescape(text)
55
+
56
+ # Remove extra whitespace and normalize
57
+ text = ' '.join(text.split())
58
+
59
+ # Fix common encoding issues
60
+ text = text.replace('“', '"').replace('â€', '"').replace('’', "'")
61
+ text = text.replace('â€"', '—').replace('â€"', '–')
62
+
63
+ return text.strip()
64
+
65
+ def format_paragraph(self, text: str) -> str:
66
+ text = self.clean_text(text)
67
+ sentences = re.split(r'(?<=[.!?]) +', text)
68
+ paragraph = ''
69
+ word_count = 0
70
+ for sentence in sentences:
71
+ words = sentence.split()
72
+ word_count += len(words)
73
+ paragraph += sentence + ' '
74
+ if 50 <= word_count <= 150:
75
+ break
76
+ return paragraph.strip()
77
+
78
+ def parse(self, response):
79
+ self.logger.info(f"Processing main page: {response.url}")
80
+ nav_items = response.css('ul.wp-block-navigation__container > li.wp-block-navigation-item')
81
+ for item in nav_items:
82
+ main_title = item.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
83
+ if not main_title:
84
+ main_title = item.css('a.wp-block-navigation-item__content::text').get('').strip()
85
+ main_link = item.css('a.wp-block-navigation-item__content::attr(href)').get()
86
+ if main_link and not main_link.startswith('#'):
87
+ main_link = response.urljoin(main_link)
88
+ if "jurusan" in main_link.lower():
89
+ continue
90
+ yield scrapy.Request(main_link, callback=self.parse_content, meta={'page_title': main_title, 'menu_path': main_title})
91
+ submenus = item.css('ul.wp-block-navigation__submenu-container > li.wp-block-navigation-item')
92
+ for submenu in submenus:
93
+ submenu_title = submenu.css('a.wp-block-navigation-item__content span.wp-block-navigation-item__label::text').get()
94
+ if not submenu_title:
95
+ submenu_title = submenu.css('a.wp-block-navigation-item__content::text').get('').strip()
96
+ submenu_link = submenu.css('a.wp-block-navigation-item__content::attr(href)').get()
97
+ if submenu_link and not submenu_link.startswith('#'):
98
+ submenu_link = response.urljoin(submenu_link)
99
+ if "jurusan" in submenu_link.lower():
100
+ continue
101
+ menu_path = f"{main_title} > {submenu_title}" if main_title else submenu_title
102
+ yield scrapy.Request(submenu_link, callback=self.parse_content, meta={'page_title': submenu_title, 'menu_path': menu_path})
103
+
104
+ def extract_leadership_info(self, response):
105
+ """Extract leadership information from the special leadership page"""
106
+ self.logger.info("Extracting leadership information from special page")
107
+
108
+ leaders_data = []
109
+
110
+ # Try multiple table selectors based on the HTML structure shown
111
+ tables = response.css('table, .wp-block-table table, .entry-content table, tbody')
112
+
113
+ if tables:
114
+ # Process each table
115
+ for table_idx, table in enumerate(tables):
116
+ self.logger.info(f"Processing table {table_idx + 1}")
117
+
118
+ rows = table.css('tr')
119
+ if not rows:
120
+ continue
121
+
122
+ leader_info = {}
123
+ position_title = ""
124
+
125
+ # Look for position title (like "DIREKTUR")
126
+ title_elements = table.css('strong, .position-title, th')
127
+ for title_elem in title_elements:
128
+ title_text = self.clean_text(' '.join(title_elem.css('*::text').getall()))
129
+ if any(pos in title_text.upper() for pos in ['DIREKTUR', 'WAKIL DIREKTUR', 'KETUA', 'SEKRETARIS']):
130
+ position_title = title_text
131
+ break
132
+
133
+ # Extract key-value pairs from table rows
134
+ for row in rows:
135
+ cells = row.css('td, th')
136
+
137
+ if len(cells) >= 3:
138
+ # Format: Label | : | Value (3 columns)
139
+ key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
140
+ separator = self.clean_text(' '.join(cells[1].css('*::text').getall()))
141
+ value = self.clean_text(' '.join(cells[2].css('*::text').getall()))
142
+
143
+ if key and value and separator == ":":
144
+ leader_info[key] = value
145
+
146
+ elif len(cells) == 2:
147
+ # Format: Label | Value (2 columns)
148
+ key = self.clean_text(' '.join(cells[0].css('*::text').getall()))
149
+ value = self.clean_text(' '.join(cells[1].css('*::text').getall()))
150
+
151
+ if key and value and key != value:
152
+ # Skip if key contains colon (likely "Label:")
153
+ clean_key = key.replace(':', '').strip()
154
+ leader_info[clean_key] = value
155
+
156
+ # Add position title if found
157
+ if position_title:
158
+ leader_info['Posisi'] = position_title
159
+
160
+ # If we found structured data, add it
161
+ if leader_info:
162
+ leaders_data.append(leader_info)
163
+ self.logger.info(f"Extracted leader data: {list(leader_info.keys())}")
164
+
165
+ # Fallback: Extract from general content structure
166
+ if not leaders_data:
167
+ self.logger.info("No table data found, trying general content extraction")
168
+
169
+ # Look for profile sections
170
+ profile_sections = response.css('.wp-block-group, .entry-content > div, .profile-section')
171
+
172
+ for section in profile_sections:
173
+ section_text = self.clean_text(' '.join(section.css('*::text').getall()))
174
+
175
+ # Check if this section contains leadership info
176
+ if any(keyword in section_text.lower() for keyword in ['direktur', 'wakil direktur', 'dr.', 's.t.', 'm.kom', 'nidn']):
177
+ # Try to extract structured info from the text
178
+ leader_info = {'description': section_text}
179
+
180
+ # Try to extract specific details using regex
181
+ name_match = re.search(r'(Dr\.|Ir\.|Prof\.)?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*),?\s*(S\.T\.|M\.Kom|M\.T\.|S\.E\.|M\.M\.)*', section_text)
182
+ if name_match:
183
+ leader_info['Nama'] = name_match.group(0).strip()
184
+
185
+ nidn_match = re.search(r'NIDN[:\s]*(\d+)', section_text)
186
+ if nidn_match:
187
+ leader_info['NIDN'] = nidn_match.group(1)
188
+
189
+ leaders_data.append(leader_info)
190
+
191
+ return leaders_data
192
+
193
+ def format_leadership_content(self, leaders_data):
194
+ """Format leadership data into readable content"""
195
+ formatted_content = []
196
+
197
+ for idx, leader in enumerate(leaders_data, 1):
198
+ if isinstance(leader, dict):
199
+ if 'description' in leader and len(leader) == 1:
200
+ # Simple description format
201
+ content = f"## Pimpinan {idx}\n\n{leader['description']}"
202
+ else:
203
+ # Structured data format - create narrative
204
+ position = leader.get("Posisi", "")
205
+ nama = leader.get("Nama", "")
206
+ nidn = leader.get("NIDN", "")
207
+ jabatan_akademik = leader.get("Jabatan Akademik", "")
208
+ jurusan = leader.get("Jurusan", "")
209
+ program_studi = leader.get("Program Studi", "")
210
+
211
+ # Create narrative starting with position
212
+ if position and nama:
213
+ content = f"## {position}\n\n"
214
+ narrative = f"{position} Politeknik Negeri Padang adalah {nama}."
215
+ elif nama:
216
+ content = f"## Pimpinan {idx}\n\n"
217
+ narrative = f"Pimpinan ini adalah {nama}."
218
+ else:
219
+ content = f"## Pimpinan {idx}\n\n"
220
+ narrative = "Informasi pimpinan:"
221
+
222
+ # Add academic position
223
+ if jabatan_akademik:
224
+ narrative += f" Secara akademik, beliau menjabat sebagai {jabatan_akademik}."
225
+
226
+ # Add department information
227
+ if jurusan:
228
+ narrative += f" Beliau berasal dari Jurusan {jurusan}."
229
+
230
+ # Add study program
231
+ if program_studi:
232
+ narrative += f" Program studi yang diampu adalah {program_studi}."
233
+
234
+ # Add NIDN
235
+ if nidn:
236
+ narrative += f" NIDN beliau adalah {nidn}."
237
+
238
+ content += narrative + "\n\n"
239
+
240
+ # Add any remaining information that wasn't included in narrative
241
+ used_keys = ['Posisi', 'Nama', 'NIDN', 'Jabatan Akademik', 'Jurusan', 'Program Studi', 'description']
242
+ for key, value in leader.items():
243
+ if key not in used_keys:
244
+ content += f"**{key}**: {value}\n\n"
245
+
246
+ # Add description if exists
247
+ if 'description' in leader:
248
+ content += f"**Informasi Tambahan**: {leader['description']}\n\n"
249
+
250
+ formatted_content.append(content.strip())
251
+
252
+ return formatted_content
253
+
254
+ def parse_content(self, response):
255
+ page_title = response.meta.get('page_title', 'Unknown Page')
256
+ menu_path = response.meta.get('menu_path', '')
257
+ if page_title == 'Unknown Page':
258
+ page_title = self.clean_text(response.css('h1.entry-title::text, h1.page-title::text').get(''))
259
+
260
+ self.logger.info(f"Extracting content from: {response.url} ({page_title})")
261
+
262
+ paragraphs = []
263
+
264
+ # Special case: halaman pimpinan PNP
265
+ if ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url:
266
+ self.logger.info("Detected leadership page - using special extraction")
267
+
268
+ leaders_data = self.extract_leadership_info(response)
269
+ self.logger.info(f"Found {len(leaders_data)} leadership entries")
270
+
271
+ if leaders_data:
272
+ formatted_leaders = self.format_leadership_content(leaders_data)
273
+ paragraphs = formatted_leaders
274
+
275
+ # Also extract any additional content from the page
276
+ additional_content = self.extract_general_content(response)
277
+ if additional_content:
278
+ paragraphs.extend(["## Informasi Tambahan"] + additional_content)
279
+ else:
280
+ # Fallback to general content extraction
281
+ self.logger.warning("Leadership extraction failed, falling back to general extraction")
282
+ paragraphs = self.extract_general_content(response)
283
+ else:
284
+ # Normal content extraction
285
+ paragraphs = self.extract_general_content(response)
286
+
287
+ # Create final content
288
+ content_text = self.create_final_content(page_title, response.url, paragraphs)
289
+
290
+ # Add table data if any (but skip for leadership pages to avoid duplication)
291
+ if not (("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url):
292
+ table_content = self.extract_table_data(response)
293
+ if table_content:
294
+ content_text += "\n\n## Data Tabel\n\n" + table_content
295
+
296
+ # Upload to Supabase
297
+ filename = self.upload_content(page_title, content_text)
298
+
299
+ yield {
300
+ 'url': response.url,
301
+ 'title': page_title,
302
+ 'menu_path': menu_path,
303
+ 'uploaded_as': filename,
304
+ 'timestamp': datetime.now().isoformat(),
305
+ 'content_length': len(content_text),
306
+ 'leadership_page': ("pimpinan" in response.url.lower() or "pimpinan" in page_title.lower()) and "pnp.ac.id" in response.url
307
+ }
308
+
309
+ # Continue with additional scraping if needed
310
+ self.process_additional_links(response, menu_path)
311
+
312
+ def extract_general_content(self, response):
313
+ """Extract general content from the page"""
314
+ paragraphs = []
315
+
316
+ content_selectors = [
317
  'div.entry-content', 'article.post', 'main.site-main',
318
  'div.content', 'div.main-content', 'div#content', 'div.page-content'
319
  ]
320
+
321
+ for selector in content_selectors:
322
+ content_area = response.css(selector)
323
  if content_area:
324
+ elems = content_area.css('p, h1, h2, h3, h4, h5, h6, li, div.wp-block-group')
325
+ for elem in elems:
326
+ text = self.clean_text(' '.join(elem.css('*::text').getall()))
327
+ if text and len(text.split()) >= 5:
328
+ # Add links if any
329
+ links = elem.css('a::attr(href)').getall()
330
+ for link in links:
331
+ if link and not link.startswith('#'):
332
+ text += f" (Link: {response.urljoin(link)})"
333
+ paragraphs.append(text)
334
+ if paragraphs:
335
+ break
336
+
337
+ # Fallback: extract from body
338
+ if not paragraphs:
339
+ body_texts = response.css('body *::text').getall()
340
+ combined_text = self.clean_text(' '.join(body_texts))
341
+ if combined_text:
342
+ # Split into meaningful chunks
343
+ sentences = re.split(r'(?<=[.!?])\s+', combined_text)
344
+ current_para = ""
345
+ for sentence in sentences:
346
+ if len((current_para + " " + sentence).split()) <= 50:
347
+ current_para += " " + sentence
348
+ else:
349
+ if current_para.strip():
350
+ paragraphs.append(current_para.strip())
351
+ current_para = sentence
352
+ if current_para.strip():
353
+ paragraphs.append(current_para.strip())
354
+
355
+ # Format paragraphs
356
+ formatted_paragraphs = []
357
+ for para in paragraphs:
358
+ if len(para.split()) >= 10:
359
+ formatted_paragraphs.append(self.format_paragraph(para))
360
+
361
+ return formatted_paragraphs
362
+
363
+ def extract_table_data(self, response):
364
+ """Extract and format table data"""
365
+ tables = response.css('table')
366
+ table_output = []
367
+
368
+ for table_idx, table in enumerate(tables):
369
+ table_rows = []
370
+ for row in table.css('tr'):
371
+ cells = row.css('th, td')
372
+ row_data = []
373
+ for cell in cells:
374
+ cell_text = self.clean_text(' '.join(cell.css('*::text').getall()))
375
+ if link := cell.css('a::attr(href)').get():
376
+ cell_text += f" (Link: {response.urljoin(link)})"
377
+ if cell_text:
378
+ row_data.append(cell_text)
379
+ if row_data:
380
+ table_rows.append(" | ".join(row_data))
381
+
382
+ if table_rows:
383
+ table_output.append(f"### Tabel {table_idx + 1}\n\n" + "\n".join(table_rows))
384
+
385
+ return "\n\n".join(table_output)
386
 
387
+ def create_final_content(self, page_title, url, paragraphs):
388
+ """Create the final formatted content"""
389
+ return f"""# {page_title}
390
+
391
+ **Tanggal**: {datetime.now().strftime('%d %B %Y')}
392
+ **URL**: {url}
393
+
394
+ {chr(10).join(paragraphs)}"""
395
+
396
+ def upload_content(self, page_title, content_text):
397
+ """Upload content to Supabase with content-based deduplication"""
398
  safe_title = re.sub(r'[^\w\s-]', '', page_title).strip().lower()
399
  safe_title = re.sub(r'[-\s]+', '-', safe_title)
400
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
401
  filename = f"{safe_title}_{timestamp}.txt"
402
  try:
403
  result = upload_if_changed(supabase, SUPABASE_BUCKET, filename, content_text)
404
+ if result.get('result') == 'uploaded':
405
+ self.logger.info(f"Uploaded {filename} successfully.")
406
+ return filename
407
+ elif result.get('result') == 'skipped':
408
+ self.logger.info(f"Skipped upload for {filename} (content unchanged)")
409
+ return f"skipped_{filename}"
410
+ else:
411
+ self.logger.error(f"Upload error for {filename}: {result.get('error')}")
412
+ return f"failed_{filename}"
413
  except Exception as e:
414
+ self.logger.error(f"Upload error for {filename}: {str(e)}")
415
  return f"failed_{filename}"
416
 
417
+ def process_additional_links(self, response, menu_path):
418
+ """Process additional links from the same domain"""
419
+ current_domain = response.url.split('//')[1].split('/')[0]
420
+ if 'pnp.ac.id' not in current_domain:
421
+ header_links = []
422
+ for sel in ['header a::attr(href)', 'nav a::attr(href)', '.navbar a::attr(href)']:
423
+ header_links.extend(response.css(sel).getall())
424
+ for link in set(link for link in header_links if link and not link.startswith(('#', 'javascript:'))):
425
+ full_link = response.urljoin(link)
426
+ if current_domain in full_link:
427
+ yield scrapy.Request(
428
+ url=full_link,
429
+ callback=self.parse_content,
430
+ meta={'page_title': 'Header Link', 'menu_path': f"{menu_path} > Header"}
431
+ )
432
+
433
+
434
+ if __name__ == '__main__':
435
+ process = CrawlerProcess({
436
+ 'USER_AGENT': 'PNPBot/1.0',
437
+ 'DOWNLOAD_DELAY': 2,
438
+ 'ROBOTSTXT_OBEY': True,
439
+ 'LOG_LEVEL': 'INFO',
440
+ 'CONCURRENT_REQUESTS': 1,
441
+ 'DOWNLOAD_TIMEOUT': 100,
442
+ 'RETRY_TIMES': 3,
443
+ 'HTTPCACHE_ENABLED': False,
444
+ 'FEED_EXPORT_ENCODING': 'utf-8'
445
+ })
446
+ process.crawl(PNPContentSpider)
447
+ process.start()
scrapping/utils/crawl4ai_utils.py DELETED
@@ -1,168 +0,0 @@
1
- import asyncio
2
- from typing import Optional, List, Dict, Set
3
- from urllib.parse import urlparse, urljoin
4
-
5
- from bs4 import BeautifulSoup
6
-
7
- try:
8
- from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
9
- except Exception as e:
10
- AsyncWebCrawler = None # type: ignore
11
- BrowserConfig = None # type: ignore
12
- CrawlerRunConfig = None # type: ignore
13
- CacheMode = None # type: ignore
14
-
15
-
16
- class Crawl4AIUnavailable(Exception):
17
- pass
18
-
19
-
20
- async def fetch_html(url: str, timeout: int = 30, headless: bool = True) -> str:
21
- """Fetch rendered HTML using Crawl4AI. Raises Crawl4AIUnavailable if not installed."""
22
- if AsyncWebCrawler is None:
23
- raise Crawl4AIUnavailable(
24
- "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
25
- )
26
- browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
27
- run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
28
- async with AsyncWebCrawler(config=browser_conf) as crawler:
29
- result = await crawler.arun(url=url, config=run_conf)
30
- # Prefer original HTML when available; fallback to markdown->html isn't provided, so use result.html
31
- html = getattr(result, "html", None)
32
- if not html:
33
- # Some versions expose "content" or only markdown. Fallback to markdown as plain text if needed.
34
- html = getattr(result, "content", None) or getattr(result, "markdown", "")
35
- return html
36
-
37
-
38
- def fetch_html_sync(url: str, timeout: int = 30, headless: bool = True) -> str:
39
- """Synchronous wrapper for fetch_html."""
40
- return asyncio.run(fetch_html(url, timeout=timeout, headless=headless))
41
-
42
-
43
- # ---------------- Parallel in-domain crawling helpers ---------------- #
44
-
45
- IMPORTANT_KEYWORDS = [
46
- # Bahasa Indonesia
47
- "profil", "tentang", "visi", "misi", "struktur", "pimpinan",
48
- "akademik", "kurikulum", "dosen", "staf", "jadwal", "kalender",
49
- "pengumuman", "berita", "pengabdian", "penelitian", "organisasi",
50
- "program-studi", "prodi", "sarjana", "diploma", "magister",
51
- # English fallbacks
52
- "about", "profile", "leadership", "faculty", "staff", "schedule",
53
- "announcement", "news", "curriculum", "study-program"
54
- ]
55
-
56
-
57
- def _same_domain(url: str, base_netloc: str) -> bool:
58
- try:
59
- return urlparse(url).netloc.endswith(base_netloc)
60
- except Exception:
61
- return False
62
-
63
-
64
- def _discover_links(base_url: str, html: str) -> List[str]:
65
- soup = BeautifulSoup(html or "", "html.parser")
66
- links: List[str] = []
67
- for a in soup.find_all("a", href=True):
68
- href = a["href"].strip()
69
- if href.startswith("#") or href.lower().startswith("javascript:"):
70
- continue
71
- abs_url = urljoin(base_url, href)
72
- links.append(abs_url)
73
- return links
74
-
75
-
76
- def _is_important(url: str) -> bool:
77
- lu = url.lower()
78
- return any(k in lu for k in IMPORTANT_KEYWORDS)
79
-
80
-
81
- async def crawl_domain_parallel(
82
- seed_url: str,
83
- max_pages: int = 20,
84
- max_concurrency: int = 5,
85
- only_important: bool = True,
86
- timeout: int = 30,
87
- headless: bool = True,
88
- ) -> Dict[str, str]:
89
- """
90
- Crawl pages in the same domain as seed_url in parallel using a single AsyncWebCrawler session.
91
- Returns {url: html} for fetched pages. If only_important=True, limits to URLs containing important keywords.
92
- """
93
- if AsyncWebCrawler is None:
94
- raise Crawl4AIUnavailable(
95
- "crawl4ai is not installed. Run: pip install crawl4ai playwright && python -m playwright install chromium"
96
- )
97
-
98
- parsed = urlparse(seed_url)
99
- base_netloc = parsed.netloc
100
-
101
- browser_conf = BrowserConfig(headless=headless, java_script_enabled=True)
102
- run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, timeout=timeout)
103
-
104
- results: Dict[str, str] = {}
105
- visited: Set[str] = set()
106
- frontier: List[str] = [seed_url]
107
- sem = asyncio.Semaphore(max_concurrency)
108
-
109
- async with AsyncWebCrawler(config=browser_conf) as crawler:
110
- async def fetch_one(url: str):
111
- async with sem:
112
- try:
113
- res = await crawler.arun(url=url, config=run_conf)
114
- html = getattr(res, "html", None) or getattr(res, "content", None) or getattr(res, "markdown", "")
115
- results[url] = html or ""
116
- return html or ""
117
- except Exception:
118
- results[url] = ""
119
- return ""
120
-
121
- while frontier and len(visited) < max_pages:
122
- batch: List[str] = []
123
- # Build a batch from frontier
124
- while frontier and len(batch) < max_concurrency and len(visited) + len(batch) < max_pages:
125
- u = frontier.pop(0)
126
- if u in visited:
127
- continue
128
- if not _same_domain(u, base_netloc):
129
- continue
130
- if only_important and not _is_important(u) and u != seed_url:
131
- continue
132
- visited.add(u)
133
- batch.append(u)
134
-
135
- if not batch:
136
- break
137
-
138
- pages = await asyncio.gather(*(fetch_one(u) for u in batch))
139
- # Discover more links from fetched pages
140
- for u, html in zip(batch, pages):
141
- if not html:
142
- continue
143
- for link in _discover_links(u, html):
144
- if link not in visited and _same_domain(link, base_netloc):
145
- frontier.append(link)
146
-
147
- return results
148
-
149
-
150
- def crawl_domain_parallel_sync(
151
- seed_url: str,
152
- max_pages: int = 20,
153
- max_concurrency: int = 5,
154
- only_important: bool = True,
155
- timeout: int = 30,
156
- headless: bool = True,
157
- ) -> Dict[str, str]:
158
- """Sync wrapper around crawl_domain_parallel."""
159
- return asyncio.run(
160
- crawl_domain_parallel(
161
- seed_url=seed_url,
162
- max_pages=max_pages,
163
- max_concurrency=max_concurrency,
164
- only_important=only_important,
165
- timeout=timeout,
166
- headless=headless,
167
- )
168
- )