|
|
import requests |
|
|
import hashlib |
|
|
import json |
|
|
import os |
|
|
from typing import List, Dict |
|
|
from bs4 import BeautifulSoup |
|
|
import re |
|
|
|
|
|
class HTMLScraper: |
|
|
def __init__(self): |
|
|
self.session = requests.Session() |
|
|
self.session.headers.update({ |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
|
}) |
|
|
|
|
|
self.program_urls = { |
|
|
'ai': 'https://abit.itmo.ru/program/master/ai', |
|
|
'ai_product': 'https://abit.itmo.ru/program/master/ai_product' |
|
|
} |
|
|
|
|
|
def scrape_programs(self) -> Dict: |
|
|
programs = {} |
|
|
|
|
|
for program_id, url in self.program_urls.items(): |
|
|
try: |
|
|
print(f'Скрапинг программы: {program_id}') |
|
|
program_data = self._scrape_program_page(url, program_id) |
|
|
if program_data: |
|
|
programs[program_id] = program_data |
|
|
print(f'Успешно обработана программа: {program_data["title"]}') |
|
|
except Exception as e: |
|
|
print(f'Ошибка при скрапинге {program_id}: {e}') |
|
|
|
|
|
return programs |
|
|
|
|
|
def _scrape_program_page(self, url: str, program_id: str) -> Dict: |
|
|
response = self.session.get(url, timeout=30) |
|
|
response.raise_for_status() |
|
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
|
|
|
|
|
title = self._extract_title(soup) |
|
|
|
|
|
|
|
|
description = self._extract_description(soup) |
|
|
|
|
|
|
|
|
pdf_links = self._extract_pdf_links(soup, url) |
|
|
|
|
|
|
|
|
content_hash = self._calculate_content_hash(response.content) |
|
|
|
|
|
return { |
|
|
'id': program_id, |
|
|
'title': title, |
|
|
'description': description, |
|
|
'url': url, |
|
|
'pdf_links': pdf_links, |
|
|
'content_hash': content_hash, |
|
|
'last_updated': response.headers.get('last-modified', '') |
|
|
} |
|
|
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> str: |
|
|
|
|
|
title_selectors = [ |
|
|
'h1', |
|
|
'.program-title', |
|
|
'.title', |
|
|
'[class*="title"]', |
|
|
'[class*="header"]' |
|
|
] |
|
|
|
|
|
for selector in title_selectors: |
|
|
title_elem = soup.select_one(selector) |
|
|
if title_elem and title_elem.get_text().strip(): |
|
|
title = title_elem.get_text().strip() |
|
|
if len(title) > 5: |
|
|
return title |
|
|
|
|
|
|
|
|
for elem in soup.find_all(['h1', 'h2', 'h3']): |
|
|
text = elem.get_text().strip() |
|
|
if any(keyword in text.lower() for keyword in ['искусственный интеллект', 'ai', 'продукт']): |
|
|
return text |
|
|
|
|
|
return f'Программа {program_id.upper()}' |
|
|
|
|
|
def _extract_description(self, soup: BeautifulSoup) -> str: |
|
|
|
|
|
desc_selectors = [ |
|
|
'.program-description', |
|
|
'.description', |
|
|
'.about', |
|
|
'[class*="description"]', |
|
|
'[class*="about"]', |
|
|
'p' |
|
|
] |
|
|
|
|
|
for selector in desc_selectors: |
|
|
desc_elem = soup.select_one(selector) |
|
|
if desc_elem: |
|
|
desc = desc_elem.get_text().strip() |
|
|
if len(desc) > 50: |
|
|
return desc[:500] + '...' if len(desc) > 500 else desc |
|
|
|
|
|
|
|
|
for elem in soup.find_all('p'): |
|
|
text = elem.get_text().strip() |
|
|
if any(keyword in text.lower() for keyword in ['магистратура', 'программа', 'обучение', 'подготовка']): |
|
|
if len(text) > 30: |
|
|
return text[:500] + '...' if len(text) > 500 else text |
|
|
|
|
|
return 'Описание программы магистратуры ITMO' |
|
|
|
|
|
def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: |
|
|
pdf_links = [] |
|
|
|
|
|
|
|
|
for link in soup.find_all('a', href=True): |
|
|
href = link['href'] |
|
|
text = link.get_text().strip().lower() |
|
|
|
|
|
|
|
|
if href.endswith('.pdf') or 'pdf' in href: |
|
|
|
|
|
doc_type = self._determine_document_type(text) |
|
|
|
|
|
|
|
|
if href.startswith('http'): |
|
|
full_url = href |
|
|
else: |
|
|
full_url = self._make_absolute_url(href, base_url) |
|
|
|
|
|
|
|
|
filename = self._generate_filename(href, doc_type) |
|
|
|
|
|
pdf_links.append({ |
|
|
'url': full_url, |
|
|
'filename': filename, |
|
|
'type': doc_type, |
|
|
'text': text |
|
|
}) |
|
|
|
|
|
|
|
|
if not pdf_links: |
|
|
pdf_links = self._search_pdf_by_keywords(soup, base_url) |
|
|
|
|
|
return pdf_links |
|
|
|
|
|
def _determine_document_type(self, text: str) -> str: |
|
|
text_lower = text.lower() |
|
|
|
|
|
if any(word in text_lower for word in ['учебный план', 'curriculum', 'plan']): |
|
|
return 'curriculum' |
|
|
elif any(word in text_lower for word in ['программа', 'program']): |
|
|
return 'program' |
|
|
elif any(word in text_lower for word in ['описание', 'description']): |
|
|
return 'description' |
|
|
else: |
|
|
return 'document' |
|
|
|
|
|
def _make_absolute_url(self, href: str, base_url: str) -> str: |
|
|
if href.startswith('//'): |
|
|
return 'https:' + href |
|
|
elif href.startswith('/'): |
|
|
|
|
|
from urllib.parse import urlparse |
|
|
parsed = urlparse(base_url) |
|
|
return f"{parsed.scheme}://{parsed.netloc}{href}" |
|
|
else: |
|
|
return base_url.rstrip('/') + '/' + href.lstrip('/') |
|
|
|
|
|
def _generate_filename(self, href: str, doc_type: str) -> str: |
|
|
|
|
|
filename = href.split('/')[-1] |
|
|
if not filename.endswith('.pdf'): |
|
|
filename += '.pdf' |
|
|
|
|
|
|
|
|
return f"{doc_type}_{filename}" |
|
|
|
|
|
def _search_pdf_by_keywords(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: |
|
|
pdf_links = [] |
|
|
|
|
|
|
|
|
keywords = [ |
|
|
'учебный план', |
|
|
'curriculum', |
|
|
'программа обучения', |
|
|
'образовательная программа' |
|
|
] |
|
|
|
|
|
|
|
|
page_text = soup.get_text().lower() |
|
|
|
|
|
for keyword in keywords: |
|
|
if keyword in page_text: |
|
|
|
|
|
for elem in soup.find_all(['a', 'p', 'div']): |
|
|
text = elem.get_text().lower() |
|
|
if keyword in text: |
|
|
|
|
|
links = elem.find_all('a', href=True) |
|
|
for link in links: |
|
|
href = link['href'] |
|
|
if href.endswith('.pdf') or 'pdf' in href: |
|
|
full_url = self._make_absolute_url(href, base_url) |
|
|
pdf_links.append({ |
|
|
'url': full_url, |
|
|
'filename': f"curriculum_{href.split('/')[-1]}", |
|
|
'type': 'curriculum', |
|
|
'text': link.get_text().strip() |
|
|
}) |
|
|
|
|
|
return pdf_links |
|
|
|
|
|
def _calculate_content_hash(self, content: bytes) -> str: |
|
|
return hashlib.sha256(content).hexdigest() |
|
|
|
|
|
def save_programs(self, programs: Dict): |
|
|
os.makedirs('data/processed', exist_ok=True) |
|
|
|
|
|
with open('data/processed/programs.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(programs, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
print(f'Программы сохранены: {len(programs)} программ') |
|
|
|
|
|
def check_updates(self, programs: Dict) -> Dict: |
|
|
updates = {} |
|
|
|
|
|
for program_id, program in programs.items(): |
|
|
try: |
|
|
response = self.session.get(program['url'], timeout=30) |
|
|
current_hash = self._calculate_content_hash(response.content) |
|
|
|
|
|
if current_hash != program.get('content_hash'): |
|
|
updates[program_id] = { |
|
|
'old_hash': program.get('content_hash'), |
|
|
'new_hash': current_hash, |
|
|
'last_modified': response.headers.get('last-modified', '') |
|
|
} |
|
|
print(f'Обнаружены изменения в программе: {program_id}') |
|
|
except Exception as e: |
|
|
print(f'Ошибка проверки обновлений для {program_id}: {e}') |
|
|
|
|
|
return updates |
|
|
|
|
|
def main(): |
|
|
scraper = HTMLScraper() |
|
|
programs = scraper.scrape_programs() |
|
|
scraper.save_programs(programs) |
|
|
|
|
|
print(f'Обработано программ: {len(programs)}') |
|
|
for program_id, program in programs.items(): |
|
|
print(f'{program_id}: {program["title"]} - {len(program["pdf_links"])} PDF') |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|