|
|
import requests |
|
|
import pdfplumber |
|
|
import os |
|
|
import re |
|
|
from typing import List, Dict |
|
|
import tempfile |
|
|
from urllib.parse import urlparse |
|
|
|
|
|
class PDFParser: |
|
|
def __init__(self): |
|
|
self.session = requests.Session() |
|
|
self.session.headers.update({ |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
|
}) |
|
|
|
|
|
def download_pdf(self, url: str, filename: str) -> str: |
|
|
"""Скачивает PDF файл и сохраняет локально""" |
|
|
try: |
|
|
print(f'Скачивание PDF: {filename}') |
|
|
response = self.session.get(url, stream=True, timeout=60) |
|
|
response.raise_for_status() |
|
|
|
|
|
|
|
|
os.makedirs('data/raw', exist_ok=True) |
|
|
|
|
|
|
|
|
filepath = os.path.join('data/raw', filename) |
|
|
with open(filepath, 'wb') as f: |
|
|
for chunk in response.iter_content(chunk_size=8192): |
|
|
f.write(chunk) |
|
|
|
|
|
print(f'PDF сохранен: {filepath}') |
|
|
return filepath |
|
|
|
|
|
except Exception as e: |
|
|
print(f'Ошибка скачивания PDF {url}: {e}') |
|
|
return None |
|
|
|
|
|
def parse_pdf(self, filepath: str, program_id: str) -> List[Dict]: |
|
|
"""Парсит PDF и извлекает информацию о курсах""" |
|
|
courses = [] |
|
|
|
|
|
try: |
|
|
print(f'Парсинг PDF: {filepath}') |
|
|
|
|
|
with pdfplumber.open(filepath) as pdf: |
|
|
|
|
|
table_courses = self._extract_from_tables(pdf, program_id) |
|
|
if table_courses: |
|
|
courses.extend(table_courses) |
|
|
print(f'Извлечено из таблиц: {len(table_courses)} курсов') |
|
|
|
|
|
|
|
|
if len(courses) < 5: |
|
|
text_courses = self._extract_from_text(pdf, program_id) |
|
|
courses.extend(text_courses) |
|
|
print(f'Извлечено из текста: {len(text_courses)} курсов') |
|
|
|
|
|
|
|
|
courses = self._deduplicate_courses(courses) |
|
|
|
|
|
print(f'Всего извлечено курсов: {len(courses)}') |
|
|
return courses |
|
|
|
|
|
except Exception as e: |
|
|
print(f'Ошибка парсинга PDF {filepath}: {e}') |
|
|
return [] |
|
|
|
|
|
def _extract_from_tables(self, pdf, program_id: str) -> List[Dict]: |
|
|
"""Извлекает курсы из таблиц PDF""" |
|
|
courses = [] |
|
|
current_semester = 1 |
|
|
|
|
|
for page_num, page in enumerate(pdf.pages): |
|
|
try: |
|
|
|
|
|
tables = page.extract_tables() |
|
|
|
|
|
for table in tables: |
|
|
if not table or len(table) < 2: |
|
|
continue |
|
|
|
|
|
|
|
|
semester = self._detect_semester_from_table(table, current_semester) |
|
|
if semester: |
|
|
current_semester = semester |
|
|
|
|
|
|
|
|
for row in table[1:]: |
|
|
if not row or len(row) < 2: |
|
|
continue |
|
|
|
|
|
course = self._parse_table_row(row, program_id, current_semester, page_num + 1) |
|
|
if course: |
|
|
courses.append(course) |
|
|
|
|
|
except Exception as e: |
|
|
print(f'Ошибка обработки страницы {page_num + 1}: {e}') |
|
|
continue |
|
|
|
|
|
return courses |
|
|
|
|
|
def _extract_from_text(self, pdf, program_id: str) -> List[Dict]: |
|
|
"""Извлекает курсы из текста PDF""" |
|
|
courses = [] |
|
|
current_semester = 1 |
|
|
|
|
|
for page_num, page in enumerate(pdf.pages): |
|
|
try: |
|
|
text = page.extract_text() |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
|
|
|
semester = self._detect_semester_from_text(text, current_semester) |
|
|
if semester: |
|
|
current_semester = semester |
|
|
|
|
|
|
|
|
page_courses = self._parse_text_for_courses(text, program_id, current_semester, page_num + 1) |
|
|
courses.extend(page_courses) |
|
|
|
|
|
except Exception as e: |
|
|
print(f'Ошибка обработки текста страницы {page_num + 1}: {e}') |
|
|
continue |
|
|
|
|
|
return courses |
|
|
|
|
|
def _detect_semester_from_table(self, table: List[List], current_semester: int) -> int: |
|
|
"""Определяет семестр по заголовкам таблицы""" |
|
|
if not table or not table[0]: |
|
|
return current_semester |
|
|
|
|
|
header_text = ' '.join([str(cell) for cell in table[0] if cell]).lower() |
|
|
|
|
|
|
|
|
for i in range(1, 5): |
|
|
if f'{i} семестр' in header_text or f'{i} семестре' in header_text: |
|
|
return i |
|
|
|
|
|
return current_semester |
|
|
|
|
|
def _detect_semester_from_text(self, text: str, current_semester: int) -> int: |
|
|
"""Определяет семестр по тексту""" |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
for i in range(1, 5): |
|
|
if f'{i} семестр' in text_lower or f'{i} семестре' in text_lower: |
|
|
return i |
|
|
|
|
|
return current_semester |
|
|
|
|
|
def _parse_table_row(self, row: List, program_id: str, semester: int, page: int) -> Dict: |
|
|
"""Парсит строку таблицы и извлекает информацию о курсе""" |
|
|
if not row or len(row) < 2: |
|
|
return None |
|
|
|
|
|
|
|
|
clean_row = [str(cell).strip() if cell else '' for cell in row] |
|
|
|
|
|
|
|
|
course_name = '' |
|
|
credits = 0 |
|
|
hours = 0 |
|
|
course_type = 'required' |
|
|
|
|
|
for i, cell in enumerate(clean_row): |
|
|
if not cell or cell.lower() in ['название', 'дисциплина', 'курс', 'предмет']: |
|
|
continue |
|
|
|
|
|
|
|
|
if len(cell) > 10 and not cell.isdigit(): |
|
|
course_name = cell |
|
|
break |
|
|
|
|
|
|
|
|
for cell in clean_row: |
|
|
if cell.isdigit(): |
|
|
num = int(cell) |
|
|
if 1 <= num <= 12: |
|
|
credits = num |
|
|
elif 18 <= num <= 216: |
|
|
hours = num |
|
|
|
|
|
|
|
|
row_text = ' '.join(clean_row).lower() |
|
|
if any(word in row_text for word in ['по выбору', 'электив', 'факультатив']): |
|
|
course_type = 'elective' |
|
|
|
|
|
if not course_name or len(course_name) < 5: |
|
|
return None |
|
|
|
|
|
return { |
|
|
'id': f'{program_id}_{semester}_{len(course_name)}', |
|
|
'program_id': program_id, |
|
|
'semester': semester, |
|
|
'name': course_name, |
|
|
'credits': credits, |
|
|
'hours': hours, |
|
|
'type': course_type, |
|
|
'source_pdf': os.path.basename(filepath) if 'filepath' in locals() else '', |
|
|
'source_page': page |
|
|
} |
|
|
|
|
|
def _parse_text_for_courses(self, text: str, program_id: str, semester: int, page: int) -> List[Dict]: |
|
|
"""Парсит текст и ищет курсы""" |
|
|
courses = [] |
|
|
|
|
|
|
|
|
lines = text.split('\n') |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line or len(line) < 10: |
|
|
continue |
|
|
|
|
|
|
|
|
course = self._extract_course_from_line(line, program_id, semester, page) |
|
|
if course: |
|
|
courses.append(course) |
|
|
|
|
|
return courses |
|
|
|
|
|
def _extract_course_from_line(self, line: str, program_id: str, semester: int, page: int) -> Dict: |
|
|
"""Извлекает информацию о курсе из строки текста""" |
|
|
|
|
|
patterns = [ |
|
|
r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s+(\d+)', |
|
|
r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s*кр', |
|
|
r'([А-Я][А-Яа-я\s\-\(\)]+?)\s+(\d+)\s*ч', |
|
|
] |
|
|
|
|
|
for pattern in patterns: |
|
|
match = re.search(pattern, line) |
|
|
if match: |
|
|
course_name = match.group(1).strip() |
|
|
if len(course_name) < 5: |
|
|
continue |
|
|
|
|
|
|
|
|
numbers = [int(match.group(i)) for i in range(2, len(match.groups()) + 1)] |
|
|
|
|
|
credits = 0 |
|
|
hours = 0 |
|
|
|
|
|
if len(numbers) >= 2: |
|
|
credits, hours = numbers[0], numbers[1] |
|
|
elif len(numbers) == 1: |
|
|
if numbers[0] <= 12: |
|
|
credits = numbers[0] |
|
|
else: |
|
|
hours = numbers[0] |
|
|
|
|
|
|
|
|
course_type = 'required' |
|
|
if any(word in line.lower() for word in ['по выбору', 'электив', 'факультатив']): |
|
|
course_type = 'elective' |
|
|
|
|
|
return { |
|
|
'id': f'{program_id}_{semester}_{len(course_name)}', |
|
|
'program_id': program_id, |
|
|
'semester': semester, |
|
|
'name': course_name, |
|
|
'credits': credits, |
|
|
'hours': hours, |
|
|
'type': course_type, |
|
|
'source_page': page |
|
|
} |
|
|
|
|
|
return None |
|
|
|
|
|
def _deduplicate_courses(self, courses: List[Dict]) -> List[Dict]: |
|
|
"""Удаляет дубликаты курсов""" |
|
|
seen = set() |
|
|
unique_courses = [] |
|
|
|
|
|
for course in courses: |
|
|
|
|
|
key = f"{course['name']}_{course['semester']}_{course['program_id']}" |
|
|
|
|
|
if key not in seen: |
|
|
seen.add(key) |
|
|
unique_courses.append(course) |
|
|
|
|
|
return unique_courses |
|
|
|
|
|
def main(): |
|
|
parser = PDFParser() |
|
|
|
|
|
|
|
|
test_url = "https://example.com/test.pdf" |
|
|
filename = "test_curriculum.pdf" |
|
|
|
|
|
|
|
|
filepath = parser.download_pdf(test_url, filename) |
|
|
if filepath: |
|
|
courses = parser.parse_pdf(filepath, 'test_program') |
|
|
print(f'Извлечено курсов: {len(courses)}') |
|
|
for course in courses[:5]: |
|
|
print(f"- {course['name']} ({course['semester']} семестр, {course['credits']} кредитов)") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|