|
|
import re |
|
|
import hashlib |
|
|
from typing import List, Dict |
|
|
|
|
|
class DataNormalizer: |
|
|
def __init__(self): |
|
|
self.tag_keywords = { |
|
|
'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель', 'классификация', 'регрессия'], |
|
|
'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer', 'нейросеть'], |
|
|
'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык', 'токенизация'], |
|
|
'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео', 'детекция', 'сегментация'], |
|
|
'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ', 'линейная алгебра', 'статистика'], |
|
|
'stats': ['статистика', 'вероятность', 'статистический', 'probability', 'теория вероятностей'], |
|
|
'product': ['продукт', 'product', 'разработка продукта', 'продуктовая', 'аналитика'], |
|
|
'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика', 'маркетинг'], |
|
|
'pm': ['project management', 'управление проектами', 'pm', 'проект', 'agile', 'scrum'], |
|
|
'systems': ['система', 'system', 'архитектура', 'инфраструктура', 'разработка'], |
|
|
'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных', 'sql', 'nosql'], |
|
|
'research': ['исследование', 'research', 'наука', 'научный', 'диссертация', 'магистерская'], |
|
|
'python': ['python', 'питон', 'программирование'], |
|
|
'java': ['java', 'джава', 'программирование'], |
|
|
'sql': ['sql', 'база данных', 'database'], |
|
|
'git': ['git', 'версионирование', 'контроль версий'], |
|
|
'docker': ['docker', 'контейнеризация', 'контейнер'], |
|
|
'aws': ['aws', 'amazon', 'облако', 'cloud'], |
|
|
'tensorflow': ['tensorflow', 'tf', 'фреймворк'], |
|
|
'pytorch': ['pytorch', 'torch', 'фреймворк'], |
|
|
'scikit-learn': ['scikit-learn', 'sklearn', 'библиотека'] |
|
|
} |
|
|
|
|
|
def normalize_courses(self, courses: List[Dict]) -> List[Dict]: |
|
|
"""Нормализует список курсов""" |
|
|
normalized_courses = [] |
|
|
seen_hashes = set() |
|
|
|
|
|
for course in courses: |
|
|
normalized = self._normalize_course(course) |
|
|
if normalized: |
|
|
course_hash = self._calculate_course_hash(normalized) |
|
|
if course_hash not in seen_hashes: |
|
|
seen_hashes.add(course_hash) |
|
|
normalized_courses.append(normalized) |
|
|
|
|
|
return normalized_courses |
|
|
|
|
|
def _normalize_course(self, course: Dict) -> Dict: |
|
|
"""Нормализует отдельный курс""" |
|
|
if not course.get('name'): |
|
|
return None |
|
|
|
|
|
normalized = course.copy() |
|
|
|
|
|
|
|
|
normalized['name'] = self._normalize_name(course['name']) |
|
|
|
|
|
|
|
|
normalized['short_desc'] = self._generate_short_desc(course) |
|
|
|
|
|
|
|
|
normalized['tags'] = self._generate_tags(course) |
|
|
|
|
|
|
|
|
normalized['semester'] = self._normalize_semester(course.get('semester', 1)) |
|
|
normalized['credits'] = self._normalize_credits(course.get('credits', 0)) |
|
|
normalized['hours'] = self._normalize_hours(course.get('hours', 0)) |
|
|
normalized['type'] = self._normalize_type(course.get('type', 'required')) |
|
|
|
|
|
return normalized |
|
|
|
|
|
def _normalize_name(self, name: str) -> str: |
|
|
"""Нормализует название курса""" |
|
|
if not name: |
|
|
return '' |
|
|
|
|
|
name = str(name).strip() |
|
|
|
|
|
|
|
|
name = re.sub(r'\s+', ' ', name) |
|
|
name = name.replace('"', '').replace('"', '').replace('«', '').replace('»', '') |
|
|
|
|
|
|
|
|
name = re.sub(r'^\s*[\(\)\[\]\-\s]+', '', name) |
|
|
name = re.sub(r'[\(\)\[\]\-\s]+\s*$', '', name) |
|
|
|
|
|
return name |
|
|
|
|
|
def _generate_short_desc(self, course: Dict) -> str: |
|
|
"""Генерирует короткое описание курса""" |
|
|
name = course.get('name', '') |
|
|
desc = course.get('description', '') |
|
|
|
|
|
|
|
|
if desc: |
|
|
desc = str(desc).strip() |
|
|
if len(desc) > 220: |
|
|
desc = desc[:220] + '...' |
|
|
return desc |
|
|
|
|
|
|
|
|
if name and len(name) > 50: |
|
|
return name[:220] |
|
|
|
|
|
|
|
|
program_id = course.get('program_id', '') |
|
|
semester = course.get('semester', 1) |
|
|
|
|
|
if program_id == 'ai': |
|
|
return f'Курс программы "Искусственный интеллект" ({semester} семестр)' |
|
|
elif program_id == 'ai_product': |
|
|
return f'Курс программы "AI Product Management" ({semester} семестр)' |
|
|
else: |
|
|
return f'Курс из учебного плана программы ({semester} семестр)' |
|
|
|
|
|
def _generate_tags(self, course: Dict) -> List[str]: |
|
|
"""Генерирует теги для курса""" |
|
|
text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower() |
|
|
tags = [] |
|
|
|
|
|
for tag, keywords in self.tag_keywords.items(): |
|
|
if any(keyword in text for keyword in keywords): |
|
|
tags.append(tag) |
|
|
|
|
|
|
|
|
program_id = course.get('program_id', '') |
|
|
if program_id == 'ai': |
|
|
if 'ml' not in tags: |
|
|
tags.append('ml') |
|
|
elif program_id == 'ai_product': |
|
|
if 'product' not in tags: |
|
|
tags.append('product') |
|
|
|
|
|
return list(set(tags)) |
|
|
|
|
|
def _normalize_semester(self, semester) -> int: |
|
|
"""Нормализует номер семестра""" |
|
|
try: |
|
|
semester = int(semester) |
|
|
if 1 <= semester <= 4: |
|
|
return semester |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
return 1 |
|
|
|
|
|
def _normalize_credits(self, credits) -> int: |
|
|
"""Нормализует количество кредитов""" |
|
|
try: |
|
|
credits = int(credits) |
|
|
if credits >= 0: |
|
|
return credits |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
return 0 |
|
|
|
|
|
def _normalize_hours(self, hours) -> int: |
|
|
"""Нормализует количество часов""" |
|
|
try: |
|
|
hours = int(hours) |
|
|
if hours >= 0: |
|
|
return hours |
|
|
except (ValueError, TypeError): |
|
|
pass |
|
|
|
|
|
return 0 |
|
|
|
|
|
def _normalize_type(self, course_type: str) -> str: |
|
|
"""Нормализует тип курса""" |
|
|
if not course_type: |
|
|
return 'required' |
|
|
|
|
|
type_lower = str(course_type).lower() |
|
|
|
|
|
if any(word in type_lower for word in ['обязательная', 'required', 'обяз', 'базовая']): |
|
|
return 'required' |
|
|
elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор', 'электив', 'факультатив']): |
|
|
return 'elective' |
|
|
|
|
|
return 'required' |
|
|
|
|
|
def _calculate_course_hash(self, course: Dict) -> str: |
|
|
"""Вычисляет хэш курса для дедупликации""" |
|
|
text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}" |
|
|
return hashlib.md5(text.encode()).hexdigest() |
|
|
|
|
|
def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]: |
|
|
"""Объединяет несколько списков курсов""" |
|
|
all_courses = [] |
|
|
for courses in courses_list: |
|
|
all_courses.extend(courses) |
|
|
|
|
|
return self.normalize_courses(all_courses) |
|
|
|
|
|
def validate_course(self, course: Dict) -> bool: |
|
|
"""Проверяет валидность курса""" |
|
|
required_fields = ['name', 'program_id', 'semester'] |
|
|
|
|
|
for field in required_fields: |
|
|
if not course.get(field): |
|
|
return False |
|
|
|
|
|
if len(course.get('name', '')) < 3: |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def get_statistics(self, courses: List[Dict]) -> Dict: |
|
|
"""Получает статистику по курсам""" |
|
|
stats = { |
|
|
'total_courses': len(courses), |
|
|
'by_program': {}, |
|
|
'by_semester': {}, |
|
|
'by_type': {}, |
|
|
'by_tags': {} |
|
|
} |
|
|
|
|
|
for course in courses: |
|
|
program_id = course.get('program_id', 'unknown') |
|
|
semester = course.get('semester', 1) |
|
|
course_type = course.get('type', 'required') |
|
|
tags = course.get('tags', []) |
|
|
|
|
|
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1 |
|
|
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1 |
|
|
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1 |
|
|
|
|
|
for tag in tags: |
|
|
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1 |
|
|
|
|
|
return stats |
|
|
|
|
|
def enrich_courses(self, courses: List[Dict]) -> List[Dict]: |
|
|
"""Обогащает курсы дополнительной информацией""" |
|
|
for course in courses: |
|
|
|
|
|
course['difficulty'] = self._calculate_difficulty(course) |
|
|
|
|
|
|
|
|
course['recommended_experience'] = self._calculate_recommended_experience(course) |
|
|
|
|
|
|
|
|
course['category'] = self._determine_category(course) |
|
|
|
|
|
return courses |
|
|
|
|
|
def _calculate_difficulty(self, course: Dict) -> str: |
|
|
"""Вычисляет сложность курса""" |
|
|
name = course.get('name', '').lower() |
|
|
credits = course.get('credits', 0) |
|
|
semester = course.get('semester', 1) |
|
|
|
|
|
|
|
|
if any(word in name for word in ['продвинутый', 'advanced', 'углубленный']): |
|
|
return 'advanced' |
|
|
elif any(word in name for word in ['базовый', 'basic', 'введение', 'вводный']): |
|
|
return 'beginner' |
|
|
|
|
|
|
|
|
if credits >= 6 or semester >= 3: |
|
|
return 'intermediate' |
|
|
elif credits <= 3 and semester <= 2: |
|
|
return 'beginner' |
|
|
else: |
|
|
return 'intermediate' |
|
|
|
|
|
def _calculate_recommended_experience(self, course: Dict) -> Dict: |
|
|
"""Вычисляет рекомендуемый опыт для курса""" |
|
|
difficulty = course.get('difficulty', 'intermediate') |
|
|
tags = course.get('tags', []) |
|
|
|
|
|
experience = { |
|
|
'programming': 1, |
|
|
'math': 1, |
|
|
'ml': 0 |
|
|
} |
|
|
|
|
|
if difficulty == 'advanced': |
|
|
experience['programming'] = 4 |
|
|
experience['math'] = 3 |
|
|
elif difficulty == 'intermediate': |
|
|
experience['programming'] = 2 |
|
|
experience['math'] = 2 |
|
|
else: |
|
|
experience['programming'] = 1 |
|
|
experience['math'] = 1 |
|
|
|
|
|
|
|
|
if 'ml' in tags or 'dl' in tags: |
|
|
experience['ml'] = max(experience['ml'], 1) |
|
|
if 'math' in tags or 'stats' in tags: |
|
|
experience['math'] = max(experience['math'], 2) |
|
|
if 'python' in tags or 'java' in tags: |
|
|
experience['programming'] = max(experience['programming'], 2) |
|
|
|
|
|
return experience |
|
|
|
|
|
def _determine_category(self, course: Dict) -> str: |
|
|
"""Определяет категорию курса""" |
|
|
tags = course.get('tags', []) |
|
|
name = course.get('name', '').lower() |
|
|
|
|
|
if any(tag in tags for tag in ['ml', 'dl', 'nlp', 'cv']): |
|
|
return 'ai_core' |
|
|
elif any(tag in tags for tag in ['product', 'business', 'pm']): |
|
|
return 'product_management' |
|
|
elif any(tag in tags for tag in ['math', 'stats']): |
|
|
return 'mathematics' |
|
|
elif any(tag in tags for tag in ['systems', 'data']): |
|
|
return 'systems_data' |
|
|
elif 'research' in tags or 'диссертация' in name: |
|
|
return 'research' |
|
|
else: |
|
|
return 'general' |
|
|
|
|
|
def main(): |
|
|
normalizer = DataNormalizer() |
|
|
|
|
|
|
|
|
test_courses = [ |
|
|
{ |
|
|
'id': 'test_1', |
|
|
'program_id': 'ai', |
|
|
'name': 'Машинное обучение', |
|
|
'semester': 1, |
|
|
'credits': 6, |
|
|
'type': 'required' |
|
|
}, |
|
|
{ |
|
|
'id': 'test_2', |
|
|
'program_id': 'ai_product', |
|
|
'name': 'Глубокое обучение', |
|
|
'semester': 2, |
|
|
'credits': 4, |
|
|
'type': 'elective' |
|
|
} |
|
|
] |
|
|
|
|
|
normalized = normalizer.normalize_courses(test_courses) |
|
|
enriched = normalizer.enrich_courses(normalized) |
|
|
stats = normalizer.get_statistics(enriched) |
|
|
|
|
|
print(f'Нормализовано курсов: {len(normalized)}') |
|
|
print(f'Статистика: {stats}') |
|
|
|
|
|
for course in enriched: |
|
|
print(f"- {course['name']}: {course['tags']} (сложность: {course['difficulty']})") |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|