File size: 7,562 Bytes
946f233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import re
import hashlib
from typing import List, Dict
class DataNormalizer:
def __init__(self):
self.tag_keywords = {
'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'],
'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'],
'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'],
'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'],
'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'],
'stats': ['статистика', 'вероятность', 'статистический', 'probability'],
'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'],
'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'],
'pm': ['project management', 'управление проектами', 'pm', 'проект'],
'systems': ['система', 'system', 'архитектура', 'инфраструктура'],
'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных']
}
def normalize_courses(self, courses: List[Dict]) -> List[Dict]:
normalized_courses = []
seen_hashes = set()
for course in courses:
normalized = self._normalize_course(course)
if normalized:
course_hash = self._calculate_course_hash(normalized)
if course_hash not in seen_hashes:
seen_hashes.add(course_hash)
normalized_courses.append(normalized)
return normalized_courses
def _normalize_course(self, course: Dict) -> Dict:
if not course.get('name'):
return None
normalized = course.copy()
normalized['name'] = self._normalize_name(course['name'])
normalized['short_desc'] = self._generate_short_desc(course)
normalized['tags'] = self._generate_tags(normalized)
normalized['semester'] = self._normalize_semester(course.get('semester', 1))
normalized['credits'] = self._normalize_credits(course.get('credits', 0))
normalized['hours'] = self._normalize_hours(course.get('hours', 0))
normalized['type'] = self._normalize_type(course.get('type', 'required'))
return normalized
def _normalize_name(self, name: str) -> str:
if not name:
return ''
name = str(name).strip()
name = re.sub(r'\s+', ' ', name)
name = name.replace('"', '').replace('"', '')
return name
def _generate_short_desc(self, course: dict) -> str:
name = course.get('name', '')
desc = course.get('description', '')
if desc:
desc = str(desc).strip()
if len(desc) > 220:
desc = desc[:220] + '...'
return desc
if name and len(name) > 50:
return name[:220]
return 'Курс из учебного плана программы'
def _generate_tags(self, course: Dict) -> List[str]:
text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
tags = []
for tag, keywords in self.tag_keywords.items():
if any(keyword in text for keyword in keywords):
tags.append(tag)
return tags
def _normalize_semester(self, semester) -> int:
try:
semester = int(semester)
if 1 <= semester <= 4:
return semester
except (ValueError, TypeError):
pass
return 1
def _normalize_credits(self, credits) -> int:
try:
credits = int(credits)
if credits >= 0:
return credits
except (ValueError, TypeError):
pass
return 0
def _normalize_hours(self, hours) -> int:
try:
hours = int(hours)
if hours >= 0:
return hours
except (ValueError, TypeError):
pass
return 0
def _normalize_type(self, course_type: str) -> str:
if not course_type:
return 'required'
type_lower = str(course_type).lower()
if any(word in type_lower for word in ['обязательная', 'required', 'обяз']):
return 'required'
elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']):
return 'elective'
return 'required'
def _calculate_course_hash(self, course: Dict) -> str:
text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}"
return hashlib.md5(text.encode()).hexdigest()
def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]:
all_courses = []
for courses in courses_list:
all_courses.extend(courses)
return self.normalize_courses(all_courses)
def validate_course(self, course: Dict) -> bool:
required_fields = ['name', 'program_id', 'semester']
for field in required_fields:
if not course.get(field):
return False
if len(course.get('name', '')) < 3:
return False
return True
def get_statistics(self, courses: List[Dict]) -> Dict:
stats = {
'total_courses': len(courses),
'by_program': {},
'by_semester': {},
'by_type': {},
'by_tags': {}
}
for course in courses:
program_id = course.get('program_id', 'unknown')
semester = course.get('semester', 1)
course_type = course.get('type', 'required')
tags = course.get('tags', [])
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1
for tag in tags:
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1
return stats
def main():
normalizer = DataNormalizer()
test_courses = [
{
'id': 'test_1',
'program_id': 'ai',
'name': 'Машинное обучение',
'semester': 1,
'credits': 6,
'type': 'required'
},
{
'id': 'test_2',
'program_id': 'ai_product',
'name': 'Глубокое обучение',
'semester': 2,
'credits': 4,
'type': 'elective'
}
]
normalized = normalizer.normalize_courses(test_courses)
stats = normalizer.get_statistics(normalized)
print(f'Нормализовано курсов: {len(normalized)}')
print(f'Статистика: {stats}')
if __name__ == '__main__':
main()
|