|
|
import json |
|
|
import re |
|
|
import hashlib |
|
|
from typing import List, Dict |
|
|
import os |
|
|
|
|
|
class KnowledgeBase: |
|
|
def __init__(self): |
|
|
self.courses = [] |
|
|
self.programs = {} |
|
|
self._load_data() |
|
|
|
|
|
def _load_data(self): |
|
|
try: |
|
|
|
|
|
with open('data/processed/courses.json', 'r', encoding='utf-8') as f: |
|
|
self.courses = json.load(f) |
|
|
|
|
|
|
|
|
with open('data/processed/programs.json', 'r', encoding='utf-8') as f: |
|
|
self.programs = json.load(f) |
|
|
|
|
|
print(f'Загружено {len(self.courses)} курсов и {len(self.programs)} программ') |
|
|
|
|
|
except FileNotFoundError: |
|
|
print('Файлы данных не найдены, создаем тестовые данные...') |
|
|
self._create_test_data() |
|
|
except Exception as e: |
|
|
print(f'Ошибка загрузки данных: {e}, создаем тестовые данные...') |
|
|
self._create_test_data() |
|
|
|
|
|
def _create_test_data(self): |
|
|
|
|
|
self.programs = { |
|
|
'ai': { |
|
|
'id': 'ai', |
|
|
'title': 'Искусственный интеллект', |
|
|
'description': 'Программа подготовки специалистов в области ИИ, машинного обучения и анализа данных', |
|
|
'pdf_links': [] |
|
|
}, |
|
|
'ai_product': { |
|
|
'id': 'ai_product', |
|
|
'title': 'AI Product Management', |
|
|
'description': 'Программа подготовки продуктовых менеджеров в области ИИ', |
|
|
'pdf_links': [] |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
self.courses = [ |
|
|
{ |
|
|
'id': 'ai_1_1', |
|
|
'program_id': 'ai', |
|
|
'semester': 1, |
|
|
'name': 'Машинное обучение', |
|
|
'credits': 6, |
|
|
'hours': 108, |
|
|
'type': 'required', |
|
|
'tags': ['ml', 'math', 'stats'], |
|
|
'short_desc': 'Основы машинного обучения, алгоритмы классификации и регрессии' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_1_2', |
|
|
'program_id': 'ai', |
|
|
'semester': 1, |
|
|
'name': 'Глубокое обучение', |
|
|
'credits': 4, |
|
|
'hours': 72, |
|
|
'type': 'required', |
|
|
'tags': ['dl', 'ml', 'neural'], |
|
|
'short_desc': 'Нейронные сети, CNN, RNN, трансформеры' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_2_1', |
|
|
'program_id': 'ai', |
|
|
'semester': 2, |
|
|
'name': 'Обработка естественного языка', |
|
|
'credits': 5, |
|
|
'hours': 90, |
|
|
'type': 'required', |
|
|
'tags': ['nlp', 'dl', 'text'], |
|
|
'short_desc': 'Методы обработки текста, токенизация, эмбеддинги' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_2_2', |
|
|
'program_id': 'ai', |
|
|
'semester': 2, |
|
|
'name': 'Компьютерное зрение', |
|
|
'credits': 5, |
|
|
'hours': 90, |
|
|
'type': 'required', |
|
|
'tags': ['cv', 'dl', 'image'], |
|
|
'short_desc': 'Обработка изображений, детекция объектов, сегментация' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_3_1', |
|
|
'program_id': 'ai', |
|
|
'semester': 3, |
|
|
'name': 'Продвинутые методы машинного обучения', |
|
|
'credits': 6, |
|
|
'hours': 108, |
|
|
'type': 'required', |
|
|
'tags': ['ml', 'advanced', 'math'], |
|
|
'short_desc': 'Ансамблевые методы, байесовские подходы, оптимизация' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_4_1', |
|
|
'program_id': 'ai', |
|
|
'semester': 4, |
|
|
'name': 'Магистерская диссертация', |
|
|
'credits': 12, |
|
|
'hours': 216, |
|
|
'type': 'required', |
|
|
'tags': ['research', 'thesis'], |
|
|
'short_desc': 'Научно-исследовательская работа по выбранной теме' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_product_1_1', |
|
|
'program_id': 'ai_product', |
|
|
'semester': 1, |
|
|
'name': 'Продуктовая аналитика', |
|
|
'credits': 6, |
|
|
'hours': 108, |
|
|
'type': 'required', |
|
|
'tags': ['product', 'business', 'data'], |
|
|
'short_desc': 'Анализ продуктовых метрик, A/B тестирование' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_product_1_2', |
|
|
'program_id': 'ai_product', |
|
|
'semester': 1, |
|
|
'name': 'Управление проектами', |
|
|
'credits': 4, |
|
|
'hours': 72, |
|
|
'type': 'required', |
|
|
'tags': ['pm', 'business', 'management'], |
|
|
'short_desc': 'Методологии управления проектами, Agile, Scrum' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_product_2_1', |
|
|
'program_id': 'ai_product', |
|
|
'semester': 2, |
|
|
'name': 'AI в продуктах', |
|
|
'credits': 5, |
|
|
'hours': 90, |
|
|
'type': 'required', |
|
|
'tags': ['ai', 'product', 'ml'], |
|
|
'short_desc': 'Интеграция ИИ в продуктовые решения' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_product_3_1', |
|
|
'program_id': 'ai_product', |
|
|
'semester': 3, |
|
|
'name': 'Стратегия продукта', |
|
|
'credits': 6, |
|
|
'hours': 108, |
|
|
'type': 'required', |
|
|
'tags': ['strategy', 'business', 'product'], |
|
|
'short_desc': 'Стратегическое планирование и развитие продуктов' |
|
|
}, |
|
|
{ |
|
|
'id': 'ai_product_4_1', |
|
|
'program_id': 'ai_product', |
|
|
'semester': 4, |
|
|
'name': 'Магистерская диссертация', |
|
|
'credits': 12, |
|
|
'hours': 216, |
|
|
'type': 'required', |
|
|
'tags': ['research', 'thesis'], |
|
|
'short_desc': 'Научно-исследовательская работа по продуктовой тематике' |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
self._save_data() |
|
|
|
|
|
def _save_data(self): |
|
|
os.makedirs('data/processed', exist_ok=True) |
|
|
|
|
|
with open('data/processed/courses.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(self.courses, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
with open('data/processed/programs.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(self.programs, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
def is_itmo_query(self, message: str) -> bool: |
|
|
itmo_keywords = [ |
|
|
'итмо', 'магистратура', 'учебный план', 'дисциплина', 'курс', |
|
|
'ии', 'ai', 'ai product', 'институт ии', 'программа', |
|
|
'машинное обучение', 'глубокое обучение', 'nlp', 'компьютерное зрение', |
|
|
'продукт', 'аналитика', 'управление', 'обучение', 'учеба' |
|
|
] |
|
|
|
|
|
message_lower = message.lower() |
|
|
return any(keyword in message_lower for keyword in itmo_keywords) |
|
|
|
|
|
def get_courses_by_semester(self, semester) -> List[Dict]: |
|
|
"""Получает курсы для указанного семестра""" |
|
|
try: |
|
|
semester = int(semester) |
|
|
except (ValueError, TypeError): |
|
|
semester = 1 |
|
|
|
|
|
return [course for course in self.courses if course.get('semester') == semester] |
|
|
|
|
|
def get_course_by_id(self, course_id: str) -> Dict: |
|
|
for course in self.courses: |
|
|
if course.get('id') == course_id: |
|
|
return course |
|
|
return {} |
|
|
|
|
|
def get_courses_by_program(self, program_id: str) -> List[Dict]: |
|
|
return [course for course in self.courses if course.get('program_id') == program_id] |
|
|
|
|
|
def get_courses_by_tag(self, tag: str) -> List[Dict]: |
|
|
return [course for course in self.courses if tag in course.get('tags', [])] |
|
|
|
|
|
def search_courses(self, query: str) -> List[Dict]: |
|
|
query_lower = query.lower() |
|
|
results = [] |
|
|
|
|
|
for course in self.courses: |
|
|
course_text = f"{course['name']} {course['short_desc']}".lower() |
|
|
if any(word in course_text for word in query_lower.split()): |
|
|
results.append(course) |
|
|
|
|
|
return results |
|
|
|
|
|
def recommend(self, profile: Dict) -> List[Dict]: |
|
|
semester = profile.get('semester') |
|
|
if not semester: |
|
|
return [] |
|
|
|
|
|
|
|
|
semester_courses = self.get_courses_by_semester(semester) |
|
|
|
|
|
if not semester_courses: |
|
|
return [] |
|
|
|
|
|
|
|
|
scored_courses = [] |
|
|
for course in semester_courses: |
|
|
score = self._calculate_recommendation_score(course, profile) |
|
|
scored_courses.append((course, score)) |
|
|
|
|
|
|
|
|
scored_courses.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
|
|
|
return [course for course, score in scored_courses[:7]] |
|
|
|
|
|
def _calculate_recommendation_score(self, course: Dict, profile: Dict) -> float: |
|
|
score = 0.0 |
|
|
|
|
|
|
|
|
score += 0.1 |
|
|
|
|
|
|
|
|
interests = profile.get('interests', []) |
|
|
course_tags = course.get('tags', []) |
|
|
|
|
|
matching_tags = [tag for tag in interests if tag in course_tags] |
|
|
if matching_tags: |
|
|
score += 0.6 * (len(matching_tags) / len(interests)) |
|
|
|
|
|
|
|
|
programming_exp = profile.get('programming_experience', 2) |
|
|
if programming_exp >= 3 and any(tag in course_tags for tag in ['ml', 'dl', 'systems']): |
|
|
score += 0.3 |
|
|
|
|
|
|
|
|
math_level = profile.get('math_level', 2) |
|
|
if math_level >= 3 and any(tag in course_tags for tag in ['math', 'stats', 'dl']): |
|
|
score += 0.3 |
|
|
|
|
|
return score |
|
|
|
|
|
def get_statistics(self) -> Dict: |
|
|
stats = { |
|
|
'total_courses': len(self.courses), |
|
|
'by_program': {}, |
|
|
'by_semester': {}, |
|
|
'by_type': {}, |
|
|
'by_tags': {} |
|
|
} |
|
|
|
|
|
for course in self.courses: |
|
|
program_id = course.get('program_id', 'unknown') |
|
|
semester = course.get('semester', 1) |
|
|
course_type = course.get('type', 'required') |
|
|
tags = course.get('tags', []) |
|
|
|
|
|
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1 |
|
|
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1 |
|
|
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1 |
|
|
|
|
|
for tag in tags: |
|
|
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1 |
|
|
|
|
|
return stats |
|
|
|