Upload 18 files
Browse files- .dockerignore +31 -0
- Dockerfile +22 -21
- README.md +206 -14
- app.py +136 -0
- data/processed/courses.json +80 -0
- data/processed/programs.json +30 -0
- data_layer.py +135 -0
- docker-compose.yml +12 -0
- llm.py +155 -0
- parser.py +313 -0
- prompts/system.txt +12 -0
- requirements.txt +8 -3
- scraper/html_scraper.py +144 -0
- scraper/normalize.py +206 -0
- scraper/pdf_parser.py +246 -0
- templates/index.html +399 -0
- tests/test_filter.py +33 -0
- tests/test_recommend.py +56 -0
.dockerignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
env
|
| 7 |
+
pip-log.txt
|
| 8 |
+
pip-delete-this-directory.txt
|
| 9 |
+
.tox
|
| 10 |
+
.coverage
|
| 11 |
+
.coverage.*
|
| 12 |
+
.cache
|
| 13 |
+
nosetests.xml
|
| 14 |
+
coverage.xml
|
| 15 |
+
*.cover
|
| 16 |
+
*.log
|
| 17 |
+
.git
|
| 18 |
+
.mypy_cache
|
| 19 |
+
.pytest_cache
|
| 20 |
+
.hypothesis
|
| 21 |
+
.DS_Store
|
| 22 |
+
.env
|
| 23 |
+
.venv
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
.idea/
|
| 28 |
+
.vscode/
|
| 29 |
+
*.swp
|
| 30 |
+
*.swo
|
| 31 |
+
*~
|
Dockerfile
CHANGED
|
@@ -1,21 +1,22 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
COPY
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Устанавливаем рабочую директорию
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Копируем requirements.txt
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
|
| 9 |
+
# Устанавливаем зависимости
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# Копируем код приложения
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
# Создаем директории для данных
|
| 16 |
+
RUN mkdir -p data/processed
|
| 17 |
+
|
| 18 |
+
# Открываем порт
|
| 19 |
+
EXPOSE 5000
|
| 20 |
+
|
| 21 |
+
# Запускаем приложение
|
| 22 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,20 +1,212 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
short_description: Streamlit template space
|
| 12 |
-
license: apache-2.0
|
| 13 |
---
|
| 14 |
|
| 15 |
-
#
|
| 16 |
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ITMO Магистратура - Чат-бот
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
+
app_file: app.py
|
|
|
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# 🤖 ITMO Магистратура - Чат-бот (Docker)
|
| 13 |
|
| 14 |
+
Минимально работающий прототип чат-бота для абитуриентов магистратур ITMO с парсингом данных, диалоговой системой и персонализированными рекомендациями.
|
| 15 |
|
| 16 |
+
## 🎯 Ключевые функции
|
| 17 |
+
|
| 18 |
+
### 1. 📊 Парсинг данных с сайтов ITMO
|
| 19 |
+
- **Автоматический сбор** учебных планов с официальных страниц
|
| 20 |
+
- **Парсинг PDF файлов** с детальной информацией о курсах
|
| 21 |
+
- **Fallback курсы** при недоступности парсинга
|
| 22 |
+
- **Нормализация данных** в единый JSON формат
|
| 23 |
+
|
| 24 |
+
### 2. 💬 Диалоговая система (одна LLM, без RAG)
|
| 25 |
+
- **LLM-powered чат** на основе RuT5-base-multitask
|
| 26 |
+
- **Строгая релевантность** - отвечает только на вопросы об ITMO
|
| 27 |
+
- **Контекстный поиск** по курсам и программам
|
| 28 |
+
- **Fallback режим** при недоступности LLM
|
| 29 |
+
|
| 30 |
+
### 3. 🎯 Персонализированные рекомендации
|
| 31 |
+
- **Учет профиля студента** (опыт программирования, математика, интересы)
|
| 32 |
+
- **Алгоритм подбора курсов** на основе сложности и предпочтений
|
| 33 |
+
- **Рекомендации по семестрам** с объяснением выбора
|
| 34 |
+
- **LLM-генерация** персонализированных советов
|
| 35 |
+
|
| 36 |
+
## 🚀 Быстрый старт
|
| 37 |
+
|
| 38 |
+
### Локальный запуск с Docker
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
# Клонируем репозиторий
|
| 42 |
+
git clone <your-repo-url>
|
| 43 |
+
cd <your-repo-name>
|
| 44 |
+
|
| 45 |
+
# Запускаем с Docker Compose
|
| 46 |
+
docker-compose up --build
|
| 47 |
+
|
| 48 |
+
# Или с Docker напрямую
|
| 49 |
+
docker build -t itmo-chatbot .
|
| 50 |
+
docker run -p 5000:5000 itmo-chatbot
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### Ручной запуск
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
# Устанавливаем зависимости
|
| 57 |
+
pip install -r requirements.txt
|
| 58 |
+
|
| 59 |
+
# Запускаем приложение
|
| 60 |
+
python app.py
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
Приложение будет доступно по адресу: http://localhost:5000
|
| 64 |
+
|
| 65 |
+
## 📁 Структура проекта
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
├── app.py # Flask приложение
|
| 69 |
+
├── parser.py # Парсинг данных с сайтов ITMO
|
| 70 |
+
├── data_layer.py # Работа с данными и рекомендации
|
| 71 |
+
├── llm.py # LLM система
|
| 72 |
+
├── templates/ # HTML шаблоны
|
| 73 |
+
│ └── index.html # Главная страница
|
| 74 |
+
├── Dockerfile # Docker конфигурация
|
| 75 |
+
├── docker-compose.yml # Docker Compose
|
| 76 |
+
├── requirements.txt # Зависимости
|
| 77 |
+
└── README.md # Документация
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
## 🎯 Что работает
|
| 81 |
+
|
| 82 |
+
✅ **Парсинг данных** - автоматический сбор с сайтов ITMO
|
| 83 |
+
✅ **Диалоговая система** - LLM-powered чат с контекстным поиском
|
| 84 |
+
✅ **Рекомендации** - персонализированные по профилю студента
|
| 85 |
+
✅ **12 fallback курсов** - полные учебные планы ИИ и AI Product
|
| 86 |
+
✅ **Строгая релевантность** - отвечает только на вопросы об ITMO
|
| 87 |
+
✅ **Fallback режим** - работает без LLM
|
| 88 |
+
✅ **Docker поддержка** - легкий деплой и развертывание
|
| 89 |
+
✅ **REST API** - JSON API для интеграции
|
| 90 |
+
|
| 91 |
+
## 🔧 Технологии
|
| 92 |
+
|
| 93 |
+
- **Flask** - веб-фреймворк
|
| 94 |
+
- **Transformers** - LLM модель (RuT5-base-multitask)
|
| 95 |
+
- **BeautifulSoup** - парсинг HTML страниц
|
| 96 |
+
- **Requests** - HTTP запросы к сайтам ITMO
|
| 97 |
+
- **Docker** - контейнеризация
|
| 98 |
+
|
| 99 |
+
## 📊 Данные
|
| 100 |
+
|
| 101 |
+
### Программы
|
| 102 |
+
- **Искусственный интеллект** - ML, DL, NLP, CV, 4 семестра
|
| 103 |
+
- **AI Product Management** - продуктовая аналитика, управление, 4 семестра
|
| 104 |
+
|
| 105 |
+
### Курсы (12 fallback курсов)
|
| 106 |
+
- **Семестры 1-4** с полной информацией
|
| 107 |
+
- **Теги** для поиска и рекомендаций (ml, dl, nlp, cv, product, business, etc.)
|
| 108 |
+
- **Сложность** - beginner/intermediate/advanced
|
| 109 |
+
- **Кредиты и часы** обучения
|
| 110 |
+
- **Типы курсов** - required/elective
|
| 111 |
+
|
| 112 |
+
## 🎯 Примеры вопросов
|
| 113 |
+
|
| 114 |
+
### Вопросы о курсах:
|
| 115 |
+
- "Какие курсы по машинному обучению?"
|
| 116 |
+
- "Сколько кредитов за глубокое обучение?"
|
| 117 |
+
- "Какие дисциплины в 1 семестре программы ИИ?"
|
| 118 |
+
|
| 119 |
+
### Вопросы о программах:
|
| 120 |
+
- "Расскажи о программе AI Product"
|
| 121 |
+
- "Какая карьера после программы ИИ?"
|
| 122 |
+
- "Нужна ли математика для AI Product?"
|
| 123 |
+
|
| 124 |
+
### Вопросы о рекомендациях:
|
| 125 |
+
- "Какие курсы подходят для моего профиля?"
|
| 126 |
+
- "Что выбрать с опытом программирования 3/5?"
|
| 127 |
+
- "Рекомендации для 2 семестра"
|
| 128 |
+
|
| 129 |
+
## 🔄 Обновление данных
|
| 130 |
+
|
| 131 |
+
Кнопка "🔄 Обновить данные" выполняет:
|
| 132 |
+
- **Парсинг страниц** программ с сайта ITMO
|
| 133 |
+
- **Поиск PDF файлов** с учебными планами
|
| 134 |
+
- **Обновление курсов** и метаданных
|
| 135 |
+
- **Сохранение в JSON** для последующего использования
|
| 136 |
+
|
| 137 |
+
## 🎯 Система рекомендаций
|
| 138 |
+
|
| 139 |
+
### Входные параметры:
|
| 140 |
+
- **Опыт программирования** (0-5)
|
| 141 |
+
- **Уровень математики** (0-4)
|
| 142 |
+
- **Интересы** (ml, dl, nlp, cv, product, business, etc.)
|
| 143 |
+
- **Целевой семестр** (1-4)
|
| 144 |
+
|
| 145 |
+
### Алгоритм подбора:
|
| 146 |
+
1. **Фильтрация по семестру**
|
| 147 |
+
2. **Оценка сложности** курса vs опыт студента
|
| 148 |
+
3. **Совпадение интересов** с тегами курса
|
| 149 |
+
4. **Математические требования** vs уровень студента
|
| 150 |
+
5. **LLM-генерация** персонализированного объяснения
|
| 151 |
+
|
| 152 |
+
## 🔍 Релевантность вопросов
|
| 153 |
+
|
| 154 |
+
Бот отвечает только на вопросы, содержащие ключевые слова:
|
| 155 |
+
- `итмо`, `магистратура`, `учебный план`, `дисциплина`, `курс`
|
| 156 |
+
- `ии`, `ai`, `ai product`, `программа`
|
| 157 |
+
- `машинное обучение`, `глубокое обучение`, `nlp`, `компьютерное зрение`
|
| 158 |
+
- `продукт`, `аналитика`, `управление`, `обучение`, `учеба`
|
| 159 |
+
|
| 160 |
+
## 📝 Системные инструкции для LLM
|
| 161 |
+
|
| 162 |
+
- Отвечай только по предоставленному контексту
|
| 163 |
+
- Если в контексте нет нужной информации — ответь: "в предоставленных данных об этом не сказано"
|
| 164 |
+
- Отвечай кратко и по делу
|
| 165 |
+
- Не выдумывай факты и не давай общих ответов без ссылок на элементы контекста
|
| 166 |
+
|
| 167 |
+
## 🔧 Fallback-данные
|
| 168 |
+
|
| 169 |
+
12 встроенных курсов (по 6 на каждую программу) с заполненными полями:
|
| 170 |
+
- id, program_id, semester, name, credits, type, short_desc, tags
|
| 171 |
+
- Используются при пустом/недоступном парсинге
|
| 172 |
+
|
| 173 |
+
## 🐳 Docker деплой
|
| 174 |
+
|
| 175 |
+
### На любом сервере с Docker:
|
| 176 |
+
|
| 177 |
+
```bash
|
| 178 |
+
# Клонируем репозиторий
|
| 179 |
+
git clone <your-repo-url>
|
| 180 |
+
cd <your-repo-name>
|
| 181 |
+
|
| 182 |
+
# Запускаем
|
| 183 |
+
docker-compose up -d
|
| 184 |
+
|
| 185 |
+
# Проверяем логи
|
| 186 |
+
docker-compose logs -f
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
### На Hugging Face Spaces:
|
| 190 |
+
|
| 191 |
+
1. Создайте новый Space
|
| 192 |
+
2. Выберите **Docker SDK**
|
| 193 |
+
3. Загрузите все файлы в репозиторий
|
| 194 |
+
4. Space автоматически соберет и запустит Docker контейнер
|
| 195 |
+
|
| 196 |
+
## 📝 Лицензия
|
| 197 |
+
|
| 198 |
+
MIT License - свободное использование и модификация.
|
| 199 |
+
|
| 200 |
+
## 🤝 Поддержка
|
| 201 |
+
|
| 202 |
+
При возникновении проблем:
|
| 203 |
+
1. Проверьте логи Docker: `docker-compose logs`
|
| 204 |
+
2. Убедитесь в наличии всех файлов
|
| 205 |
+
3. Проверьте доступность сайта ITMO
|
| 206 |
+
4. Используйте fallback режим при проблемах с LLM
|
| 207 |
+
|
| 208 |
+
---
|
| 209 |
+
|
| 210 |
+
**Создано для абитуриентов магистратур ITMO** 🎓
|
| 211 |
+
|
| 212 |
+
*Минимально работающий прототип: парсинг + диалог + рекомендации + Docker*
|
app.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request, jsonify
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Импорты модулей
|
| 5 |
+
from parser import parse_all
|
| 6 |
+
from data_layer import load_courses, filter_courses, recommend_courses, is_relevant
|
| 7 |
+
from llm import answer, generate_recommendations
|
| 8 |
+
|
| 9 |
+
app = Flask(__name__)
|
| 10 |
+
|
| 11 |
+
# Инициализация данных
|
| 12 |
+
courses = load_courses()
|
| 13 |
+
print(f'Загружено курсов: {len(courses)}')
|
| 14 |
+
|
| 15 |
+
@app.route('/')
|
| 16 |
+
def index():
|
| 17 |
+
"""Главная страница"""
|
| 18 |
+
return render_template('index.html', courses_count=len(courses))
|
| 19 |
+
|
| 20 |
+
@app.route('/api/chat', methods=['POST'])
|
| 21 |
+
def chat():
|
| 22 |
+
"""API для чата"""
|
| 23 |
+
try:
|
| 24 |
+
data = request.get_json()
|
| 25 |
+
message = data.get('message', '').strip()
|
| 26 |
+
|
| 27 |
+
if not message:
|
| 28 |
+
return jsonify({'error': 'Пустое сообщение'}), 400
|
| 29 |
+
|
| 30 |
+
# Проверяем релевантность
|
| 31 |
+
if not is_relevant(message):
|
| 32 |
+
response = '''Похоже, вопрос не относится к магистратурам ITMO и их учебным планам.
|
| 33 |
+
|
| 34 |
+
Попробуйте спросить, например:
|
| 35 |
+
• "Какие дисциплины по NLP в 1 семестре программы ИИ?"
|
| 36 |
+
• "Расскажи о программе AI Product"
|
| 37 |
+
• "Какие курсы по машинному обучению есть в программе ИИ?"
|
| 38 |
+
• "Сколько кредитов за дисциплину 'Глубокое обучение'?"
|
| 39 |
+
• "Какие курсы подходят для моего профиля?"'''
|
| 40 |
+
return jsonify({'response': response})
|
| 41 |
+
|
| 42 |
+
# Определяем программу из сообщения
|
| 43 |
+
program_id = None
|
| 44 |
+
message_lower = message.lower()
|
| 45 |
+
if any(word in message_lower for word in ['ai product', 'продукт', 'менеджмент', 'аналитика']):
|
| 46 |
+
program_id = 'ai_product'
|
| 47 |
+
elif any(word in message_lower for word in ['ии', 'ai', 'машинное обучение', 'глубокое обучение', 'nlp', 'cv']):
|
| 48 |
+
program_id = 'ai'
|
| 49 |
+
|
| 50 |
+
# Извлекаем семестр если указан
|
| 51 |
+
semester = None
|
| 52 |
+
for i in range(1, 5):
|
| 53 |
+
if f'{i} семестр' in message_lower or f'{i} семестре' in message_lower:
|
| 54 |
+
semester = i
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
# Фильтруем курсы
|
| 58 |
+
context = filter_courses(message, program_id, semester)
|
| 59 |
+
|
| 60 |
+
if not context:
|
| 61 |
+
response = 'К сожалению, не нашел релевантной информации в учебных планах ITMO. Попробуйте переформулировать вопрос.'
|
| 62 |
+
else:
|
| 63 |
+
# Генерируем ответ с помощью LLM
|
| 64 |
+
response = answer(message, context)
|
| 65 |
+
|
| 66 |
+
return jsonify({'response': response})
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
return jsonify({'error': f'Ошибка обработки: {str(e)}'}), 500
|
| 70 |
+
|
| 71 |
+
@app.route('/api/recommendations', methods=['POST'])
|
| 72 |
+
def get_recommendations():
|
| 73 |
+
"""API для рекомендаций"""
|
| 74 |
+
try:
|
| 75 |
+
data = request.get_json()
|
| 76 |
+
programming_exp = data.get('programming_exp', 2)
|
| 77 |
+
math_level = data.get('math_level', 2)
|
| 78 |
+
interests = data.get('interests', [])
|
| 79 |
+
semester = data.get('semester', '')
|
| 80 |
+
|
| 81 |
+
if not semester:
|
| 82 |
+
return jsonify({'error': 'Пожалуйста, укажите семестр для получения рекомендаций.'}), 400
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
semester_int = int(semester)
|
| 86 |
+
except ValueError:
|
| 87 |
+
return jsonify({'error': 'Пожалуйста, выберите корректный семестр.'}), 400
|
| 88 |
+
|
| 89 |
+
# Формируем профиль
|
| 90 |
+
profile = {
|
| 91 |
+
'programming_experience': programming_exp,
|
| 92 |
+
'math_level': math_level,
|
| 93 |
+
'interests': interests,
|
| 94 |
+
'semester': semester_int
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# Получаем рекомендации
|
| 98 |
+
recommended_courses = recommend_courses(profile)
|
| 99 |
+
|
| 100 |
+
if not recommended_courses:
|
| 101 |
+
return jsonify({'error': f'К сожалению, не найдено подходящих курсов для {semester} семестра.'}), 404
|
| 102 |
+
|
| 103 |
+
# Генерируем ответ с помощью LLM
|
| 104 |
+
response = generate_recommendations(recommended_courses, profile)
|
| 105 |
+
|
| 106 |
+
return jsonify({'response': response})
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
return jsonify({'error': f'Ошибка получения рекомендаций: {str(e)}'}), 500
|
| 110 |
+
|
| 111 |
+
@app.route('/api/update', methods=['POST'])
|
| 112 |
+
def update_data():
|
| 113 |
+
"""API для обновления данных"""
|
| 114 |
+
try:
|
| 115 |
+
success = parse_all()
|
| 116 |
+
if success:
|
| 117 |
+
# Перезагружаем курсы
|
| 118 |
+
global courses
|
| 119 |
+
courses = load_courses()
|
| 120 |
+
return jsonify({'message': f'Данные успешно обновлены! Загружено {len(courses)} курсов.'})
|
| 121 |
+
else:
|
| 122 |
+
return jsonify({'error': 'Ошибка при обновлении данных. Используются базовые курсы.'}), 500
|
| 123 |
+
except Exception as e:
|
| 124 |
+
return jsonify({'error': f'Ошибка обновления данных: {str(e)}'}), 500
|
| 125 |
+
|
| 126 |
+
@app.route('/api/status')
|
| 127 |
+
def status():
|
| 128 |
+
"""API для статуса системы"""
|
| 129 |
+
return jsonify({
|
| 130 |
+
'status': 'ok',
|
| 131 |
+
'courses_count': len(courses),
|
| 132 |
+
'llm_available': True
|
| 133 |
+
})
|
| 134 |
+
|
| 135 |
+
if __name__ == '__main__':
|
| 136 |
+
app.run(host='0.0.0.0', port=5000, debug=False)
|
data/processed/courses.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "ai_1_1",
|
| 4 |
+
"program_id": "ai",
|
| 5 |
+
"semester": 1,
|
| 6 |
+
"name": "Машинное обучение",
|
| 7 |
+
"credits": 6,
|
| 8 |
+
"hours": 108,
|
| 9 |
+
"type": "required",
|
| 10 |
+
"tags": ["ml", "math", "stats"],
|
| 11 |
+
"short_desc": "Основы машинного обучения, алгоритмы классификации и регрессии",
|
| 12 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 13 |
+
"source_page": 1
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"id": "ai_1_2",
|
| 17 |
+
"program_id": "ai",
|
| 18 |
+
"semester": 1,
|
| 19 |
+
"name": "Глубокое обучение",
|
| 20 |
+
"credits": 4,
|
| 21 |
+
"hours": 72,
|
| 22 |
+
"type": "required",
|
| 23 |
+
"tags": ["dl", "ml", "neural"],
|
| 24 |
+
"short_desc": "Нейронные сети, CNN, RNN, трансформеры",
|
| 25 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 26 |
+
"source_page": 1
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"id": "ai_2_1",
|
| 30 |
+
"program_id": "ai",
|
| 31 |
+
"semester": 2,
|
| 32 |
+
"name": "Обработка естественного языка",
|
| 33 |
+
"credits": 5,
|
| 34 |
+
"hours": 90,
|
| 35 |
+
"type": "required",
|
| 36 |
+
"tags": ["nlp", "dl", "text"],
|
| 37 |
+
"short_desc": "Методы обработки текста, токенизация, эмбеддинги",
|
| 38 |
+
"source_pdf": "ai_curriculum.pdf",
|
| 39 |
+
"source_page": 2
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"id": "ai_product_1_1",
|
| 43 |
+
"program_id": "ai_product",
|
| 44 |
+
"semester": 1,
|
| 45 |
+
"name": "Продуктовая аналитика",
|
| 46 |
+
"credits": 6,
|
| 47 |
+
"hours": 108,
|
| 48 |
+
"type": "required",
|
| 49 |
+
"tags": ["product", "business", "data"],
|
| 50 |
+
"short_desc": "Анализ продуктовых метрик, A/B тестирование",
|
| 51 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 52 |
+
"source_page": 1
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"id": "ai_product_1_2",
|
| 56 |
+
"program_id": "ai_product",
|
| 57 |
+
"semester": 1,
|
| 58 |
+
"name": "Управление проектами",
|
| 59 |
+
"credits": 4,
|
| 60 |
+
"hours": 72,
|
| 61 |
+
"type": "required",
|
| 62 |
+
"tags": ["pm", "business", "management"],
|
| 63 |
+
"short_desc": "Методологии управления проектами, Agile, Scrum",
|
| 64 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 65 |
+
"source_page": 1
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": "ai_product_2_1",
|
| 69 |
+
"program_id": "ai_product",
|
| 70 |
+
"semester": 2,
|
| 71 |
+
"name": "Компьютерное зрение",
|
| 72 |
+
"credits": 5,
|
| 73 |
+
"hours": 90,
|
| 74 |
+
"type": "elective",
|
| 75 |
+
"tags": ["cv", "dl", "image"],
|
| 76 |
+
"short_desc": "Обработка изображений, распознавание объектов",
|
| 77 |
+
"source_pdf": "ai_product_curriculum.pdf",
|
| 78 |
+
"source_page": 2
|
| 79 |
+
}
|
| 80 |
+
]
|
data/processed/programs.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ai": {
|
| 3 |
+
"id": "ai",
|
| 4 |
+
"title": "Искусственный интеллект",
|
| 5 |
+
"description": "Магистерская программа по искусственному интеллекту в ITMO",
|
| 6 |
+
"url": "https://abit.itmo.ru/program/master/ai",
|
| 7 |
+
"pdf_links": [
|
| 8 |
+
{
|
| 9 |
+
"url": "https://abit.itmo.ru/program/master/ai/curriculum",
|
| 10 |
+
"text": "учебный план",
|
| 11 |
+
"filename": "ai_curriculum.pdf"
|
| 12 |
+
}
|
| 13 |
+
],
|
| 14 |
+
"hash": "test_hash_ai"
|
| 15 |
+
},
|
| 16 |
+
"ai_product": {
|
| 17 |
+
"id": "ai_product",
|
| 18 |
+
"title": "AI Product",
|
| 19 |
+
"description": "Магистерская программа по продуктовой разработке с ИИ",
|
| 20 |
+
"url": "https://abit.itmo.ru/program/master/ai_product",
|
| 21 |
+
"pdf_links": [
|
| 22 |
+
{
|
| 23 |
+
"url": "https://abit.itmo.ru/program/master/ai_product/curriculum",
|
| 24 |
+
"text": "учебный план",
|
| 25 |
+
"filename": "ai_product_curriculum.pdf"
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"hash": "test_hash_ai_product"
|
| 29 |
+
}
|
| 30 |
+
}
|
data_layer.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from parser import get_fallback_courses
|
| 4 |
+
|
| 5 |
+
def load_courses():
|
| 6 |
+
"""Загружает курсы из JSON файла или возвращает fallback"""
|
| 7 |
+
try:
|
| 8 |
+
courses_file = 'data/processed/courses.json'
|
| 9 |
+
if os.path.exists(courses_file):
|
| 10 |
+
with open(courses_file, 'r', encoding='utf-8') as f:
|
| 11 |
+
courses = json.load(f)
|
| 12 |
+
return courses
|
| 13 |
+
else:
|
| 14 |
+
# Если файла нет, возвращаем fallback
|
| 15 |
+
return get_fallback_courses()
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f'Ошибка загрузки курсов: {e}')
|
| 18 |
+
return get_fallback_courses()
|
| 19 |
+
|
| 20 |
+
def filter_courses(query, program_id=None, semester=None):
|
| 21 |
+
"""Фильтрация курсов по запросу и параметрам"""
|
| 22 |
+
courses = load_courses()
|
| 23 |
+
query_lower = query.lower()
|
| 24 |
+
|
| 25 |
+
filtered = []
|
| 26 |
+
|
| 27 |
+
for course in courses:
|
| 28 |
+
# Фильтр по программе
|
| 29 |
+
if program_id and course.get('program_id') != program_id:
|
| 30 |
+
continue
|
| 31 |
+
|
| 32 |
+
# Фильтр по семестру
|
| 33 |
+
if semester and course.get('semester') != semester:
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
# Поиск по ключевым словам
|
| 37 |
+
course_text = f"{course.get('name', '')} {course.get('short_desc', '')} {' '.join(course.get('tags', []))}".lower()
|
| 38 |
+
|
| 39 |
+
if any(word in course_text for word in query_lower.split()):
|
| 40 |
+
filtered.append(course)
|
| 41 |
+
|
| 42 |
+
return filtered[:8] # Ограничиваем до 8 курсов
|
| 43 |
+
|
| 44 |
+
def recommend_courses(profile):
|
| 45 |
+
"""Рекомендации курсов на основе профиля студента"""
|
| 46 |
+
courses = load_courses()
|
| 47 |
+
|
| 48 |
+
programming_exp = profile.get('programming_experience', 2)
|
| 49 |
+
math_level = profile.get('math_level', 2)
|
| 50 |
+
interests = profile.get('interests', [])
|
| 51 |
+
semester = profile.get('semester')
|
| 52 |
+
|
| 53 |
+
# Фильтруем по семестру если указан
|
| 54 |
+
if semester:
|
| 55 |
+
courses = [c for c in courses if c.get('semester') == semester]
|
| 56 |
+
|
| 57 |
+
# Сортируем по релевантности
|
| 58 |
+
scored_courses = []
|
| 59 |
+
|
| 60 |
+
for course in courses:
|
| 61 |
+
score = 0
|
| 62 |
+
|
| 63 |
+
# Оценка по сложности программирования
|
| 64 |
+
if programming_exp <= 2 and 'python' in course.get('tags', []):
|
| 65 |
+
score += 2
|
| 66 |
+
elif 2 <= programming_exp <= 4 and 'ml' in course.get('tags', []):
|
| 67 |
+
score += 2
|
| 68 |
+
elif programming_exp >= 4 and 'dl' in course.get('tags', []):
|
| 69 |
+
score += 2
|
| 70 |
+
|
| 71 |
+
# Оценка по математике
|
| 72 |
+
if math_level >= 2 and 'math' in course.get('tags', []):
|
| 73 |
+
score += 2
|
| 74 |
+
if math_level >= 3 and 'stats' in course.get('tags', []):
|
| 75 |
+
score += 1
|
| 76 |
+
|
| 77 |
+
# Оценка по интересам
|
| 78 |
+
matching_tags = [tag for tag in interests if tag in course.get('tags', [])]
|
| 79 |
+
score += len(matching_tags) * 3
|
| 80 |
+
|
| 81 |
+
# Бонус за product/business интересы для AI Product программы
|
| 82 |
+
if 'product' in interests or 'business' in interests:
|
| 83 |
+
if course.get('program_id') == 'ai_product':
|
| 84 |
+
score += 2
|
| 85 |
+
|
| 86 |
+
if score > 0:
|
| 87 |
+
scored_courses.append((course, score))
|
| 88 |
+
|
| 89 |
+
# Сортируем по score и возвращаем топ-7
|
| 90 |
+
scored_courses.sort(key=lambda x: x[1], reverse=True)
|
| 91 |
+
return [course for course, score in scored_courses[:7]]
|
| 92 |
+
|
| 93 |
+
def is_relevant(message):
|
| 94 |
+
"""Проверяет релевантность вопроса"""
|
| 95 |
+
itmo_keywords = [
|
| 96 |
+
'итмо', 'магистратура', 'учебный план', 'дисциплина', 'курс',
|
| 97 |
+
'ии', 'ai', 'ai product', 'институт ии', 'программа',
|
| 98 |
+
'машинное обучение', 'глубокое обучение', 'nlp', 'компьютерное зрение',
|
| 99 |
+
'продукт', 'аналитика', 'управление', 'обучение', 'учеба'
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
message_lower = message.lower()
|
| 103 |
+
|
| 104 |
+
# Проверяем ключевые слова
|
| 105 |
+
if any(keyword in message_lower for keyword in itmo_keywords):
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
# Проверяем совпадение с названиями курсов
|
| 109 |
+
courses = load_courses()
|
| 110 |
+
for course in courses:
|
| 111 |
+
if course.get('name', '').lower() in message_lower:
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
def get_program_info(program_id):
|
| 117 |
+
"""Получает информацию о программе"""
|
| 118 |
+
programs = {
|
| 119 |
+
'ai': {
|
| 120 |
+
'name': 'Искусственный интеллект',
|
| 121 |
+
'description': 'Программа готовит специалистов в области машинного обучения, глубокого обучения, обработки естестве��ного языка и компьютерного зрения.',
|
| 122 |
+
'duration': '2 года (4 семестра)',
|
| 123 |
+
'credits_total': 120,
|
| 124 |
+
'career': 'ML Engineer, Data Scientist, Research Scientist, AI Developer'
|
| 125 |
+
},
|
| 126 |
+
'ai_product': {
|
| 127 |
+
'name': 'AI Product Management',
|
| 128 |
+
'description': 'Программа готовит продуктовых менеджеров, способных создавать и развивать ИИ-продукты.',
|
| 129 |
+
'duration': '2 года (4 семестра)',
|
| 130 |
+
'credits_total': 120,
|
| 131 |
+
'career': 'Product Manager, AI Product Manager, Business Analyst, Product Owner'
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
return programs.get(program_id)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
itmo-chatbot:
|
| 5 |
+
build: .
|
| 6 |
+
ports:
|
| 7 |
+
- "5000:5000"
|
| 8 |
+
volumes:
|
| 9 |
+
- ./data:/app/data
|
| 10 |
+
environment:
|
| 11 |
+
- FLASK_ENV=production
|
| 12 |
+
restart: unless-stopped
|
llm.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
# Ленивая загрузка LLM
|
| 4 |
+
_generator = None
|
| 5 |
+
|
| 6 |
+
def load_model():
|
| 7 |
+
"""Ленивая загрузка модели"""
|
| 8 |
+
global _generator
|
| 9 |
+
if _generator is None:
|
| 10 |
+
try:
|
| 11 |
+
from transformers import pipeline
|
| 12 |
+
print('Загрузка LLM модели...')
|
| 13 |
+
_generator = pipeline('text2text-generation', model='cointegrated/rut5-base-multitask')
|
| 14 |
+
print('LLM модель загружена')
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print(f'Ошибка загрузки LLM: {e}')
|
| 17 |
+
_generator = None
|
| 18 |
+
return _generator
|
| 19 |
+
|
| 20 |
+
def answer(question, context, system_prompt=None):
|
| 21 |
+
"""Генерирует ответ с помощью LLM"""
|
| 22 |
+
generator = load_model()
|
| 23 |
+
|
| 24 |
+
if not generator or not context:
|
| 25 |
+
return fallback_answer(context)
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# Формируем контекст
|
| 29 |
+
context_text = 'Доступные курсы:\n'
|
| 30 |
+
for i, course in enumerate(context[:6], 1):
|
| 31 |
+
context_text += f'{i}. {course["name"]} ({course["semester"]} семестр, {course["credits"]} кредитов)\n'
|
| 32 |
+
context_text += f' Описание: {course["short_desc"]}\n'
|
| 33 |
+
context_text += f' Теги: {", ".join(course["tags"])}\n\n'
|
| 34 |
+
|
| 35 |
+
# Системные инструкции
|
| 36 |
+
if system_prompt is None:
|
| 37 |
+
system_prompt = '''Ты помощник для абитуриентов магистратуры ITMO. Отвечай только по предоставленному контексту.
|
| 38 |
+
Если в контексте нет нужной информации — ответь: "в предоставленных данных об этом не сказано."
|
| 39 |
+
Отвечай кратко и по делу.
|
| 40 |
+
Не выдумывай факты и не давай общих ответов без ссылок на элементы контекста.'''
|
| 41 |
+
|
| 42 |
+
# Формируем промпт
|
| 43 |
+
prompt = f'''{system_prompt}
|
| 44 |
+
|
| 45 |
+
{context_text}
|
| 46 |
+
|
| 47 |
+
Вопрос: {question}'''
|
| 48 |
+
|
| 49 |
+
# Генерируем ответ
|
| 50 |
+
response = generator(
|
| 51 |
+
prompt,
|
| 52 |
+
max_new_tokens=180,
|
| 53 |
+
temperature=0.3,
|
| 54 |
+
do_sample=True
|
| 55 |
+
)[0]['generated_text']
|
| 56 |
+
|
| 57 |
+
return response.strip()
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f'Ошибка генерации LLM: {e}')
|
| 61 |
+
return fallback_answer(context)
|
| 62 |
+
|
| 63 |
+
def fallback_answer(context):
|
| 64 |
+
"""Fallback ответ без LLM"""
|
| 65 |
+
if not context:
|
| 66 |
+
return 'В предоставленных данных об этом не сказано.'
|
| 67 |
+
|
| 68 |
+
courses = []
|
| 69 |
+
for item in context[:3]:
|
| 70 |
+
courses.append(f'{item["name"]} ({item["semester"]} семестр, {item["credits"]} кредитов)')
|
| 71 |
+
|
| 72 |
+
return f'Найденные курсы: {", ".join(courses)}.'
|
| 73 |
+
|
| 74 |
+
def generate_recommendations(courses, profile):
|
| 75 |
+
"""Генерирует рекомендации с помощью LLM"""
|
| 76 |
+
generator = load_model()
|
| 77 |
+
|
| 78 |
+
if not generator or not courses:
|
| 79 |
+
return fallback_recommendations(courses, profile)
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
# Формируем контекст курсов
|
| 83 |
+
courses_text = 'Доступные курсы:\n'
|
| 84 |
+
for i, course in enumerate(courses[:7], 1):
|
| 85 |
+
courses_text += f'{i}. {course["name"]} ({course["credits"]} кредитов)\n'
|
| 86 |
+
courses_text += f' Сложность: {course.get("difficulty", "не указана")}, Теги: {", ".join(course["tags"])}\n'
|
| 87 |
+
courses_text += f' Описание: {course["short_desc"]}\n\n'
|
| 88 |
+
|
| 89 |
+
# Профиль студента
|
| 90 |
+
programming_exp = profile.get('programming_experience', 2)
|
| 91 |
+
math_level = profile.get('math_level', 2)
|
| 92 |
+
interests = profile.get('interests', [])
|
| 93 |
+
semester = profile.get('semester', 'не указан')
|
| 94 |
+
|
| 95 |
+
prompt = f'''Ты эксперт по выбору курсов. Дай персонализированные рекомендации студенту.
|
| 96 |
+
|
| 97 |
+
Профиль студента:
|
| 98 |
+
- Опыт программирования: {programming_exp}/5
|
| 99 |
+
- Уровень математики: {math_level}/4
|
| 100 |
+
- Интересы: {", ".join(interests)}
|
| 101 |
+
- Целевой семестр: {semester}
|
| 102 |
+
|
| 103 |
+
{courses_text}
|
| 104 |
+
|
| 105 |
+
Дай 5-7 лучших рекомендаций с объяснением почему они подходят. Учитывай уровень сложности и интересы. Отвечай кратко, по делу.'''
|
| 106 |
+
|
| 107 |
+
response = generator(
|
| 108 |
+
prompt,
|
| 109 |
+
max_new_tokens=300,
|
| 110 |
+
temperature=0.4,
|
| 111 |
+
do_sample=True
|
| 112 |
+
)[0]['generated_text']
|
| 113 |
+
|
| 114 |
+
return response.strip()
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f'Ошибка генерации рекомендаций: {e}')
|
| 118 |
+
return fallback_recommendations(courses, profile)
|
| 119 |
+
|
| 120 |
+
def fallback_recommendations(courses, profile):
|
| 121 |
+
"""Fallback рекомендации без LLM"""
|
| 122 |
+
if not courses:
|
| 123 |
+
semester = profile.get('semester', 'не указан')
|
| 124 |
+
return f'Нет курсов для {semester} семестра.'
|
| 125 |
+
|
| 126 |
+
programming_exp = profile.get('programming_experience', 2)
|
| 127 |
+
math_level = profile.get('math_level', 2)
|
| 128 |
+
interests = profile.get('interests', [])
|
| 129 |
+
semester = profile.get('semester', 'не указан')
|
| 130 |
+
|
| 131 |
+
result = f'🎯 Рекомендации для {semester} семестра:\n\n'
|
| 132 |
+
|
| 133 |
+
for i, course in enumerate(courses[:7], 1):
|
| 134 |
+
result += f'{i}. {course["name"]} ({course["credits"]} кредитов)\n'
|
| 135 |
+
|
| 136 |
+
# Объяснение почему подходит
|
| 137 |
+
reasons = []
|
| 138 |
+
matching_tags = [tag for tag in interests if tag in course.get('tags', [])]
|
| 139 |
+
if matching_tags:
|
| 140 |
+
reasons.append(f'подходит по интересам: {", ".join(matching_tags)}')
|
| 141 |
+
|
| 142 |
+
if programming_exp <= 2 and 'python' in course.get('tags', []):
|
| 143 |
+
reasons.append('подходит для начинающих программистов')
|
| 144 |
+
elif programming_exp >= 4 and 'dl' in course.get('tags', []):
|
| 145 |
+
reasons.append('подходит для опытных программистов')
|
| 146 |
+
|
| 147 |
+
if math_level >= 2 and 'math' in course.get('tags', []):
|
| 148 |
+
reasons.append('требует хорошую математическую подготовку')
|
| 149 |
+
|
| 150 |
+
if reasons:
|
| 151 |
+
result += f' Почему подходит: {"; ".join(reasons)}\n'
|
| 152 |
+
|
| 153 |
+
result += '\n'
|
| 154 |
+
|
| 155 |
+
return result
|
parser.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import re
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logging.basicConfig(level=logging.INFO)
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
def parse_program_page(url, program_id):
|
| 12 |
+
"""Парсинг страницы программы"""
|
| 13 |
+
try:
|
| 14 |
+
logger.info(f'Парсинг страницы {program_id}: {url}')
|
| 15 |
+
response = requests.get(url, timeout=10)
|
| 16 |
+
response.raise_for_status()
|
| 17 |
+
|
| 18 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 19 |
+
|
| 20 |
+
# Ищем заголовок
|
| 21 |
+
title = soup.find('h1')
|
| 22 |
+
title_text = title.get_text().strip() if title else f'Программа {program_id}'
|
| 23 |
+
|
| 24 |
+
# Ищем описание
|
| 25 |
+
description = soup.find('div', class_='description') or soup.find('p')
|
| 26 |
+
desc_text = description.get_text().strip() if description else f'Описание программы {program_id}'
|
| 27 |
+
|
| 28 |
+
# Ищем ссылки на PDF
|
| 29 |
+
pdf_links = []
|
| 30 |
+
for link in soup.find_all('a', href=True):
|
| 31 |
+
href = link['href']
|
| 32 |
+
if '.pdf' in href.lower() or 'curriculum' in href.lower() or 'plan' in href.lower():
|
| 33 |
+
if href.startswith('/'):
|
| 34 |
+
href = 'https://abit.itmo.ru' + href
|
| 35 |
+
elif not href.startswith('http'):
|
| 36 |
+
href = 'https://abit.itmo.ru/' + href
|
| 37 |
+
pdf_links.append(href)
|
| 38 |
+
|
| 39 |
+
logger.info(f'Найдено {len(pdf_links)} PDF ссылок для {program_id}')
|
| 40 |
+
|
| 41 |
+
return {
|
| 42 |
+
'title': title_text,
|
| 43 |
+
'description': desc_text,
|
| 44 |
+
'pdf_links': pdf_links,
|
| 45 |
+
'source_url': url
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f'Ошибка парсинга страницы {program_id}: {e}')
|
| 50 |
+
return {
|
| 51 |
+
'title': f'Программа {program_id}',
|
| 52 |
+
'description': f'Описание программы {program_id}',
|
| 53 |
+
'pdf_links': [],
|
| 54 |
+
'source_url': url
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
def parse_pdf(url, program_id):
|
| 58 |
+
"""Парсинг PDF файла с учебным планом"""
|
| 59 |
+
try:
|
| 60 |
+
logger.info(f'Попытка парсинга PDF: {url}')
|
| 61 |
+
|
| 62 |
+
# Пока используем заглушку, так как PDF парсинг сложен
|
| 63 |
+
# В реальной реализации здесь был бы код для извлечения таблиц из PDF
|
| 64 |
+
|
| 65 |
+
# Возвращаем пустой список, чтобы не ломать приложение
|
| 66 |
+
return []
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.error(f'Ошибка парсинга PDF {url}: {e}')
|
| 70 |
+
return []
|
| 71 |
+
|
| 72 |
+
def normalize_course(course_data, program_id):
|
| 73 |
+
"""Нормализация данных курса"""
|
| 74 |
+
# Создаем short_desc из названия если нет
|
| 75 |
+
if 'short_desc' not in course_data:
|
| 76 |
+
course_data['short_desc'] = course_data.get('name', '')[:200]
|
| 77 |
+
|
| 78 |
+
# Генерируем теги на основе названия и описания
|
| 79 |
+
text = f"{course_data.get('name', '')} {course_data.get('short_desc', '')}".lower()
|
| 80 |
+
tags = []
|
| 81 |
+
|
| 82 |
+
if any(word in text for word in ['машинное обучение', 'ml', 'machine learning']):
|
| 83 |
+
tags.append('ml')
|
| 84 |
+
if any(word in text for word in ['глубокое обучение', 'dl', 'neural', 'нейрон']):
|
| 85 |
+
tags.append('dl')
|
| 86 |
+
if any(word in text for word in ['nlp', 'язык', 'текст', 'natural language']):
|
| 87 |
+
tags.append('nlp')
|
| 88 |
+
if any(word in text for word in ['зрение', 'vision', 'image', 'изображение']):
|
| 89 |
+
tags.append('cv')
|
| 90 |
+
if any(word in text for word in ['продукт', 'product', 'менеджмент', 'management']):
|
| 91 |
+
tags.append('product')
|
| 92 |
+
if any(word in text for word in ['бизнес', 'business', 'аналитика', 'analytics']):
|
| 93 |
+
tags.append('business')
|
| 94 |
+
if any(word in text for word in ['исследование', 'research', 'наука']):
|
| 95 |
+
tags.append('research')
|
| 96 |
+
if any(word in text for word in ['данные', 'data', 'статистика']):
|
| 97 |
+
tags.append('data')
|
| 98 |
+
if any(word in text for word in ['системы', 'systems', 'архитектура']):
|
| 99 |
+
tags.append('systems')
|
| 100 |
+
if any(word in text for word in ['python', 'программирование']):
|
| 101 |
+
tags.append('python')
|
| 102 |
+
if any(word in text for word in ['математика', 'math', 'статистика', 'оптимизация']):
|
| 103 |
+
tags.append('math')
|
| 104 |
+
|
| 105 |
+
course_data['tags'] = tags
|
| 106 |
+
course_data['program_id'] = program_id
|
| 107 |
+
|
| 108 |
+
return course_data
|
| 109 |
+
|
| 110 |
+
def get_fallback_courses():
|
| 111 |
+
"""Fallback курсы на случай недоступности парсинга"""
|
| 112 |
+
return [
|
| 113 |
+
# Программа ИИ
|
| 114 |
+
{
|
| 115 |
+
'id': 'ai_1_1',
|
| 116 |
+
'program_id': 'ai',
|
| 117 |
+
'semester': 1,
|
| 118 |
+
'name': 'Машинное обучение',
|
| 119 |
+
'credits': 6,
|
| 120 |
+
'hours': 108,
|
| 121 |
+
'type': 'required',
|
| 122 |
+
'short_desc': 'Основы машинного обучения, алгоритмы классификации и регрессии',
|
| 123 |
+
'tags': ['ml', 'math', 'stats', 'python'],
|
| 124 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
'id': 'ai_1_2',
|
| 128 |
+
'program_id': 'ai',
|
| 129 |
+
'semester': 1,
|
| 130 |
+
'name': 'Глубокое обучение',
|
| 131 |
+
'credits': 4,
|
| 132 |
+
'hours': 72,
|
| 133 |
+
'type': 'required',
|
| 134 |
+
'short_desc': 'Нейронные сети, CNN, RNN, трансформеры',
|
| 135 |
+
'tags': ['dl', 'ml', 'neural', 'python'],
|
| 136 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
'id': 'ai_2_1',
|
| 140 |
+
'program_id': 'ai',
|
| 141 |
+
'semester': 2,
|
| 142 |
+
'name': 'Обработка естественного языка',
|
| 143 |
+
'credits': 5,
|
| 144 |
+
'hours': 90,
|
| 145 |
+
'type': 'required',
|
| 146 |
+
'short_desc': 'Методы обработки текста, токенизация, эмбеддинги',
|
| 147 |
+
'tags': ['nlp', 'dl', 'text', 'transformers'],
|
| 148 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
'id': 'ai_2_2',
|
| 152 |
+
'program_id': 'ai',
|
| 153 |
+
'semester': 2,
|
| 154 |
+
'name': 'Компьютерное зрение',
|
| 155 |
+
'credits': 4,
|
| 156 |
+
'hours': 72,
|
| 157 |
+
'type': 'required',
|
| 158 |
+
'short_desc': 'Обработка изображений, CNN, детекция объектов',
|
| 159 |
+
'tags': ['cv', 'dl', 'image', 'cnn'],
|
| 160 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
'id': 'ai_3_1',
|
| 164 |
+
'program_id': 'ai',
|
| 165 |
+
'semester': 3,
|
| 166 |
+
'name': 'Продвинутые методы машинного обучения',
|
| 167 |
+
'credits': 5,
|
| 168 |
+
'hours': 90,
|
| 169 |
+
'type': 'required',
|
| 170 |
+
'short_desc': 'Продвинутые алгоритмы ML, ансамбли, оптимизация',
|
| 171 |
+
'tags': ['ml', 'advanced', 'algorithms'],
|
| 172 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
'id': 'ai_4_1',
|
| 176 |
+
'program_id': 'ai',
|
| 177 |
+
'semester': 4,
|
| 178 |
+
'name': 'Магистерская диссертация',
|
| 179 |
+
'credits': 12,
|
| 180 |
+
'hours': 216,
|
| 181 |
+
'type': 'required',
|
| 182 |
+
'short_desc': 'Научно-исследовательская работа, защита диссертации',
|
| 183 |
+
'tags': ['research', 'thesis', 'project'],
|
| 184 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai'
|
| 185 |
+
},
|
| 186 |
+
|
| 187 |
+
# Программа AI Product
|
| 188 |
+
{
|
| 189 |
+
'id': 'ai_product_1_1',
|
| 190 |
+
'program_id': 'ai_product',
|
| 191 |
+
'semester': 1,
|
| 192 |
+
'name': 'Продуктовая аналитика',
|
| 193 |
+
'credits': 6,
|
| 194 |
+
'hours': 108,
|
| 195 |
+
'type': 'required',
|
| 196 |
+
'short_desc': 'Анализ продуктовых метрик, A/B тестирование',
|
| 197 |
+
'tags': ['product', 'business', 'data', 'analytics'],
|
| 198 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
'id': 'ai_product_1_2',
|
| 202 |
+
'program_id': 'ai_product',
|
| 203 |
+
'semester': 1,
|
| 204 |
+
'name': 'Управление проектами',
|
| 205 |
+
'credits': 4,
|
| 206 |
+
'hours': 72,
|
| 207 |
+
'type': 'required',
|
| 208 |
+
'short_desc': 'Методологии управления проектами, Agile, Scrum',
|
| 209 |
+
'tags': ['pm', 'business', 'management', 'agile'],
|
| 210 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
'id': 'ai_product_2_1',
|
| 214 |
+
'program_id': 'ai_product',
|
| 215 |
+
'semester': 2,
|
| 216 |
+
'name': 'UX/UI для ИИ продуктов',
|
| 217 |
+
'credits': 4,
|
| 218 |
+
'hours': 72,
|
| 219 |
+
'type': 'required',
|
| 220 |
+
'short_desc': 'Дизайн интерфейсов для ИИ, UX исследования',
|
| 221 |
+
'tags': ['ux', 'ui', 'design', 'ai'],
|
| 222 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
'id': 'ai_product_2_2',
|
| 226 |
+
'program_id': 'ai_product',
|
| 227 |
+
'semester': 2,
|
| 228 |
+
'name': 'Этика ИИ',
|
| 229 |
+
'credits': 3,
|
| 230 |
+
'hours': 54,
|
| 231 |
+
'type': 'required',
|
| 232 |
+
'short_desc': 'Этические принципы ИИ, справедливость, прозрачность',
|
| 233 |
+
'tags': ['ethics', 'ai', 'responsible', 'fairness'],
|
| 234 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
'id': 'ai_product_3_1',
|
| 238 |
+
'program_id': 'ai_product',
|
| 239 |
+
'semester': 3,
|
| 240 |
+
'name': 'Управление ИИ продуктами',
|
| 241 |
+
'credits': 6,
|
| 242 |
+
'hours': 108,
|
| 243 |
+
'type': 'required',
|
| 244 |
+
'short_desc': 'Стратегическое управление ИИ продуктами, команды',
|
| 245 |
+
'tags': ['product', 'management', 'ai', 'leadership'],
|
| 246 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
'id': 'ai_product_4_1',
|
| 250 |
+
'program_id': 'ai_product',
|
| 251 |
+
'semester': 4,
|
| 252 |
+
'name': 'Дипломный проект',
|
| 253 |
+
'credits': 12,
|
| 254 |
+
'hours': 216,
|
| 255 |
+
'type': 'required',
|
| 256 |
+
'short_desc': 'Разработка ИИ продукта, защита проекта',
|
| 257 |
+
'tags': ['project', 'thesis', 'product'],
|
| 258 |
+
'source_url': 'https://abit.itmo.ru/program/master/ai_product'
|
| 259 |
+
}
|
| 260 |
+
]
|
| 261 |
+
|
| 262 |
+
def parse_all():
|
| 263 |
+
"""Основная функция парсинга всех данных"""
|
| 264 |
+
try:
|
| 265 |
+
logger.info('Начинаем парсинг всех данных')
|
| 266 |
+
|
| 267 |
+
# Создаем директории если нет
|
| 268 |
+
os.makedirs('data/processed', exist_ok=True)
|
| 269 |
+
|
| 270 |
+
# Парсим страницы программ
|
| 271 |
+
programs = {
|
| 272 |
+
'ai': 'https://abit.itmo.ru/program/master/ai',
|
| 273 |
+
'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
all_courses = []
|
| 277 |
+
|
| 278 |
+
for program_id, url in programs.items():
|
| 279 |
+
# Парсим страницу программы
|
| 280 |
+
program_info = parse_program_page(url, program_id)
|
| 281 |
+
|
| 282 |
+
# Пытаемся парсить PDF файлы
|
| 283 |
+
for pdf_url in program_info['pdf_links']:
|
| 284 |
+
pdf_courses = parse_pdf(pdf_url, program_id)
|
| 285 |
+
for course in pdf_courses:
|
| 286 |
+
normalized_course = normalize_course(course, program_id)
|
| 287 |
+
all_courses.append(normalized_course)
|
| 288 |
+
|
| 289 |
+
# Если парсинг не дал результатов, используем fallback
|
| 290 |
+
if not all_courses:
|
| 291 |
+
logger.warning('Парсинг не дал результатов, используем fallback курсы')
|
| 292 |
+
all_courses = get_fallback_courses()
|
| 293 |
+
|
| 294 |
+
# Сохраняем в JSON
|
| 295 |
+
courses_file = 'data/processed/courses.json'
|
| 296 |
+
with open(courses_file, 'w', encoding='utf-8') as f:
|
| 297 |
+
json.dump(all_courses, f, ensure_ascii=False, indent=2)
|
| 298 |
+
|
| 299 |
+
logger.info(f'Сохранено {len(all_courses)} курсов в {courses_file}')
|
| 300 |
+
return True
|
| 301 |
+
|
| 302 |
+
except Exception as e:
|
| 303 |
+
logger.error(f'Ошибка парсинга: {e}')
|
| 304 |
+
# Сохраняем fallback курсы
|
| 305 |
+
try:
|
| 306 |
+
os.makedirs('data/processed', exist_ok=True)
|
| 307 |
+
with open('data/processed/courses.json', 'w', encoding='utf-8') as f:
|
| 308 |
+
json.dump(get_fallback_courses(), f, ensure_ascii=False, indent=2)
|
| 309 |
+
logger.info('Сохранены fallback курсы')
|
| 310 |
+
return True
|
| 311 |
+
except Exception as e2:
|
| 312 |
+
logger.error(f'Ошибка сохранения fallback курсов: {e2}')
|
| 313 |
+
return False
|
prompts/system.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Ты - помощник для абитуриентов магистратур ITMO. Отвечай только по контексту, предоставленному ниже.
|
| 2 |
+
|
| 3 |
+
ПРАВИЛА:
|
| 4 |
+
1. Отвечай только на основе информации из контекста
|
| 5 |
+
2. Если в контексте нет ответа - прямо скажи: "в предоставленных данных об этом не сказано"
|
| 6 |
+
3. Отвечай кратко и по делу
|
| 7 |
+
4. Не выдумывай информацию
|
| 8 |
+
5. Если спрашивают о курсах - указывай семестр и количество кредитов
|
| 9 |
+
6. Если спрашивают о программах - давай краткое описание из контекста
|
| 10 |
+
7. Будь вежливым и полезным
|
| 11 |
+
|
| 12 |
+
Контекст содержит информацию о курсах из официальных учебных планов ITMO.
|
requirements.txt
CHANGED
|
@@ -1,3 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask==2.3.3
|
| 2 |
+
transformers==4.36.2
|
| 3 |
+
torch==2.1.0
|
| 4 |
+
requests==2.31.0
|
| 5 |
+
beautifulsoup4==4.12.2
|
| 6 |
+
numpy==1.24.3
|
| 7 |
+
sentencepiece==0.1.99
|
| 8 |
+
huggingface-hub==0.19.4
|
scraper/html_scraper.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import re
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import hashlib
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
class HTMLScraper:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.session = requests.Session()
|
| 12 |
+
self.session.headers.update({
|
| 13 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 14 |
+
})
|
| 15 |
+
|
| 16 |
+
self.program_urls = {
|
| 17 |
+
'ai': 'https://abit.itmo.ru/program/master/ai',
|
| 18 |
+
'ai_product': 'https://abit.itmo.ru/program/master/ai_product'
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def scrape_programs(self) -> Dict:
|
| 23 |
+
programs = {}
|
| 24 |
+
|
| 25 |
+
for program_id, url in self.program_urls.items():
|
| 26 |
+
try:
|
| 27 |
+
print(f'Скрапинг программы {program_id}...')
|
| 28 |
+
program_data = self._scrape_program_page(url, program_id)
|
| 29 |
+
programs[program_id] = program_data
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f'Ошибка при скрапинге {program_id}: {e}')
|
| 32 |
+
|
| 33 |
+
return programs
|
| 34 |
+
|
| 35 |
+
def _scrape_program_page(self, url: str, program_id: str) -> Dict:
|
| 36 |
+
response = self.session.get(url, timeout=30)
|
| 37 |
+
response.raise_for_status()
|
| 38 |
+
|
| 39 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 40 |
+
|
| 41 |
+
title = self._extract_title(soup)
|
| 42 |
+
description = self._extract_description(soup)
|
| 43 |
+
pdf_links = self._extract_pdf_links(soup, url)
|
| 44 |
+
|
| 45 |
+
program_data = {
|
| 46 |
+
'id': program_id,
|
| 47 |
+
'title': title,
|
| 48 |
+
'description': description,
|
| 49 |
+
'url': url,
|
| 50 |
+
'pdf_links': pdf_links,
|
| 51 |
+
'hash': self._calculate_hash(response.content)
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
return program_data
|
| 55 |
+
|
| 56 |
+
def _extract_title(self, soup: BeautifulSoup) -> str:
|
| 57 |
+
title_elem = soup.find('h1') or soup.find('title')
|
| 58 |
+
if title_elem:
|
| 59 |
+
return title_elem.get_text().strip()
|
| 60 |
+
return ''
|
| 61 |
+
|
| 62 |
+
def _extract_description(self, soup: BeautifulSoup) -> str:
|
| 63 |
+
desc_selectors = [
|
| 64 |
+
'.program-description',
|
| 65 |
+
'.description',
|
| 66 |
+
'.program-info',
|
| 67 |
+
'p',
|
| 68 |
+
'.content'
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
for selector in desc_selectors:
|
| 72 |
+
elem = soup.select_one(selector)
|
| 73 |
+
if elem:
|
| 74 |
+
text = elem.get_text().strip()
|
| 75 |
+
if len(text) > 50:
|
| 76 |
+
return text[:500]
|
| 77 |
+
|
| 78 |
+
return ''
|
| 79 |
+
|
| 80 |
+
def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
|
| 81 |
+
pdf_links = []
|
| 82 |
+
|
| 83 |
+
for link in soup.find_all('a', href=True):
|
| 84 |
+
href = link.get('href', '')
|
| 85 |
+
text = link.get_text().strip().lower()
|
| 86 |
+
|
| 87 |
+
if self._is_pdf_link(href, text):
|
| 88 |
+
full_url = self._make_absolute_url(href, base_url)
|
| 89 |
+
pdf_links.append({
|
| 90 |
+
'url': full_url,
|
| 91 |
+
'text': text,
|
| 92 |
+
'filename': self._extract_filename(href)
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
return pdf_links
|
| 96 |
+
|
| 97 |
+
def _is_pdf_link(self, href: str, text: str) -> bool:
|
| 98 |
+
pdf_indicators = [
|
| 99 |
+
'учебный план', 'учебный план', 'curriculum', 'plan',
|
| 100 |
+
'pdf', '.pdf', 'программа', 'program'
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
href_lower = href.lower()
|
| 104 |
+
return any(indicator in href_lower or indicator in text for indicator in pdf_indicators)
|
| 105 |
+
|
| 106 |
+
def _make_absolute_url(self, href: str, base_url: str) -> str:
|
| 107 |
+
if href.startswith('http'):
|
| 108 |
+
return href
|
| 109 |
+
elif href.startswith('/'):
|
| 110 |
+
base = '/'.join(base_url.split('/')[:3])
|
| 111 |
+
return base + href
|
| 112 |
+
else:
|
| 113 |
+
return base_url.rstrip('/') + '/' + href.lstrip('/')
|
| 114 |
+
|
| 115 |
+
def _extract_filename(self, href: str) -> str:
|
| 116 |
+
filename = href.split('/')[-1]
|
| 117 |
+
if not filename.endswith('.pdf'):
|
| 118 |
+
filename += '.pdf'
|
| 119 |
+
return filename
|
| 120 |
+
|
| 121 |
+
def _calculate_hash(self, content: bytes) -> str:
|
| 122 |
+
return hashlib.sha256(content).hexdigest()
|
| 123 |
+
|
| 124 |
+
def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'):
|
| 125 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 126 |
+
|
| 127 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 128 |
+
json.dump(programs, f, ensure_ascii=False, indent=2)
|
| 129 |
+
|
| 130 |
+
print(f'Программы сохранены в {output_path}')
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
scraper = HTMLScraper()
|
| 134 |
+
programs = scraper.scrape_programs()
|
| 135 |
+
scraper.save_programs(programs)
|
| 136 |
+
|
| 137 |
+
for program_id, program in programs.items():
|
| 138 |
+
print(f'\n{program["title"]}:')
|
| 139 |
+
print(f'PDF ссылок найдено: {len(program["pdf_links"])}')
|
| 140 |
+
for link in program['pdf_links']:
|
| 141 |
+
print(f' - {link["filename"]}: {link["url"]}')
|
| 142 |
+
|
| 143 |
+
if __name__ == '__main__':
|
| 144 |
+
main()
|
scraper/normalize.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import hashlib
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
class DataNormalizer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.tag_keywords = {
|
| 8 |
+
'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'],
|
| 9 |
+
'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'],
|
| 10 |
+
'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'],
|
| 11 |
+
'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'],
|
| 12 |
+
'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'],
|
| 13 |
+
'stats': ['статистика', 'вероятность', 'статистический', 'probability'],
|
| 14 |
+
'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'],
|
| 15 |
+
'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'],
|
| 16 |
+
'pm': ['project management', 'управление проектами', 'pm', 'проект'],
|
| 17 |
+
'systems': ['система', 'system', 'архитектура', 'инфраструктура'],
|
| 18 |
+
'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных']
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
def normalize_courses(self, courses: List[Dict]) -> List[Dict]:
|
| 22 |
+
normalized_courses = []
|
| 23 |
+
seen_hashes = set()
|
| 24 |
+
|
| 25 |
+
for course in courses:
|
| 26 |
+
normalized = self._normalize_course(course)
|
| 27 |
+
if normalized:
|
| 28 |
+
course_hash = self._calculate_course_hash(normalized)
|
| 29 |
+
if course_hash not in seen_hashes:
|
| 30 |
+
seen_hashes.add(course_hash)
|
| 31 |
+
normalized_courses.append(normalized)
|
| 32 |
+
|
| 33 |
+
return normalized_courses
|
| 34 |
+
|
| 35 |
+
def _normalize_course(self, course: Dict) -> Dict:
|
| 36 |
+
if not course.get('name'):
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
normalized = course.copy()
|
| 40 |
+
|
| 41 |
+
normalized['name'] = self._normalize_name(course['name'])
|
| 42 |
+
normalized['short_desc'] = self._generate_short_desc(course)
|
| 43 |
+
normalized['tags'] = self._generate_tags(normalized)
|
| 44 |
+
|
| 45 |
+
normalized['semester'] = self._normalize_semester(course.get('semester', 1))
|
| 46 |
+
normalized['credits'] = self._normalize_credits(course.get('credits', 0))
|
| 47 |
+
normalized['hours'] = self._normalize_hours(course.get('hours', 0))
|
| 48 |
+
normalized['type'] = self._normalize_type(course.get('type', 'required'))
|
| 49 |
+
|
| 50 |
+
return normalized
|
| 51 |
+
|
| 52 |
+
def _normalize_name(self, name: str) -> str:
|
| 53 |
+
if not name:
|
| 54 |
+
return ''
|
| 55 |
+
|
| 56 |
+
name = str(name).strip()
|
| 57 |
+
name = re.sub(r'\s+', ' ', name)
|
| 58 |
+
name = name.replace('"', '').replace('"', '')
|
| 59 |
+
|
| 60 |
+
return name
|
| 61 |
+
|
| 62 |
+
def _generate_short_desc(self, course: dict) -> str:
|
| 63 |
+
name = course.get('name', '')
|
| 64 |
+
desc = course.get('description', '')
|
| 65 |
+
|
| 66 |
+
if desc:
|
| 67 |
+
desc = str(desc).strip()
|
| 68 |
+
if len(desc) > 220:
|
| 69 |
+
desc = desc[:220] + '...'
|
| 70 |
+
return desc
|
| 71 |
+
|
| 72 |
+
if name and len(name) > 50:
|
| 73 |
+
return name[:220]
|
| 74 |
+
|
| 75 |
+
return 'Курс из учебного плана программы'
|
| 76 |
+
|
| 77 |
+
def _generate_tags(self, course: Dict) -> List[str]:
|
| 78 |
+
text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower()
|
| 79 |
+
tags = []
|
| 80 |
+
|
| 81 |
+
for tag, keywords in self.tag_keywords.items():
|
| 82 |
+
if any(keyword in text for keyword in keywords):
|
| 83 |
+
tags.append(tag)
|
| 84 |
+
|
| 85 |
+
return tags
|
| 86 |
+
|
| 87 |
+
def _normalize_semester(self, semester) -> int:
|
| 88 |
+
try:
|
| 89 |
+
semester = int(semester)
|
| 90 |
+
if 1 <= semester <= 4:
|
| 91 |
+
return semester
|
| 92 |
+
except (ValueError, TypeError):
|
| 93 |
+
pass
|
| 94 |
+
|
| 95 |
+
return 1
|
| 96 |
+
|
| 97 |
+
def _normalize_credits(self, credits) -> int:
|
| 98 |
+
try:
|
| 99 |
+
credits = int(credits)
|
| 100 |
+
if credits >= 0:
|
| 101 |
+
return credits
|
| 102 |
+
except (ValueError, TypeError):
|
| 103 |
+
pass
|
| 104 |
+
|
| 105 |
+
return 0
|
| 106 |
+
|
| 107 |
+
def _normalize_hours(self, hours) -> int:
|
| 108 |
+
try:
|
| 109 |
+
hours = int(hours)
|
| 110 |
+
if hours >= 0:
|
| 111 |
+
return hours
|
| 112 |
+
except (ValueError, TypeError):
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
return 0
|
| 116 |
+
|
| 117 |
+
def _normalize_type(self, course_type: str) -> str:
|
| 118 |
+
if not course_type:
|
| 119 |
+
return 'required'
|
| 120 |
+
|
| 121 |
+
type_lower = str(course_type).lower()
|
| 122 |
+
|
| 123 |
+
if any(word in type_lower for word in ['обязательная', 'required', 'обяз']):
|
| 124 |
+
return 'required'
|
| 125 |
+
elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']):
|
| 126 |
+
return 'elective'
|
| 127 |
+
|
| 128 |
+
return 'required'
|
| 129 |
+
|
| 130 |
+
def _calculate_course_hash(self, course: Dict) -> str:
|
| 131 |
+
text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}"
|
| 132 |
+
return hashlib.md5(text.encode()).hexdigest()
|
| 133 |
+
|
| 134 |
+
def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]:
|
| 135 |
+
all_courses = []
|
| 136 |
+
for courses in courses_list:
|
| 137 |
+
all_courses.extend(courses)
|
| 138 |
+
|
| 139 |
+
return self.normalize_courses(all_courses)
|
| 140 |
+
|
| 141 |
+
def validate_course(self, course: Dict) -> bool:
|
| 142 |
+
required_fields = ['name', 'program_id', 'semester']
|
| 143 |
+
|
| 144 |
+
for field in required_fields:
|
| 145 |
+
if not course.get(field):
|
| 146 |
+
return False
|
| 147 |
+
|
| 148 |
+
if len(course.get('name', '')) < 3:
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
return True
|
| 152 |
+
|
| 153 |
+
def get_statistics(self, courses: List[Dict]) -> Dict:
|
| 154 |
+
stats = {
|
| 155 |
+
'total_courses': len(courses),
|
| 156 |
+
'by_program': {},
|
| 157 |
+
'by_semester': {},
|
| 158 |
+
'by_type': {},
|
| 159 |
+
'by_tags': {}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
for course in courses:
|
| 163 |
+
program_id = course.get('program_id', 'unknown')
|
| 164 |
+
semester = course.get('semester', 1)
|
| 165 |
+
course_type = course.get('type', 'required')
|
| 166 |
+
tags = course.get('tags', [])
|
| 167 |
+
|
| 168 |
+
stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1
|
| 169 |
+
stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1
|
| 170 |
+
stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1
|
| 171 |
+
|
| 172 |
+
for tag in tags:
|
| 173 |
+
stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1
|
| 174 |
+
|
| 175 |
+
return stats
|
| 176 |
+
|
| 177 |
+
def main():
|
| 178 |
+
normalizer = DataNormalizer()
|
| 179 |
+
|
| 180 |
+
test_courses = [
|
| 181 |
+
{
|
| 182 |
+
'id': 'test_1',
|
| 183 |
+
'program_id': 'ai',
|
| 184 |
+
'name': 'Машинное обучение',
|
| 185 |
+
'semester': 1,
|
| 186 |
+
'credits': 6,
|
| 187 |
+
'type': 'required'
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
'id': 'test_2',
|
| 191 |
+
'program_id': 'ai_product',
|
| 192 |
+
'name': 'Глубокое обучение',
|
| 193 |
+
'semester': 2,
|
| 194 |
+
'credits': 4,
|
| 195 |
+
'type': 'elective'
|
| 196 |
+
}
|
| 197 |
+
]
|
| 198 |
+
|
| 199 |
+
normalized = normalizer.normalize_courses(test_courses)
|
| 200 |
+
stats = normalizer.get_statistics(normalized)
|
| 201 |
+
|
| 202 |
+
print(f'Нормализовано курсов: {len(normalized)}')
|
| 203 |
+
print(f'Статистика: {stats}')
|
| 204 |
+
|
| 205 |
+
if __name__ == '__main__':
|
| 206 |
+
main()
|
scraper/pdf_parser.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pdfplumber
|
| 2 |
+
import requests
|
| 3 |
+
import re
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import os
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
class PDFParser:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.session = requests.Session()
|
| 11 |
+
self.session.headers.update({
|
| 12 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 13 |
+
})
|
| 14 |
+
|
| 15 |
+
def download_pdf(self, url: str, filename: str) -> str:
|
| 16 |
+
local_path = os.path.join('data/raw', filename)
|
| 17 |
+
|
| 18 |
+
if os.path.exists(local_path):
|
| 19 |
+
print(f'PDF уже загружен: {filename}')
|
| 20 |
+
return local_path
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
print(f'Загрузка PDF: {url}')
|
| 24 |
+
response = self.session.get(url, stream=True, timeout=60)
|
| 25 |
+
response.raise_for_status()
|
| 26 |
+
|
| 27 |
+
os.makedirs('data/raw', exist_ok=True)
|
| 28 |
+
|
| 29 |
+
with open(local_path, 'wb') as f:
|
| 30 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 31 |
+
f.write(chunk)
|
| 32 |
+
|
| 33 |
+
print(f'PDF сохранен: {local_path}')
|
| 34 |
+
return local_path
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f'Ошибка загрузки PDF {url}: {e}')
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def parse_pdf(self, pdf_path: str, program_id: str) -> List[Dict]:
|
| 41 |
+
courses = []
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
with pdfplumber.open(pdf_path) as pdf:
|
| 45 |
+
print(f'Парсинг PDF: {pdf_path}')
|
| 46 |
+
|
| 47 |
+
for page_num, page in enumerate(tqdm(pdf.pages, desc='Страницы')):
|
| 48 |
+
page_courses = self._parse_page(page, page_num + 1, program_id)
|
| 49 |
+
courses.extend(page_courses)
|
| 50 |
+
|
| 51 |
+
print(f'Найдено курсов: {len(courses)}')
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f'Ошибка парсинга PDF {pdf_path}: {e}')
|
| 55 |
+
return courses
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
return courses
|
| 59 |
+
|
| 60 |
+
def _parse_page(self, page, page_num: int, program_id: str) -> List[Dict]:
|
| 61 |
+
courses = []
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
tables = page.extract_tables()
|
| 65 |
+
|
| 66 |
+
for table in tables:
|
| 67 |
+
table_courses = self._parse_table(table, page_num, program_id)
|
| 68 |
+
courses.extend(table_courses)
|
| 69 |
+
|
| 70 |
+
if not courses:
|
| 71 |
+
courses = self._parse_text_fallback(page, page_num, program_id)
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f'Ошибка парсинга страницы {page_num}: {e}')
|
| 75 |
+
|
| 76 |
+
return courses
|
| 77 |
+
|
| 78 |
+
def _parse_table(self, table: list, page_num: int, program_id: str) -> List[Dict]:
|
| 79 |
+
courses = []
|
| 80 |
+
|
| 81 |
+
if not table or len(table) < 2:
|
| 82 |
+
return courses
|
| 83 |
+
|
| 84 |
+
headers = [str(cell).lower().strip() if cell else '' for cell in table[0]]
|
| 85 |
+
|
| 86 |
+
for row_idx, row in enumerate(table[1:], 1):
|
| 87 |
+
if not row or len(row) < 3:
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
course = self._extract_course_from_row(row, headers, page_num, program_id)
|
| 91 |
+
if course:
|
| 92 |
+
courses.append(course)
|
| 93 |
+
|
| 94 |
+
return courses
|
| 95 |
+
|
| 96 |
+
def _extract_course_from_row(self, row: list, headers: list, page_num: int, program_id: str) -> Dict:
|
| 97 |
+
try:
|
| 98 |
+
row = [str(cell).strip() if cell else '' for cell in row]
|
| 99 |
+
|
| 100 |
+
name = self._extract_name(row, headers)
|
| 101 |
+
if not name or len(name) < 3:
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
semester = self._extract_semester(row, headers)
|
| 105 |
+
credits = self._extract_credits(row, headers)
|
| 106 |
+
hours = self._extract_hours(row, headers)
|
| 107 |
+
course_type = self._extract_type(row, headers)
|
| 108 |
+
|
| 109 |
+
course = {
|
| 110 |
+
'id': f'{program_id}_{page_num}_{hash(name) % 10000}',
|
| 111 |
+
'program_id': program_id,
|
| 112 |
+
'semester': semester,
|
| 113 |
+
'name': name,
|
| 114 |
+
'credits': credits,
|
| 115 |
+
'hours': hours,
|
| 116 |
+
'type': course_type,
|
| 117 |
+
'source_pdf': os.path.basename(pdf_path),
|
| 118 |
+
'source_page': page_num
|
| 119 |
+
}
|
| 120 |
+
return course
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f'Ошибка извлечения курса из строки: {e}')
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
def _extract_name(self, row: list, headers: list) -> str:
|
| 128 |
+
name_indicators = ['название', 'дисциплина', 'курс', 'предмет', 'name', 'course']
|
| 129 |
+
|
| 130 |
+
for i, header in enumerate(headers):
|
| 131 |
+
if any(indicator in header for indicator in name_indicators):
|
| 132 |
+
if i < len(row) and row[i]:
|
| 133 |
+
return row[i]
|
| 134 |
+
|
| 135 |
+
if len(row) > 0 and row[0]:
|
| 136 |
+
return row[0]
|
| 137 |
+
|
| 138 |
+
return ''
|
| 139 |
+
|
| 140 |
+
def _extract_semester(self, row: list, headers: list) -> int:
|
| 141 |
+
semester_indicators = ['семестр', 'semester', 'сем']
|
| 142 |
+
|
| 143 |
+
for i, header in enumerate(headers):
|
| 144 |
+
if any(indicator in header for indicator in semester_indicators):
|
| 145 |
+
if i < len(row) and row[i]:
|
| 146 |
+
try:
|
| 147 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 148 |
+
except:
|
| 149 |
+
pass
|
| 150 |
+
|
| 151 |
+
return 1
|
| 152 |
+
|
| 153 |
+
def _extract_credits(self, row: list, headers: list) -> int:
|
| 154 |
+
credit_indicators = ['кредит', 'credit', 'зет', 'з.е.']
|
| 155 |
+
|
| 156 |
+
for i, header in enumerate(headers):
|
| 157 |
+
if any(indicator in header for indicator in credit_indicators):
|
| 158 |
+
if i < len(row) and row[i]:
|
| 159 |
+
try:
|
| 160 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 161 |
+
except:
|
| 162 |
+
pass
|
| 163 |
+
|
| 164 |
+
return 0
|
| 165 |
+
|
| 166 |
+
def _extract_hours(self, row: list, headers: list) -> int:
|
| 167 |
+
hour_indicators = ['час', 'hour', 'ауд']
|
| 168 |
+
|
| 169 |
+
for i, header in enumerate(headers):
|
| 170 |
+
if any(indicator in header for indicator in hour_indicators):
|
| 171 |
+
if i < len(row) and row[i]:
|
| 172 |
+
try:
|
| 173 |
+
return int(re.findall(r'\d+', row[i])[0])
|
| 174 |
+
except:
|
| 175 |
+
pass
|
| 176 |
+
|
| 177 |
+
return 0
|
| 178 |
+
|
| 179 |
+
def _extract_type(self, row: list, headers: list) -> str:
|
| 180 |
+
type_indicators = ['тип', 'type', 'вид']
|
| 181 |
+
|
| 182 |
+
for i, header in enumerate(headers):
|
| 183 |
+
if any(indicator in header for indicator in type_indicators):
|
| 184 |
+
if i < len(row) and row[i]:
|
| 185 |
+
text = row[i].lower()
|
| 186 |
+
if any(word in text for word in ['обязательная', 'required', 'обяз']):
|
| 187 |
+
return 'required'
|
| 188 |
+
elif any(word in text for word in ['по выбору', 'elective', 'выбор']):
|
| 189 |
+
return 'elective'
|
| 190 |
+
|
| 191 |
+
return 'required'
|
| 192 |
+
|
| 193 |
+
def _parse_text_fallback(self, page, page_num: int, program_id: str) -> List[Dict]:
|
| 194 |
+
courses = []
|
| 195 |
+
|
| 196 |
+
try:
|
| 197 |
+
text = page.extract_text()
|
| 198 |
+
if not text:
|
| 199 |
+
return courses
|
| 200 |
+
|
| 201 |
+
lines = text.split('\n')
|
| 202 |
+
current_semester = 1
|
| 203 |
+
|
| 204 |
+
for line in lines:
|
| 205 |
+
line = line.strip()
|
| 206 |
+
if not line:
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
if 'семестр' in line.lower():
|
| 210 |
+
semester_match = re.findall(r'\d+', line)
|
| 211 |
+
if semester_match:
|
| 212 |
+
current_semester = int(semester_match[0])
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
if len(line) > 10 and not line.isdigit():
|
| 216 |
+
course = {
|
| 217 |
+
'id': f'{program_id}_{page_num}_{hash(line) % 10000}',
|
| 218 |
+
'program_id': program_id,
|
| 219 |
+
'semester': current_semester,
|
| 220 |
+
'name': line,
|
| 221 |
+
'credits': 0,
|
| 222 |
+
'hours': 0,
|
| 223 |
+
'type': 'required',
|
| 224 |
+
'source_pdf': os.path.basename(program_id),
|
| 225 |
+
'source_page': page_num
|
| 226 |
+
}
|
| 227 |
+
courses.append(course)
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
print(f'Ошибка fallback парсинга страницы {page_num}: {e}')
|
| 231 |
+
|
| 232 |
+
return courses
|
| 233 |
+
|
| 234 |
+
def main():
|
| 235 |
+
parser = PDFParser()
|
| 236 |
+
|
| 237 |
+
test_url = 'https://example.com/test.pdf'
|
| 238 |
+
test_filename = 'test.pdf'
|
| 239 |
+
|
| 240 |
+
local_path = parser.download_pdf(test_url, test_filename)
|
| 241 |
+
if local_path:
|
| 242 |
+
courses = parser.parse_pdf(local_path, 'test_program')
|
| 243 |
+
print(f'Найдено курсов: {len(courses)}')
|
| 244 |
+
|
| 245 |
+
if __name__ == '__main__':
|
| 246 |
+
main()
|
templates/index.html
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="ru">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>ITMO Магистратура - Чат-бот</title>
|
| 7 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 8 |
+
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
|
| 9 |
+
<style>
|
| 10 |
+
body {
|
| 11 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 12 |
+
min-height: 100vh;
|
| 13 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 14 |
+
}
|
| 15 |
+
.main-container {
|
| 16 |
+
background: rgba(255, 255, 255, 0.95);
|
| 17 |
+
border-radius: 20px;
|
| 18 |
+
box-shadow: 0 20px 40px rgba(0,0,0,0.1);
|
| 19 |
+
margin: 20px auto;
|
| 20 |
+
max-width: 1200px;
|
| 21 |
+
}
|
| 22 |
+
.chat-container {
|
| 23 |
+
height: 400px;
|
| 24 |
+
overflow-y: auto;
|
| 25 |
+
border: 1px solid #dee2e6;
|
| 26 |
+
border-radius: 10px;
|
| 27 |
+
padding: 15px;
|
| 28 |
+
background: #f8f9fa;
|
| 29 |
+
}
|
| 30 |
+
.message {
|
| 31 |
+
margin-bottom: 15px;
|
| 32 |
+
padding: 10px 15px;
|
| 33 |
+
border-radius: 15px;
|
| 34 |
+
max-width: 80%;
|
| 35 |
+
}
|
| 36 |
+
.user-message {
|
| 37 |
+
background: #007bff;
|
| 38 |
+
color: white;
|
| 39 |
+
margin-left: auto;
|
| 40 |
+
}
|
| 41 |
+
.bot-message {
|
| 42 |
+
background: #e9ecef;
|
| 43 |
+
color: #333;
|
| 44 |
+
}
|
| 45 |
+
.loading {
|
| 46 |
+
display: none;
|
| 47 |
+
text-align: center;
|
| 48 |
+
padding: 20px;
|
| 49 |
+
}
|
| 50 |
+
.spinner-border-sm {
|
| 51 |
+
width: 1rem;
|
| 52 |
+
height: 1rem;
|
| 53 |
+
}
|
| 54 |
+
.card {
|
| 55 |
+
border: none;
|
| 56 |
+
box-shadow: 0 5px 15px rgba(0,0,0,0.08);
|
| 57 |
+
border-radius: 15px;
|
| 58 |
+
}
|
| 59 |
+
.btn-primary {
|
| 60 |
+
background: linear-gradient(45deg, #667eea, #764ba2);
|
| 61 |
+
border: none;
|
| 62 |
+
border-radius: 25px;
|
| 63 |
+
padding: 10px 25px;
|
| 64 |
+
}
|
| 65 |
+
.btn-secondary {
|
| 66 |
+
background: linear-gradient(45deg, #6c757d, #495057);
|
| 67 |
+
border: none;
|
| 68 |
+
border-radius: 25px;
|
| 69 |
+
padding: 10px 25px;
|
| 70 |
+
}
|
| 71 |
+
.form-control, .form-select {
|
| 72 |
+
border-radius: 10px;
|
| 73 |
+
border: 2px solid #e9ecef;
|
| 74 |
+
}
|
| 75 |
+
.form-control:focus, .form-select:focus {
|
| 76 |
+
border-color: #667eea;
|
| 77 |
+
box-shadow: 0 0 0 0.2rem rgba(102, 126, 234, 0.25);
|
| 78 |
+
}
|
| 79 |
+
</style>
|
| 80 |
+
</head>
|
| 81 |
+
<body>
|
| 82 |
+
<div class="container-fluid">
|
| 83 |
+
<div class="main-container p-4">
|
| 84 |
+
<!-- Заголовок -->
|
| 85 |
+
<div class="text-center mb-4">
|
| 86 |
+
<h1 class="display-4 text-primary">
|
| 87 |
+
<i class="fas fa-robot"></i> ITMO Магистратура - Чат-бот
|
| 88 |
+
</h1>
|
| 89 |
+
<p class="lead text-muted">Задавайте вопросы о программах ИИ и AI Product, получайте персональные рекомендации по курсам</p>
|
| 90 |
+
<div class="row justify-content-center">
|
| 91 |
+
<div class="col-md-3">
|
| 92 |
+
<div class="card text-center">
|
| 93 |
+
<div class="card-body">
|
| 94 |
+
<i class="fas fa-graduation-cap fa-2x text-primary"></i>
|
| 95 |
+
<h5 class="card-title">{{ courses_count }}</h5>
|
| 96 |
+
<p class="card-text">Курсов загружено</p>
|
| 97 |
+
</div>
|
| 98 |
+
</div>
|
| 99 |
+
</div>
|
| 100 |
+
</div>
|
| 101 |
+
</div>
|
| 102 |
+
|
| 103 |
+
<div class="row">
|
| 104 |
+
<!-- Чат -->
|
| 105 |
+
<div class="col-lg-8">
|
| 106 |
+
<div class="card">
|
| 107 |
+
<div class="card-header bg-primary text-white">
|
| 108 |
+
<h5 class="mb-0"><i class="fas fa-comments"></i> Чат с ботом</h5>
|
| 109 |
+
</div>
|
| 110 |
+
<div class="card-body">
|
| 111 |
+
<div class="chat-container" id="chatContainer">
|
| 112 |
+
<div class="message bot-message">
|
| 113 |
+
<strong>Бот:</strong> Привет! Я помогу тебе узнать больше о магистерских программах ITMO. Задавай вопросы о курсах, программах и получай персональные рекомендации!
|
| 114 |
+
</div>
|
| 115 |
+
</div>
|
| 116 |
+
|
| 117 |
+
<div class="loading" id="loading">
|
| 118 |
+
<div class="spinner-border text-primary" role="status">
|
| 119 |
+
<span class="visually-hidden">Загрузка...</span>
|
| 120 |
+
</div>
|
| 121 |
+
<p class="mt-2">Бот думает...</p>
|
| 122 |
+
</div>
|
| 123 |
+
|
| 124 |
+
<div class="input-group mt-3">
|
| 125 |
+
<input type="text" class="form-control" id="messageInput"
|
| 126 |
+
placeholder="Спрашивайте о дисциплинах, программах, учебных планах...">
|
| 127 |
+
<button class="btn btn-primary" type="button" id="sendBtn">
|
| 128 |
+
<i class="fas fa-paper-plane"></i> Отправить
|
| 129 |
+
</button>
|
| 130 |
+
<button class="btn btn-outline-secondary" type="button" id="clearBtn">
|
| 131 |
+
<i class="fas fa-trash"></i> Очистить
|
| 132 |
+
</button>
|
| 133 |
+
</div>
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
<!-- Рекомендации -->
|
| 139 |
+
<div class="col-lg-4">
|
| 140 |
+
<div class="card">
|
| 141 |
+
<div class="card-header bg-success text-white">
|
| 142 |
+
<h5 class="mb-0"><i class="fas fa-user-graduate"></i> Профиль для рекомендаций</h5>
|
| 143 |
+
</div>
|
| 144 |
+
<div class="card-body">
|
| 145 |
+
<form id="recommendationsForm">
|
| 146 |
+
<div class="mb-3">
|
| 147 |
+
<label class="form-label">Опыт программирования (0-5)</label>
|
| 148 |
+
<input type="range" class="form-range" id="programmingExp" min="0" max="5" value="2">
|
| 149 |
+
<div class="d-flex justify-content-between">
|
| 150 |
+
<small>Нет опыта</small>
|
| 151 |
+
<small>Эксперт</small>
|
| 152 |
+
</div>
|
| 153 |
+
</div>
|
| 154 |
+
|
| 155 |
+
<div class="mb-3">
|
| 156 |
+
<label class="form-label">Уровень математики (0-4)</label>
|
| 157 |
+
<input type="range" class="form-range" id="mathLevel" min="0" max="4" value="2">
|
| 158 |
+
<div class="d-flex justify-content-between">
|
| 159 |
+
<small>Базовый</small>
|
| 160 |
+
<small>Продвинутый</small>
|
| 161 |
+
</div>
|
| 162 |
+
</div>
|
| 163 |
+
|
| 164 |
+
<div class="mb-3">
|
| 165 |
+
<label class="form-label">Интересы</label>
|
| 166 |
+
<div class="row">
|
| 167 |
+
<div class="col-6">
|
| 168 |
+
<div class="form-check">
|
| 169 |
+
<input class="form-check-input" type="checkbox" value="ml" id="ml" checked>
|
| 170 |
+
<label class="form-check-label" for="ml">ML</label>
|
| 171 |
+
</div>
|
| 172 |
+
<div class="form-check">
|
| 173 |
+
<input class="form-check-input" type="checkbox" value="dl" id="dl">
|
| 174 |
+
<label class="form-check-label" for="dl">DL</label>
|
| 175 |
+
</div>
|
| 176 |
+
<div class="form-check">
|
| 177 |
+
<input class="form-check-input" type="checkbox" value="nlp" id="nlp">
|
| 178 |
+
<label class="form-check-label" for="nlp">NLP</label>
|
| 179 |
+
</div>
|
| 180 |
+
<div class="form-check">
|
| 181 |
+
<input class="form-check-input" type="checkbox" value="cv" id="cv">
|
| 182 |
+
<label class="form-check-label" for="cv">CV</label>
|
| 183 |
+
</div>
|
| 184 |
+
<div class="form-check">
|
| 185 |
+
<input class="form-check-input" type="checkbox" value="product" id="product">
|
| 186 |
+
<label class="form-check-label" for="product">Product</label>
|
| 187 |
+
</div>
|
| 188 |
+
<div class="form-check">
|
| 189 |
+
<input class="form-check-input" type="checkbox" value="business" id="business">
|
| 190 |
+
<label class="form-check-label" for="business">Business</label>
|
| 191 |
+
</div>
|
| 192 |
+
</div>
|
| 193 |
+
<div class="col-6">
|
| 194 |
+
<div class="form-check">
|
| 195 |
+
<input class="form-check-input" type="checkbox" value="research" id="research">
|
| 196 |
+
<label class="form-check-label" for="research">Research</label>
|
| 197 |
+
</div>
|
| 198 |
+
<div class="form-check">
|
| 199 |
+
<input class="form-check-input" type="checkbox" value="data" id="data">
|
| 200 |
+
<label class="form-check-label" for="data">Data</label>
|
| 201 |
+
</div>
|
| 202 |
+
<div class="form-check">
|
| 203 |
+
<input class="form-check-input" type="checkbox" value="systems" id="systems">
|
| 204 |
+
<label class="form-check-label" for="systems">Systems</label>
|
| 205 |
+
</div>
|
| 206 |
+
<div class="form-check">
|
| 207 |
+
<input class="form-check-input" type="checkbox" value="python" id="python">
|
| 208 |
+
<label class="form-check-label" for="python">Python</label>
|
| 209 |
+
</div>
|
| 210 |
+
<div class="form-check">
|
| 211 |
+
<input class="form-check-input" type="checkbox" value="math" id="math">
|
| 212 |
+
<label class="form-check-label" for="math">Math</label>
|
| 213 |
+
</div>
|
| 214 |
+
</div>
|
| 215 |
+
</div>
|
| 216 |
+
</div>
|
| 217 |
+
|
| 218 |
+
<div class="mb-3">
|
| 219 |
+
<label class="form-label">Целевой семестр</label>
|
| 220 |
+
<select class="form-select" id="semester">
|
| 221 |
+
<option value="">Выберите семестр</option>
|
| 222 |
+
<option value="1">1 семестр</option>
|
| 223 |
+
<option value="2">2 семестр</option>
|
| 224 |
+
<option value="3">3 семестр</option>
|
| 225 |
+
<option value="4">4 семестр</option>
|
| 226 |
+
</select>
|
| 227 |
+
</div>
|
| 228 |
+
|
| 229 |
+
<button type="submit" class="btn btn-success w-100 mb-2">
|
| 230 |
+
<i class="fas fa-lightbulb"></i> Получить рекомендации
|
| 231 |
+
</button>
|
| 232 |
+
|
| 233 |
+
<button type="button" class="btn btn-secondary w-100" id="updateBtn">
|
| 234 |
+
<i class="fas fa-sync-alt"></i> Обновить данные
|
| 235 |
+
</button>
|
| 236 |
+
</form>
|
| 237 |
+
|
| 238 |
+
<div class="mt-3">
|
| 239 |
+
<textarea class="form-control" id="recommendationsOutput" rows="8"
|
| 240 |
+
placeholder="Здесь появятся рекомендации..." readonly></textarea>
|
| 241 |
+
</div>
|
| 242 |
+
</div>
|
| 243 |
+
</div>
|
| 244 |
+
</div>
|
| 245 |
+
</div>
|
| 246 |
+
</div>
|
| 247 |
+
</div>
|
| 248 |
+
|
| 249 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
|
| 250 |
+
<script>
|
| 251 |
+
// Функции для работы с чатом
|
| 252 |
+
function addMessage(message, isUser = false) {
|
| 253 |
+
const chatContainer = document.getElementById('chatContainer');
|
| 254 |
+
const messageDiv = document.createElement('div');
|
| 255 |
+
messageDiv.className = `message ${isUser ? 'user-message' : 'bot-message'}`;
|
| 256 |
+
messageDiv.innerHTML = `<strong>${isUser ? 'Вы:' : 'Бот:'}</strong> ${message}`;
|
| 257 |
+
chatContainer.appendChild(messageDiv);
|
| 258 |
+
chatContainer.scrollTop = chatContainer.scrollHeight;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
function showLoading() {
|
| 262 |
+
document.getElementById('loading').style.display = 'block';
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
function hideLoading() {
|
| 266 |
+
document.getElementById('loading').style.display = 'none';
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
// Отправка сообщения
|
| 270 |
+
async function sendMessage() {
|
| 271 |
+
const input = document.getElementById('messageInput');
|
| 272 |
+
const message = input.value.trim();
|
| 273 |
+
|
| 274 |
+
if (!message) return;
|
| 275 |
+
|
| 276 |
+
addMessage(message, true);
|
| 277 |
+
input.value = '';
|
| 278 |
+
showLoading();
|
| 279 |
+
|
| 280 |
+
try {
|
| 281 |
+
const response = await fetch('/api/chat', {
|
| 282 |
+
method: 'POST',
|
| 283 |
+
headers: {
|
| 284 |
+
'Content-Type': 'application/json',
|
| 285 |
+
},
|
| 286 |
+
body: JSON.stringify({ message: message })
|
| 287 |
+
});
|
| 288 |
+
|
| 289 |
+
const data = await response.json();
|
| 290 |
+
|
| 291 |
+
if (response.ok) {
|
| 292 |
+
addMessage(data.response);
|
| 293 |
+
} else {
|
| 294 |
+
addMessage(`Ошибка: ${data.error}`);
|
| 295 |
+
}
|
| 296 |
+
} catch (error) {
|
| 297 |
+
addMessage(`Ошибка соединения: ${error.message}`);
|
| 298 |
+
} finally {
|
| 299 |
+
hideLoading();
|
| 300 |
+
}
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// Получение рекомендаций
|
| 304 |
+
async function getRecommendations() {
|
| 305 |
+
const programmingExp = document.getElementById('programmingExp').value;
|
| 306 |
+
const mathLevel = document.getElementById('mathLevel').value;
|
| 307 |
+
const semester = document.getElementById('semester').value;
|
| 308 |
+
|
| 309 |
+
if (!semester) {
|
| 310 |
+
alert('Пожалуйста, выберите семестр');
|
| 311 |
+
return;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
const interests = [];
|
| 315 |
+
document.querySelectorAll('input[type="checkbox"]:checked').forEach(cb => {
|
| 316 |
+
interests.push(cb.value);
|
| 317 |
+
});
|
| 318 |
+
|
| 319 |
+
const output = document.getElementById('recommendationsOutput');
|
| 320 |
+
output.value = 'Генерируем рекомендации...';
|
| 321 |
+
|
| 322 |
+
try {
|
| 323 |
+
const response = await fetch('/api/recommendations', {
|
| 324 |
+
method: 'POST',
|
| 325 |
+
headers: {
|
| 326 |
+
'Content-Type': 'application/json',
|
| 327 |
+
},
|
| 328 |
+
body: JSON.stringify({
|
| 329 |
+
programming_exp: parseInt(programmingExp),
|
| 330 |
+
math_level: parseInt(mathLevel),
|
| 331 |
+
interests: interests,
|
| 332 |
+
semester: semester
|
| 333 |
+
})
|
| 334 |
+
});
|
| 335 |
+
|
| 336 |
+
const data = await response.json();
|
| 337 |
+
|
| 338 |
+
if (response.ok) {
|
| 339 |
+
output.value = data.response;
|
| 340 |
+
} else {
|
| 341 |
+
output.value = `Ошибка: ${data.error}`;
|
| 342 |
+
}
|
| 343 |
+
} catch (error) {
|
| 344 |
+
output.value = `Ошибка соединения: ${error.message}`;
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
// Обновление данных
|
| 349 |
+
async function updateData() {
|
| 350 |
+
const output = document.getElementById('recommendationsOutput');
|
| 351 |
+
output.value = 'Обновляем данные...';
|
| 352 |
+
|
| 353 |
+
try {
|
| 354 |
+
const response = await fetch('/api/update', {
|
| 355 |
+
method: 'POST',
|
| 356 |
+
headers: {
|
| 357 |
+
'Content-Type': 'application/json',
|
| 358 |
+
}
|
| 359 |
+
});
|
| 360 |
+
|
| 361 |
+
const data = await response.json();
|
| 362 |
+
|
| 363 |
+
if (response.ok) {
|
| 364 |
+
output.value = data.message;
|
| 365 |
+
location.reload(); // Перезагружаем страницу для обновления счетчика
|
| 366 |
+
} else {
|
| 367 |
+
output.value = `Ошибка: ${data.error}`;
|
| 368 |
+
}
|
| 369 |
+
} catch (error) {
|
| 370 |
+
output.value = `Ошибка соединения: ${error.message}`;
|
| 371 |
+
}
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
// Очистка чата
|
| 375 |
+
function clearChat() {
|
| 376 |
+
const chatContainer = document.getElementById('chatContainer');
|
| 377 |
+
chatContainer.innerHTML = `
|
| 378 |
+
<div class="message bot-message">
|
| 379 |
+
<strong>Бот:</strong> Привет! Я помогу тебе узнать больше о магистерских программах ITMO. Задавай вопросы о курсах, программах и получай персональные рекомендации!
|
| 380 |
+
</div>
|
| 381 |
+
`;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
// Обработчики событий
|
| 385 |
+
document.getElementById('sendBtn').addEventListener('click', sendMessage);
|
| 386 |
+
document.getElementById('messageInput').addEventListener('keypress', function(e) {
|
| 387 |
+
if (e.key === 'Enter') {
|
| 388 |
+
sendMessage();
|
| 389 |
+
}
|
| 390 |
+
});
|
| 391 |
+
document.getElementById('clearBtn').addEventListener('click', clearChat);
|
| 392 |
+
document.getElementById('recommendationsForm').addEventListener('submit', function(e) {
|
| 393 |
+
e.preventDefault();
|
| 394 |
+
getRecommendations();
|
| 395 |
+
});
|
| 396 |
+
document.getElementById('updateBtn').addEventListener('click', updateData);
|
| 397 |
+
</script>
|
| 398 |
+
</body>
|
| 399 |
+
</html>
|
tests/test_filter.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from knowledge_base import KnowledgeBase
|
| 6 |
+
|
| 7 |
+
def test_itmo_query_filter():
|
| 8 |
+
kb = KnowledgeBase()
|
| 9 |
+
|
| 10 |
+
test_cases = [
|
| 11 |
+
('Какие дисциплины по NLP в 1 семестре программы ИИ?', True),
|
| 12 |
+
('Расскажи о программе AI Product', True),
|
| 13 |
+
('Сколько кредитов за курс машинного обучения?', True),
|
| 14 |
+
('Какая погода в Санкт-Петербурге?', False),
|
| 15 |
+
('Как приготовить борщ?', False),
|
| 16 |
+
('Расскажи о программе ИИ в ITMO', True),
|
| 17 |
+
('Какие курсы по глубокому обучению?', True),
|
| 18 |
+
('Как добраться до метро?', False),
|
| 19 |
+
('Учебный план магистратуры', True),
|
| 20 |
+
('Дисциплины по компьютерному зрению', True)
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
print('Тестирование фильтра релевантности...')
|
| 24 |
+
|
| 25 |
+
for query, expected in test_cases:
|
| 26 |
+
result = kb.is_itmo_query(query)
|
| 27 |
+
status = '✓' if result == expected else '✗'
|
| 28 |
+
print(f'{status} "{query}" -> {result} (ожидалось {expected})')
|
| 29 |
+
|
| 30 |
+
print('\nТест завершен')
|
| 31 |
+
|
| 32 |
+
if __name__ == '__main__':
|
| 33 |
+
test_itmo_query_filter()
|
tests/test_recommend.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 4 |
+
|
| 5 |
+
from knowledge_base import KnowledgeBase
|
| 6 |
+
|
| 7 |
+
def test_recommendations():
|
| 8 |
+
kb = KnowledgeBase()
|
| 9 |
+
|
| 10 |
+
test_profiles = [
|
| 11 |
+
{
|
| 12 |
+
'name': 'ML профиль',
|
| 13 |
+
'profile': {
|
| 14 |
+
'programming_experience': 4,
|
| 15 |
+
'math_level': 3,
|
| 16 |
+
'interests': ['ml', 'dl', 'nlp'],
|
| 17 |
+
'semester': 1
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
'name': 'Product профиль',
|
| 22 |
+
'profile': {
|
| 23 |
+
'programming_experience': 2,
|
| 24 |
+
'math_level': 1,
|
| 25 |
+
'interests': ['product', 'business'],
|
| 26 |
+
'semester': 2
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
'name': 'Research профиль',
|
| 31 |
+
'profile': {
|
| 32 |
+
'programming_experience': 3,
|
| 33 |
+
'math_level': 4,
|
| 34 |
+
'interests': ['research', 'math', 'stats'],
|
| 35 |
+
'semester': 3
|
| 36 |
+
}
|
| 37 |
+
}
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
print('Тестирование системы рекомендаций...')
|
| 41 |
+
|
| 42 |
+
for test_case in test_profiles:
|
| 43 |
+
print(f'\n{test_case["name"]}:')
|
| 44 |
+
recommendations = kb.recommend(test_case['profile'])
|
| 45 |
+
|
| 46 |
+
if recommendations:
|
| 47 |
+
print(f'Найдено рекомендаций: {len(recommendations)}')
|
| 48 |
+
for i, rec in enumerate(recommendations[:3], 1):
|
| 49 |
+
print(f' {i}. {rec["name"]} ({rec["semester"]} семестр)')
|
| 50 |
+
else:
|
| 51 |
+
print('Рекомендации не найдены')
|
| 52 |
+
|
| 53 |
+
print('\nТест завершен')
|
| 54 |
+
|
| 55 |
+
if __name__ == '__main__':
|
| 56 |
+
test_recommendations()
|