Spaces:
Sleeping
Sleeping
Pygmales commited on
Commit ·
2b7b752
1
Parent(s): 682e2a1
loaded project
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +17 -0
- .gitignore +69 -0
- Dockerfile +37 -0
- config.py +186 -0
- main.py +163 -0
- requirements.txt +39 -0
- src/__init__.py +0 -0
- src/apps/__init__.py +0 -0
- src/apps/chat/__init__.py +0 -0
- src/apps/chat/app.py +324 -0
- src/apps/dbapp/app.py +44 -0
- src/apps/dbapp/backup.py +191 -0
- src/apps/dbapp/collections.py +8 -0
- src/apps/dbapp/config.py +350 -0
- src/apps/dbapp/framebase.py +15 -0
- src/apps/dbapp/imports.py +244 -0
- src/apps/dbapp/mainframe.py +8 -0
- src/apps/dbapp/query.py +108 -0
- src/apps/dbapp/utilclasses.py +38 -0
- src/cache/__init__.py +0 -0
- src/cache/cache.py +75 -0
- src/cache/cache_base.py +19 -0
- src/cache/cache_metrics.py +28 -0
- src/cache/cache_strategies.py +88 -0
- src/cache/utils.py +5 -0
- src/config/__init__.py +39 -0
- src/config/configs.py +249 -0
- src/const/agent_response_constants.py +209 -0
- src/const/cc_whitelist.py +3 -0
- src/const/data_consent_constants.py +60 -0
- src/const/page_blacklist.py +5 -0
- src/const/page_priority.py +137 -0
- src/database/__init__.py +0 -0
- src/database/docker-compose-cache.yml +27 -0
- src/database/docker-compose.yml +29 -0
- src/database/redisservice.py +53 -0
- src/database/weavservice.py +851 -0
- src/notification/__init__.py +0 -0
- src/notification/notification_center.py +148 -0
- src/pipeline/__init__.py +0 -0
- src/pipeline/pipeline.py +212 -0
- src/pipeline/processors.py +303 -0
- src/pipeline/utilclasses.py +3 -0
- src/pipeline/utils/__init__.py +3 -0
- src/pipeline/utils/serializer.py +58 -0
- src/pipeline/utils/strategies_processor.py +74 -0
- src/pipeline/utils/utilclasses.py +13 -0
- src/rag/__init__.py +0 -0
- src/rag/agent_chain.py +1022 -0
- src/rag/input_handler.py +147 -0
.dockerignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.env
|
| 4 |
+
__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
venv/
|
| 8 |
+
logs/
|
| 9 |
+
data/database/backups/
|
| 10 |
+
tests/
|
| 11 |
+
docs/
|
| 12 |
+
README.md
|
| 13 |
+
README_UPDATES.md
|
| 14 |
+
*.md
|
| 15 |
+
.github/
|
| 16 |
+
.pytest_cache/
|
| 17 |
+
htmlcov/
|
.gitignore
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environment
|
| 7 |
+
.env
|
| 8 |
+
.venv/
|
| 9 |
+
env/
|
| 10 |
+
venv/
|
| 11 |
+
ENV/
|
| 12 |
+
env.bak/
|
| 13 |
+
venv.bak/
|
| 14 |
+
|
| 15 |
+
# Environment variables
|
| 16 |
+
.env
|
| 17 |
+
|
| 18 |
+
# VS Code settings
|
| 19 |
+
.vscode/
|
| 20 |
+
|
| 21 |
+
# MacOS system files
|
| 22 |
+
.DS_Store
|
| 23 |
+
|
| 24 |
+
# Jupyter Notebook checkpoints
|
| 25 |
+
.ipynb_checkpoints/
|
| 26 |
+
|
| 27 |
+
# Logs
|
| 28 |
+
*.log
|
| 29 |
+
|
| 30 |
+
# Cache and temp files
|
| 31 |
+
*.tmp
|
| 32 |
+
*.swp
|
| 33 |
+
*.bak
|
| 34 |
+
.cache/
|
| 35 |
+
*.sqlite3
|
| 36 |
+
*.db
|
| 37 |
+
|
| 38 |
+
# Data files
|
| 39 |
+
*.pdf
|
| 40 |
+
*.json
|
| 41 |
+
*.jsonl
|
| 42 |
+
|
| 43 |
+
# Output folders
|
| 44 |
+
dist/
|
| 45 |
+
build/
|
| 46 |
+
*.egg-info/
|
| 47 |
+
|
| 48 |
+
# Output data
|
| 49 |
+
data/
|
| 50 |
+
|
| 51 |
+
# Pyright config may differ from one platform to another
|
| 52 |
+
pyrightconfig.json
|
| 53 |
+
|
| 54 |
+
# Pycharm
|
| 55 |
+
.idea/
|
| 56 |
+
|
| 57 |
+
# OS junk
|
| 58 |
+
.Trashes.env
|
| 59 |
+
.env
|
| 60 |
+
.env
|
| 61 |
+
|
| 62 |
+
#idk
|
| 63 |
+
--source-branch
|
| 64 |
+
--source-repo
|
| 65 |
+
/.gradio/certificate.pem
|
| 66 |
+
|
| 67 |
+
#feedback I just uploaded into the same file to check for accuracy
|
| 68 |
+
chatbot emba x.docx
|
| 69 |
+
IEBMA Test Cards 1_2.docx
|
Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================== Initial Building =============================
|
| 2 |
+
FROM python:3.11.14-slim-bookworm AS builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# CPU-only PyTorch
|
| 7 |
+
RUN pip install --no-cache-dir torch torchvision torchaudio \
|
| 8 |
+
--index-url https://download.pytorch.org/whl/cpu
|
| 9 |
+
|
| 10 |
+
# Python dependencies
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
+
# ============================== Size Reduction ===============================
|
| 14 |
+
FROM python:3.11.14-slim-bookworm
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Only necessary dependencies from builder
|
| 19 |
+
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
|
| 20 |
+
COPY --from=builder /usr/local/bin /usr/local/bin
|
| 21 |
+
|
| 22 |
+
# System dependencies for runtime
|
| 23 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 24 |
+
libmagic1 \
|
| 25 |
+
poppler-utils \
|
| 26 |
+
curl \
|
| 27 |
+
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
| 28 |
+
|
| 29 |
+
# ============================ Final Compilation ==============================
|
| 30 |
+
COPY . .
|
| 31 |
+
|
| 32 |
+
EXPOSE 7860
|
| 33 |
+
|
| 34 |
+
HEALTHCHECK --interval=60s --timeout=10s --retries=3 \
|
| 35 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 36 |
+
|
| 37 |
+
CMD ["python", "main.py", "--app", "de"]
|
config.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration settings for the Executive Education RAG Chatbot.
|
| 3 |
+
PLEASE CONSIDER READING THE 'docs/configuration_system_documentation.md' TO PROPERLY USE THE NEW CONFIGURATION SYSTEM.
|
| 4 |
+
"""
|
| 5 |
+
# ========================================= General Configuration ===========================================
|
| 6 |
+
|
| 7 |
+
# A list of ISO 639 language codes. Defines a list of languages in which
|
| 8 |
+
# the application can operate. Defaults to ['en', 'de'].
|
| 9 |
+
AVAILABLE_LANGUAGES = ['en', 'de']
|
| 10 |
+
|
| 11 |
+
# A string representing a path (relative to the project root or absolute) to the directory
|
| 12 |
+
# where the data output files such as scraping or document processing outputs will be stored.
|
| 13 |
+
DATA_PATH = 'data'
|
| 14 |
+
|
| 15 |
+
# A string representing a path (relative to the project root or absolute) to the directory
|
| 16 |
+
# where the loging files will be stored.
|
| 17 |
+
LOGS_PATH = 'logs'
|
| 18 |
+
|
| 19 |
+
# =================================== Conversation State Configuration ======================================
|
| 20 |
+
|
| 21 |
+
# A boolean; either True or False. Enables the collection of user preferences
|
| 22 |
+
# during conversation to avoid repetetive questions. Defaults to True.
|
| 23 |
+
TRACK_USER_PROFILE = True
|
| 24 |
+
|
| 25 |
+
# An integer. Defines the amount of user messages after which the language
|
| 26 |
+
# of the conversation will be locked. If set to 0, the language will not be locked.
|
| 27 |
+
LOCK_LANGUAGE_AFTER_N_MESSAGES = 3
|
| 28 |
+
|
| 29 |
+
# An integer. Sets the maximum amount of conversation turns as the sum of user queries
|
| 30 |
+
# and agent responses. The conversation ends after the maximum turns amount is reached.
|
| 31 |
+
MAX_CONVERSATION_TURNS = 20
|
| 32 |
+
|
| 33 |
+
# ============================================ LLM Configuration ============================================
|
| 34 |
+
|
| 35 |
+
# A string, either 'openai', 'groq', 'open_router' or 'ollama' (local).
|
| 36 |
+
# Defines the main model provider for the application.
|
| 37 |
+
LLM_PROVIDER = 'openai'
|
| 38 |
+
|
| 39 |
+
# A string. Defines the model that will be used by the application agents.
|
| 40 |
+
OPENAI_MODEL = 'gpt-5.1'
|
| 41 |
+
# GROQ_MODEL =
|
| 42 |
+
# OLLAMA_MODEL =
|
| 43 |
+
# OPEN_ROUTER_MODEL =
|
| 44 |
+
|
| 45 |
+
# ==================================== Weaviate Database Configuration ======================================
|
| 46 |
+
|
| 47 |
+
# A boolean; either True or False.
|
| 48 |
+
# Defines whether the database is set as a local instance (via Docker container),
|
| 49 |
+
# or as a cloud service. More information on https://docs.weaviate.io/weaviate.
|
| 50 |
+
WEAVIATE_IS_LOCAL = False
|
| 51 |
+
|
| 52 |
+
# A string. Defines the name of the colletions stored in the database.
|
| 53 |
+
# For each available language a new collection will be created
|
| 54 |
+
# with set name <WEAVIATE_COLLECTION_BASENAME>_<LANGUAGE>.
|
| 55 |
+
WEAVIATE_COLLECTION_BASENAME = 'hsg_rag_content'
|
| 56 |
+
|
| 57 |
+
# A string; either 'manual', 'filesystem' (local instance), 's3' (AWS).
|
| 58 |
+
# Defines the service for storing the database backups.
|
| 59 |
+
# More information on https://docs.weaviate.io/deploy/configuration/backups.
|
| 60 |
+
WEAVIATE_BACKUP_METHOD = 'manual'
|
| 61 |
+
|
| 62 |
+
# A string representing a path in the system where backups will be stored
|
| 63 |
+
# only if WEAVIATE_BACKUP_METHOD is set to 'manual'.
|
| 64 |
+
BACKUPS_PATH = 'data/database/backups'
|
| 65 |
+
|
| 66 |
+
# A string representing a system path where collection properties will be stored.
|
| 67 |
+
PROPERTIES_PATH = 'data/database'
|
| 68 |
+
|
| 69 |
+
# A string representing a system path where property strategies will be stored.
|
| 70 |
+
# More information on property strategies in the documentation.
|
| 71 |
+
STRATEGIES_PATH = 'data/database/strategies'
|
| 72 |
+
|
| 73 |
+
# An integer. Defines a connection timeout to the cloud weaviate service (in seconds).
|
| 74 |
+
# Defaults to 90.
|
| 75 |
+
WEAVIATE_INIT_TIMEOUT = 90
|
| 76 |
+
|
| 77 |
+
# An integer. Defines the query response time limit upon querying the database (in seconds).
|
| 78 |
+
# Defaults to 60.
|
| 79 |
+
WEAVIATE_QUERY_TIMEOUT = 60
|
| 80 |
+
|
| 81 |
+
# An integer. Defines the chunk insertion time limit when importing new chunks to database (in seconds).
|
| 82 |
+
# Defaults to 600
|
| 83 |
+
WEAVIATE_INSERT_TIMEOUT = 600
|
| 84 |
+
|
| 85 |
+
# ========================================== Cache Configuration ============================================
|
| 86 |
+
|
| 87 |
+
# A string; either 'local', 'cloud' (Redis) or 'dict'. Defaults to 'cloud'.
|
| 88 |
+
# Sets the default cache mode. More information on cache modes in documentation.
|
| 89 |
+
CACHE_MODE = 'cloud'
|
| 90 |
+
|
| 91 |
+
# An integer. Sets the reset time (time to live) in seconds for the cache storage.
|
| 92 |
+
# The cache storage will be cleared upon reset time exceedance.
|
| 93 |
+
# Defaults to 86400 seconds (24 hours).
|
| 94 |
+
CACHE_TTL = 86400
|
| 95 |
+
|
| 96 |
+
# An integer. Maximum amount of cached messages that will be held in the cache storage.
|
| 97 |
+
# Defaults to 1000.
|
| 98 |
+
CACHE_MAX_SIZE = 1000
|
| 99 |
+
|
| 100 |
+
# A string. Defines the IP adress to access the local cache storage. Defaults to 'localhost'.
|
| 101 |
+
CACHE_LOCAL_HOST = 'localhost'
|
| 102 |
+
|
| 103 |
+
# An integer. Defines the port for accessing the local cache storage. Defaults to 6379.
|
| 104 |
+
CACHE_LOCAL_PORT = 6379
|
| 105 |
+
|
| 106 |
+
# ===================================== Data Processing Configuration =======================================
|
| 107 |
+
|
| 108 |
+
# A string representing the name of an embeding model for embedding generation.
|
| 109 |
+
# The parameter MAX_TOKENS must match this model's maximum token amount.
|
| 110 |
+
EMBEDDING_MODEL = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
|
| 111 |
+
|
| 112 |
+
# A float in range from 0 to 1. Sets the threshold for english language in the language detector.
|
| 113 |
+
# If the language detection certanty is lower than the threshold, the English language will be returned.
|
| 114 |
+
LANG_AMBIGUITY_THRESHOLD = 0.6
|
| 115 |
+
|
| 116 |
+
# An integer. Defines the maximum amount of tokens pro single chunk.
|
| 117 |
+
MAX_TOKENS = 512
|
| 118 |
+
|
| 119 |
+
# An integer. Defines the amount of overlapping tokens between chunks to keep the context.
|
| 120 |
+
CHUNK_OVERLAP = 100
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# An integer representing seconds. Defines the maximum waiting time for the target server
|
| 124 |
+
# responses during the scraping procedures.
|
| 125 |
+
SCRAPING_TIMEOUT = 30
|
| 126 |
+
|
| 127 |
+
# An integer. Defines the maximum amount of additional tries that will be performed
|
| 128 |
+
# if the initial request to the server failed.
|
| 129 |
+
SCRAPING_MAX_RETRIES = 3
|
| 130 |
+
|
| 131 |
+
# An integer representing seconds. Defines the waiting interval between two server calls.
|
| 132 |
+
# This value might be overwritten by the delay set by the server.
|
| 133 |
+
SCRAPING_CRAWL_DELAY = 1
|
| 134 |
+
|
| 135 |
+
# An integer. Defines the backoff base value for retries with exponential backoff.
|
| 136 |
+
# The higher is the number, the longer is the waiting interval between subsequent retries going to be.
|
| 137 |
+
SCRAPING_BACKOFF_RATE = 1.25
|
| 138 |
+
|
| 139 |
+
# A list of string URLs. Defines the starting points for the website scraping.
|
| 140 |
+
SCRAPING_TARGET_URLS = [
|
| 141 |
+
# 'https://emba.unisg.ch/', # EMBA HSG root
|
| 142 |
+
'https://embax.ch/', # emba X root
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
# Scraping Priority Interval in days
|
| 146 |
+
SCRAPING_PRIO_INTERVAL = {
|
| 147 |
+
"high": 1,
|
| 148 |
+
"medium": 7,
|
| 149 |
+
"low": 30
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
# ======================================== Agent Chain Configuration ========================================
|
| 153 |
+
|
| 154 |
+
# A boolean; either True or False. Activates the response quality evaluation procedure
|
| 155 |
+
# for agentic responses. Defaults to True.
|
| 156 |
+
ENABLE_EVALUATE_RESPONSE_QUALITY = True
|
| 157 |
+
|
| 158 |
+
# A float in range from 0 to 1. Sets the treshold value for the quality evaluation.
|
| 159 |
+
# The fallback mechanism will be activated if the quality of the agentic response
|
| 160 |
+
# is lower than the confidence threshold.
|
| 161 |
+
CONFIDENCE_THRESHOLD = 0.6
|
| 162 |
+
|
| 163 |
+
# An integer. Defines the amount of chunks that should be retrieved from the database
|
| 164 |
+
# upon querying by subagents during conversation. Defaults to 4.
|
| 165 |
+
TOP_K_RETRIEVAL = 4
|
| 166 |
+
|
| 167 |
+
# An integer. Sets the amount of model invocation retries after which the fallback model
|
| 168 |
+
# will be invoked. Defaults to 3.
|
| 169 |
+
MODEL_MAX_RETRIES = 3
|
| 170 |
+
|
| 171 |
+
# An integer. Sets the maximum amount of words in the response from the lead agent.
|
| 172 |
+
MAX_RESPONSE_WORDS_LEAD = 100
|
| 173 |
+
|
| 174 |
+
# An integer. Sets the maximum amount of words in the response for subagents.
|
| 175 |
+
MAX_RESPONSE_WORDS_SUBAGENT = 200
|
| 176 |
+
|
| 177 |
+
# A boolean; either True or False. If response chunking is enabled, long responses
|
| 178 |
+
# from the lead agent will be split and retuned through multiple conversation turns.
|
| 179 |
+
ENABLE_RESPONSE_CHUNKING = True
|
| 180 |
+
|
| 181 |
+
# ========================================== Notification Configuration =====================================
|
| 182 |
+
|
| 183 |
+
NOTIFY_ENABLE_EMAIL_ALERTS= True
|
| 184 |
+
NOTIFY_ENABLE_SLACK_ALERTS = True
|
| 185 |
+
|
| 186 |
+
# ===========================================================================================================
|
main.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main entry point for the Executive Education RAG Chatbot.
|
| 3 |
+
"""
|
| 4 |
+
import argparse
|
| 5 |
+
import langsmith
|
| 6 |
+
from langsmith import traceable
|
| 7 |
+
from src.utils.logging import init_logging, get_logger
|
| 8 |
+
from config import AVAILABLE_LANGUAGES
|
| 9 |
+
from src.cache.cache import Cache
|
| 10 |
+
from src.config import config
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Initialize logging
|
| 14 |
+
def logging_startup():
|
| 15 |
+
init_logging()
|
| 16 |
+
return get_logger('main_module')
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def run_scraper() -> None:
|
| 20 |
+
"""
|
| 21 |
+
Run the scraper to collect program data.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
use_selenium: Whether to use Selenium for scraping.
|
| 25 |
+
"""
|
| 26 |
+
from src.pipeline.pipeline import ImportPipeline
|
| 27 |
+
logger = logging_startup()
|
| 28 |
+
|
| 29 |
+
logger.info("Running scraper...")
|
| 30 |
+
ImportPipeline().scrape_website()
|
| 31 |
+
logger.info("Scraping completed.")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def run_importer(sources: list[str]) -> None:
|
| 35 |
+
"""Run the data import pipeline."""
|
| 36 |
+
from src.pipeline.pipeline import ImportPipeline
|
| 37 |
+
logger = logging_startup()
|
| 38 |
+
|
| 39 |
+
logger.info("Running data import pipeline...")
|
| 40 |
+
ImportPipeline().import_many_documents(sources)
|
| 41 |
+
logger.info("Data processing completed.")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def run_weaviate_command(command: str, backup_id: str = None):
|
| 45 |
+
"""Run commands to manipulate the database contents."""
|
| 46 |
+
from src.database.weavservice import WeaviateService
|
| 47 |
+
logger = logging_startup()
|
| 48 |
+
|
| 49 |
+
logger.info(f"Running database command {command}")
|
| 50 |
+
if command == 'restore' and not backup_id:
|
| 51 |
+
logger.error("Backup ID is required to initalize the restore process.")
|
| 52 |
+
|
| 53 |
+
service = WeaviateService()
|
| 54 |
+
if command == 'backup':
|
| 55 |
+
service._create_backup()
|
| 56 |
+
|
| 57 |
+
if command == 'restore':
|
| 58 |
+
service._restore_backup(backup_id)
|
| 59 |
+
|
| 60 |
+
if command == 'delete' or command == 'redo':
|
| 61 |
+
service._delete_collections()
|
| 62 |
+
|
| 63 |
+
if command == 'init' or command == 'redo':
|
| 64 |
+
service._create_collections()
|
| 65 |
+
|
| 66 |
+
if command == 'checkhealth' or command == 'init' or command == 'redo':
|
| 67 |
+
service._checkhealth()
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def clear_cache():
|
| 71 |
+
cache = Cache.get_cache()
|
| 72 |
+
if cache:
|
| 73 |
+
cache.clear_cache()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def run_application(lang: str, cache_mode, cache) -> None:
|
| 77 |
+
"""Run the chatbot web application."""
|
| 78 |
+
from src.apps.chat.app import ChatbotApplication
|
| 79 |
+
logger = logging_startup()
|
| 80 |
+
|
| 81 |
+
Cache.configure(cache_mode, cache)
|
| 82 |
+
|
| 83 |
+
logger.info("Starting chatbot web application...")
|
| 84 |
+
app = ChatbotApplication(language=lang)
|
| 85 |
+
app.run()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def run_dbapp() -> None:
|
| 89 |
+
"""Run the database application."""
|
| 90 |
+
from src.apps.dbapp.app import DatabaseApplication
|
| 91 |
+
logger = logging_startup()
|
| 92 |
+
logger.info("Starting database application...")
|
| 93 |
+
app = DatabaseApplication()
|
| 94 |
+
app.run()
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def parse_args():
|
| 98 |
+
"""Parse command-line arguments."""
|
| 99 |
+
parser = argparse.ArgumentParser(description="University of St. Gallen Executive Education RAG Chatbot")
|
| 100 |
+
|
| 101 |
+
# Add arguments
|
| 102 |
+
parser.add_argument("--scrape", action="store_true",
|
| 103 |
+
help="Scrapes the data from the HSG website and imports it into the database")
|
| 104 |
+
parser.add_argument("--imports", nargs="+", help="Runs the data importing pipeline for the provided files")
|
| 105 |
+
|
| 106 |
+
parser.add_argument("--weaviate", type=str, choices=['init', 'delete', 'redo', 'checkhealth', 'backup', 'restore'],
|
| 107 |
+
help="Runs different database actions")
|
| 108 |
+
parser.add_argument("--backup-id", type=str, help="Required when calling the --weaviate restore command!")
|
| 109 |
+
|
| 110 |
+
parser.add_argument("--cache-mode", type=str, choices=['local', 'cloud', 'dict'], default=config.cache.CACHE_MODE,
|
| 111 |
+
help="Defines whether to use the local or cloud Redis database or the special python dict as cache")
|
| 112 |
+
|
| 113 |
+
parser.add_argument("--cache", action="store_false", help="(De-)activates the caching mechanism")
|
| 114 |
+
|
| 115 |
+
parser.add_argument("--clear-cache", action="store_true",
|
| 116 |
+
help="Clears the cache")
|
| 117 |
+
|
| 118 |
+
parser.add_argument("--cli", action="store_true", help="Run the chatbot CLI")
|
| 119 |
+
parser.add_argument("--app", type=str, choices=AVAILABLE_LANGUAGES, help="Run the chatbot web application")
|
| 120 |
+
parser.add_argument("--dbapp", action="store_true", help="Run the database management application")
|
| 121 |
+
|
| 122 |
+
return parser.parse_args()
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def main():
|
| 126 |
+
"""Main entry point for the application."""
|
| 127 |
+
args = parse_args()
|
| 128 |
+
|
| 129 |
+
# Load cache settings with the cache args
|
| 130 |
+
must_clear_cache = False
|
| 131 |
+
|
| 132 |
+
# Check if any argument is provided
|
| 133 |
+
if not any([args.scrape, args.imports, args.weaviate, args.cli, args.cache, args.app, args.dbapp]):
|
| 134 |
+
# If no argument is provided, run the chatbot by default
|
| 135 |
+
run_application(cache_mode=args.cache_mode, cache=args.cache)
|
| 136 |
+
return
|
| 137 |
+
|
| 138 |
+
# Run the specified components
|
| 139 |
+
if args.scrape:
|
| 140 |
+
must_clear_cache = True
|
| 141 |
+
run_scraper()
|
| 142 |
+
|
| 143 |
+
if args.imports:
|
| 144 |
+
must_clear_cache = True
|
| 145 |
+
run_importer(args.imports)
|
| 146 |
+
|
| 147 |
+
if args.weaviate:
|
| 148 |
+
if args.weaviate in ["init", "redo", "restore"]:
|
| 149 |
+
must_clear_cache = True
|
| 150 |
+
run_weaviate_command(command=args.weaviate, backup_id=args.backup_id)
|
| 151 |
+
|
| 152 |
+
if args.clear_cache or must_clear_cache:
|
| 153 |
+
clear_cache()
|
| 154 |
+
|
| 155 |
+
if args.app:
|
| 156 |
+
run_application(lang=args.app, cache_mode=args.cache_mode, cache=args.cache)
|
| 157 |
+
|
| 158 |
+
if args.dbapp:
|
| 159 |
+
run_dbapp()
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
langchain>=1.0.2
|
| 3 |
+
langchain-core>=1.0.1
|
| 4 |
+
langchain-deepseek>=1.0.0
|
| 5 |
+
langchain-groq>=1.0.0
|
| 6 |
+
langchain-ollama>=0.3.10
|
| 7 |
+
langchain-openai>=1.0.1
|
| 8 |
+
langsmith>=0.4.0
|
| 9 |
+
|
| 10 |
+
requests>=2.31.0
|
| 11 |
+
openai>=1.3.0
|
| 12 |
+
python-dotenv>=1.0.0
|
| 13 |
+
colorama>=0.4.6
|
| 14 |
+
|
| 15 |
+
# Language detection
|
| 16 |
+
langdetect>=1.0.9
|
| 17 |
+
|
| 18 |
+
# Transformers for tokenization
|
| 19 |
+
transformers>=4.34.0
|
| 20 |
+
|
| 21 |
+
# Web applications
|
| 22 |
+
gradio>=5.49.1
|
| 23 |
+
|
| 24 |
+
# Processing pipeline
|
| 25 |
+
docling>=2.55.0
|
| 26 |
+
ultimate-sitemap-parser>=1.8.0
|
| 27 |
+
beautifulsoup4>=4.14.3
|
| 28 |
+
fake-useragent>=1.5.1
|
| 29 |
+
|
| 30 |
+
# Weaviate Vector DB
|
| 31 |
+
weaviate-client>=4.16.9
|
| 32 |
+
PyYAML>=6.0
|
| 33 |
+
|
| 34 |
+
# Cache
|
| 35 |
+
cachetools>=5.0.0
|
| 36 |
+
redis>=4.5.5
|
| 37 |
+
|
| 38 |
+
# Scheduling
|
| 39 |
+
apscheduler
|
src/__init__.py
ADDED
|
File without changes
|
src/apps/__init__.py
ADDED
|
File without changes
|
src/apps/chat/__init__.py
ADDED
|
File without changes
|
src/apps/chat/app.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
from src.const.agent_response_constants import *
|
| 8 |
+
from src.const.data_consent_constants import *
|
| 9 |
+
from src.rag.agent_chain import ExecutiveAgentChain
|
| 10 |
+
from src.utils.logging import get_logger, ConsentLogger
|
| 11 |
+
|
| 12 |
+
logger = get_logger("chatbot_app")
|
| 13 |
+
|
| 14 |
+
def init_fastapi_app(language):
|
| 15 |
+
fastapi_app = FastAPI()
|
| 16 |
+
|
| 17 |
+
@fastapi_app.get('/health')
|
| 18 |
+
def healthcheck():
|
| 19 |
+
from src.database.weavservice import WeaviateService
|
| 20 |
+
from fastapi.responses import JSONResponse
|
| 21 |
+
|
| 22 |
+
status = 200
|
| 23 |
+
message = { 'timestamp': datetime.now().isoformat() }
|
| 24 |
+
try:
|
| 25 |
+
message |= {
|
| 26 |
+
'status': 'ok',
|
| 27 |
+
'weaviate': True,
|
| 28 |
+
}
|
| 29 |
+
response = WeaviateService().ping(language)
|
| 30 |
+
if response['status'] != 'OK':
|
| 31 |
+
status = 503
|
| 32 |
+
message |= {
|
| 33 |
+
'status': 'degraded',
|
| 34 |
+
'weaviate': False,
|
| 35 |
+
'error': str(response['error']),
|
| 36 |
+
}
|
| 37 |
+
except Exception as e:
|
| 38 |
+
status = 503
|
| 39 |
+
message |= {
|
| 40 |
+
'status': 'down',
|
| 41 |
+
'weaviate': False,
|
| 42 |
+
'error': str(e),
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
return JSONResponse(
|
| 46 |
+
status_code = status,
|
| 47 |
+
content = message,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
return fastapi_app
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class ChatbotApplication:
|
| 54 |
+
def __init__(self, language: str = "de") -> None:
|
| 55 |
+
self._fastapi_app = init_fastapi_app(language)
|
| 56 |
+
self._gradio_app = gr.Blocks()
|
| 57 |
+
self._app = gr.mount_gradio_app(self._fastapi_app, self._gradio_app, path='/')
|
| 58 |
+
self._language = language
|
| 59 |
+
self._consentLogger = ConsentLogger()
|
| 60 |
+
|
| 61 |
+
with self._gradio_app:
|
| 62 |
+
agent_state = gr.State(None)
|
| 63 |
+
lang_state = gr.State(language)
|
| 64 |
+
consent_state = gr.State(False)
|
| 65 |
+
session_id_state = gr.State(str(uuid.uuid4())) # for consent logging later
|
| 66 |
+
|
| 67 |
+
with gr.Row():
|
| 68 |
+
lang_selector = gr.Radio(
|
| 69 |
+
choices=["Deutsch", "English"],
|
| 70 |
+
value="English" if language == "en" else "Deutsch",
|
| 71 |
+
label="Selected Language",
|
| 72 |
+
interactive=True,
|
| 73 |
+
)
|
| 74 |
+
reset_button = gr.Button("Reset Conversation", visible=False)
|
| 75 |
+
|
| 76 |
+
# ---- Consent Screen (Page 1) ----
|
| 77 |
+
with gr.Column(visible=True) as consent_screen:
|
| 78 |
+
data_policy = gr.Markdown(PRIVACY_NOTICE[language])
|
| 79 |
+
with gr.Row():
|
| 80 |
+
decline_btn = gr.Button(DECLINE[language])
|
| 81 |
+
accept_btn = gr.Button(ACCEPT[language])
|
| 82 |
+
|
| 83 |
+
decline_info = gr.Markdown("", visible=False)
|
| 84 |
+
|
| 85 |
+
# ---- Chat Screen (Page 2) ----
|
| 86 |
+
with gr.Column(visible=False) as chat_screen:
|
| 87 |
+
chat = gr.ChatInterface(
|
| 88 |
+
fn=lambda msg, history, agent: self._chat(
|
| 89 |
+
message=msg, history=history, agent=agent
|
| 90 |
+
),
|
| 91 |
+
additional_inputs=[agent_state],
|
| 92 |
+
title="Executive Education Adviser",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
with gr.Row():
|
| 96 |
+
withdraw_button = gr.Button(WITHDRAW_TEXT[language], visible=False, variant="stop")
|
| 97 |
+
|
| 98 |
+
def create_session_id() -> str:
|
| 99 |
+
return str(uuid.uuid4())
|
| 100 |
+
|
| 101 |
+
def initialize_agent(lang: str, session_id: str):
|
| 102 |
+
agent = ExecutiveAgentChain(language=lang, session_id=session_id)
|
| 103 |
+
greeting = agent.generate_greeting()
|
| 104 |
+
|
| 105 |
+
disclaimer_html = get_disclaimer_widget(lang)
|
| 106 |
+
|
| 107 |
+
full_content = f"{disclaimer_html}{greeting}"
|
| 108 |
+
|
| 109 |
+
return agent, [{"role": "assistant", "content": full_content}]
|
| 110 |
+
|
| 111 |
+
def label_to_lang_code(label: str) -> str:
|
| 112 |
+
return "en" if label == "English" else "de"
|
| 113 |
+
|
| 114 |
+
# Language change: before consent => only update consent UI text.
|
| 115 |
+
# After consent: keep chat running (or optionally re-init agent on language change).
|
| 116 |
+
def on_language_change(
|
| 117 |
+
language_label: str,
|
| 118 |
+
consent_given: bool,
|
| 119 |
+
agent,
|
| 120 |
+
session_id: str,
|
| 121 |
+
):
|
| 122 |
+
lang_code = label_to_lang_code(language_label)
|
| 123 |
+
|
| 124 |
+
# Before consent: update consent screen text to selected language
|
| 125 |
+
if not consent_given:
|
| 126 |
+
return (
|
| 127 |
+
lang_code,
|
| 128 |
+
gr.update(value=PRIVACY_NOTICE[lang_code]),
|
| 129 |
+
gr.update(value=DECLINE[lang_code]),
|
| 130 |
+
gr.update(value=ACCEPT[lang_code]),
|
| 131 |
+
gr.update(visible=False, value=""),
|
| 132 |
+
None, # agent_state stays None
|
| 133 |
+
None, # chat stays as it is
|
| 134 |
+
gr.update(value=WITHDRAW_TEXT[lang_code], visible=False),
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# After consent
|
| 138 |
+
new_agent, greeting = initialize_agent(lang_code, session_id=session_id)
|
| 139 |
+
return (
|
| 140 |
+
lang_code,
|
| 141 |
+
gr.update(value=PRIVACY_NOTICE[lang_code]),
|
| 142 |
+
gr.update(value=DECLINE[lang_code]),
|
| 143 |
+
gr.update(value=ACCEPT[lang_code]),
|
| 144 |
+
gr.update(visible=False, value=""),
|
| 145 |
+
new_agent,
|
| 146 |
+
greeting,
|
| 147 |
+
gr.update(value=WITHDRAW_TEXT[lang_code], visible=True),
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
def on_accept(lang: str, session_id: str):
|
| 151 |
+
agent, greeting = initialize_agent(lang, session_id=session_id)
|
| 152 |
+
self._consentLogger.log(session_id, "accepted", policy_version="1.0")
|
| 153 |
+
self._language = lang
|
| 154 |
+
return (
|
| 155 |
+
gr.update(visible=False), # consent_screen hide
|
| 156 |
+
gr.update(visible=True), # chat_screen show
|
| 157 |
+
True, # consent_state
|
| 158 |
+
agent, # agent_state
|
| 159 |
+
greeting, # chat initial history
|
| 160 |
+
gr.update(visible=False, value=""), # decline_info hide
|
| 161 |
+
gr.update(visible=True), # show reset_button
|
| 162 |
+
gr.update(value=WITHDRAW_TEXT[lang], visible=True),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
def on_decline(lang: str, session_id: str):
|
| 166 |
+
self._language = lang
|
| 167 |
+
self._consentLogger.log(session_id, "declined", policy_version="1.0")
|
| 168 |
+
return (
|
| 169 |
+
gr.update(visible=True), # consent_screen stays
|
| 170 |
+
gr.update(visible=False), # chat_screen stays hidden
|
| 171 |
+
False, # consent_state
|
| 172 |
+
None, # agent_state
|
| 173 |
+
[], # chat history empty
|
| 174 |
+
gr.update(visible=True, value=DECLINE_MESSAGE[lang]),
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
def on_reset_chat(lang: str, session_id: str):
|
| 178 |
+
agent, greeting = initialize_agent(lang, session_id=session_id)
|
| 179 |
+
self._language = lang
|
| 180 |
+
return (
|
| 181 |
+
agent,
|
| 182 |
+
greeting,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
def on_withdraw(lang: str, agent, session_id: str):
|
| 186 |
+
self._consentLogger.log(session_id, "withdrawn", policy_version="1.0")
|
| 187 |
+
|
| 188 |
+
# 1) wipe server-side
|
| 189 |
+
if agent is not None:
|
| 190 |
+
try:
|
| 191 |
+
agent.wipe_session_data()
|
| 192 |
+
logger.info("wipe_session_data executed")
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"wipe_session_data failed: {e}", exc_info=True)
|
| 195 |
+
|
| 196 |
+
# 2) lock chat again (back to consent screen)
|
| 197 |
+
new_session_id = create_session_id()
|
| 198 |
+
return (
|
| 199 |
+
gr.update(visible=True), # consent_screen
|
| 200 |
+
gr.update(value=PRIVACY_NOTICE[lang]), # data_policy
|
| 201 |
+
gr.update(value=DECLINE[lang]), # decline_btn
|
| 202 |
+
gr.update(value=ACCEPT[lang]), # accept_btn
|
| 203 |
+
gr.update(visible=False), # chat_screen
|
| 204 |
+
gr.update(visible=True, value=WITHDRAW_CONFIRMATION_MESSAGE[lang]), # decline_info
|
| 205 |
+
False, # consent_state
|
| 206 |
+
None, # agent_state
|
| 207 |
+
[], # chat.chatbot_value (history)
|
| 208 |
+
gr.update(visible=False), # reset_button
|
| 209 |
+
gr.update(visible=False), # withdraw_button
|
| 210 |
+
new_session_id, # session_id_state
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Language switch updates consent UI if consent not given
|
| 214 |
+
lang_selector.change(
|
| 215 |
+
fn=on_language_change,
|
| 216 |
+
inputs=[lang_selector, consent_state, agent_state, session_id_state],
|
| 217 |
+
outputs=[lang_state,
|
| 218 |
+
data_policy,
|
| 219 |
+
decline_btn,
|
| 220 |
+
accept_btn,
|
| 221 |
+
decline_info,
|
| 222 |
+
agent_state,
|
| 223 |
+
chat.chatbot_value,
|
| 224 |
+
withdraw_button,
|
| 225 |
+
],
|
| 226 |
+
queue=True,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Accept/Decline data consent
|
| 230 |
+
accept_btn.click(
|
| 231 |
+
fn=on_accept,
|
| 232 |
+
inputs=[lang_state, session_id_state],
|
| 233 |
+
outputs=[
|
| 234 |
+
consent_screen,
|
| 235 |
+
chat_screen,
|
| 236 |
+
consent_state,
|
| 237 |
+
agent_state,
|
| 238 |
+
chat.chatbot_value,
|
| 239 |
+
decline_info,
|
| 240 |
+
reset_button,
|
| 241 |
+
withdraw_button,
|
| 242 |
+
],
|
| 243 |
+
queue=True,
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
decline_btn.click(
|
| 247 |
+
fn=on_decline,
|
| 248 |
+
inputs=[lang_state, session_id_state],
|
| 249 |
+
outputs=[consent_screen, chat_screen, consent_state, agent_state, chat.chatbot_value, decline_info],
|
| 250 |
+
queue=True,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# Reset
|
| 254 |
+
reset_button.click(
|
| 255 |
+
fn=on_reset_chat,
|
| 256 |
+
inputs=[lang_state, session_id_state],
|
| 257 |
+
outputs=[
|
| 258 |
+
agent_state,
|
| 259 |
+
chat.chatbot_value,
|
| 260 |
+
],
|
| 261 |
+
queue=True,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Withdraw consent
|
| 265 |
+
withdraw_button.click(
|
| 266 |
+
fn=on_withdraw,
|
| 267 |
+
inputs=[lang_state, agent_state, session_id_state],
|
| 268 |
+
outputs=[
|
| 269 |
+
consent_screen,
|
| 270 |
+
data_policy,
|
| 271 |
+
decline_btn,
|
| 272 |
+
accept_btn,
|
| 273 |
+
chat_screen,
|
| 274 |
+
decline_info,
|
| 275 |
+
consent_state,
|
| 276 |
+
agent_state,
|
| 277 |
+
chat.chatbot_value,
|
| 278 |
+
reset_button,
|
| 279 |
+
withdraw_button,
|
| 280 |
+
session_id_state,
|
| 281 |
+
],
|
| 282 |
+
queue=True,
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
@property
|
| 287 |
+
def app(self) -> gr.Blocks:
|
| 288 |
+
"""Expose underlying Gradio Blocks for external runners (e.g., HF Spaces)."""
|
| 289 |
+
return self._app
|
| 290 |
+
|
| 291 |
+
def _chat(self, message: str, history: list[dict], agent: ExecutiveAgentChain):
|
| 292 |
+
if agent is None:
|
| 293 |
+
logger.error("Agent not initialized")
|
| 294 |
+
return ["I apologize, but the chatbot is not properly initialized."]
|
| 295 |
+
|
| 296 |
+
answers = []
|
| 297 |
+
try:
|
| 298 |
+
logger.info(f"Processing user query: {message[:100]}...")
|
| 299 |
+
response = agent.query(message)
|
| 300 |
+
answers.append(response.response)
|
| 301 |
+
self._language = response.language
|
| 302 |
+
|
| 303 |
+
if response.show_booking_widget:
|
| 304 |
+
html_code = get_booking_widget(language=self._language, programs=response.relevant_programs)
|
| 305 |
+
answers.append(gr.HTML(value=html_code))
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logger.error(f"Error processing query: {e}", exc_info=True)
|
| 308 |
+
error_message = (
|
| 309 |
+
"I apologize, but I encountered an error processing your request. "
|
| 310 |
+
"Please try rephrasing your question or contact our admissions team for assistance."
|
| 311 |
+
)
|
| 312 |
+
answers.append(error_message)
|
| 313 |
+
|
| 314 |
+
return answers
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def run(self):
|
| 318 |
+
import uvicorn
|
| 319 |
+
uvicorn.run(
|
| 320 |
+
self._app,
|
| 321 |
+
host='0.0.0.0',
|
| 322 |
+
port=7860,
|
| 323 |
+
log_config=None
|
| 324 |
+
)
|
src/apps/dbapp/app.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tkinter import *
|
| 2 |
+
from tkinter import ttk
|
| 3 |
+
from src.database.weavservice import WeaviateService
|
| 4 |
+
|
| 5 |
+
from src.apps.dbapp.mainframe import MainFrame
|
| 6 |
+
from src.apps.dbapp.query import QueryFrame
|
| 7 |
+
from src.apps.dbapp.imports import ImportFrame
|
| 8 |
+
from src.apps.dbapp.backup import BackupsFrame
|
| 9 |
+
from src.apps.dbapp.collections import CollectionsFrame
|
| 10 |
+
from src.apps.dbapp.config import SchemaConfigurationFrame
|
| 11 |
+
|
| 12 |
+
from src.utils.logging import get_logger
|
| 13 |
+
|
| 14 |
+
logger = get_logger("db_inter ")
|
| 15 |
+
|
| 16 |
+
class DatabaseApplication:
|
| 17 |
+
def __init__(self) -> None:
|
| 18 |
+
self._root = Tk()
|
| 19 |
+
self._service = WeaviateService()
|
| 20 |
+
|
| 21 |
+
self._root.title("Database Interface")
|
| 22 |
+
self._root.geometry("810x500")
|
| 23 |
+
|
| 24 |
+
notebook = ttk.Notebook(self._root)
|
| 25 |
+
notebook.pack(fill=BOTH, expand=True)
|
| 26 |
+
|
| 27 |
+
main_frame = MainFrame(notebook, self._service).init()
|
| 28 |
+
import_frame = ImportFrame(notebook, self._service).init()
|
| 29 |
+
config_frame = SchemaConfigurationFrame(notebook, self._service).init()
|
| 30 |
+
collections_frame = CollectionsFrame(notebook, self._service).init()
|
| 31 |
+
query_frame = QueryFrame(notebook, self._service).init()
|
| 32 |
+
backups_frame = BackupsFrame(notebook, self._service).init()
|
| 33 |
+
|
| 34 |
+
notebook.add(main_frame, text='Main')
|
| 35 |
+
notebook.add(import_frame, text='Import')
|
| 36 |
+
notebook.add(config_frame, text='Schemas')
|
| 37 |
+
notebook.add(collections_frame, text='Collections')
|
| 38 |
+
notebook.add(query_frame, text='Query')
|
| 39 |
+
notebook.add(backups_frame, text='Backups')
|
| 40 |
+
|
| 41 |
+
logger.info("Application initialization finished")
|
| 42 |
+
|
| 43 |
+
def run(self):
|
| 44 |
+
self._root.mainloop()
|
src/apps/dbapp/backup.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, shutil
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
|
| 4 |
+
from tkinter import *
|
| 5 |
+
from tkinter import ttk
|
| 6 |
+
from src.database.weavservice import WeaviateService
|
| 7 |
+
from src.apps.dbapp.framebase import CustomFrameBase
|
| 8 |
+
from src.apps.dbapp.utilclasses import BackupData
|
| 9 |
+
from src.config import config
|
| 10 |
+
|
| 11 |
+
def _load_backup_files():
|
| 12 |
+
backups = []
|
| 13 |
+
os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
for backup_id in os.listdir(config.weaviate.BACKUP_PATH):
|
| 16 |
+
backups.append(BackupData(backup_id))
|
| 17 |
+
|
| 18 |
+
return backups
|
| 19 |
+
|
| 20 |
+
class BackupsFrame(CustomFrameBase):
|
| 21 |
+
def __init__(self, parent, service: WeaviateService):
|
| 22 |
+
super().__init__(parent, service)
|
| 23 |
+
self._backups = _load_backup_files()
|
| 24 |
+
|
| 25 |
+
def init(self) -> ttk.Frame:
|
| 26 |
+
self._backups = _load_backup_files()
|
| 27 |
+
|
| 28 |
+
main_frame = ttk.Frame(self._parent)
|
| 29 |
+
main_frame.pack(fill=BOTH, expand=True)
|
| 30 |
+
|
| 31 |
+
tree_frame = ttk.Frame(main_frame)
|
| 32 |
+
tree_frame.pack(fill=BOTH, expand=True, padx=10, pady=10)
|
| 33 |
+
|
| 34 |
+
label_frame = ttk.Frame(main_frame)
|
| 35 |
+
label_frame.pack(fill=X, expand=True, padx=10, pady=10)
|
| 36 |
+
|
| 37 |
+
button_frame = ttk.Frame(main_frame)
|
| 38 |
+
button_frame.pack(fill=X, padx=10, pady=10)
|
| 39 |
+
|
| 40 |
+
date_reverse_sort = True
|
| 41 |
+
columns = ('date', 'size')
|
| 42 |
+
|
| 43 |
+
info_label = ttk.Label(label_frame, text="", padding=8)
|
| 44 |
+
|
| 45 |
+
def _print_label(msg, backc, forc):
|
| 46 |
+
info_label.configure(text=msg, foreground=forc, background=backc)
|
| 47 |
+
info_label.update_idletasks()
|
| 48 |
+
|
| 49 |
+
def print_failure(msg: str):
|
| 50 |
+
_print_label(msg, "#FFCDD2", "#B71C1C")
|
| 51 |
+
|
| 52 |
+
def print_info(msg: str):
|
| 53 |
+
_print_label(msg, "#cdedff", "#1c31b7")
|
| 54 |
+
|
| 55 |
+
def print_success(msg: str):
|
| 56 |
+
_print_label(msg, "#d7ffcd", "#4db71c")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
tree = ttk.Treeview(
|
| 60 |
+
tree_frame,
|
| 61 |
+
columns=columns,
|
| 62 |
+
show='tree headings',
|
| 63 |
+
selectmode='browse',
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def sort_by_date():
|
| 67 |
+
nonlocal date_reverse_sort
|
| 68 |
+
|
| 69 |
+
parents = tree.get_children("")
|
| 70 |
+
data = []
|
| 71 |
+
|
| 72 |
+
for p in parents:
|
| 73 |
+
value = tree.set(p, 'date')
|
| 74 |
+
try:
|
| 75 |
+
value = datetime.strptime(value, "%d.%m.%Y %H:%M:%S")
|
| 76 |
+
except Exception:
|
| 77 |
+
pass
|
| 78 |
+
data.append((value, p))
|
| 79 |
+
|
| 80 |
+
data.sort(reverse=date_reverse_sort)
|
| 81 |
+
date_reverse_sort = not date_reverse_sort
|
| 82 |
+
|
| 83 |
+
for index, (_, p) in enumerate(data):
|
| 84 |
+
tree.move(p, "", index)
|
| 85 |
+
|
| 86 |
+
tree.heading(
|
| 87 |
+
'date',
|
| 88 |
+
text='Created at ' + ('▾' if date_reverse_sort else '▴'),
|
| 89 |
+
command=lambda: sort_by_date()
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
tree.heading('#0', text='Backup ID')
|
| 93 |
+
tree.heading('date', text='Created at ▾', command=lambda: sort_by_date())
|
| 94 |
+
tree.heading('size', text='Embeddings amount')
|
| 95 |
+
|
| 96 |
+
tree.column("#0", width=100)
|
| 97 |
+
tree.column("date", width=60)
|
| 98 |
+
tree.column("size", width=30)
|
| 99 |
+
|
| 100 |
+
def insert_backup(backup):
|
| 101 |
+
nonlocal date_reverse_sort
|
| 102 |
+
bk = backup.to_treeformat()
|
| 103 |
+
parent = tree.insert('', 0 if not date_reverse_sort else END,
|
| 104 |
+
text=bk['id'],
|
| 105 |
+
values=bk['date']
|
| 106 |
+
)
|
| 107 |
+
for collection in bk['collections']:
|
| 108 |
+
tree.insert(parent, END,
|
| 109 |
+
text=collection['name'],
|
| 110 |
+
values=collection['size'],
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
for backup in self._backups:
|
| 114 |
+
insert_backup(backup)
|
| 115 |
+
sort_by_date()
|
| 116 |
+
|
| 117 |
+
def create_backup():
|
| 118 |
+
print_info(f"Creating new backup...")
|
| 119 |
+
backup_id = self._service._create_backup()
|
| 120 |
+
|
| 121 |
+
backup = BackupData(backup_id)
|
| 122 |
+
self._backups.append(backup)
|
| 123 |
+
insert_backup(backup)
|
| 124 |
+
print_success(f"Successfully created new backup {backup._backup_id}!")
|
| 125 |
+
|
| 126 |
+
def restore_backup():
|
| 127 |
+
item_id = tree.selection()[0]
|
| 128 |
+
backup = tree.item(item_id)
|
| 129 |
+
|
| 130 |
+
print_info(f"Restoring backup {backup['text']}...")
|
| 131 |
+
self._service._restore_backup('backup_' + backup['text'])
|
| 132 |
+
print_success(f"Successfully restored backup {backup['text']}!")
|
| 133 |
+
|
| 134 |
+
def delete_backup():
|
| 135 |
+
item_id = tree.selection()[0]
|
| 136 |
+
backup = tree.item(item_id)
|
| 137 |
+
|
| 138 |
+
backup_path = os.path.join(config.weaviate.BACKUP_PATH, 'backup_' + backup['text'])
|
| 139 |
+
shutil.rmtree(backup_path, ignore_errors=True)
|
| 140 |
+
|
| 141 |
+
tree.delete(item_id)
|
| 142 |
+
print_success(f"Deleted backup {backup['text']}.")
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
create_bkp_btn = ttk.Button(
|
| 146 |
+
button_frame,
|
| 147 |
+
text="Create Backup",
|
| 148 |
+
command=create_backup
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
restore_bkp_btn = ttk.Button(
|
| 152 |
+
button_frame,
|
| 153 |
+
text="Restore Backup",
|
| 154 |
+
command=restore_backup,
|
| 155 |
+
state=['disabled']
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
delete_bkp_btn = ttk.Button(
|
| 159 |
+
button_frame,
|
| 160 |
+
text="Delete Backup",
|
| 161 |
+
command=delete_backup,
|
| 162 |
+
state=['disabled']
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
def on_item_selection(event):
|
| 166 |
+
selected = tree.selection()
|
| 167 |
+
if not selected:
|
| 168 |
+
restore_bkp_btn.state(['disabled'])
|
| 169 |
+
delete_bkp_btn.state(['disabled'])
|
| 170 |
+
return
|
| 171 |
+
|
| 172 |
+
item_id = selected[0]
|
| 173 |
+
is_parent = tree.parent(item_id) == ''
|
| 174 |
+
restore_bkp_btn.state(['!disabled' if is_parent else 'disabled'])
|
| 175 |
+
delete_bkp_btn.state(['!disabled' if is_parent else 'disabled'])
|
| 176 |
+
|
| 177 |
+
tree.bind("<<TreeviewSelect>>", on_item_selection)
|
| 178 |
+
|
| 179 |
+
scrollbar = ttk.Scrollbar(tree_frame, orient="vertical", command=tree.yview)
|
| 180 |
+
tree.configure(yscrollcommand=scrollbar.set)
|
| 181 |
+
|
| 182 |
+
info_label.pack()
|
| 183 |
+
|
| 184 |
+
tree.pack(side=LEFT, fill=BOTH, expand=True)
|
| 185 |
+
scrollbar.pack(side=RIGHT, fill=Y)
|
| 186 |
+
|
| 187 |
+
create_bkp_btn.pack(side=LEFT, padx=5)
|
| 188 |
+
restore_bkp_btn.pack(side=RIGHT, padx=5)
|
| 189 |
+
delete_bkp_btn.pack(side=RIGHT, padx=5)
|
| 190 |
+
|
| 191 |
+
return main_frame
|
src/apps/dbapp/collections.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tkinter import *
|
| 2 |
+
from tkinter import ttk
|
| 3 |
+
from src.apps.dbapp.framebase import CustomFrameBase
|
| 4 |
+
from src.database.weavservice import WeaviateService
|
| 5 |
+
|
| 6 |
+
class CollectionsFrame(CustomFrameBase):
|
| 7 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 8 |
+
super().__init__(parent, service)
|
src/apps/dbapp/config.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
|
| 3 |
+
from tkinter import *
|
| 4 |
+
from tkinter import ttk
|
| 5 |
+
from src.apps.dbapp.framebase import CustomFrameBase
|
| 6 |
+
from src.utils.stratutils.generator import generate_strategy
|
| 7 |
+
from src.database.weavservice import WeaviateService
|
| 8 |
+
from src.config import config
|
| 9 |
+
|
| 10 |
+
def _dump_schema(schema):
|
| 11 |
+
os.makedirs(config.weaviate.PROPERTIES_PATH, exist_ok=True)
|
| 12 |
+
properties_file_path = os.path.join(config.weaviate.PROPERTIES_PATH, 'properties.json')
|
| 13 |
+
with open(properties_file_path, 'w', encoding='utf-8') as f:
|
| 14 |
+
json.dump(schema, f, indent=2, default=str)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SchemaConfigurationFrame(CustomFrameBase):
|
| 18 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 19 |
+
super().__init__(parent, service)
|
| 20 |
+
self._schema = self._load_schema_data()
|
| 21 |
+
self._strategies = self._load_strategies()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _load_strategies(self) -> dict:
|
| 25 |
+
os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
|
| 26 |
+
loaded_strats = os.listdir(config.weaviate.STRATEGIES_PATH)
|
| 27 |
+
strategies = {}
|
| 28 |
+
|
| 29 |
+
for name, prop in self._schema.items():
|
| 30 |
+
strategy_file = f"strat_{name}.py"
|
| 31 |
+
file_path = os.path.join(config.weaviate.STRATEGIES_PATH, strategy_file)
|
| 32 |
+
strategy_content = ""
|
| 33 |
+
|
| 34 |
+
if strategy_file not in loaded_strats:
|
| 35 |
+
strategy_content = generate_strategy(name, prop)
|
| 36 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 37 |
+
f.write(strategy_content)
|
| 38 |
+
else:
|
| 39 |
+
with open(file_path) as f:
|
| 40 |
+
strategy_content = f.read()
|
| 41 |
+
|
| 42 |
+
strategies[name] = strategy_content
|
| 43 |
+
|
| 44 |
+
return strategies
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _save_strategy(self, name, strategy) -> None:
|
| 48 |
+
os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
|
| 49 |
+
self._strategies[name] = strategy
|
| 50 |
+
|
| 51 |
+
file_path = os.path.join(config.weaviate.STRATEGIES_PATH, f"strat_{name}.py")
|
| 52 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 53 |
+
f.write(strategy)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _load_schema_data(self) -> dict:
|
| 57 |
+
schema_data = {}
|
| 58 |
+
|
| 59 |
+
schema = self._service._extract_data()['schema']
|
| 60 |
+
if not schema:
|
| 61 |
+
return schema_data
|
| 62 |
+
|
| 63 |
+
for prop in schema[0]['properties']:
|
| 64 |
+
data_property = {
|
| 65 |
+
'description': prop.get('description', ''),
|
| 66 |
+
'data_type': prop['dataType'][0],
|
| 67 |
+
'filterable': prop['indexFilterable'],
|
| 68 |
+
'searchable': prop['indexSearchable'],
|
| 69 |
+
'skip_vectorization': prop['moduleConfig']['text2vec-huggingface']['skip'],
|
| 70 |
+
}
|
| 71 |
+
schema_data[prop['name']] = data_property
|
| 72 |
+
|
| 73 |
+
_dump_schema(schema_data)
|
| 74 |
+
|
| 75 |
+
return schema_data
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _update_schema_property(self, old_name: str, new_name: str, prop: dict) -> None:
|
| 79 |
+
del self._schema[old_name]
|
| 80 |
+
self._schema[new_name] = prop
|
| 81 |
+
_dump_schema(self._schema)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _add_schema_property(self, name, prop: dict) -> None:
|
| 85 |
+
self._schema[name] = prop
|
| 86 |
+
_dump_schema(self._schema)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _delete_schema_property(self, name) -> None:
|
| 90 |
+
del self._schema[name]
|
| 91 |
+
_dump_schema(self._schema)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def init(self) -> ttk.Frame:
|
| 95 |
+
main_frame = ttk.Frame(self._parent)
|
| 96 |
+
main_frame.pack(fill=BOTH, expand=True)
|
| 97 |
+
|
| 98 |
+
schema_frame = ttk.Frame(main_frame)
|
| 99 |
+
schema_frame.pack(fill=BOTH, expand=True)
|
| 100 |
+
|
| 101 |
+
add_button = ttk.Button(schema_frame, text='Add property',
|
| 102 |
+
command=lambda: self._add_property(refresh_table))
|
| 103 |
+
add_button.pack(anchor=NW, padx=5, pady=5)
|
| 104 |
+
|
| 105 |
+
canvas = Canvas(schema_frame)
|
| 106 |
+
scrollbar = ttk.Scrollbar(schema_frame, orient="vertical", command=canvas.yview)
|
| 107 |
+
scrollable_frame = ttk.Frame(canvas)
|
| 108 |
+
|
| 109 |
+
scrollable_frame.bind("<Configure>", lambda _: canvas.configure(scrollregion=canvas.bbox("all")))
|
| 110 |
+
canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
|
| 111 |
+
canvas.configure(yscrollcommand=scrollbar.set)
|
| 112 |
+
canvas.pack(side=LEFT, fill=BOTH, expand=True)
|
| 113 |
+
scrollbar.pack(side=RIGHT, fill=Y)
|
| 114 |
+
|
| 115 |
+
def refresh_table():
|
| 116 |
+
for widget in scrollable_frame.winfo_children():
|
| 117 |
+
widget.destroy()
|
| 118 |
+
|
| 119 |
+
self._build_table(scrollable_frame, refresh_table)
|
| 120 |
+
|
| 121 |
+
refresh_table()
|
| 122 |
+
return main_frame
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _build_table(self, parent_frame, refresh_callback):
|
| 126 |
+
style = ttk.Style()
|
| 127 |
+
style.configure('Header.TLabel', font=('Helvetica', 10, 'bold'), background='#e0e0e0')
|
| 128 |
+
style.configure('EvenRow.TLabel', background='#f0f0f0')
|
| 129 |
+
style.configure('OddRow.TLabel', background='white')
|
| 130 |
+
|
| 131 |
+
table_frame = ttk.Frame(parent_frame)
|
| 132 |
+
table_frame.pack(fill=X, padx=5, pady=5)
|
| 133 |
+
|
| 134 |
+
for i in range(5):
|
| 135 |
+
table_frame.grid_columnconfigure(i, minsize=100, weight=1)
|
| 136 |
+
|
| 137 |
+
headers = ['Name', 'Data Type', 'Filterable', 'Searchable', 'Skip Vectorize']
|
| 138 |
+
for col, text in enumerate(headers):
|
| 139 |
+
label = ttk.Label(table_frame, text=text, borderwidth=1, relief=SOLID, anchor='center', style='Header.TLabel')
|
| 140 |
+
label.grid(row=0, column=col, sticky='ew')
|
| 141 |
+
|
| 142 |
+
for idx, (name, prop) in enumerate(self._schema.items(), start=1):
|
| 143 |
+
row_style = 'EvenRow.TLabel' if idx % 2 == 0 else 'OddRow.TLabel'
|
| 144 |
+
|
| 145 |
+
row_name_label = ttk.Label(table_frame, text=name, style=row_style)
|
| 146 |
+
row_type_label = ttk.Label(table_frame, text=prop['data_type'].upper(), style=row_style)
|
| 147 |
+
row_filterable_label = ttk.Label(table_frame, text='Yes' if prop['filterable'] else 'No', style=row_style)
|
| 148 |
+
row_searchable_label = ttk.Label(table_frame, text='Yes' if prop['searchable'] else 'No', style=row_style)
|
| 149 |
+
row_vectorize_label = ttk.Label(table_frame, text='Yes' if prop['skip_vectorization'] else 'No', style=row_style)
|
| 150 |
+
|
| 151 |
+
row_edit_button = ttk.Button(table_frame, text='Edit',
|
| 152 |
+
command=lambda n=name, p=prop: self._edit_property(n, p, refresh_callback))
|
| 153 |
+
row_delete_button = ttk.Button(table_frame, text='Delete',
|
| 154 |
+
command=lambda n=name: self._delete_property(n, refresh_callback))
|
| 155 |
+
row_strategy_button = ttk.Button(table_frame, text='Strategy',
|
| 156 |
+
command=lambda n=name: self._handle_strategy(n))
|
| 157 |
+
|
| 158 |
+
row_name_label.grid(row=idx, column=0, sticky='ew', ipadx=25)
|
| 159 |
+
row_type_label.grid(row=idx, column=1, sticky='ew', ipadx=25)
|
| 160 |
+
row_filterable_label.grid(row=idx, column=2, sticky='ew', ipadx=25)
|
| 161 |
+
row_searchable_label.grid(row=idx, column=3, sticky='ew')
|
| 162 |
+
row_vectorize_label.grid(row=idx, column=4, sticky='ew')
|
| 163 |
+
row_edit_button.grid(row=idx, column=5, sticky='ew')
|
| 164 |
+
row_delete_button.grid(row=idx, column=6, sticky='ew')
|
| 165 |
+
row_strategy_button.grid(row=idx, column=7, sticky='ew')
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _handle_strategy(self, n):
|
| 169 |
+
dialog = Toplevel()
|
| 170 |
+
dialog.title(f"Property {n} strategy")
|
| 171 |
+
dialog.geometry("700x400")
|
| 172 |
+
|
| 173 |
+
field_frame = ttk.Frame(dialog)
|
| 174 |
+
field_frame.pack(fill=BOTH, expand=True, padx=10, pady=10)
|
| 175 |
+
|
| 176 |
+
scrollbar = Scrollbar(field_frame, orient=VERTICAL)
|
| 177 |
+
scrollbar.pack(side=RIGHT, fill=Y)
|
| 178 |
+
|
| 179 |
+
strategy = self._strategies[n]
|
| 180 |
+
edit_field = Text(field_frame, width=80, height=15, wrap=WORD, yscrollcommand=scrollbar.set)
|
| 181 |
+
edit_field.insert(END, strategy)
|
| 182 |
+
edit_field.pack(side=LEFT, fill=BOTH, expand=True)
|
| 183 |
+
|
| 184 |
+
scrollbar.config(command=edit_field.yview)
|
| 185 |
+
|
| 186 |
+
def commit():
|
| 187 |
+
new_strategy = edit_field.get("1.0", END).strip()
|
| 188 |
+
self._save_strategy(n, new_strategy)
|
| 189 |
+
dialog.destroy()
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
ttk.Button(dialog, text="Save", command=commit).pack(side=BOTTOM, anchor=S, pady=10)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def _delete_property(self, name, refresh_callback):
|
| 196 |
+
msg = f"Do you want to delete property '{name}'?"
|
| 197 |
+
dialog = Toplevel()
|
| 198 |
+
dialog.title('Warning!')
|
| 199 |
+
dialog.geometry(f"{len(msg)*5+120}x50")
|
| 200 |
+
dialog.grab_set()
|
| 201 |
+
|
| 202 |
+
ttk.Label(dialog, text=msg).pack()
|
| 203 |
+
|
| 204 |
+
def submit():
|
| 205 |
+
self._delete_schema_property(name)
|
| 206 |
+
refresh_callback()
|
| 207 |
+
dialog.destroy()
|
| 208 |
+
|
| 209 |
+
button_frame = ttk.Frame(dialog)
|
| 210 |
+
button_frame.pack(fill=X, expand=True)
|
| 211 |
+
|
| 212 |
+
ttk.Button(button_frame, text='Delete', command=submit).pack(side=LEFT, padx=15)
|
| 213 |
+
ttk.Button(button_frame, text='Cancel', command=dialog.destroy).pack(side=RIGHT, padx=15)
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def _add_property(self, refresh_callback):
|
| 217 |
+
dialog = Toplevel()
|
| 218 |
+
dialog.title(f"New property")
|
| 219 |
+
dialog.geometry("280x300")
|
| 220 |
+
dialog.grab_set()
|
| 221 |
+
|
| 222 |
+
texts_frame = ttk.Frame(dialog)
|
| 223 |
+
texts_frame.pack(fill=X, expand=True)
|
| 224 |
+
|
| 225 |
+
ttk.Label(texts_frame, text="Name:").grid(row=0, column=0, padx=5, pady=5, sticky='e')
|
| 226 |
+
name_entry = ttk.Entry(texts_frame)
|
| 227 |
+
name_entry.grid(row=0, column=1, padx=5, pady=5, sticky='w')
|
| 228 |
+
|
| 229 |
+
ttk.Label(texts_frame, text="Description:").grid(row=1, column=0, padx=5, pady=5, sticky='e')
|
| 230 |
+
desc_entry = ttk.Entry(texts_frame)
|
| 231 |
+
desc_entry.insert(0, '')
|
| 232 |
+
desc_entry.grid(row=1, column=1, padx=5, pady=5, sticky='w')
|
| 233 |
+
|
| 234 |
+
ttk.Label(texts_frame, text="Data Type:").grid(row=2, column=0, padx=5, pady=5, sticky='e')
|
| 235 |
+
type_var = StringVar(value='text')
|
| 236 |
+
type_combo = ttk.Combobox(texts_frame, textvariable=type_var,
|
| 237 |
+
values=["text", "int", "number", "boolean", "date", "text[]", "int[]", "number[]", "boolean[]", "date[]", "object"]
|
| 238 |
+
)
|
| 239 |
+
type_combo.grid(row=2, column=1, padx=5, pady=5, sticky='w')
|
| 240 |
+
|
| 241 |
+
checks_frame = ttk.Frame(dialog)
|
| 242 |
+
checks_frame.pack(fill=X, expand=True)
|
| 243 |
+
|
| 244 |
+
filterable_var = BooleanVar(value=True)
|
| 245 |
+
searchable_var = BooleanVar(value=True)
|
| 246 |
+
skip_vec_var = BooleanVar(value=False)
|
| 247 |
+
|
| 248 |
+
ttk.Checkbutton(checks_frame, text="Filterable ", variable=filterable_var).pack(anchor=W, padx=15)
|
| 249 |
+
ttk.Checkbutton(checks_frame, text="Searchable ", variable=searchable_var).pack(anchor=W, padx=15)
|
| 250 |
+
ttk.Checkbutton(checks_frame, text="Skip Vectorization", variable=skip_vec_var).pack(anchor=W, padx=15)
|
| 251 |
+
|
| 252 |
+
def submit():
|
| 253 |
+
name = name_entry.get()
|
| 254 |
+
if not name:
|
| 255 |
+
self._show_messagebox("Parameter 'name' is required!")
|
| 256 |
+
return
|
| 257 |
+
if name in self._schema.keys():
|
| 258 |
+
self._show_messagebox(f"Property with name '{name}' already exists!")
|
| 259 |
+
return
|
| 260 |
+
|
| 261 |
+
prop = {
|
| 262 |
+
'description': desc_entry.get().strip(),
|
| 263 |
+
'data_type': type_var.get(),
|
| 264 |
+
'filterable': filterable_var.get(),
|
| 265 |
+
'searchable': searchable_var.get(),
|
| 266 |
+
'skip_vectorization': skip_vec_var.get(),
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
self._add_schema_property(name, prop)
|
| 270 |
+
refresh_callback()
|
| 271 |
+
dialog.destroy()
|
| 272 |
+
|
| 273 |
+
buttons_frame = ttk.Frame(dialog)
|
| 274 |
+
buttons_frame.pack(fill=X, expand=True)
|
| 275 |
+
|
| 276 |
+
ttk.Button(buttons_frame, text="Save", command=submit).pack(side=LEFT, padx=15)
|
| 277 |
+
ttk.Button(buttons_frame, text="Cancel", command=dialog.destroy).pack(side=RIGHT, padx=15)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def _edit_property(self, name: str, prop: dict, refresh_callback):
|
| 281 |
+
dialog = Toplevel()
|
| 282 |
+
dialog.title(f"Edit Property: {name}")
|
| 283 |
+
dialog.geometry("280x300")
|
| 284 |
+
dialog.grab_set()
|
| 285 |
+
|
| 286 |
+
texts_frame = ttk.Frame(dialog)
|
| 287 |
+
texts_frame.pack(fill=X, expand=True)
|
| 288 |
+
|
| 289 |
+
ttk.Label(texts_frame, text="Name:").grid(row=0, column=0, padx=5, pady=5, sticky='e')
|
| 290 |
+
name_entry = ttk.Entry(texts_frame)
|
| 291 |
+
name_entry.insert(0, name)
|
| 292 |
+
name_entry.grid(row=0, column=1, padx=5, pady=5, sticky='w')
|
| 293 |
+
|
| 294 |
+
ttk.Label(texts_frame, text="Description:").grid(row=1, column=0, padx=5, pady=5, sticky='e')
|
| 295 |
+
desc_entry = ttk.Entry(texts_frame)
|
| 296 |
+
desc_entry.insert(0, prop.get('description', ''))
|
| 297 |
+
desc_entry.grid(row=1, column=1, padx=5, pady=5, sticky='w')
|
| 298 |
+
|
| 299 |
+
ttk.Label(texts_frame, text="Data Type:").grid(row=2, column=0, padx=5, pady=5, sticky='e')
|
| 300 |
+
type_var = StringVar(value=prop['data_type'])
|
| 301 |
+
type_combo = ttk.Combobox(texts_frame, textvariable=type_var,
|
| 302 |
+
values=["text", "int", "number", "boolean", "date", "text[]", "int[]", "number[]", "boolean[]", "date[]", "object"]
|
| 303 |
+
)
|
| 304 |
+
type_combo.grid(row=2, column=1, padx=5, pady=5, sticky='w')
|
| 305 |
+
|
| 306 |
+
checks_frame = ttk.Frame(dialog)
|
| 307 |
+
checks_frame.pack(fill=X, expand=True)
|
| 308 |
+
|
| 309 |
+
filterable_var = BooleanVar(value=prop['filterable'])
|
| 310 |
+
searchable_var = BooleanVar(value=prop['searchable'])
|
| 311 |
+
skip_vec_var = BooleanVar(value=prop['skip_vectorization'])
|
| 312 |
+
|
| 313 |
+
ttk.Checkbutton(checks_frame, text="Filterable ", variable=filterable_var).pack(anchor=W, padx=15)
|
| 314 |
+
ttk.Checkbutton(checks_frame, text="Searchable ", variable=searchable_var).pack(anchor=W, padx=15)
|
| 315 |
+
ttk.Checkbutton(checks_frame, text="Skip Vectorization", variable=skip_vec_var).pack(anchor=W, padx=15)
|
| 316 |
+
|
| 317 |
+
def submit():
|
| 318 |
+
new_name = name_entry.get().strip()
|
| 319 |
+
if not new_name:
|
| 320 |
+
self._show_messagebox("Parameter 'name' is required!")
|
| 321 |
+
return
|
| 322 |
+
|
| 323 |
+
updated_prop = {
|
| 324 |
+
'description': desc_entry.get().strip(),
|
| 325 |
+
'data_type': type_var.get(),
|
| 326 |
+
'filterable': filterable_var.get(),
|
| 327 |
+
'searchable': searchable_var.get(),
|
| 328 |
+
'skip_vectorization': skip_vec_var.get(),
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
self._update_schema_property(name, new_name, updated_prop)
|
| 332 |
+
refresh_callback()
|
| 333 |
+
dialog.destroy()
|
| 334 |
+
|
| 335 |
+
buttons_frame = ttk.Frame(dialog)
|
| 336 |
+
buttons_frame.pack(fill=X, expand=True)
|
| 337 |
+
|
| 338 |
+
ttk.Button(buttons_frame, text="Save", command=submit).pack(side=LEFT, padx=15)
|
| 339 |
+
ttk.Button(buttons_frame, text="Cancel", command=dialog.destroy).pack(side=RIGHT, padx=15)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
@staticmethod
|
| 343 |
+
def _show_messagebox(msg):
|
| 344 |
+
dialog = Toplevel()
|
| 345 |
+
dialog.title('Warning!')
|
| 346 |
+
dialog.geometry(f"{len(msg)*5+120}x50")
|
| 347 |
+
dialog.grab_set()
|
| 348 |
+
|
| 349 |
+
ttk.Label(dialog, text=msg).pack()
|
| 350 |
+
ttk.Button(dialog, text='OK', command=dialog.destroy).pack(padx=15)
|
src/apps/dbapp/framebase.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tkinter import *
|
| 2 |
+
from tkinter import ttk
|
| 3 |
+
from src.database.weavservice import WeaviateService
|
| 4 |
+
|
| 5 |
+
class CustomFrameBase:
|
| 6 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 7 |
+
self._parent = parent
|
| 8 |
+
self._service = service
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def init(self) -> ttk.Frame:
|
| 12 |
+
main_frame = ttk.Frame(self._parent)
|
| 13 |
+
main_frame.pack()
|
| 14 |
+
|
| 15 |
+
return main_frame
|
src/apps/dbapp/imports.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import threading
|
| 3 |
+
from tkinter import *
|
| 4 |
+
from tkinter import ttk
|
| 5 |
+
from tkinter import filedialog
|
| 6 |
+
from queue import Queue
|
| 7 |
+
|
| 8 |
+
from .framebase import CustomFrameBase
|
| 9 |
+
|
| 10 |
+
from src.pipeline.pipeline import ImportPipeline
|
| 11 |
+
from src.pipeline.utils import ProcessingResult
|
| 12 |
+
|
| 13 |
+
from src.database.weavservice import WeaviateService
|
| 14 |
+
from src.utils.lang import get_language_name
|
| 15 |
+
from src.config import config
|
| 16 |
+
|
| 17 |
+
class ImportFrame(CustomFrameBase):
|
| 18 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 19 |
+
super().__init__(parent, service)
|
| 20 |
+
self._import_paths = dict()
|
| 21 |
+
|
| 22 |
+
def init(self) -> ttk.Frame:
|
| 23 |
+
main_frame = ttk.Frame(self._parent)
|
| 24 |
+
main_frame.pack(fill=BOTH, expand=True)
|
| 25 |
+
|
| 26 |
+
# ====================== Helper functions ======================
|
| 27 |
+
def update_treeview():
|
| 28 |
+
for item in self.files_treeview.get_children():
|
| 29 |
+
self.files_treeview.delete(item)
|
| 30 |
+
for filename in self._import_paths:
|
| 31 |
+
self.files_treeview.insert("", 0, text=filename)
|
| 32 |
+
|
| 33 |
+
def open_file_dialog():
|
| 34 |
+
filepaths = filedialog.askopenfilenames(
|
| 35 |
+
title="Select files to import",
|
| 36 |
+
filetypes=(("PDF", "*.pdf"), ("Text files", "*.txt"), ("All files", "*.*"))
|
| 37 |
+
)
|
| 38 |
+
for path in filepaths:
|
| 39 |
+
filename = os.path.basename(path)
|
| 40 |
+
self._import_paths[filename] = path
|
| 41 |
+
update_treeview()
|
| 42 |
+
|
| 43 |
+
def remove_files():
|
| 44 |
+
selection = self.files_treeview.selection()
|
| 45 |
+
if not selection:
|
| 46 |
+
return
|
| 47 |
+
for item in selection:
|
| 48 |
+
filename = self.files_treeview.item(item)["text"]
|
| 49 |
+
self._import_paths.pop(filename, None)
|
| 50 |
+
update_treeview()
|
| 51 |
+
|
| 52 |
+
def change_button_state(state):
|
| 53 |
+
add_button.config(state=state)
|
| 54 |
+
remove_button.config(state=state)
|
| 55 |
+
import_button.config(state=state)
|
| 56 |
+
|
| 57 |
+
# Configure grid for 50/50 split
|
| 58 |
+
main_frame.grid_rowconfigure(0, weight=1)
|
| 59 |
+
main_frame.grid_columnconfigure(0, weight=1)
|
| 60 |
+
main_frame.grid_columnconfigure(1, weight=1)
|
| 61 |
+
|
| 62 |
+
# ====================== LEFT SIDE ======================
|
| 63 |
+
left_frame = ttk.Frame(main_frame)
|
| 64 |
+
left_frame.grid(row=0, column=0, sticky='nsew', padx=(10, 5), pady=10)
|
| 65 |
+
|
| 66 |
+
# Button row for add/remove
|
| 67 |
+
btn_row = ttk.Frame(left_frame)
|
| 68 |
+
btn_row.pack(fill=X, pady=(0, 8))
|
| 69 |
+
|
| 70 |
+
add_button = ttk.Button(btn_row, text="Add files", command=open_file_dialog)
|
| 71 |
+
add_button.pack(side=LEFT, padx=8)
|
| 72 |
+
|
| 73 |
+
remove_button = ttk.Button(btn_row, text="Remove files", command=remove_files)
|
| 74 |
+
remove_button.pack(side=LEFT, padx=8)
|
| 75 |
+
|
| 76 |
+
# Controls row for checkbox and import button
|
| 77 |
+
controls_row = ttk.Frame(left_frame)
|
| 78 |
+
controls_row.pack(fill=X, pady=(0, 8))
|
| 79 |
+
|
| 80 |
+
import_button = ttk.Button(
|
| 81 |
+
controls_row,
|
| 82 |
+
text="Begin Import",
|
| 83 |
+
command=lambda: self._import_callback(change_button_state)
|
| 84 |
+
)
|
| 85 |
+
import_button.pack(side=LEFT, padx=10)
|
| 86 |
+
|
| 87 |
+
self.reset_cd_var = BooleanVar(value=False)
|
| 88 |
+
reset_cb = ttk.Checkbutton(
|
| 89 |
+
controls_row,
|
| 90 |
+
text="Reset database",
|
| 91 |
+
variable=self.reset_cd_var
|
| 92 |
+
)
|
| 93 |
+
reset_cb.pack(side=LEFT, padx=8, pady=6)
|
| 94 |
+
|
| 95 |
+
# Files treeview
|
| 96 |
+
self.files_treeview = ttk.Treeview(
|
| 97 |
+
left_frame,
|
| 98 |
+
columns=[],
|
| 99 |
+
show="tree headings",
|
| 100 |
+
selectmode="extended",
|
| 101 |
+
height=18
|
| 102 |
+
)
|
| 103 |
+
self.files_treeview.heading("#0", text="File name")
|
| 104 |
+
self.files_treeview.column("#0", width=260)
|
| 105 |
+
self.files_treeview.pack(fill=BOTH, expand=True, pady=8)
|
| 106 |
+
|
| 107 |
+
# ====================== RIGHT SIDE ======================
|
| 108 |
+
right_frame = ttk.Frame(main_frame)
|
| 109 |
+
right_frame.grid(row=0, column=1, sticky='nsew', padx=(5, 10), pady=10)
|
| 110 |
+
|
| 111 |
+
ttk.Label(right_frame, text="Enter URLs (one per line):").pack(anchor=W, padx=5, pady=(0, 6))
|
| 112 |
+
|
| 113 |
+
self.url_text = Text(right_frame, width=28, height=22, undo=True, wrap="word", font=("Segoe UI", 10))
|
| 114 |
+
self.url_text.pack(side=LEFT, fill=BOTH, expand=True, padx=5, pady=5)
|
| 115 |
+
|
| 116 |
+
self.url_text.insert(END, '\n'.join(config.get('SCRAPING_TARGET_URLS')))
|
| 117 |
+
|
| 118 |
+
# Scrollbar
|
| 119 |
+
scrollbar = ttk.Scrollbar(right_frame, orient="vertical", command=self.url_text.yview)
|
| 120 |
+
scrollbar.pack(side=RIGHT, fill=Y)
|
| 121 |
+
self.url_text.config(yscrollcommand=scrollbar.set)
|
| 122 |
+
|
| 123 |
+
return main_frame
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _deduplication_callback(self, source: str, amount: int):
|
| 127 |
+
result_queue = Queue()
|
| 128 |
+
|
| 129 |
+
def show_dialog():
|
| 130 |
+
dialog = Toplevel()
|
| 131 |
+
dialog.title("Duplicated content!")
|
| 132 |
+
dialog.bell()
|
| 133 |
+
|
| 134 |
+
wrap_width = 360
|
| 135 |
+
|
| 136 |
+
info_label = ttk.Label(
|
| 137 |
+
dialog,
|
| 138 |
+
text=f'{amount} duplicated chunks found in database for {source}!',
|
| 139 |
+
wraplength=wrap_width,
|
| 140 |
+
justify=LEFT
|
| 141 |
+
)
|
| 142 |
+
info_label2 = ttk.Label(
|
| 143 |
+
dialog,
|
| 144 |
+
text='Would you like to reimport them with updated properties?',
|
| 145 |
+
wraplength=wrap_width,
|
| 146 |
+
justify=LEFT
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
info_label.pack(fill=X, anchor=W, padx=15, pady=15)
|
| 150 |
+
info_label2.pack(fill=X, anchor=W, padx=15, pady=15)
|
| 151 |
+
|
| 152 |
+
def reimport_callback():
|
| 153 |
+
result_queue.put(True)
|
| 154 |
+
dialog.destroy()
|
| 155 |
+
|
| 156 |
+
def dispose_callback():
|
| 157 |
+
result_queue.put(False)
|
| 158 |
+
dialog.destroy()
|
| 159 |
+
|
| 160 |
+
reimport_button = ttk.Button(dialog, text='Reimport', command=reimport_callback)
|
| 161 |
+
dispose_button = ttk.Button(dialog, text='Dispose', command=dispose_callback)
|
| 162 |
+
|
| 163 |
+
reimport_button.pack(side=LEFT, padx=15, pady=15)
|
| 164 |
+
dispose_button.pack(side=RIGHT, padx=15, pady=15)
|
| 165 |
+
|
| 166 |
+
dialog.update_idletasks()
|
| 167 |
+
width = dialog.winfo_reqwidth() + 20
|
| 168 |
+
height = dialog.winfo_reqheight() + 20
|
| 169 |
+
dialog.geometry(f"{width}x{height}")
|
| 170 |
+
|
| 171 |
+
dialog.protocol("WM_DELETE_WINDOW", dispose_callback)
|
| 172 |
+
|
| 173 |
+
dialog.wait_visibility()
|
| 174 |
+
dialog.grab_set()
|
| 175 |
+
|
| 176 |
+
self._parent.after(0, show_dialog)
|
| 177 |
+
return result_queue.get()
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def _import_callback(self, button_state_callback):
|
| 181 |
+
dialog = Toplevel()
|
| 182 |
+
dialog.title("Import status")
|
| 183 |
+
dialog.geometry("600x400")
|
| 184 |
+
|
| 185 |
+
current_import_label = ttk.Label(dialog, text='Initiating the import pipeline...')
|
| 186 |
+
current_import_label.pack(side=TOP, padx=15, pady=15)
|
| 187 |
+
|
| 188 |
+
progress_bar = ttk.Progressbar(dialog, length=200, value=0, maximum=100)
|
| 189 |
+
progress_bar.pack(side=TOP, padx=15, pady=15)
|
| 190 |
+
|
| 191 |
+
chunks_treeview = ttk.Treeview(
|
| 192 |
+
dialog,
|
| 193 |
+
columns=['chunks', 'lang'],
|
| 194 |
+
show='tree headings',
|
| 195 |
+
selectmode='extended',
|
| 196 |
+
)
|
| 197 |
+
chunks_treeview.heading('#0', text='File name')
|
| 198 |
+
chunks_treeview.heading('chunks', text='Collected chunks')
|
| 199 |
+
chunks_treeview.heading('lang', text='Language')
|
| 200 |
+
|
| 201 |
+
chunks_treeview.column('#0', width=100)
|
| 202 |
+
chunks_treeview.column('chunks', width=60)
|
| 203 |
+
chunks_treeview.column('lang', width=40)
|
| 204 |
+
|
| 205 |
+
chunks_treeview.pack(side=TOP, fill=X, padx=15, pady=15, expand=True)
|
| 206 |
+
|
| 207 |
+
def logging_callback(
|
| 208 |
+
msg: str,
|
| 209 |
+
progress: int,
|
| 210 |
+
result: ProcessingResult = None,
|
| 211 |
+
failed: bool = False,
|
| 212 |
+
):
|
| 213 |
+
current_import_label.config(text=msg)
|
| 214 |
+
progress_bar.config(value=progress)
|
| 215 |
+
|
| 216 |
+
if result:
|
| 217 |
+
chunks_treeview.insert('', index=0,
|
| 218 |
+
text=result.source,
|
| 219 |
+
values=(
|
| 220 |
+
'Failure!' if failed else len(result.chunks),
|
| 221 |
+
get_language_name(result.lang)
|
| 222 |
+
)
|
| 223 |
+
)
|
| 224 |
+
config.dbapp['logging_callback'] = logging_callback
|
| 225 |
+
|
| 226 |
+
def import_task():
|
| 227 |
+
button_state_callback(DISABLED)
|
| 228 |
+
filepaths = self._import_paths.values()
|
| 229 |
+
urls = self.url_text.get('1.0', END).strip().split('\n')
|
| 230 |
+
try:
|
| 231 |
+
ImportPipeline(
|
| 232 |
+
logging_callback=logging_callback,
|
| 233 |
+
deduplication_callback=self._deduplication_callback,
|
| 234 |
+
).import_all(
|
| 235 |
+
paths=filepaths,
|
| 236 |
+
urls=urls,
|
| 237 |
+
reset_collections=self.reset_cd_var.get()
|
| 238 |
+
)
|
| 239 |
+
dialog.bell()
|
| 240 |
+
finally:
|
| 241 |
+
button_state_callback(NORMAL)
|
| 242 |
+
|
| 243 |
+
import_thread = threading.Thread(target=import_task)
|
| 244 |
+
import_thread.start()
|
src/apps/dbapp/mainframe.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tkinter import *
|
| 2 |
+
from tkinter import ttk
|
| 3 |
+
from src.apps.dbapp.framebase import CustomFrameBase
|
| 4 |
+
from src.database.weavservice import WeaviateService
|
| 5 |
+
|
| 6 |
+
class MainFrame(CustomFrameBase):
|
| 7 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 8 |
+
super().__init__(parent, service)
|
src/apps/dbapp/query.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tkinter import *
|
| 2 |
+
from tkinter import ttk
|
| 3 |
+
from src.apps.dbapp.framebase import CustomFrameBase
|
| 4 |
+
from src.database.weavservice import WeaviateService
|
| 5 |
+
|
| 6 |
+
class QueryFrame(CustomFrameBase):
|
| 7 |
+
def __init__(self, parent, service: WeaviateService) -> None:
|
| 8 |
+
super().__init__(parent, service)
|
| 9 |
+
|
| 10 |
+
def init(self) -> ttk.Frame:
|
| 11 |
+
main_frame = ttk.Frame(self._parent)
|
| 12 |
+
main_frame.pack(fill=BOTH, expand=True)
|
| 13 |
+
|
| 14 |
+
input_frame = ttk.Frame(main_frame)
|
| 15 |
+
input_frame.pack(fill=X, padx=10, pady=(5, 10))
|
| 16 |
+
|
| 17 |
+
self.language_var = StringVar(value="de")
|
| 18 |
+
|
| 19 |
+
self.filters_button = ttk.Button(input_frame, text="Filters...", command=self.open_filters)
|
| 20 |
+
self.filters_button.pack(side=LEFT, padx=(0, 10))
|
| 21 |
+
|
| 22 |
+
lang_frame = ttk.Frame(input_frame)
|
| 23 |
+
lang_frame.pack(side=LEFT, padx=(0, 15))
|
| 24 |
+
|
| 25 |
+
ttk.Radiobutton(
|
| 26 |
+
lang_frame,
|
| 27 |
+
text="EN",
|
| 28 |
+
variable=self.language_var,
|
| 29 |
+
value="en"
|
| 30 |
+
).pack(side=LEFT, padx=(0, 8))
|
| 31 |
+
|
| 32 |
+
ttk.Radiobutton(
|
| 33 |
+
lang_frame,
|
| 34 |
+
text="DE",
|
| 35 |
+
variable=self.language_var,
|
| 36 |
+
value="de"
|
| 37 |
+
).pack(side=LEFT)
|
| 38 |
+
|
| 39 |
+
self.query_entry = ttk.Entry(input_frame)
|
| 40 |
+
self.query_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 10))
|
| 41 |
+
|
| 42 |
+
self.send_button = ttk.Button(input_frame, text="Send", command=self.send_query)
|
| 43 |
+
self.send_button.pack(side=RIGHT)
|
| 44 |
+
|
| 45 |
+
self.query_entry.bind("<Return>", lambda _: self.send_query())
|
| 46 |
+
|
| 47 |
+
results_frame = ttk.Frame(main_frame)
|
| 48 |
+
results_frame.pack(fill=BOTH, expand=True, padx=10, pady=(10, 5))
|
| 49 |
+
|
| 50 |
+
self.results_text = Text(results_frame, wrap=WORD, font=("TkDefaultFont", 10))
|
| 51 |
+
y_scrollbar = ttk.Scrollbar(results_frame, orient=VERTICAL, command=self.results_text.yview)
|
| 52 |
+
self.results_text.configure(yscrollcommand=y_scrollbar.set)
|
| 53 |
+
|
| 54 |
+
self.results_text.pack(side=LEFT, fill=BOTH, expand=True)
|
| 55 |
+
y_scrollbar.pack(side=RIGHT, fill=Y)
|
| 56 |
+
|
| 57 |
+
self.results_text.config(state=NORMAL)
|
| 58 |
+
self.results_text.insert(END, "Enter your query below and click Send (or press Enter) to see results.\n")
|
| 59 |
+
self.results_text.config(state=DISABLED)
|
| 60 |
+
|
| 61 |
+
return main_frame
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def send_query(self):
|
| 65 |
+
query_text = self.query_entry.get().strip()
|
| 66 |
+
if not query_text:
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
self.query_entry.delete(0, END)
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
response, _ = self._service.query(
|
| 73 |
+
lang=self.language_var.get(),
|
| 74 |
+
query=query_text,
|
| 75 |
+
)
|
| 76 |
+
result_str = ''.join([f"""
|
| 77 |
+
---------------------- Result {idx} ----------------------
|
| 78 |
+
SOURCE: {obj.properties['source']}
|
| 79 |
+
INSERTION DATE: {obj.properties['date']}
|
| 80 |
+
RELEVANT PROGRAMS: {', '.join(obj.properties['programs'])}
|
| 81 |
+
|
| 82 |
+
CONTENT:
|
| 83 |
+
{obj.properties['body']}
|
| 84 |
+
|
| 85 |
+
VECTOR:
|
| 86 |
+
{obj.vector}
|
| 87 |
+
""" for idx, obj in enumerate(response.objects, start=1)])
|
| 88 |
+
|
| 89 |
+
result_str = f"Query: {query_text}\n{result_str}"
|
| 90 |
+
|
| 91 |
+
self.display_result(result_str)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
self.display_result(f"Error:\n{str(e)}")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def display_result(self, result_text: str):
|
| 97 |
+
self.results_text.config(state=NORMAL)
|
| 98 |
+
self.results_text.delete(1.0, END)
|
| 99 |
+
self.results_text.insert(END, result_text + "\n")
|
| 100 |
+
self.results_text.config(state=DISABLED)
|
| 101 |
+
self.results_text.see(1.0)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def open_filters(self):
|
| 105 |
+
dialog = Toplevel(self._parent)
|
| 106 |
+
dialog.title("Query Filters")
|
| 107 |
+
dialog.geometry("400x300")
|
| 108 |
+
dialog.grab_set()
|
src/apps/dbapp/utilclasses.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from src.config import config
|
| 4 |
+
|
| 5 |
+
class BackupData:
|
| 6 |
+
def __init__(self, backup_id: str) -> None:
|
| 7 |
+
self._backup_id = backup_id
|
| 8 |
+
self._creation_date = ""
|
| 9 |
+
self._collections = []
|
| 10 |
+
|
| 11 |
+
backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
|
| 12 |
+
files = os.listdir(backup_path)
|
| 13 |
+
|
| 14 |
+
if 'data.json' in files:
|
| 15 |
+
data_path = os.path.join(backup_path, 'data.json')
|
| 16 |
+
with open(data_path) as f:
|
| 17 |
+
data = json.load(f)
|
| 18 |
+
|
| 19 |
+
date = datetime.fromisoformat(data['creation_date'])
|
| 20 |
+
self._creation_date = date.strftime("%d.%m.%Y %H:%M:%S")
|
| 21 |
+
|
| 22 |
+
if 'objects.json' in files:
|
| 23 |
+
objects_path = os.path.join(backup_path, 'objects.json')
|
| 24 |
+
with open(objects_path) as f:
|
| 25 |
+
data = json.load(f)
|
| 26 |
+
for name, objs in data.items():
|
| 27 |
+
self._collections.append({
|
| 28 |
+
'name': name.lower(),
|
| 29 |
+
'size': ('', len(objs))
|
| 30 |
+
})
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def to_treeformat(self):
|
| 34 |
+
return {
|
| 35 |
+
'id': self._backup_id.replace('backup_', ''),
|
| 36 |
+
'date': (self._creation_date, ''),
|
| 37 |
+
'collections': self._collections,
|
| 38 |
+
}
|
src/cache/__init__.py
ADDED
|
File without changes
|
src/cache/cache.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from threading import Lock
|
| 2 |
+
from src.cache.cache_metrics import CacheMetrics
|
| 3 |
+
from src.cache.cache_strategies import RedisCache, LocalCache
|
| 4 |
+
|
| 5 |
+
from src.utils.logging import get_logger
|
| 6 |
+
from src.config import config
|
| 7 |
+
|
| 8 |
+
logger = get_logger("cache ")
|
| 9 |
+
|
| 10 |
+
class Cache:
|
| 11 |
+
_instance = None
|
| 12 |
+
_settings = None
|
| 13 |
+
_lock = Lock()
|
| 14 |
+
_cache_metrics = None
|
| 15 |
+
|
| 16 |
+
@staticmethod
|
| 17 |
+
def configure(mode: str, cache: bool):
|
| 18 |
+
logger.info(f"Cache configured with parameters: mode={mode}, cache={cache}")
|
| 19 |
+
config.cache.ENABLED = cache
|
| 20 |
+
Cache._settings = {
|
| 21 |
+
"mode": mode,
|
| 22 |
+
"enabled": cache
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
@staticmethod
|
| 26 |
+
def get_cache():
|
| 27 |
+
if Cache._instance is not None:
|
| 28 |
+
return Cache._instance
|
| 29 |
+
|
| 30 |
+
with Cache._lock:
|
| 31 |
+
if Cache._instance is not None:
|
| 32 |
+
return Cache._instance
|
| 33 |
+
|
| 34 |
+
settings = Cache._settings or {"mode": 'local', "enabled": True}
|
| 35 |
+
|
| 36 |
+
if not settings.get("enabled", True):
|
| 37 |
+
Cache._instance = None
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
if Cache._cache_metrics is None:
|
| 41 |
+
Cache._cache_metrics = CacheMetrics()
|
| 42 |
+
|
| 43 |
+
mode = settings.get("mode", 'local')
|
| 44 |
+
|
| 45 |
+
if mode == 'cloud':
|
| 46 |
+
cache_obj = RedisCache(
|
| 47 |
+
host=config.cache.CLOUD_HOST,
|
| 48 |
+
port=config.cache.CLOUD_PORT,
|
| 49 |
+
password=config.cache.CLOUD_PASS,
|
| 50 |
+
mode=mode,
|
| 51 |
+
metrics=Cache._cache_metrics
|
| 52 |
+
)
|
| 53 |
+
elif mode == 'local':
|
| 54 |
+
cache_obj = RedisCache(
|
| 55 |
+
host=config.cache.LOCAL_HOST,
|
| 56 |
+
port=config.cache.LOCAL_PORT,
|
| 57 |
+
password=config.cache.LOCAL_PASS,
|
| 58 |
+
mode=mode,
|
| 59 |
+
metrics=Cache._cache_metrics
|
| 60 |
+
)
|
| 61 |
+
elif mode == 'dict':
|
| 62 |
+
Cache._instance = LocalCache(metrics=Cache._cache_metrics)
|
| 63 |
+
return Cache._instance
|
| 64 |
+
else:
|
| 65 |
+
logger.error("FALLBACK to dict cache. Unknown cache mode")
|
| 66 |
+
Cache._instance = LocalCache(metrics=Cache._cache_metrics)
|
| 67 |
+
return Cache._instance
|
| 68 |
+
|
| 69 |
+
if cache_obj.client is None:
|
| 70 |
+
logger.error("FALLBACK to dict cache. Redis connection failed")
|
| 71 |
+
Cache._instance = LocalCache(metrics=Cache._cache_metrics)
|
| 72 |
+
else:
|
| 73 |
+
Cache._instance = cache_obj
|
| 74 |
+
|
| 75 |
+
return Cache._instance
|
src/cache/cache_base.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from typing import Any
|
| 3 |
+
|
| 4 |
+
class CacheStrategy(ABC):
|
| 5 |
+
"""
|
| 6 |
+
Defines the interface for the different cache system strategies (Local or Redis).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
@abstractmethod
|
| 10 |
+
def set(self, key: str, value: Any, language: str, session_id: str):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
@abstractmethod
|
| 14 |
+
def get(self, key: str, language: str, session_id: str):
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
@abstractmethod
|
| 18 |
+
def clear_cache(self):
|
| 19 |
+
pass
|
src/cache/cache_metrics.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from threading import Lock
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class CacheStatistics:
|
| 7 |
+
hits: int
|
| 8 |
+
misses: int
|
| 9 |
+
hits_ratio: float
|
| 10 |
+
|
| 11 |
+
class CacheMetrics:
|
| 12 |
+
def __init__(self) -> None:
|
| 13 |
+
self.cache_stats = CacheStatistics(0, 0, 0.0)
|
| 14 |
+
self._lock = Lock()
|
| 15 |
+
|
| 16 |
+
def increment_hit(self):
|
| 17 |
+
with self._lock:
|
| 18 |
+
self.cache_stats.hits += 1
|
| 19 |
+
self._calc_hit_ratio()
|
| 20 |
+
|
| 21 |
+
def increment_miss(self):
|
| 22 |
+
with self._lock:
|
| 23 |
+
self.cache_stats.misses += 1
|
| 24 |
+
self._calc_hit_ratio()
|
| 25 |
+
|
| 26 |
+
def _calc_hit_ratio(self):
|
| 27 |
+
total = self.cache_stats.hits + self.cache_stats.misses
|
| 28 |
+
self.cache_stats.hits_ratio = (self.cache_stats.hits / total) if total else 0.0
|
src/cache/cache_strategies.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Any
|
| 3 |
+
from cachetools import TTLCache
|
| 4 |
+
|
| 5 |
+
from .utils import get_cache_key
|
| 6 |
+
from src.cache.cache_base import CacheStrategy
|
| 7 |
+
from src.database.redisservice import RedisService
|
| 8 |
+
from src.utils.logging import get_logger
|
| 9 |
+
from src.config import config
|
| 10 |
+
|
| 11 |
+
logger = get_logger('cache_strat')
|
| 12 |
+
|
| 13 |
+
class RedisCache(CacheStrategy):
|
| 14 |
+
def __init__(self, host, port, password, mode, metrics):
|
| 15 |
+
service = RedisService(host, port, password, mode)
|
| 16 |
+
self.client = service.get_client()
|
| 17 |
+
self.metrics = metrics
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def set(self, key: str, value: Any, language: str, session_id: str):
|
| 21 |
+
if not self.client: return
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
json_str = json.dumps(value)
|
| 25 |
+
cache_key = get_cache_key(key, language, session_id)
|
| 26 |
+
self.client.set(cache_key, json_str, ex=config.cache.TTL_CACHE)
|
| 27 |
+
logger.info(f"Cached response with key {cache_key[:20]}... to Redis")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.error(f"Could not write to Redis: {e}")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get(self, key: str, language: str, session_id: str):
|
| 33 |
+
if not self.client: return None
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
cache_key = get_cache_key(key, language, session_id)
|
| 37 |
+
val = self.client.get(cache_key)
|
| 38 |
+
if val is not None:
|
| 39 |
+
self.metrics.increment_hit()
|
| 40 |
+
logger.info(f"Found cached data with key {cache_key}")
|
| 41 |
+
logger.debug(f"Cache statistics: Hit cache {self.metrics.cache_stats.hits} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
|
| 42 |
+
return json.loads(val)
|
| 43 |
+
|
| 44 |
+
self.metrics.increment_miss()
|
| 45 |
+
logger.debug(f"Cache statistics: Missed cache {self.metrics.cache_stats.misses} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
|
| 46 |
+
return None
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Could not read from Redis: {e}")
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def clear_cache(self):
|
| 53 |
+
if not self.client: return
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
self.client.flushdb()
|
| 57 |
+
logger.info(f"Redis Cache cleared.")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Could not clear Redis cache: {e}")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class LocalCache(CacheStrategy):
|
| 63 |
+
def __init__(self, metrics):
|
| 64 |
+
self.cache = TTLCache(maxsize=config.cache.MAX_SIZE_CACHE, ttl=config.cache.TTL_CACHE)
|
| 65 |
+
self.metrics = metrics
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def set(self, key: str, value: Any, language: str, session_id: str):
|
| 69 |
+
normalized_key = get_cache_key(key, language, session_id)
|
| 70 |
+
self.cache[normalized_key] = value
|
| 71 |
+
logger.info("Response cached")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def get(self, key: str, language: str, session_id: str):
|
| 75 |
+
normalized_key = get_cache_key(key, language, session_id)
|
| 76 |
+
res = self.cache.get(normalized_key, None)
|
| 77 |
+
if res is not None:
|
| 78 |
+
self.metrics.increment_hit()
|
| 79 |
+
logger.debug(f"Cache statistics: Hit cache {self.metrics.cache_stats.hits} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
|
| 80 |
+
else:
|
| 81 |
+
self.metrics.increment_miss()
|
| 82 |
+
logger.debug(f"Cache statistics: Missed cache {self.metrics.cache_stats.misses} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
|
| 83 |
+
return res
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def clear_cache(self):
|
| 87 |
+
self.cache.clear()
|
| 88 |
+
logger.info("Local Cache cleared.")
|
src/cache/utils.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def get_cache_key(key: str, language: str, session_id: str) -> str:
|
| 4 |
+
normalized_key = re.sub(r'[^a-z0-9]', '', key.lower())
|
| 5 |
+
return f"cache:{session_id}:{language}:{normalized_key}"
|
src/config/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.config.configs import *
|
| 2 |
+
from functools import lru_cache
|
| 3 |
+
from typing import Any
|
| 4 |
+
import config as c
|
| 5 |
+
|
| 6 |
+
class AppConfig:
|
| 7 |
+
# ===================== INITIALIZE YOUR SUBCONFIGS HERE =====================
|
| 8 |
+
|
| 9 |
+
convstate: ConversationStateConfig = ConversationStateConfig()
|
| 10 |
+
processing: ProcessingConfig = ProcessingConfig()
|
| 11 |
+
weaviate: WeaviateConfig = WeaviateConfig()
|
| 12 |
+
scraping: ScrapingConfig = ScrapingConfig()
|
| 13 |
+
chain: ChainConfig = ChainConfig()
|
| 14 |
+
cache: CacheConfig = CacheConfig()
|
| 15 |
+
paths: PathsConfig = PathsConfig()
|
| 16 |
+
dbapp: DatabaseAppConfig = DatabaseAppConfig()
|
| 17 |
+
llm: LLMProviderConfig = LLMProviderConfig()
|
| 18 |
+
|
| 19 |
+
# ===========================================================================
|
| 20 |
+
|
| 21 |
+
def get(self, key: str, default: Any = None) -> Any:
|
| 22 |
+
"""
|
| 23 |
+
Retrieves an extra parameter from config.py by name.
|
| 24 |
+
|
| 25 |
+
Raises:
|
| 26 |
+
AttributeError if not found and no default provided.
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
return getattr(c, key)
|
| 30 |
+
except AttributeError:
|
| 31 |
+
if default is not None:
|
| 32 |
+
return default
|
| 33 |
+
raise AttributeError(f"Config parameter '{key}' is not defined!")
|
| 34 |
+
|
| 35 |
+
@lru_cache(maxsize=1)
|
| 36 |
+
def get_config() -> AppConfig:
|
| 37 |
+
return AppConfig()
|
| 38 |
+
|
| 39 |
+
config = get_config()
|
src/config/configs.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
import config, os
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
def _get(param: str, default=None, type_=None):
|
| 9 |
+
value = getattr(config, param, default)
|
| 10 |
+
|
| 11 |
+
if value is None:
|
| 12 |
+
value = os.getenv(param)
|
| 13 |
+
|
| 14 |
+
if value is None:
|
| 15 |
+
return default
|
| 16 |
+
|
| 17 |
+
if not type_: return value
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
return type_(value)
|
| 21 |
+
except (ValueError, TypeError):
|
| 22 |
+
raise ValueError(f"Failed to cast '{param}' value '{value}' to {type_.__name__}")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ConfigBase:
|
| 26 |
+
PARAMS: dict = dict()
|
| 27 |
+
|
| 28 |
+
@classmethod
|
| 29 |
+
def __getitem__(cls, key):
|
| 30 |
+
return cls.PARAMS.get(key, None)
|
| 31 |
+
|
| 32 |
+
@classmethod
|
| 33 |
+
def __setitem__(cls, key, value):
|
| 34 |
+
cls.PARAMS[key] = value
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class DatabaseAppConfig(ConfigBase):
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class PathsConfig(ConfigBase):
|
| 42 |
+
DATA: str = _get('DATA_PATH')
|
| 43 |
+
LOGS: str = _get('LOGS_PATH')
|
| 44 |
+
URLS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'urls')
|
| 45 |
+
CHUNKS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'chunks')
|
| 46 |
+
TEMP_CHUNKS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'temp_chunks')
|
| 47 |
+
SCRAPING_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'scraping')
|
| 48 |
+
RAW_TEXT_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'raw_text')
|
| 49 |
+
RAW_HTML_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'raw_html')
|
| 50 |
+
METADATA_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'metadata')
|
| 51 |
+
EXTRACTED_TEXT_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'extracted_text')
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ScrapingConfig(ConfigBase):
|
| 55 |
+
TIMEOUT: int = _get('SCRAPING_SCRAPING_TIMEOUT', 30)
|
| 56 |
+
MAX_RETRIES: int = _get('SCRAPING_MAX_RETRIES', 3)
|
| 57 |
+
CRAWL_DELAY: int = _get('SCRAPING_CRAWL_DELAY', 1)
|
| 58 |
+
BACKOFF_RATE: int = _get('SCRAPING_BACKOFF_RATE', 2)
|
| 59 |
+
TARGET_URLS: int = _get('SCRAPING_TARGET_URLS', None)
|
| 60 |
+
INTERVALS: dict = _get('SCRAPING_PRIO_INTERVAL', dict())
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class ConversationStateConfig(ConfigBase):
|
| 64 |
+
TRACK_USER_PROFILE = _get('TRACK_USER_PROFILE')
|
| 65 |
+
LOCK_LANGUAGE_AFTER_N_MESSAGES = _get('LOCK_LANGUAGE_AFTER_N_MESSAGES')
|
| 66 |
+
MAX_CONVERSATION_TURNS = _get('MAX_CONVERSATION_TURNS')
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class ProcessingConfig(ConfigBase):
|
| 70 |
+
LANG_AMBIGUITY_THRESHOLD: float = _get('LANG_AMBIGUITY_THRESHOLD')
|
| 71 |
+
EMBEDDING_MODEL: float = _get('EMBEDDING_MODEL')
|
| 72 |
+
MAX_TOKENS: int = _get('MAX_TOKENS')
|
| 73 |
+
CHUNK_OVERLAP: int = _get('CHUNK_OVERLAP')
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class ChainConfig(ConfigBase):
|
| 77 |
+
ENABLE_RESPONSE_CHUNKING: bool = _get('ENABLE_RESPONSE_CHUNKING', True)
|
| 78 |
+
EVALUATE_RESPONSE_QUALITY: bool = _get('ENABLE_EVALUATE_RESPONSE_QUALITY', True)
|
| 79 |
+
CONFIDENCE_THRESHOLD: float = _get('CONFIDENCE_THRESHOLD')
|
| 80 |
+
|
| 81 |
+
TOP_K_RETRIEVAL: int = _get('TOP_K_RETRIEVAL', 4)
|
| 82 |
+
MAX_RETRIES: int = _get('MODEL_MAX_RETRIES', 3)
|
| 83 |
+
MAX_RESPONSE_WORDS_LEAD: int = _get('MAX_RESPONSE_WORDS_LEAD', 100)
|
| 84 |
+
MAX_RESPONSE_WORDS_SUBAGENT: int = _get('MAX_RESPONSE_WORDS_SUBAGENT', 200)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class CacheConfig(ConfigBase):
|
| 88 |
+
ENABLED: bool = _get('CACHE_ENABLED', False)
|
| 89 |
+
CACHE_MODE: Literal['local', 'cloud', 'dict'] = _get('CACHE_MODE')
|
| 90 |
+
|
| 91 |
+
LOCAL_HOST: str = _get('CACHE_LOCAL_HOST', 'localhost')
|
| 92 |
+
LOCAL_PORT: int = _get('CACHE_LOCAL_PORT', 6379)
|
| 93 |
+
LOCAL_PASS: str = _get('CACHE_LOCAL_PASSWORD', '')
|
| 94 |
+
|
| 95 |
+
CLOUD_HOST: str = _get('REDIS_CLOUD_HOST')
|
| 96 |
+
CLOUD_PORT: int = _get('REDIS_CLOUD_PORT', type_=int)
|
| 97 |
+
CLOUD_PASS: str = _get('REDIS_CLOUD_PASSWORD')
|
| 98 |
+
|
| 99 |
+
TTL_CACHE: int = _get('CACHE_TTL', 86400)
|
| 100 |
+
MAX_SIZE_CACHE: int = _get('CACHE_MAX_SIZE', 1000)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class WeaviateConfig(ConfigBase):
|
| 104 |
+
LOCAL_DATABASE: bool = _get('WEAVIATE_IS_LOCAL')
|
| 105 |
+
WEAVIATE_COLLECTION_BASENAME: str = _get('WEAVIATE_COLLECTION_BASENAME')
|
| 106 |
+
|
| 107 |
+
BACKUP_METHODS: list[str] = ['manual', 'filesystem', 's3']
|
| 108 |
+
BACKUP_METHOD: Literal['manual', 'filesystem', 's3'] = _get('WEAVIATE_BACKUP_METHOD')
|
| 109 |
+
|
| 110 |
+
BACKUP_PATH: str = _get('BACKUPS_PATH')
|
| 111 |
+
PROPERTIES_PATH: str = _get('PROPERTIES_PATH')
|
| 112 |
+
STRATEGIES_PATH: str = _get('STRATEGIES_PATH')
|
| 113 |
+
|
| 114 |
+
CLUSTER_URL: str = _get('WEAVIATE_CLUSTER_URL')
|
| 115 |
+
WEAVIATE_API_KEY: str = _get('WEAVIATE_API_KEY')
|
| 116 |
+
HUGGING_FACE_API_KEY: str = _get('HUGGING_FACE_API_KEY')
|
| 117 |
+
|
| 118 |
+
INIT_TIMEOUT: int = _get('WEAVIATE_INIT_TIMEOUT', 90)
|
| 119 |
+
QUERY_TIMEOUT: int = _get('WEAVIATE_QUERY_TIMEOUT', 60)
|
| 120 |
+
INSERT_TIMEOUT: int = _get('WEAVIATE_INSERT_TIMEOUT', 600)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
#TODO: Clean this configuration (outdated)
|
| 124 |
+
class LLMProvider:
|
| 125 |
+
def __init__(self, base: str, sub: str | None = None) -> None:
|
| 126 |
+
self.base = base
|
| 127 |
+
self.sub = sub
|
| 128 |
+
self.name = f"{base}:{sub}" if sub else base
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def with_sub(self, sub: str | None = None) -> str:
|
| 132 |
+
return LLMProvider(self.base, sub)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class LLMProviderConfig:
|
| 136 |
+
AVAIABLE_PROVIDERS: list[str] = [
|
| 137 |
+
'groq',
|
| 138 |
+
'ollama',
|
| 139 |
+
'openai',
|
| 140 |
+
'open_router',
|
| 141 |
+
]
|
| 142 |
+
AVAILABLE_SUBPROVIDERS: dict = {
|
| 143 |
+
'groq': [],
|
| 144 |
+
'open_router': [
|
| 145 |
+
'openai',
|
| 146 |
+
'deepseek',
|
| 147 |
+
'meituan'
|
| 148 |
+
'alibaba' # For tongyi models
|
| 149 |
+
'nvidia',
|
| 150 |
+
],
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
LLM_PROVIDER: LLMProvider = LLMProvider('openai')
|
| 154 |
+
|
| 155 |
+
# -------------------- Some predefined models for available providers ----------------------
|
| 156 |
+
|
| 157 |
+
# Groq settings
|
| 158 |
+
GROQ_API_KEY: str = os.getenv("GROQ_API_KEY")
|
| 159 |
+
GROQ_MODEL: str = "mixtral-8x7b-32768"
|
| 160 |
+
|
| 161 |
+
# Open Router settings
|
| 162 |
+
OPEN_ROUTER_API_KEY: str = os.getenv("OPEN_ROUTER_API_KEY")
|
| 163 |
+
OPEN_ROUTER_MODEL: str = "meituan/longcat-flash-chat:free"
|
| 164 |
+
OPEN_ROUTER_BASE_URL: str = "https://openrouter.ai/api/v1"
|
| 165 |
+
|
| 166 |
+
# OpenAI settings
|
| 167 |
+
OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY")
|
| 168 |
+
OPENAI_MODEL: str = "gpt-5.1"
|
| 169 |
+
|
| 170 |
+
# The gpt-oss:20b model is preferable but takes much more space
|
| 171 |
+
# Set to False if you only have the llama3.2 installed
|
| 172 |
+
GPT_OSS_ENABLED: bool = False
|
| 173 |
+
# Local/Ollama settings
|
| 174 |
+
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
| 175 |
+
OLLAMA_MODEL: str = "gpt-oss:20b" if GPT_OSS_ENABLED else "llama3.2"
|
| 176 |
+
|
| 177 |
+
# ----------------------------------------------------------------------------------------
|
| 178 |
+
|
| 179 |
+
@classmethod
|
| 180 |
+
def get_fallback_models(cls, provider: LLMProvider | None = None) -> list[str]:
|
| 181 |
+
provider = provider or cls.LLM_PROVIDER
|
| 182 |
+
match provider.base:
|
| 183 |
+
case 'openai':
|
| 184 |
+
return {
|
| 185 |
+
provider: fallback_model
|
| 186 |
+
for fallback_model in [
|
| 187 |
+
'gpt-5-mini',
|
| 188 |
+
'gpt-5-nano',
|
| 189 |
+
]
|
| 190 |
+
}
|
| 191 |
+
case 'open_router':
|
| 192 |
+
return {
|
| 193 |
+
provider.with_sub('openai'): "gpt-oss-20b",
|
| 194 |
+
provider.with_sub('openai'): "gpt-oss-120b",
|
| 195 |
+
provider.with_sub('alibaba'): "alibaba/tongyi-deepresearch-30b-a3b:free",
|
| 196 |
+
provider: "openrouter/polaris-alpha",
|
| 197 |
+
# Currently unusable because has no tool support
|
| 198 |
+
#provider.with_sub('deepseek'): "deepseek/deepseek-chat-v3.1:free",
|
| 199 |
+
}
|
| 200 |
+
case _:
|
| 201 |
+
return {}
|
| 202 |
+
|
| 203 |
+
@classmethod
|
| 204 |
+
def get_reasoning_support(cls, provider: LLMProvider | None = None) -> bool:
|
| 205 |
+
provider = provider or cls.LLM_PROVIDER
|
| 206 |
+
return {
|
| 207 |
+
"groq": True,
|
| 208 |
+
"openai": True,
|
| 209 |
+
"open_router": True,
|
| 210 |
+
}.get(provider.base, False)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@classmethod
|
| 214 |
+
def get_default_model(cls, provider: LLMProvider | None = None) -> str:
|
| 215 |
+
provider = provider or cls.LLM_PROVIDER
|
| 216 |
+
return {
|
| 217 |
+
"groq": cls.GROQ_MODEL,
|
| 218 |
+
"openai": cls.OPENAI_MODEL,
|
| 219 |
+
"ollama": cls.OLLAMA_MODEL,
|
| 220 |
+
"open_router": cls.OPEN_ROUTER_MODEL,
|
| 221 |
+
}.get(provider.base)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@classmethod
|
| 225 |
+
def get_api_key(cls, provider: LLMProvider | None = None) -> str:
|
| 226 |
+
provider = provider or cls.LLM_PROVIDER
|
| 227 |
+
return {
|
| 228 |
+
"groq": cls.GROQ_API_KEY,
|
| 229 |
+
"openai": cls.OPENAI_API_KEY,
|
| 230 |
+
"open_router": cls.OPEN_ROUTER_API_KEY,
|
| 231 |
+
}.get(provider.base)
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
class NotificationCenterConfig(ConfigBase):
|
| 235 |
+
ENABLE_EMAIL_ALERTS: bool = _get('NOTIFY_ENABLE_EMAIL_ALERTS', True, bool)
|
| 236 |
+
|
| 237 |
+
SMTP_HOST: str = _get("NOTIFY_SMTP_HOST")
|
| 238 |
+
SMTP_PORT: int = _get("NOTIFY_SMTP_PORT", 587, type_=int)
|
| 239 |
+
|
| 240 |
+
SMTP_USER: str = _get("NOTIFY_SMTP_USER")
|
| 241 |
+
SMTP_PASSWORD: str = _get("NOTIFY_SMTP_PASSWORD")
|
| 242 |
+
|
| 243 |
+
SMTP_USE_TLS: bool = _get("NOTIFY_SMTP_USE_TLS", "True").lower() in ("1", "true", "yes", "on")
|
| 244 |
+
|
| 245 |
+
FROM_EMAIL: str = _get("NOTIFY_FROM_EMAIL")
|
| 246 |
+
TO_EMAIL: str = _get("NOTIFY_TO_EMAIL")
|
| 247 |
+
|
| 248 |
+
ENABLE_SLACK_ALERTS: bool = _get('NOTIFY_ENABLE_SLACK_ALERTS', False, bool)
|
| 249 |
+
SLACK_WEBHOOK_URL: str = _get("NOTIFY_SLACK_WEBHOOK_URL")
|
src/const/agent_response_constants.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
""" Constants for Gradio app """
|
| 2 |
+
|
| 3 |
+
GREETING_MESSAGES = {
|
| 4 |
+
"en": [
|
| 5 |
+
"Hello and welcome. I am your Executive Education Advisor for the HSG Executive MBA programmes (**IEMBA**, **emba X**, and **EMBA**). How may I support your MBA planning today?",
|
| 6 |
+
"Hello and welcome. I am your Executive Education Advisor for the University of St.Gallen Executive MBA programmes (**IEMBA**, **emba X**, and **EMBA**). How may I assist you with your programme search?",
|
| 7 |
+
"Hello and welcome. I am here to help you explore the University of St.Gallen Executive MBA programmes (**EMBA**, **IEMBA**, and **emba X**). What would you like to discuss today?",
|
| 8 |
+
"Hello and welcome. I am your Executive Education Advisor for the University of St.Gallen’s Executive MBA programmes, and I am here to help you assess fit across **EMBA**, **IEMBA**, and **emba X**.",
|
| 9 |
+
"Hello and welcome. I am here to support you with questions about the University of St.Gallen Executive MBA programmes and to help you evaluate the **EMBA**, **IEMBA**, and **emba X** options.",
|
| 10 |
+
],
|
| 11 |
+
"de": [
|
| 12 |
+
"Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme und unterstütze Sie gerne bei Fragen zu **EMBA**, **IEMBA** und **emba X**.",
|
| 13 |
+
"Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme (**EMBA**, **IEMBA**, **emba X**). Ich unterstütze Sie bei Programmwahl, Ablauf und Zulassungsfragen.",
|
| 14 |
+
"Guten Tag und herzlich willkommen. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme und unterstütze Sie gerne bei Fragen zu **EMBA**, **IEMBA** und **emba X**.",
|
| 15 |
+
"Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme (**EMBA**, **IEMBA**, **emba X**) und unterstütze Sie gerne bei der Einschätzung der passenden Option.",
|
| 16 |
+
"Guten Tag. Ich unterstütze Sie gerne bei Fragen zu den HSG Executive MBA Programmen und helfe Ihnen, die Optionen **EMBA**, **IEMBA** und **emba X** einzuordnen.",
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
QUERY_EXCEPTION_MESSAGE = {
|
| 21 |
+
"en": "I'm sorry, I cannot provide a helpful response right now. Please contact tech support or try again later.",
|
| 22 |
+
"de": "Es tut mir leid, ich kann im Moment keine hilfreiche Antwort geben. Bitte wenden Sie sich an den technischen Support oder versuchen Sie es später erneut.",
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
NOT_VALID_QUERY_MESSAGE = {
|
| 26 |
+
"en": "I didn't quite understand that. Could you please rephrase your question?",
|
| 27 |
+
"de": "Das habe ich nicht ganz verstanden. Könnten Sie Ihre Frage bitte anders formulieren?",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
CONFIDENCE_FALLBACK_MESSAGE = {
|
| 31 |
+
"en": (
|
| 32 |
+
"I am sorry, but I could not find sufficiently reliable information in my records to answer that question with confidence. "
|
| 33 |
+
"Could you please rephrase your question?\n\n"
|
| 34 |
+
"If you would like a personal consultation, I can also help you with appointment booking."
|
| 35 |
+
),
|
| 36 |
+
"de": (
|
| 37 |
+
"Es tut mir leid, aber ich konnte in meinen Unterlagen keine Informationen finden, "
|
| 38 |
+
"die zu Ihrer Anfrage passen, sodass ich sie nicht mit ausreichender Sicherheit beantworten kann. "
|
| 39 |
+
"Könnten Sie Ihre Frage bitte umformulieren?\n\n"
|
| 40 |
+
"Wenn Sie ein persönliches Beratungsgespräch wünschen, kann ich Ihnen auch bei der Terminbuchung helfen."
|
| 41 |
+
),
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
LANGUAGE_FALLBACK_MESSAGE = {
|
| 45 |
+
"en": (
|
| 46 |
+
"I am sorry, I can only reply in English or German. "
|
| 47 |
+
"Would you like to continue our conversation in English?"
|
| 48 |
+
),
|
| 49 |
+
"de": (
|
| 50 |
+
"Es tut mir leid, ich kann nur auf Englisch oder Deutsch antworten. "
|
| 51 |
+
"Möchten Sie unser Gespräch auf Deutsch fortführen?"
|
| 52 |
+
),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
CONVERSATION_END_MESSAGE = {
|
| 56 |
+
"en": (
|
| 57 |
+
"This conversation has reached its maximum length. "
|
| 58 |
+
"To make sure you receive the best possible support, "
|
| 59 |
+
"please continue with a personal consultation.\n\n"
|
| 60 |
+
"If you would like to see appointment options with an admissions advisor, please ask me to show them. "
|
| 61 |
+
"Thank you for your understanding."
|
| 62 |
+
),
|
| 63 |
+
"de": (
|
| 64 |
+
"Dieses Gespräch hat die maximale Länge erreicht. "
|
| 65 |
+
"Damit Sie bestmöglich unterstützt werden, bitten wir Sie, "
|
| 66 |
+
"das Anliegen in einem persönlichen Beratungsgespräch fortzusetzen.\n\n"
|
| 67 |
+
"Wenn Sie Terminoptionen mit der Studienberatung sehen möchten, sagen Sie mir bitte kurz Bescheid. "
|
| 68 |
+
"Vielen Dank für Ihr Verständnis."
|
| 69 |
+
),
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
ADMISSIONS_TEAM_CONTACT = {
|
| 73 |
+
"en": {
|
| 74 |
+
"email": "emba@unisg.ch",
|
| 75 |
+
"phone": "+41 71 224 27 02",
|
| 76 |
+
},
|
| 77 |
+
"de": {
|
| 78 |
+
"email": "emba@unisg.ch",
|
| 79 |
+
"phone": "+41 71 224 27 02",
|
| 80 |
+
},
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
ADVISOR_CONTACTS = [
|
| 84 |
+
{
|
| 85 |
+
"name": "Cyra von Müller (EMBA)",
|
| 86 |
+
"program": "emba",
|
| 87 |
+
"email": "cyra.vonmueller@unisg.ch",
|
| 88 |
+
"phone": "+41 71 224 27 12",
|
| 89 |
+
"url": "https://calendly.com/cyra-vonmueller/beratungsgespraech-emba-hsg",
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"name": "Kristin Fuchs (IEMBA)",
|
| 93 |
+
"program": "iemba",
|
| 94 |
+
"email": "kristin.fuchs@unisg.ch",
|
| 95 |
+
"phone": "+41 71 224 75 46",
|
| 96 |
+
"url": "https://calendly.com/kristin-fuchs-unisg/iemba-online-personal-consultation",
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "Teyuna Giger (emba X)",
|
| 100 |
+
"program": "emba_x",
|
| 101 |
+
"email": "teyuna.giger@unisg.ch",
|
| 102 |
+
"phone": "+41 71 224 77 65",
|
| 103 |
+
"url": "https://calendly.com/teyuna-giger-unisg",
|
| 104 |
+
},
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def get_admissions_contact_text(language: str = "en") -> str:
|
| 109 |
+
labels = {
|
| 110 |
+
"en": "You can reach the Executive MBA admissions team at {email} or {phone}.",
|
| 111 |
+
"de": "Sie erreichen das Executive-MBA-Zulassungsteam unter {email} oder {phone}.",
|
| 112 |
+
}
|
| 113 |
+
contact = ADMISSIONS_TEAM_CONTACT.get(language, ADMISSIONS_TEAM_CONTACT["en"])
|
| 114 |
+
template = labels.get(language, labels["en"])
|
| 115 |
+
return template.format(email=contact["email"], phone=contact["phone"])
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def get_booking_widget(language: str="en", programs: list[str]=None):
|
| 119 |
+
"""
|
| 120 |
+
Returns an HTML string representing a Booking Widget.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
if programs is None or programs == []:
|
| 124 |
+
programs = ["emba", "iemba", "emba_x"]
|
| 125 |
+
|
| 126 |
+
labels = {
|
| 127 |
+
"en": {
|
| 128 |
+
"header": "Book a Consultation",
|
| 129 |
+
"sub": "Select an advisor to view available appointment slots and contact details:",
|
| 130 |
+
"email": "Email",
|
| 131 |
+
"phone": "Phone",
|
| 132 |
+
},
|
| 133 |
+
"de": {
|
| 134 |
+
"header": "Termin vereinbaren",
|
| 135 |
+
"sub": "Wählen Sie einen Berater, um verfügbare Termine und Kontaktdaten zu sehen:",
|
| 136 |
+
"email": "E-Mail",
|
| 137 |
+
"phone": "Telefon",
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
txt = labels.get(language, labels["en"])
|
| 141 |
+
|
| 142 |
+
base_params = "?hide_gdpr_banner=1&embed_type=Inline&embed_domain=1"
|
| 143 |
+
|
| 144 |
+
html_content = f"""
|
| 145 |
+
<div style="width: 100%; min-width: 100%; box-sizing: border-box; background-color: #f9fafb; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-top: 10px; font-family: sans-serif;">
|
| 146 |
+
<h3 style="margin: 0 0 10px 0; color: #111827; font-size: 1.2em;">{txt['header']}</h3>
|
| 147 |
+
<p style="margin: 0 0 20px 0; color: #6b7280; font-size: 1em;">{txt['sub']}</p>
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
for advisor in ADVISOR_CONTACTS:
|
| 151 |
+
if advisor["program"] in programs:
|
| 152 |
+
html_content += f"""
|
| 153 |
+
<details style="margin-bottom: 12px; border: 1px solid #d1d5db; border-radius: 8px; background: white; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
|
| 154 |
+
<summary style="cursor: pointer; padding: 16px 20px; background-color: #ffffff; font-weight: 600; color: #374151; font-size: 1.05em; list-style: none; transition: background 0.2s;">
|
| 155 |
+
{advisor['name']}
|
| 156 |
+
</summary>
|
| 157 |
+
<div style="padding: 16px 20px 0 20px; border-top: 1px solid #e5e7eb;">
|
| 158 |
+
<p style="margin: 0 0 6px 0; color: #374151;"><strong>{txt['email']}:</strong> <a href="mailto:{advisor['email']}" style="color: #1d4ed8; text-decoration: none;">{advisor['email']}</a></p>
|
| 159 |
+
<p style="margin: 0 0 16px 0; color: #374151;"><strong>{txt['phone']}:</strong> <a href="tel:{advisor['phone'].replace(' ', '')}" style="color: #1d4ed8; text-decoration: none;">{advisor['phone']}</a></p>
|
| 160 |
+
</div>
|
| 161 |
+
<div style="padding: 0; border-top: 1px solid #e5e7eb;">
|
| 162 |
+
<iframe src="{advisor['url']}{base_params}" width="100%" height="650px" frameborder="0" style="display: block;"></iframe>
|
| 163 |
+
</div>
|
| 164 |
+
</details>
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
html_content += "</div>"
|
| 168 |
+
return html_content
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def get_disclaimer_widget(language: str = "en"):
|
| 172 |
+
"""
|
| 173 |
+
Returns an HTML string representing a warning disclaimer.
|
| 174 |
+
"""
|
| 175 |
+
disclaimers = {
|
| 176 |
+
"en": {
|
| 177 |
+
"title": "Disclaimer",
|
| 178 |
+
"body": "Assessments provided by this advisor are non-binding and based on limited information. Please consult our program directors for final admission or credit evaluations."
|
| 179 |
+
},
|
| 180 |
+
"de": {
|
| 181 |
+
"title": "Haftungsausschluss",
|
| 182 |
+
"body": "Die Einschätzungen dieses Beraters sind unverbindlich und basieren auf begrenzten Informationen. Bitte wenden Sie sich für endgültige Zulassungs- oder Anrechnungsfragen an die Programmleitung."
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
content = disclaimers.get(language, disclaimers["en"])
|
| 187 |
+
|
| 188 |
+
# Yellow styling constants
|
| 189 |
+
bg_color = "#fffbeb" # Light yellow
|
| 190 |
+
border_color = "#f59e0b" # Amber/Yellow border
|
| 191 |
+
icon_color = "#d97706" # Darker amber for the icon
|
| 192 |
+
text_color = "#92400e" # Dark brown/yellow for readability
|
| 193 |
+
|
| 194 |
+
html_content = f"""
|
| 195 |
+
<div style="display: flex; align-items: flex-start; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 8px; padding: 16px; margin-bottom: 20px; font-family: sans-serif;">
|
| 196 |
+
<div style="margin-right: 12px; margin-top: 2px;">
|
| 197 |
+
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="{icon_color}" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 198 |
+
<path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"/><line x1="12" y1="9" x2="12" y2="13"/><line x1="12" y1="17" x2="12.01" y2="17"/>
|
| 199 |
+
</svg>
|
| 200 |
+
</div>
|
| 201 |
+
<div>
|
| 202 |
+
<strong style="display: block; color: {text_color}; margin-bottom: 4px; font-size: 0.95em;">{content['title']}</strong>
|
| 203 |
+
<p style="margin: 0; color: {text_color}; font-size: 0.85em; line-height: 1.4;">
|
| 204 |
+
{content['body']}
|
| 205 |
+
</p>
|
| 206 |
+
</div>
|
| 207 |
+
</div>
|
| 208 |
+
"""
|
| 209 |
+
return html_content
|
src/const/cc_whitelist.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
REPETITION_WHITELIST = [
|
| 2 |
+
'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'januar', 'februar', 'märz', 'mai', 'juni', 'juli', 'oktober', 'dezember', 'total', 'iemba', 'emba', 'emba x', 'programme', 'program',
|
| 3 |
+
]
|
src/const/data_consent_constants.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PRIVACY_NOTICE = {
|
| 2 |
+
"de": """
|
| 3 |
+
### Datenschutzhinweis
|
| 4 |
+
|
| 5 |
+
Wir verwenden Ihre Angaben, um Sie zu **Executive MBA Programmen der Universität St.Gallen** zu beraten.
|
| 6 |
+
Dabei verarbeiten wir insbesondere:
|
| 7 |
+
|
| 8 |
+
- Ihre Gesprächsinhalte und Anfragen
|
| 9 |
+
- Kontaktdaten (Name, E-Mail) bei Terminbuchung
|
| 10 |
+
- Informationen zu Ihrer Berufserfahrung und Ausbildung
|
| 11 |
+
|
| 12 |
+
Ihre Daten werden **ausschließlich für die Studienberatung** verwendet und **nicht an Dritte weitergegeben**.
|
| 13 |
+
Sie können Ihre Einwilligung **jederzeit widerrufen**.
|
| 14 |
+
|
| 15 |
+
[Weitere Informationen zur Datenschutzerklärung](https://www.unisg.ch/en/data-protection-declaration/)
|
| 16 |
+
""",
|
| 17 |
+
|
| 18 |
+
"en": """
|
| 19 |
+
### Privacy Notice
|
| 20 |
+
|
| 21 |
+
We use your information to advise you on **Executive MBA programmes at the University of St.Gallen**.
|
| 22 |
+
We process in particular:
|
| 23 |
+
|
| 24 |
+
- Your conversation content and inquiries
|
| 25 |
+
- Contact details (name, email) for appointment booking
|
| 26 |
+
- Information about your professional experience and education
|
| 27 |
+
|
| 28 |
+
Your data is used **solely for study advisory purposes** and **is not shared with third parties**.
|
| 29 |
+
You may **withdraw your consent at any time**.
|
| 30 |
+
|
| 31 |
+
[More information in the Privacy Policy](https://www.unisg.ch/en/data-protection-declaration/)
|
| 32 |
+
"""
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
ACCEPT = {
|
| 36 |
+
"de": "Zustimmen",
|
| 37 |
+
"en": "Accept"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
DECLINE = {
|
| 41 |
+
"de": "Ablehnen",
|
| 42 |
+
"en": "Decline"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
DECLINE_MESSAGE = {
|
| 46 |
+
"de": "Ohne Ihre Einwilligung können wir Sie leider nicht beraten. "
|
| 47 |
+
"Bitte kontaktieren Sie uns direkt unter emba@unisg.ch.",
|
| 48 |
+
"en": "Without your consent, we cannot provide advice. "
|
| 49 |
+
"Please contact us directly at emba@unisg.ch.",
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
WITHDRAW_CONFIRMATION_MESSAGE = {
|
| 53 |
+
"de": "Ihre Einwilligung wurde widerrufen. Ihre Session-Daten wurden gelöscht. Ohne Einwilligung können wir Sie leider nicht beraten.",
|
| 54 |
+
"en": "Your consent has been withdrawn. Your session data has been deleted. Without consent, we cannot continue advising you."
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
WITHDRAW_TEXT = {
|
| 58 |
+
"de": "Einwilligung widerrufen",
|
| 59 |
+
"en": "Withdraw Consent"
|
| 60 |
+
}
|
src/const/page_blacklist.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PAGE_BLACKLIST = [
|
| 2 |
+
'cookie', 'cookies', 'privacy', 'datenschutz', 'popup', 'download',
|
| 3 |
+
'cookie-policy', 'privacy-policy', 'cookie-and-privacy-policy',
|
| 4 |
+
'data-protection', 'impressum', 'legal', 'terms', 'agb', 'imprint'
|
| 5 |
+
]
|
src/const/page_priority.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PAGE_PRIORITY_KEYWORDS = {
|
| 2 |
+
'high': [
|
| 3 |
+
# -------------------------------------------- EN --------------------------------------------
|
| 4 |
+
'overview', 'about', 'introduction', 'summary', 'home', 'general information', 'welcome',
|
| 5 |
+
'admissions', 'admission', 'apply', 'application', 'how to apply', 'enrollment', 'prospective students', 'entrance',
|
| 6 |
+
'costs', 'tuition', 'fees', 'expenses', 'financial information', 'funding', 'scholarships',
|
| 7 |
+
'curriculum', 'courses', 'program', 'programmes', 'degree structure', 'modules', 'syllabus',
|
| 8 |
+
'eligibility', 'admission requirements', 'entry requirements', 'qualifications', 'prerequisites', 'criteria',
|
| 9 |
+
'deadlines', 'application deadlines', 'key dates', 'timeline', 'due dates', 'important dates'
|
| 10 |
+
|
| 11 |
+
# -------------------------------------------- DE --------------------------------------------
|
| 12 |
+
'übersicht', 'überblick', 'einführung', 'zusammenfassung', 'allgemeines', 'willkommen',
|
| 13 |
+
'zulassung', 'zulassungen', 'bewerbung', 'bewerbungen', 'wie bewerben', 'einschreibung', 'potenzielle studenten', 'aufnahme',
|
| 14 |
+
'kosten', 'studiengebühren', 'gebühren', 'ausgaben', 'finanzielle informationen', 'finanzierung', 'stipendien',
|
| 15 |
+
'studienplan', 'lehrplan', 'curriculum', 'modulhandbuch', 'studiengangsstruktur', 'module', 'lehrstoff',
|
| 16 |
+
'voraussetzungen', 'zulassungsvoraussetzungen', 'eintrittsvoraussetzungen', 'qualifikationen', 'vorkenntnisse', 'kriterien',
|
| 17 |
+
'fristen', 'bewerbungsfristen', 'schlüsseltermine', 'zeitplan', 'fälligkeitsdaten', 'wichtige daten'
|
| 18 |
+
],
|
| 19 |
+
'medium': [
|
| 20 |
+
# -------------------------------------------- EN --------------------------------------------
|
| 21 |
+
'faculty', 'faculties', 'staff', 'professors', 'departments', 'team', 'instructors', 'lecturers',
|
| 22 |
+
'alumni', 'graduates', 'former students', 'success stories', 'alumnae'
|
| 23 |
+
|
| 24 |
+
# -------------------------------------------- DE --------------------------------------------
|
| 25 |
+
'fakultät', 'fakultäten', 'personal', 'professoren', 'dozenten', 'abteilungen', 'team', 'lehrkräfte',
|
| 26 |
+
'alumni', 'absolventen', 'ehemalige studenten', 'erfolgsgeschichten'
|
| 27 |
+
],
|
| 28 |
+
'low': [
|
| 29 |
+
# -------------------------------------------- EN --------------------------------------------
|
| 30 |
+
'news', 'press', 'blog', 'updates', 'articles', 'announcements',
|
| 31 |
+
'events', 'calendar', 'activities', 'conferences', 'workshops', 'seminars'
|
| 32 |
+
|
| 33 |
+
# -------------------------------------------- DE --------------------------------------------
|
| 34 |
+
'nachrichten', 'presse', 'blog', 'aktualisierungen', 'artikel', 'ankündigungen',
|
| 35 |
+
'veranstaltungen', 'kalender', 'aktivitäten', 'konferenzen', 'workshops', 'seminare'
|
| 36 |
+
],
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
CHUNK_TOPIC_KEYWORDS = {
|
| 40 |
+
'admissions': {
|
| 41 |
+
# ----------------------- EN -----------------------
|
| 42 |
+
'admissions', 'application', 'apply', 'application process',
|
| 43 |
+
'deadline', 'deadlines', 'selection', 'assessment',
|
| 44 |
+
'interview', 'admissions committee', 'application form',
|
| 45 |
+
'submit', 'submission', 'enrollment',
|
| 46 |
+
|
| 47 |
+
# ----------------------- DE -----------------------
|
| 48 |
+
'zulassung', 'bewerbung', 'bewerben',
|
| 49 |
+
'bewerbungsprozess', 'frist', 'fristen',
|
| 50 |
+
'auswahlverfahren', 'aufnahmeverfahren',
|
| 51 |
+
'assessment', 'interview', 'aufnahmegespräch',
|
| 52 |
+
'zulassungskomitee', 'einschreibung', 'immatrikulation',
|
| 53 |
+
'einreichen'
|
| 54 |
+
},
|
| 55 |
+
|
| 56 |
+
'costs': {
|
| 57 |
+
# ----------------------- EN -----------------------
|
| 58 |
+
'tuition', 'tuition fee', 'fees', 'costs', 'expenses',
|
| 59 |
+
'payment', 'payment plan', 'installment', 'installments',
|
| 60 |
+
'deposit', 'price', 'total cost',
|
| 61 |
+
'funding', 'financing', 'loan', 'loans',
|
| 62 |
+
'scholarship', 'scholarships', 'budget',
|
| 63 |
+
|
| 64 |
+
# ----------------------- DE -----------------------
|
| 65 |
+
'studiengebühren', 'gebühren', 'kosten', 'ausgaben',
|
| 66 |
+
'zahlung', 'zahlungsplan', 'rate', 'raten',
|
| 67 |
+
'anzahlung', 'preis', 'gesamtkosten',
|
| 68 |
+
'finanzierung', 'kredit', 'kredite',
|
| 69 |
+
'stipendium', 'stipendien', 'budget'
|
| 70 |
+
},
|
| 71 |
+
|
| 72 |
+
'curriculum': {
|
| 73 |
+
# ----------------------- EN -----------------------
|
| 74 |
+
'curriculum', 'program', 'programme', 'content',
|
| 75 |
+
'module', 'modules', 'course', 'courses',
|
| 76 |
+
'structure', 'format', 'timeline', 'schedule',
|
| 77 |
+
'duration', 'ects', 'credits',
|
| 78 |
+
'training', 'coaching', 'workshop', 'workshops',
|
| 79 |
+
'project', 'projects', 'leadership', 'development',
|
| 80 |
+
'learning', 'electives',
|
| 81 |
+
|
| 82 |
+
# ----------------------- DE -----------------------
|
| 83 |
+
'curriculum', 'programm', 'studium', 'inhalt',
|
| 84 |
+
'modul', 'module', 'kurs', 'kurse',
|
| 85 |
+
'struktur', 'format', 'zeitplan', 'ablauf',
|
| 86 |
+
'dauer', 'ects', 'leistungspunkte',
|
| 87 |
+
'training', 'coaching', 'workshop', 'workshops',
|
| 88 |
+
'projekt', 'projekte', 'führung', 'entwicklung',
|
| 89 |
+
'lernen', 'wahlfächer'
|
| 90 |
+
},
|
| 91 |
+
|
| 92 |
+
'eligibility': {
|
| 93 |
+
# ----------------------- EN -----------------------
|
| 94 |
+
'eligibility', 'requirements', 'prerequisites',
|
| 95 |
+
'admission requirements', 'criteria',
|
| 96 |
+
'qualification', 'qualifications',
|
| 97 |
+
'work experience', 'leadership experience',
|
| 98 |
+
'degree', 'academic degree',
|
| 99 |
+
'language requirement', 'fluency',
|
| 100 |
+
|
| 101 |
+
# ----------------------- DE -----------------------
|
| 102 |
+
'voraussetzungen', 'zulassungsvoraussetzungen',
|
| 103 |
+
'anforderungen', 'kriterien',
|
| 104 |
+
'qualifikation', 'qualifikationen',
|
| 105 |
+
'berufserfahrung', 'führungserfahrung',
|
| 106 |
+
'abschluss', 'studienabschluss',
|
| 107 |
+
'sprachkenntnisse', 'sprachvoraussetzungen'
|
| 108 |
+
},
|
| 109 |
+
|
| 110 |
+
'alumni': {
|
| 111 |
+
# ----------------------- EN -----------------------
|
| 112 |
+
'alumni', 'alumni network',
|
| 113 |
+
'graduates', 'community',
|
| 114 |
+
'career service', 'mentoring',
|
| 115 |
+
|
| 116 |
+
# ----------------------- DE -----------------------
|
| 117 |
+
'alumni', 'alumni-netzwerk',
|
| 118 |
+
'absolventen', 'gemeinschaft',
|
| 119 |
+
'karriereservice', 'mentoring'
|
| 120 |
+
},
|
| 121 |
+
|
| 122 |
+
'general': {
|
| 123 |
+
# ----------------------- EN -----------------------
|
| 124 |
+
'overview', 'introduction', 'summary',
|
| 125 |
+
'highlights', 'benefits', 'advantages',
|
| 126 |
+
'experience', 'journey',
|
| 127 |
+
'programme details', 'program details',
|
| 128 |
+
'location', 'format', 'language',
|
| 129 |
+
|
| 130 |
+
# ----------------------- DE -----------------------
|
| 131 |
+
'überblick', 'einführung', 'zusammenfassung',
|
| 132 |
+
'highlights', 'vorteile',
|
| 133 |
+
'erfahrung', 'reise',
|
| 134 |
+
'programmdetails', 'standort',
|
| 135 |
+
'format', 'sprche'
|
| 136 |
+
},
|
| 137 |
+
}
|
src/database/__init__.py
ADDED
|
File without changes
|
src/database/docker-compose-cache.yml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
redis:
|
| 5 |
+
image: redis:alpine
|
| 6 |
+
container_name: hsg_redis_cache
|
| 7 |
+
ports:
|
| 8 |
+
- "6379:6379"
|
| 9 |
+
command: >
|
| 10 |
+
redis-server
|
| 11 |
+
--requirepass "${REDIS_PASSWORD}"
|
| 12 |
+
--save 60 1
|
| 13 |
+
--loglevel warning
|
| 14 |
+
--maxmemory 200mb
|
| 15 |
+
--maxmemory-policy allkeys-lru
|
| 16 |
+
volumes:
|
| 17 |
+
- redis_data:/data
|
| 18 |
+
restart: unless-stopped
|
| 19 |
+
|
| 20 |
+
healthcheck:
|
| 21 |
+
test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"]
|
| 22 |
+
interval: 5s
|
| 23 |
+
timeout: 3s
|
| 24 |
+
retries: 5
|
| 25 |
+
|
| 26 |
+
volumes:
|
| 27 |
+
redis_data:
|
src/database/docker-compose.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.4'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
weaviate:
|
| 5 |
+
image: semitechnologies/weaviate:1.33.0
|
| 6 |
+
restart: on-failure:0
|
| 7 |
+
ports:
|
| 8 |
+
- "8080:8080"
|
| 9 |
+
- "50051:50051"
|
| 10 |
+
environment:
|
| 11 |
+
QUERY_DEFAULTS_LIMIT: 25
|
| 12 |
+
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
|
| 13 |
+
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
|
| 14 |
+
ENABLE_API_BASED_MODULES: 'true'
|
| 15 |
+
ENABLE_MODULES: 'text2vec-transformers'
|
| 16 |
+
TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
|
| 17 |
+
CLUSTER_HOSTNAME: 'node1'
|
| 18 |
+
volumes:
|
| 19 |
+
- weaviate_data:/var/lib/weaviate
|
| 20 |
+
|
| 21 |
+
t2v-transformers:
|
| 22 |
+
image: semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2
|
| 23 |
+
restart: on-failure:0
|
| 24 |
+
ports:
|
| 25 |
+
- "8081:8080"
|
| 26 |
+
|
| 27 |
+
volumes:
|
| 28 |
+
weaviate_data:
|
| 29 |
+
|
src/database/redisservice.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import redis
|
| 2 |
+
from threading import Lock
|
| 3 |
+
from src.utils.logging import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger("redis_service")
|
| 6 |
+
|
| 7 |
+
class RedisService:
|
| 8 |
+
_instance = None
|
| 9 |
+
_init_lock = Lock()
|
| 10 |
+
|
| 11 |
+
def __new__(cls, host, port, password, mode):
|
| 12 |
+
if cls._instance is None:
|
| 13 |
+
with cls._init_lock:
|
| 14 |
+
if cls._instance is None:
|
| 15 |
+
cls._instance = super().__new__(cls)
|
| 16 |
+
return cls._instance
|
| 17 |
+
|
| 18 |
+
def __init__(self, host, port, password, mode):
|
| 19 |
+
if hasattr(self, '_initialized') and self._initialized:
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
self._client = None
|
| 23 |
+
self._host = host
|
| 24 |
+
self._port = port
|
| 25 |
+
self._password = password
|
| 26 |
+
self.mode = mode
|
| 27 |
+
|
| 28 |
+
self._connect()
|
| 29 |
+
|
| 30 |
+
self._initialized = True
|
| 31 |
+
|
| 32 |
+
def _connect(self):
|
| 33 |
+
try:
|
| 34 |
+
logger.info(f"Connecting to Redis at {self._host}:{self._port}...")
|
| 35 |
+
self._client = redis.Redis(
|
| 36 |
+
host=self._host,
|
| 37 |
+
port=self._port,
|
| 38 |
+
password=self._password,
|
| 39 |
+
decode_responses=True,
|
| 40 |
+
socket_connect_timeout=2,
|
| 41 |
+
socket_timeout=2
|
| 42 |
+
)
|
| 43 |
+
self._client.ping()
|
| 44 |
+
logger.info(f"Successfully connected to Redis! {self.mode}")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Redis connection failed: {e}")
|
| 47 |
+
self._client = None
|
| 48 |
+
|
| 49 |
+
def get_client(self):
|
| 50 |
+
return self._client
|
| 51 |
+
|
| 52 |
+
def is_connected(self) -> bool:
|
| 53 |
+
return self._client is not None
|
src/database/weavservice.py
ADDED
|
@@ -0,0 +1,851 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import reduce
|
| 2 |
+
import weaviate as wvt
|
| 3 |
+
import datetime, os
|
| 4 |
+
from threading import Lock
|
| 5 |
+
|
| 6 |
+
from time import perf_counter, sleep
|
| 7 |
+
from weaviate.classes.config import Configure, Property, DataType
|
| 8 |
+
from weaviate.collections.classes.grpc import MetadataQuery
|
| 9 |
+
from weaviate.collections.collection import Collection
|
| 10 |
+
from weaviate.classes.init import AdditionalConfig, Timeout
|
| 11 |
+
from weaviate.classes.query import Filter
|
| 12 |
+
from weaviate.config import AdditionalConfig
|
| 13 |
+
|
| 14 |
+
from ..utils.logging import get_logger
|
| 15 |
+
from ..config import config
|
| 16 |
+
|
| 17 |
+
logger = get_logger("weaviate_service")
|
| 18 |
+
|
| 19 |
+
_get_collection_name = lambda lang: f'{config.weaviate.WEAVIATE_COLLECTION_BASENAME}_{lang}'
|
| 20 |
+
_collection_names = [_get_collection_name(lang) for lang in config.get('AVAILABLE_LANGUAGES')]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _default_properties() -> list[Property]:
|
| 24 |
+
return [
|
| 25 |
+
Property(name='body', data_type=DataType.TEXT),
|
| 26 |
+
Property(name='chunk_id', data_type=DataType.TEXT),
|
| 27 |
+
Property(name='document_id', data_type=DataType.TEXT),
|
| 28 |
+
Property(name='programs', data_type=DataType.TEXT_ARRAY),
|
| 29 |
+
Property(name='source', data_type=DataType.TEXT),
|
| 30 |
+
Property(name='date', data_type=DataType.DATE),
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class WeaviateService:
|
| 35 |
+
"""
|
| 36 |
+
Provides an interface for interacting with the Weaviate vector database.
|
| 37 |
+
Handles initialization, data import, and hybrid queries.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
_instance = None
|
| 41 |
+
_init_lock = Lock()
|
| 42 |
+
|
| 43 |
+
def __new__(cls):
|
| 44 |
+
if cls._instance is None:
|
| 45 |
+
with cls._init_lock:
|
| 46 |
+
if cls._instance is None:
|
| 47 |
+
cls._instance = super().__new__(cls)
|
| 48 |
+
return cls._instance
|
| 49 |
+
|
| 50 |
+
def __init__(self) -> None:
|
| 51 |
+
"""
|
| 52 |
+
Initialize the Weaviate service.
|
| 53 |
+
"""
|
| 54 |
+
if hasattr(self, '_initialized'):
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
self._connection_type = 'local' if config.weaviate.LOCAL_DATABASE else 'cloud'
|
| 58 |
+
self._client = None
|
| 59 |
+
self._client_lock = Lock()
|
| 60 |
+
|
| 61 |
+
# Some parameters to ensure that the connection will not be closed
|
| 62 |
+
# during long pauses in conversations
|
| 63 |
+
self._last_query_time = perf_counter()
|
| 64 |
+
self._idle_timeout = 25 * 60
|
| 65 |
+
self._initialized = True
|
| 66 |
+
|
| 67 |
+
# Initialize the client for the first time
|
| 68 |
+
logger.info("Initializing Weaviate service...")
|
| 69 |
+
try:
|
| 70 |
+
self._init_client()
|
| 71 |
+
logger.info("Weaviate service initialized successfully")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Failed to initialize Weaviate service: {e}")
|
| 74 |
+
raise e
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _init_client(self) -> wvt.WeaviateClient:
|
| 78 |
+
"""
|
| 79 |
+
Initializes the weaviate client with additional configuration.
|
| 80 |
+
Performs a warm-up querying to speed-up the subsequent calls.
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
configured Weaviate client instance on successfull connection.
|
| 84 |
+
|
| 85 |
+
Raises:
|
| 86 |
+
WeaviateConnectionError of the last failed connection if connection fails after 3 retires.
|
| 87 |
+
"""
|
| 88 |
+
# Returns the client if it hasn't been idling for too long
|
| 89 |
+
if self._client is not None:
|
| 90 |
+
time_since_query = perf_counter() - self._last_query_time
|
| 91 |
+
if time_since_query < self._idle_timeout:
|
| 92 |
+
return self._client
|
| 93 |
+
|
| 94 |
+
# The connection might be closed, clients has to be reconnected
|
| 95 |
+
logger.warning(f"Client has been idling for too long. Reconnecting to prevent server-side closure...")
|
| 96 |
+
try:
|
| 97 |
+
self._client.close()
|
| 98 |
+
except Exception as _:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
self._client = None
|
| 102 |
+
|
| 103 |
+
# Client initialization
|
| 104 |
+
with self._client_lock:
|
| 105 |
+
if self._client:
|
| 106 |
+
return self._client
|
| 107 |
+
|
| 108 |
+
retries = 0
|
| 109 |
+
last_exception: Exception = None
|
| 110 |
+
while retries < 3:
|
| 111 |
+
try:
|
| 112 |
+
if config.weaviate.LOCAL_DATABASE:
|
| 113 |
+
self._client = wvt.connect_to_local()
|
| 114 |
+
break
|
| 115 |
+
|
| 116 |
+
self._client = wvt.connect_to_weaviate_cloud(
|
| 117 |
+
cluster_url=config.weaviate.CLUSTER_URL,
|
| 118 |
+
auth_credentials=config.weaviate.WEAVIATE_API_KEY,
|
| 119 |
+
additional_config=AdditionalConfig(
|
| 120 |
+
timeout=Timeout(
|
| 121 |
+
init=config.weaviate.INIT_TIMEOUT,
|
| 122 |
+
query=config.weaviate.QUERY_TIMEOUT,
|
| 123 |
+
insert=config.weaviate.INSERT_TIMEOUT,
|
| 124 |
+
),
|
| 125 |
+
skip_init_checks=False,
|
| 126 |
+
),
|
| 127 |
+
headers={
|
| 128 |
+
"X-HuggingFace-Api-Key": config.weaviate.HUGGING_FACE_API_KEY,
|
| 129 |
+
},
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Warm-up query
|
| 133 |
+
logger.info("Running warm-up query to initialize server...")
|
| 134 |
+
try:
|
| 135 |
+
collection = _get_collection_name(config.get('AVAILABLE_LANGUAGES')[0])
|
| 136 |
+
self._client.collections.exists(collection)
|
| 137 |
+
logger.info("Warm-up finished - server is ready!")
|
| 138 |
+
except Exception as warmup_err:
|
| 139 |
+
logger.warning(f"Warm-up query failed (non-critical): {warmup_err}")
|
| 140 |
+
|
| 141 |
+
break
|
| 142 |
+
except Exception as e:
|
| 143 |
+
last_exception = e
|
| 144 |
+
logger.warning(f"Failed to establish connection on try {retries}: {e}")
|
| 145 |
+
retries += 1
|
| 146 |
+
sleep(1)
|
| 147 |
+
|
| 148 |
+
if retries == 3:
|
| 149 |
+
logger.error(f"Failed to establish connection after 3 retries!")
|
| 150 |
+
raise last_exception
|
| 151 |
+
|
| 152 |
+
logger.info(f"Successully connected to the {self._connection_type} weaviate database")
|
| 153 |
+
self._last_query_time = perf_counter()
|
| 154 |
+
return self._client
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _select_collection(self, lang: str) -> tuple[Collection, str]:
|
| 158 |
+
"""
|
| 159 |
+
Select a language-specific collection as the active working collection.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
lang (str): Acceptable language code.
|
| 163 |
+
|
| 164 |
+
Raises:
|
| 165 |
+
weaviate.exceptions.WeaviateConnectionError: If the specified language collection does not exist.
|
| 166 |
+
"""
|
| 167 |
+
if lang not in config.get('AVAILABLE_LANGUAGES'):
|
| 168 |
+
logger.error(f"No collection for language '{lang}' was found in the database")
|
| 169 |
+
return None, ''
|
| 170 |
+
|
| 171 |
+
collection_name = _get_collection_name(lang)
|
| 172 |
+
logger.debug(f"Using collection {collection_name}")
|
| 173 |
+
|
| 174 |
+
client = self._init_client()
|
| 175 |
+
return client.collections.use(collection_name), collection_name
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def batch_import(self, data_rows: list, lang: str) -> list:
|
| 179 |
+
"""
|
| 180 |
+
Perform a batch import of multiple objects into the current collection.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
data_rows (list): List of dictionaries representing the data rows to import.
|
| 184 |
+
lang (str, optional): Language collection to use. If not provided, uses the current one.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
list[dict]: List of failed imports with error details, if any.
|
| 188 |
+
|
| 189 |
+
Raises:
|
| 190 |
+
If no active collection is available or a connection error was catched.
|
| 191 |
+
"""
|
| 192 |
+
collection, collection_name = self._select_collection(lang)
|
| 193 |
+
if collection is None:
|
| 194 |
+
logger.error("No working collection selected!")
|
| 195 |
+
return []
|
| 196 |
+
|
| 197 |
+
import_errors = []
|
| 198 |
+
logger.info(f"Batch importing {len(data_rows)} rows into {collection_name}")
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
with self._client_lock:
|
| 202 |
+
with collection.batch.fixed_size(batch_size=100, concurrent_requests=2) as batch:
|
| 203 |
+
for idx, data_row in enumerate(data_rows):
|
| 204 |
+
try:
|
| 205 |
+
batch.add_object(properties=data_row)
|
| 206 |
+
except Exception as e:
|
| 207 |
+
import_errors.append({'index': idx, 'chunk_id': data_row['chunk_id'], 'error': str(e)})
|
| 208 |
+
|
| 209 |
+
if idx % 20 == 0 and idx > 0:
|
| 210 |
+
if batch.number_errors > 0:
|
| 211 |
+
logger.info(f"Failed imports at index {idx}: {batch.number_errors}")
|
| 212 |
+
|
| 213 |
+
self._last_query_time = perf_counter()
|
| 214 |
+
logger.info(f"Batch import finished. Total errors: {len(import_errors)}")
|
| 215 |
+
|
| 216 |
+
except Exception as e:
|
| 217 |
+
if 'connection' in str(e).lower():
|
| 218 |
+
logger.error(f"Connection error during batch import: {e}")
|
| 219 |
+
self._client = None
|
| 220 |
+
raise e
|
| 221 |
+
|
| 222 |
+
return import_errors
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
@staticmethod
|
| 226 |
+
def _create_property_filter(prop, values) -> Filter:
|
| 227 |
+
match prop:
|
| 228 |
+
case 'programs':
|
| 229 |
+
return Filter.by_property('programs').contains_any(values)
|
| 230 |
+
case 'source':
|
| 231 |
+
return Filter.by_property('source').contains_any(values) \
|
| 232 |
+
if isinstance(values, list) else Filter.by_property('source').equal(values)
|
| 233 |
+
case _:
|
| 234 |
+
return None
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def delete_chunks(self, lang: str, property_filters: dict[str, any] = None) -> int:
|
| 238 |
+
"""
|
| 239 |
+
Delete all chunks from the specified collection that match given property filters.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
lang (str): Language collection to use.
|
| 243 |
+
property_filters (dict[str, any]): Key-value pairs for filtering.
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
int: Number of deleted objects (if available, else -1).
|
| 247 |
+
"""
|
| 248 |
+
retry_count = 0
|
| 249 |
+
max_retries = 2
|
| 250 |
+
|
| 251 |
+
filters = [self._create_property_filter(prop, values)
|
| 252 |
+
for prop, values in property_filters.items()] if property_filters else None
|
| 253 |
+
if filters:
|
| 254 |
+
filters = [f for f in filters if f is not None]
|
| 255 |
+
filters = reduce(lambda f1, f2: f1 & f2, filters) if filters else None
|
| 256 |
+
|
| 257 |
+
while retry_count < max_retries:
|
| 258 |
+
try:
|
| 259 |
+
collection, collection_name = self._select_collection(lang)
|
| 260 |
+
if collection is None:
|
| 261 |
+
logger.error("No working collection selected!")
|
| 262 |
+
return 0
|
| 263 |
+
|
| 264 |
+
logger.info(f"Deleting chunks from {collection_name} with filters={property_filters}")
|
| 265 |
+
|
| 266 |
+
with self._client_lock:
|
| 267 |
+
result = collection.data.delete_many(
|
| 268 |
+
where=filters
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
self._last_query_time = perf_counter()
|
| 272 |
+
|
| 273 |
+
deleted = getattr(result, "objects_deleted", None)
|
| 274 |
+
if deleted is None:
|
| 275 |
+
logger.info("Deletion executed (count not returned by client)")
|
| 276 |
+
return -1
|
| 277 |
+
|
| 278 |
+
logger.info(f"Deleted {deleted} objects")
|
| 279 |
+
return deleted
|
| 280 |
+
|
| 281 |
+
except Exception as e:
|
| 282 |
+
if any(err_type in str(e).lower() for err_type in ['reset', 'closed', 'grpc', 'unavailable']):
|
| 283 |
+
retry_count += 1
|
| 284 |
+
logger.warning(f"Connection error during deletion: {e}. Retrying...")
|
| 285 |
+
if retry_count == max_retries:
|
| 286 |
+
raise e
|
| 287 |
+
else:
|
| 288 |
+
raise e
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def ping(self, lang: str) -> dict:
|
| 292 |
+
try:
|
| 293 |
+
collection, _ = self._select_collection(lang)
|
| 294 |
+
with self._client_lock:
|
| 295 |
+
collection.query.hybrid("health check query")
|
| 296 |
+
return { 'status': 'OK' }
|
| 297 |
+
except Exception as e:
|
| 298 |
+
return { 'status': 'ERROR', 'error': e }
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def query(self, query: str, lang: str, property_filters: dict[str] = None, limit: int = 5) -> dict:
|
| 302 |
+
"""
|
| 303 |
+
Execute a hybrid semantic and keyword query against the active collection with automatic reconnection on idle timeout.
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
query (str): The query string.
|
| 307 |
+
lang (str, optional): Language collection to use. If not provided, uses the current one.
|
| 308 |
+
property_filters (dict[str, any]): Key-value pairs for metadata filtering. Keys correspond
|
| 309 |
+
to document properties (e.g., 'program', 'topic'), and values are the required matches.
|
| 310 |
+
Multiple filters are combined using logical AND.
|
| 311 |
+
limit (int, optional): Maximum number of results to return. Defaults to 5.
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
tuple: A tuple containing the query response and elapsed time.
|
| 316 |
+
|
| 317 |
+
Raises:
|
| 318 |
+
weaviate.exceptions.WeaviateConnectionError: If no active collection is available.
|
| 319 |
+
"""
|
| 320 |
+
retry_count = 0
|
| 321 |
+
max_retries = 2
|
| 322 |
+
|
| 323 |
+
filters = [self._create_property_filter(prop, values)
|
| 324 |
+
for prop, values in property_filters.items()] if property_filters else None
|
| 325 |
+
if filters:
|
| 326 |
+
filters = [f for f in filters if f is not None]
|
| 327 |
+
filters = reduce(lambda f1, f2: f1 & f2, filters) if filters else None
|
| 328 |
+
|
| 329 |
+
while retry_count < max_retries:
|
| 330 |
+
try:
|
| 331 |
+
collection, collection_name = self._select_collection(lang)
|
| 332 |
+
if collection is None:
|
| 333 |
+
logger.error("No working collection selected upon starting of the querying!")
|
| 334 |
+
return [], 0
|
| 335 |
+
|
| 336 |
+
logger.info(f"Querying collection {collection_name}")
|
| 337 |
+
query_start_time = perf_counter()
|
| 338 |
+
|
| 339 |
+
with self._client_lock:
|
| 340 |
+
resp = collection.query.hybrid(
|
| 341 |
+
query=query,
|
| 342 |
+
filters=filters,
|
| 343 |
+
limit=limit,
|
| 344 |
+
return_metadata=MetadataQuery.full()
|
| 345 |
+
)
|
| 346 |
+
elapsed = perf_counter() - query_start_time
|
| 347 |
+
self._last_query_time = perf_counter()
|
| 348 |
+
logger.info(f"Querying retrieved {len(resp.objects)} objects in {elapsed:3.2f} seconds")
|
| 349 |
+
|
| 350 |
+
return (resp, elapsed)
|
| 351 |
+
except Exception as e:
|
| 352 |
+
if any(err_type in str(e).lower() for err_type in ['reset', 'closed', 'grpc', 'unavailable']):
|
| 353 |
+
retry_count += 1
|
| 354 |
+
logger.warning(f"Connection error detected: {e}. Retrying...")
|
| 355 |
+
|
| 356 |
+
if retry_count == max_retries:
|
| 357 |
+
raise e
|
| 358 |
+
else: # Probably not a server issue
|
| 359 |
+
raise e
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def _load_properties(self) -> list[Property]:
|
| 363 |
+
properties = {}
|
| 364 |
+
properties_file = os.path.join(config.weaviate.PROPERTIES_PATH, 'properties.yaml')
|
| 365 |
+
if not os.path.exists(properties_file):
|
| 366 |
+
logger.warning(
|
| 367 |
+
f"Optional file 'properties.yaml' is missing on path: {properties_file}. "
|
| 368 |
+
"Falling back to built-in default properties."
|
| 369 |
+
)
|
| 370 |
+
return _default_properties()
|
| 371 |
+
|
| 372 |
+
try:
|
| 373 |
+
import yaml
|
| 374 |
+
|
| 375 |
+
with open(properties_file, 'r') as stream:
|
| 376 |
+
properties = yaml.safe_load(stream)
|
| 377 |
+
except ModuleNotFoundError:
|
| 378 |
+
logger.warning(
|
| 379 |
+
"PyYAML is not installed. Falling back to built-in default properties "
|
| 380 |
+
"for Weaviate collection creation."
|
| 381 |
+
)
|
| 382 |
+
return _default_properties()
|
| 383 |
+
except Exception as e:
|
| 384 |
+
logger.error(f"Failed to load properties from path {properties_file}: {e}")
|
| 385 |
+
raise e
|
| 386 |
+
|
| 387 |
+
if not properties:
|
| 388 |
+
logger.warning("properties.yaml is empty. Falling back to built-in default properties.")
|
| 389 |
+
return _default_properties()
|
| 390 |
+
|
| 391 |
+
final_properties = []
|
| 392 |
+
for name, params in properties.items():
|
| 393 |
+
try:
|
| 394 |
+
data_type = params.get('data_type', '')
|
| 395 |
+
dtype = DataType(data_type)
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logger.error(f"Nonexistent datatype {data_type}")
|
| 398 |
+
raise e
|
| 399 |
+
|
| 400 |
+
final_properties.append(Property(
|
| 401 |
+
name=name,
|
| 402 |
+
data_type=dtype,
|
| 403 |
+
index_filterable=params.get('filterable', True),
|
| 404 |
+
index_searchable=params.get('searchable', True),
|
| 405 |
+
skip_vectorization=params.get('skip_vectorization', False),
|
| 406 |
+
))
|
| 407 |
+
|
| 408 |
+
return final_properties
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def _create_collections(self):
|
| 412 |
+
"""
|
| 413 |
+
Create and initialize language-specific collections.
|
| 414 |
+
|
| 415 |
+
Creates collections for all available languages with vector configuration.
|
| 416 |
+
"""
|
| 417 |
+
properties = self._load_properties()
|
| 418 |
+
try:
|
| 419 |
+
client = self._init_client()
|
| 420 |
+
logger.info('Attempting collections creation...')
|
| 421 |
+
|
| 422 |
+
vector_config = (
|
| 423 |
+
Configure.Vectors.text2vec_transformers() if config.weaviate.LOCAL_DATABASE
|
| 424 |
+
else Configure.Vectors.text2vec_huggingface(
|
| 425 |
+
name='hsg_rag_embeddings',
|
| 426 |
+
source_properties=['body'],
|
| 427 |
+
model=config.processing.EMBEDDING_MODEL,
|
| 428 |
+
)
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
successful_creations = 0
|
| 432 |
+
|
| 433 |
+
with self._client_lock:
|
| 434 |
+
for collection_name in _collection_names:
|
| 435 |
+
try:
|
| 436 |
+
client.collections.create(
|
| 437 |
+
name=collection_name,
|
| 438 |
+
properties=properties,
|
| 439 |
+
vector_config=vector_config
|
| 440 |
+
)
|
| 441 |
+
logger.info(f"Created collection {collection_name}")
|
| 442 |
+
successful_creations += 1
|
| 443 |
+
except Exception as e:
|
| 444 |
+
logger.error(f"Failed to create collection '{collection_name}': {e}")
|
| 445 |
+
|
| 446 |
+
self._last_query_time = perf_counter()
|
| 447 |
+
|
| 448 |
+
if successful_creations == len(_collection_names):
|
| 449 |
+
logger.info('All collections successfully instantiated')
|
| 450 |
+
else:
|
| 451 |
+
logger.warning(f"Only {successful_creations}/{len(_collection_names)} collections created")
|
| 452 |
+
|
| 453 |
+
except Exception as e:
|
| 454 |
+
logger.error(f"Collections creation failed: {e}")
|
| 455 |
+
self._client = None
|
| 456 |
+
raise e
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
def _delete_collections(self):
|
| 460 |
+
"""
|
| 461 |
+
Delete all existing collections from the database.
|
| 462 |
+
|
| 463 |
+
Also removes the hash file if it exists.
|
| 464 |
+
"""
|
| 465 |
+
try:
|
| 466 |
+
client = self._init_client()
|
| 467 |
+
logger.info("Initiating deletion of stored collections...")
|
| 468 |
+
|
| 469 |
+
deleted_count = 0
|
| 470 |
+
with self._client_lock:
|
| 471 |
+
for collection_name in _collection_names:
|
| 472 |
+
try:
|
| 473 |
+
if client.collections.exists(collection_name):
|
| 474 |
+
client.collections.delete(collection_name)
|
| 475 |
+
logger.info(f"Deleted collection {collection_name}")
|
| 476 |
+
deleted_count += 1
|
| 477 |
+
else:
|
| 478 |
+
logger.warning(f"Collection {collection_name} does not exist")
|
| 479 |
+
except Exception as e:
|
| 480 |
+
logger.error(f"Failed to delete collection {collection_name}: {e}")
|
| 481 |
+
|
| 482 |
+
self._last_query_time = perf_counter()
|
| 483 |
+
logger.info(f"Deleted {deleted_count}/{len(_collection_names)} collections")
|
| 484 |
+
|
| 485 |
+
except Exception as e:
|
| 486 |
+
logger.error(f"Collections deletion failed: {e}")
|
| 487 |
+
self._client = None
|
| 488 |
+
raise e
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def _reset_collections(self):
|
| 492 |
+
self._delete_collections()
|
| 493 |
+
self._create_collections()
|
| 494 |
+
|
| 495 |
+
|
| 496 |
+
def _collect_chunk_ids(self) -> dict:
|
| 497 |
+
client = self._init_client()
|
| 498 |
+
try:
|
| 499 |
+
ids = []
|
| 500 |
+
with self._client_lock:
|
| 501 |
+
for c in client.collections.list_all(simple=False):
|
| 502 |
+
coll = client.collections.get(c)
|
| 503 |
+
for obj in coll.iterator():
|
| 504 |
+
ids.append(obj.properties['chunk_id'])
|
| 505 |
+
return ids
|
| 506 |
+
except Exception as e:
|
| 507 |
+
logger.error(f"Failed to collect chunk ids: {e}")
|
| 508 |
+
raise e
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
def _extract_data(self) -> dict:
|
| 512 |
+
client = self._init_client()
|
| 513 |
+
try:
|
| 514 |
+
schema = []
|
| 515 |
+
objects = {}
|
| 516 |
+
with self._client_lock:
|
| 517 |
+
for c in client.collections.list_all(simple=False):
|
| 518 |
+
coll = client.collections.get(c)
|
| 519 |
+
cfg = coll.config.get().to_dict()
|
| 520 |
+
schema.append(cfg)
|
| 521 |
+
|
| 522 |
+
objects[c] = []
|
| 523 |
+
for obj in coll.iterator(include_vector=True):
|
| 524 |
+
objects[c].append({
|
| 525 |
+
"uuid": obj.uuid,
|
| 526 |
+
"properties": obj.properties,
|
| 527 |
+
"vector": obj.vector,
|
| 528 |
+
})
|
| 529 |
+
|
| 530 |
+
return {
|
| 531 |
+
'schema': schema,
|
| 532 |
+
'objects': objects,
|
| 533 |
+
}
|
| 534 |
+
except Exception as e:
|
| 535 |
+
logger.error(f"Failed to extract data from database: {e}")
|
| 536 |
+
raise e
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
def _create_backup(self) -> str:
|
| 540 |
+
"""
|
| 541 |
+
Create a backup of the current database state and stores it under selected backup provider.
|
| 542 |
+
|
| 543 |
+
Returns: backup id of the created backup.
|
| 544 |
+
"""
|
| 545 |
+
try:
|
| 546 |
+
if not config.weaviate.BACKUP_METHOD:
|
| 547 |
+
raise ValueError('Backup method is not selected!')
|
| 548 |
+
if config.weaviate.BACKUP_METHOD not in config.weaviate.BACKUP_METHODS:
|
| 549 |
+
raise ValueError(f"Selected backup method 'config.weaviate.BACKUP_METHODS' is not supported!")
|
| 550 |
+
if not config.weaviate.BACKUP_PATH:
|
| 551 |
+
raise ValueError("Backup directory is not set!")
|
| 552 |
+
os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
|
| 553 |
+
|
| 554 |
+
backup_id = f"backup_{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}"
|
| 555 |
+
logger.info(f"Initiating backup creation for {self._connection_type} database...")
|
| 556 |
+
|
| 557 |
+
match config.weaviate.BACKUP_METHOD:
|
| 558 |
+
case 'manual':
|
| 559 |
+
import json
|
| 560 |
+
|
| 561 |
+
backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
|
| 562 |
+
os.makedirs(backup_path)
|
| 563 |
+
|
| 564 |
+
db_data = self._extract_data()
|
| 565 |
+
data_backup = {
|
| 566 |
+
'creation_date': datetime.datetime.now().isoformat(),
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
schema_backup_path = os.path.join(backup_path, 'schema.json')
|
| 570 |
+
with open(schema_backup_path, 'w', encoding='utf-8') as f:
|
| 571 |
+
json.dump(db_data['schema'], f, indent=2, default=str)
|
| 572 |
+
|
| 573 |
+
objects_backup_path = os.path.join(backup_path, 'objects.json')
|
| 574 |
+
with open(objects_backup_path, 'w', encoding='utf-8') as f:
|
| 575 |
+
json.dump(db_data['objects'], f, indent=2, default=str)
|
| 576 |
+
|
| 577 |
+
data_backup_path = os.path.join(backup_path, 'data.json')
|
| 578 |
+
with open(data_backup_path, 'w', encoding='utf-8') as f:
|
| 579 |
+
json.dump(data_backup, f, indent=2, default=str)
|
| 580 |
+
|
| 581 |
+
case 's3':
|
| 582 |
+
client = self._init_client()
|
| 583 |
+
with self._client_lock:
|
| 584 |
+
client.backup.create(
|
| 585 |
+
backup_id=backup_id,
|
| 586 |
+
backend="s3",
|
| 587 |
+
include_collections=_collection_names,
|
| 588 |
+
wait_for_completion=True,
|
| 589 |
+
)
|
| 590 |
+
case _:
|
| 591 |
+
raise NotImplementedError()
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
self._last_query_time = perf_counter()
|
| 595 |
+
logger.info(f"Backup '{backup_id}' created successfully")
|
| 596 |
+
|
| 597 |
+
return backup_id
|
| 598 |
+
except Exception as e:
|
| 599 |
+
logger.error(f"Backup creation failed: {e}")
|
| 600 |
+
raise e
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
def _restore_backup(self, backup_id: str):
|
| 604 |
+
"""
|
| 605 |
+
Restore the database state from a backup.
|
| 606 |
+
|
| 607 |
+
Restores specified collections from backup.
|
| 608 |
+
|
| 609 |
+
Args:
|
| 610 |
+
backup_id: ID of the backup to restore from
|
| 611 |
+
|
| 612 |
+
Raises:
|
| 613 |
+
Exception if backup restoration fails
|
| 614 |
+
"""
|
| 615 |
+
self._delete_collections()
|
| 616 |
+
|
| 617 |
+
try:
|
| 618 |
+
if not config.weaviate.BACKUP_METHOD:
|
| 619 |
+
raise ValueError('Backup method is not selected!')
|
| 620 |
+
if config.weaviate.BACKUP_METHOD not in config.weaviate.BACKUP_METHODS:
|
| 621 |
+
raise ValueError(f"Selected backup method 'config.weaviate.BACKUP_METHODS' is not supported!")
|
| 622 |
+
if not config.weaviate.BACKUP_PATH:
|
| 623 |
+
raise ValueError("Backup directory is not set!")
|
| 624 |
+
os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
|
| 625 |
+
|
| 626 |
+
backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
|
| 627 |
+
if not os.path.exists(backup_path):
|
| 628 |
+
raise RuntimeError(f"Directory for backup 'backup_id' does not exist in the backup directory!")
|
| 629 |
+
schema_backup_path = os.path.join(backup_path, 'schema.json')
|
| 630 |
+
if not os.path.exists(schema_backup_path):
|
| 631 |
+
raise RuntimeError(f"Schema backup is missing in the backup directory!")
|
| 632 |
+
objects_backup_path = os.path.join(backup_path, 'objects.json')
|
| 633 |
+
if not os.path.exists(objects_backup_path):
|
| 634 |
+
raise RuntimeError(f"Objects backup is missing in the backup directory!")
|
| 635 |
+
|
| 636 |
+
client = self._init_client()
|
| 637 |
+
logger.info(f"Initiating restoration from backup '{backup_id}' for {self._connection_type} database...")
|
| 638 |
+
|
| 639 |
+
with self._client_lock:
|
| 640 |
+
match config.weaviate.BACKUP_METHOD:
|
| 641 |
+
case 'manual':
|
| 642 |
+
import json
|
| 643 |
+
|
| 644 |
+
with open(schema_backup_path) as f:
|
| 645 |
+
schemas = json.load(f)
|
| 646 |
+
for cfg in schemas:
|
| 647 |
+
client.collections.create_from_dict(cfg)
|
| 648 |
+
|
| 649 |
+
with open(objects_backup_path) as f:
|
| 650 |
+
data = json.load(f)
|
| 651 |
+
for name, objs in data.items():
|
| 652 |
+
logger.info(f"Restoring collection '{name}' with {len(objs)} objects...")
|
| 653 |
+
coll = client.collections.get(name)
|
| 654 |
+
|
| 655 |
+
with coll.batch.dynamic() as batch:
|
| 656 |
+
for o in objs:
|
| 657 |
+
o['properties']['date'] = o['properties']['date'] \
|
| 658 |
+
.replace(" ", "T").replace("+00:00", "Z")
|
| 659 |
+
batch.add_object(
|
| 660 |
+
uuid=o["uuid"],
|
| 661 |
+
properties=o["properties"],
|
| 662 |
+
vector=o["vector"]
|
| 663 |
+
)
|
| 664 |
+
logger.info(f"Collection '{name}' restored successfully")
|
| 665 |
+
case 's3':
|
| 666 |
+
client.backup.restore(
|
| 667 |
+
backup_id=backup_id,
|
| 668 |
+
backend="s3",
|
| 669 |
+
wait_for_completion=True,
|
| 670 |
+
roles_restore="all",
|
| 671 |
+
users_restore="all",
|
| 672 |
+
)
|
| 673 |
+
case _:
|
| 674 |
+
raise NotImplementedError()
|
| 675 |
+
|
| 676 |
+
self._last_query_time = perf_counter()
|
| 677 |
+
logger.info(f"Backup '{backup_id}' restored successfully")
|
| 678 |
+
|
| 679 |
+
except Exception as e:
|
| 680 |
+
error_msg = str(e).lower()
|
| 681 |
+
if 'connection' in error_msg:
|
| 682 |
+
logger.error(f"Connection error during backup restore: {e}. Will reconnect on next operation.")
|
| 683 |
+
self._client = None
|
| 684 |
+
logger.error(f"Backup restoration failed: {e}")
|
| 685 |
+
raise e
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
def _checkhealth(self) -> bool:
|
| 689 |
+
"""
|
| 690 |
+
Check the connectivity and health status of the Weaviate database.
|
| 691 |
+
|
| 692 |
+
Verifies:
|
| 693 |
+
- Connection to the database
|
| 694 |
+
- Database metadata and version
|
| 695 |
+
- Existence of all expected collections
|
| 696 |
+
- Module availability
|
| 697 |
+
|
| 698 |
+
Returns:
|
| 699 |
+
True if all health checks pass, False otherwise
|
| 700 |
+
"""
|
| 701 |
+
try:
|
| 702 |
+
client = self._init_client()
|
| 703 |
+
|
| 704 |
+
# Check basic connectivity
|
| 705 |
+
is_connected = False
|
| 706 |
+
with self._client_lock:
|
| 707 |
+
is_connected = client.is_connected()
|
| 708 |
+
|
| 709 |
+
connection_status = "✓ OK" if is_connected else "✗ ERROR"
|
| 710 |
+
logger.info(f"Connection to {self._connection_type} database: {connection_status}")
|
| 711 |
+
|
| 712 |
+
if not is_connected:
|
| 713 |
+
logger.error("Database connection check failed")
|
| 714 |
+
return False
|
| 715 |
+
|
| 716 |
+
# Get and log metadata
|
| 717 |
+
try:
|
| 718 |
+
with self._client_lock:
|
| 719 |
+
metainfo = client.get_meta()
|
| 720 |
+
|
| 721 |
+
# Format module information
|
| 722 |
+
modules = metainfo.get('modules', {})
|
| 723 |
+
modules_list = list(modules.keys()) if isinstance(modules, dict) else modules
|
| 724 |
+
modules_str = ', '.join(str(m) for m in modules_list) if modules_list else 'None'
|
| 725 |
+
|
| 726 |
+
# Truncate long module strings for logging
|
| 727 |
+
if len(modules_str) > 50:
|
| 728 |
+
modules_str = modules_str[:47] + '...'
|
| 729 |
+
|
| 730 |
+
# Log connection details
|
| 731 |
+
if config.weaviate.LOCAL_DATABASE:
|
| 732 |
+
logger.info(
|
| 733 |
+
f"Database metadata: "
|
| 734 |
+
f"HOSTNAME={metainfo.get('hostname', 'unknown')}, "
|
| 735 |
+
f"VERSION={metainfo.get('version', 'unknown')}, "
|
| 736 |
+
f"MODULES={modules_str}"
|
| 737 |
+
)
|
| 738 |
+
else:
|
| 739 |
+
logger.info(
|
| 740 |
+
f"Database metadata: "
|
| 741 |
+
f"VERSION={metainfo.get('version', 'unknown')}, "
|
| 742 |
+
f"MODULES={modules_str}"
|
| 743 |
+
)
|
| 744 |
+
|
| 745 |
+
except Exception as e:
|
| 746 |
+
logger.warning(f"Could not retrieve database metadata: {e}")
|
| 747 |
+
|
| 748 |
+
# Check collection existence
|
| 749 |
+
all_collections_exist = True
|
| 750 |
+
|
| 751 |
+
with self._client_lock:
|
| 752 |
+
for collection_name in _collection_names:
|
| 753 |
+
try:
|
| 754 |
+
exists = client.collections.exists(collection_name)
|
| 755 |
+
status = "✓ OK" if exists else "✗ MISSING"
|
| 756 |
+
logger.info(f"Collection '{collection_name}': {status}")
|
| 757 |
+
|
| 758 |
+
if not exists:
|
| 759 |
+
all_collections_exist = False
|
| 760 |
+
|
| 761 |
+
except Exception as e:
|
| 762 |
+
logger.error(f"Error checking collection '{collection_name}': {e}")
|
| 763 |
+
all_collections_exist = False
|
| 764 |
+
|
| 765 |
+
# Update last health check time
|
| 766 |
+
self._last_query_time = perf_counter()
|
| 767 |
+
|
| 768 |
+
# Log overall health status
|
| 769 |
+
if is_connected and all_collections_exist:
|
| 770 |
+
logger.info("✓ Database health check PASSED - All systems operational")
|
| 771 |
+
return True
|
| 772 |
+
else:
|
| 773 |
+
logger.warning("✗ Database health check FAILED - Some issues detected")
|
| 774 |
+
return False
|
| 775 |
+
|
| 776 |
+
except Exception as e:
|
| 777 |
+
error_msg = str(e).lower()
|
| 778 |
+
if 'connection' in error_msg:
|
| 779 |
+
logger.error(f"Connection error during health check: {e}. Will reconnect on next operation.")
|
| 780 |
+
self._client = None
|
| 781 |
+
logger.error(f"Health check failed: {e}")
|
| 782 |
+
return False
|
| 783 |
+
|
| 784 |
+
|
| 785 |
+
def parse_arguments():
|
| 786 |
+
"""
|
| 787 |
+
Parse command-line arguments for managing Weaviate collections.
|
| 788 |
+
|
| 789 |
+
Returns:
|
| 790 |
+
argparse.Namespace: Parsed command-line arguments.
|
| 791 |
+
"""
|
| 792 |
+
import argparse
|
| 793 |
+
|
| 794 |
+
parser = argparse.ArgumentParser(
|
| 795 |
+
description='Weaviate database management utility'
|
| 796 |
+
)
|
| 797 |
+
group = parser.add_mutually_exclusive_group()
|
| 798 |
+
|
| 799 |
+
group.add_argument(
|
| 800 |
+
'-dc', "--delete_collections",
|
| 801 |
+
action='store_true',
|
| 802 |
+
help='Delete all collections from the database'
|
| 803 |
+
)
|
| 804 |
+
group.add_argument(
|
| 805 |
+
'-cc', "--create_collections",
|
| 806 |
+
action='store_true',
|
| 807 |
+
help='Initialize collections for different language contents'
|
| 808 |
+
)
|
| 809 |
+
group.add_argument(
|
| 810 |
+
'-rc', "--redo_collections",
|
| 811 |
+
action='store_true',
|
| 812 |
+
help='Delete and recreate all collections'
|
| 813 |
+
)
|
| 814 |
+
group.add_argument(
|
| 815 |
+
'-ch', "--checkhealth",
|
| 816 |
+
action='store_true',
|
| 817 |
+
help='Check database connection and collection existence'
|
| 818 |
+
)
|
| 819 |
+
group.add_argument(
|
| 820 |
+
'-cb', "--create_backup",
|
| 821 |
+
action='store_true',
|
| 822 |
+
help='Create a backup of the current database state'
|
| 823 |
+
)
|
| 824 |
+
group.add_argument(
|
| 825 |
+
'-rb', "--restore_backup",
|
| 826 |
+
type=str,
|
| 827 |
+
metavar='BACKUP_ID',
|
| 828 |
+
help='Restore database from a backup (provide backup_id)'
|
| 829 |
+
)
|
| 830 |
+
|
| 831 |
+
return parser.parse_args()
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
if __name__ == "__main__":
|
| 835 |
+
args = parse_arguments()
|
| 836 |
+
service = WeaviateService()
|
| 837 |
+
|
| 838 |
+
if args.create_backup:
|
| 839 |
+
service._create_backup()
|
| 840 |
+
|
| 841 |
+
if args.restore_backup:
|
| 842 |
+
service._restore_backup(args.restore_backup)
|
| 843 |
+
|
| 844 |
+
if any([args.delete_collections, args.redo_collections]):
|
| 845 |
+
service._delete_collections()
|
| 846 |
+
|
| 847 |
+
if any([args.create_collections, args.redo_collections]):
|
| 848 |
+
service._create_collections()
|
| 849 |
+
|
| 850 |
+
if any([args.checkhealth, args.create_collections, args.redo_collections]):
|
| 851 |
+
service._checkhealth()
|
src/notification/__init__.py
ADDED
|
File without changes
|
src/notification/notification_center.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
import mimetypes
|
| 3 |
+
import os
|
| 4 |
+
import smtplib
|
| 5 |
+
from email.message import EmailMessage
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
|
| 9 |
+
from ..config import NotificationCenterConfig as NC
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
Channel = Literal["email", "slack"]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EmailNotifier:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.enabled = NC.ENABLE_EMAIL_ALERTS
|
| 18 |
+
self.smtp_host = NC.SMTP_HOST
|
| 19 |
+
self.smtp_port = NC.SMTP_PORT
|
| 20 |
+
self.smtp_user = NC.SMTP_USER
|
| 21 |
+
self.smtp_password = NC.SMTP_PASSWORD
|
| 22 |
+
self.smtp_use_tls = NC.SMTP_USE_TLS
|
| 23 |
+
self.from_email = NC.FROM_EMAIL
|
| 24 |
+
self.to_emails = self._parse_recipients(NC.TO_EMAIL)
|
| 25 |
+
|
| 26 |
+
if self.enabled:
|
| 27 |
+
self._validate()
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def _parse_recipients(value: str | None) -> list[str]:
|
| 31 |
+
if not value:
|
| 32 |
+
return []
|
| 33 |
+
return [email.strip() for email in value.split(",") if email.strip()]
|
| 34 |
+
|
| 35 |
+
def _validate(self) -> None:
|
| 36 |
+
missing = []
|
| 37 |
+
|
| 38 |
+
if not self.smtp_host:
|
| 39 |
+
missing.append("NOTIFY_SMTP_HOST")
|
| 40 |
+
if not self.smtp_user:
|
| 41 |
+
missing.append("NOTIFY_SMTP_USER")
|
| 42 |
+
if not self.smtp_password:
|
| 43 |
+
missing.append("NOTIFY_SMTP_PASSWORD")
|
| 44 |
+
if not self.from_email:
|
| 45 |
+
missing.append("NOTIFY_FROM_EMAIL")
|
| 46 |
+
if not self.to_emails:
|
| 47 |
+
missing.append("NOTIFY_TO_EMAIL")
|
| 48 |
+
|
| 49 |
+
if missing:
|
| 50 |
+
raise ValueError(f"Missing notification email config: {', '.join(missing)}")
|
| 51 |
+
|
| 52 |
+
def send(
|
| 53 |
+
self,
|
| 54 |
+
subject: str,
|
| 55 |
+
body: str,
|
| 56 |
+
attachments: str | list[str] | None = None,
|
| 57 |
+
) -> None:
|
| 58 |
+
if not self.enabled:
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
if isinstance(attachments, str):
|
| 62 |
+
attachments = [attachments]
|
| 63 |
+
|
| 64 |
+
msg = EmailMessage()
|
| 65 |
+
msg["Subject"] = subject
|
| 66 |
+
msg["From"] = self.from_email
|
| 67 |
+
msg["To"] = ", ".join(self.to_emails)
|
| 68 |
+
msg.set_content(body)
|
| 69 |
+
|
| 70 |
+
if attachments:
|
| 71 |
+
for file_path in attachments:
|
| 72 |
+
if not file_path or not os.path.isfile(file_path):
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
mime_type, _ = mimetypes.guess_type(file_path)
|
| 76 |
+
mime_type = mime_type or "application/octet-stream"
|
| 77 |
+
maintype, subtype = mime_type.split("/", 1)
|
| 78 |
+
|
| 79 |
+
with open(file_path, "rb") as f:
|
| 80 |
+
msg.add_attachment(
|
| 81 |
+
f.read(),
|
| 82 |
+
maintype=maintype,
|
| 83 |
+
subtype=subtype,
|
| 84 |
+
filename=os.path.basename(file_path),
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
with smtplib.SMTP(self.smtp_host, self.smtp_port, timeout=20) as server:
|
| 88 |
+
if self.smtp_use_tls:
|
| 89 |
+
server.starttls()
|
| 90 |
+
server.login(self.smtp_user, self.smtp_password)
|
| 91 |
+
server.send_message(msg)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class SlackNotifier:
|
| 95 |
+
def __init__(self):
|
| 96 |
+
self.enabled = NC.ENABLE_SLACK_ALERTS
|
| 97 |
+
self.webhook_url = NC.SLACK_WEBHOOK_URL
|
| 98 |
+
|
| 99 |
+
if self.enabled:
|
| 100 |
+
self._validate()
|
| 101 |
+
|
| 102 |
+
def _validate(self) -> None:
|
| 103 |
+
if not self.webhook_url:
|
| 104 |
+
raise ValueError("Missing notification slack config: NOTIFY_SLACK_WEBHOOK_URL")
|
| 105 |
+
|
| 106 |
+
def send(self, subject: str, body: str) -> None:
|
| 107 |
+
if not self.enabled:
|
| 108 |
+
return
|
| 109 |
+
|
| 110 |
+
text = f"*{subject}*\n{body}"
|
| 111 |
+
|
| 112 |
+
response = requests.post(
|
| 113 |
+
self.webhook_url,
|
| 114 |
+
json={"text": text},
|
| 115 |
+
timeout=10,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
response.raise_for_status()
|
| 119 |
+
|
| 120 |
+
if response.status_code != 200:
|
| 121 |
+
raise RuntimeError(
|
| 122 |
+
f"Slack notification failed: {response.status_code} {response.text}"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class NotificationCenter:
|
| 127 |
+
def __init__(self):
|
| 128 |
+
self.email = EmailNotifier()
|
| 129 |
+
self.slack = SlackNotifier()
|
| 130 |
+
|
| 131 |
+
def send_notification(
|
| 132 |
+
self,
|
| 133 |
+
subject: str,
|
| 134 |
+
body: str,
|
| 135 |
+
channel: Channel = "email",
|
| 136 |
+
attachments: str | list[str] | None = None,
|
| 137 |
+
) -> None:
|
| 138 |
+
|
| 139 |
+
match channel:
|
| 140 |
+
case "all":
|
| 141 |
+
self.email.send(subject, body, attachments)
|
| 142 |
+
self.slack.send(subject, body)
|
| 143 |
+
case "email":
|
| 144 |
+
self.email.send(subject, body, attachments)
|
| 145 |
+
case "slack":
|
| 146 |
+
self.slack.send(subject, body)
|
| 147 |
+
case _:
|
| 148 |
+
raise ValueError(f"Unknown notification channel: {channel}")
|
src/pipeline/__init__.py
ADDED
|
File without changes
|
src/pipeline/pipeline.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .utils import *
|
| 2 |
+
from .processors import *
|
| 3 |
+
from ..scraping.scraper import Scraper
|
| 4 |
+
|
| 5 |
+
from ..database.weavservice import WeaviateService
|
| 6 |
+
from ..utils.logging import get_logger
|
| 7 |
+
from ..config import config
|
| 8 |
+
|
| 9 |
+
pipelogger = get_logger("pipeline_module")
|
| 10 |
+
implogger = get_logger("import_pipeline")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ImportPipeline:
|
| 14 |
+
"""
|
| 15 |
+
Main pipeline class responsible for importing website and local documents
|
| 16 |
+
into the database with deduplication and language-based organization.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
logging_callback = None,
|
| 22 |
+
deduplication_callback = None,
|
| 23 |
+
) -> None:
|
| 24 |
+
"""
|
| 25 |
+
Initialize the import pipeline with optional callbacks for logging and deduplication.
|
| 26 |
+
|
| 27 |
+
This sets up the processors for websites and documents and recieves existing chunk IDs
|
| 28 |
+
from the database for deduplication purposes.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
logging_callback (callable, optional): A callback function for logging progress.
|
| 32 |
+
Defaults to a placeholder if not provided.
|
| 33 |
+
deduplication_callback (callable, optional): A callback function for handling
|
| 34 |
+
deduplication decisions. Defaults to a placeholder if not provided.
|
| 35 |
+
"""
|
| 36 |
+
self._logging_callback = logging_callback or logging_callback_placeholder
|
| 37 |
+
self._deduplication_callback = deduplication_callback or deduplication_callback_placeholder
|
| 38 |
+
self._docprocessor = DocumentProcessor()
|
| 39 |
+
self._service = WeaviateService()
|
| 40 |
+
self._ids = self._service._collect_chunk_ids()
|
| 41 |
+
|
| 42 |
+
implogger.info('Import pipeline initialization finished!')
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def import_from_scraper(self, scraper_chunks: dict[str, dict]) -> None:
|
| 46 |
+
for lang, chunks in scraper_chunks.items():
|
| 47 |
+
if not chunks: continue
|
| 48 |
+
|
| 49 |
+
sources = list(set([chunk.get('source', '') for chunk in chunks]))
|
| 50 |
+
self._service.delete_chunks(lang, property_filters={'source': sources})
|
| 51 |
+
self._service.batch_import(data_rows=chunks, lang=lang)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def scrape_website(self, target_urls: list[str] | None = None, scrape_all: bool = False) -> None:
|
| 55 |
+
target_urls = [url for url in (target_urls or config.scraping.TARGET_URLS or []) if url]
|
| 56 |
+
if not target_urls:
|
| 57 |
+
implogger.warning("No target URLs configured for scraping.")
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
scraper = Scraper(scrape_all=scrape_all)
|
| 61 |
+
for target_url in target_urls:
|
| 62 |
+
self._logging_callback(f"Scraping target {target_url}...", 0)
|
| 63 |
+
scraped_chunks = scraper.scrape_target(target_url)
|
| 64 |
+
if not scraped_chunks:
|
| 65 |
+
self._logging_callback(f"No importable chunks scraped from {target_url}.", 100)
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
self._logging_callback(f"Importing scraped chunks from {target_url}...", 90)
|
| 69 |
+
self.import_from_scraper(scraped_chunks)
|
| 70 |
+
self._logging_callback(f"Finished scraping import for {target_url}.", 100)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def import_many_documents(self, sources: list[str]) -> None:
|
| 74 |
+
self.import_all(paths=sources)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _import_urls_via_scraper(self, urls: list[str], scrape_all: bool = True) -> None:
|
| 78 |
+
urls = [url for url in (urls or []) if url]
|
| 79 |
+
if not urls:
|
| 80 |
+
return
|
| 81 |
+
|
| 82 |
+
scraper = Scraper(scrape_all=scrape_all)
|
| 83 |
+
for url in urls:
|
| 84 |
+
self._logging_callback(f"Scraping URL {url}...", 0)
|
| 85 |
+
scraped_chunks = scraper.scrape_target(url)
|
| 86 |
+
if not scraped_chunks:
|
| 87 |
+
self._logging_callback(f"Failed to scrape URL {url}!", 100, failed=True)
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
self._logging_callback(f"Importing scraped chunks from {url}...", 90)
|
| 91 |
+
self.import_from_scraper(scraped_chunks)
|
| 92 |
+
self._logging_callback(f"Stored scraped chunks for {url}.", 100)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def import_all(
|
| 96 |
+
self,
|
| 97 |
+
paths: list[str] = None,
|
| 98 |
+
urls: list[str] = None,
|
| 99 |
+
reset_collections: bool = False,
|
| 100 |
+
) -> None:
|
| 101 |
+
"""
|
| 102 |
+
Import documents from local paths and/or URLs into the database.
|
| 103 |
+
|
| 104 |
+
Processes the provided paths and URLs using the appropriate processors,
|
| 105 |
+
combines chunks by language, optionally resets database collections,
|
| 106 |
+
and performs batch imports.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
paths (list[str], optional): List of local file paths to process. Defaults to None.
|
| 110 |
+
urls (list[str], optional): List of website URLs to process. Defaults to None.
|
| 111 |
+
reset_collections (bool, optional): If True, reset the database collections before importing.
|
| 112 |
+
Defaults to False.
|
| 113 |
+
"""
|
| 114 |
+
chunks = self._pipeline(paths, self._docprocessor, reset_collections)
|
| 115 |
+
|
| 116 |
+
if reset_collections:
|
| 117 |
+
self._logging_callback('Resetting database collections...', 60)
|
| 118 |
+
self._service._reset_collections()
|
| 119 |
+
|
| 120 |
+
self._logging_callback('Importing document chunks to database...', 90)
|
| 121 |
+
for lang, ch in chunks.items():
|
| 122 |
+
self._service.batch_import(data_rows=ch, lang=lang)
|
| 123 |
+
|
| 124 |
+
self._import_urls_via_scraper(urls, scrape_all=True)
|
| 125 |
+
|
| 126 |
+
self._logging_callback(
|
| 127 |
+
f'Successfully imported {sum([len(ch) for ch in chunks.values()])} document chunks!',
|
| 128 |
+
100
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _pipeline(
|
| 133 |
+
self,
|
| 134 |
+
sources: list[str],
|
| 135 |
+
processor: ProcessorBase,
|
| 136 |
+
reset_collections: bool,
|
| 137 |
+
) -> dict:
|
| 138 |
+
"""
|
| 139 |
+
Internal pipeline to process a list of sources using a given processor.
|
| 140 |
+
|
| 141 |
+
Handles processing, deduplication (if not resetting), and organizes unique chunks by language.
|
| 142 |
+
If no new unique data is found, logs a warning and returns empty chunks.
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
sources (list[str]): List of sources (paths or URLs) to process.
|
| 146 |
+
processor (ProcessorBase): The processor instance to use for handling sources.
|
| 147 |
+
reset_collections (bool): If True, skip deduplication.
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
dict: A dictionary mapping languages to lists of unique chunk dictionaries.
|
| 151 |
+
"""
|
| 152 |
+
unique_chunks = {lang: [] for lang in config.get('AVAILABLE_LANGUAGES')}
|
| 153 |
+
|
| 154 |
+
sources = [s for s in (sources or []) if s != ""]
|
| 155 |
+
if not sources:
|
| 156 |
+
return unique_chunks
|
| 157 |
+
|
| 158 |
+
for source in sources:
|
| 159 |
+
self._logging_callback(f'Starting pipeline for {source}...', 0)
|
| 160 |
+
result = processor.process(source)
|
| 161 |
+
|
| 162 |
+
if not result.chunks:
|
| 163 |
+
implogger.error(f"Failed to process {source}!")
|
| 164 |
+
self._logging_callback(f"Failed to process {source}!", 100, result, failed=True)
|
| 165 |
+
continue
|
| 166 |
+
|
| 167 |
+
if not reset_collections:
|
| 168 |
+
self._deduplicate(result)
|
| 169 |
+
|
| 170 |
+
self._logging_callback(f'Storing chunks for {source}...', 100, result)
|
| 171 |
+
unique_chunks[result.lang].extend(result.chunks)
|
| 172 |
+
|
| 173 |
+
if all([len(chunks) == 0 for chunks in unique_chunks.values()]):
|
| 174 |
+
self._logging_callback('No new data could be extracted from these sources!', 100)
|
| 175 |
+
implogger.warning(f"File(s) provided for the insertion do not contain any unique information.")
|
| 176 |
+
|
| 177 |
+
return unique_chunks
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def _deduplicate(self, result: ProcessingResult) -> ProcessingResult:
|
| 181 |
+
"""
|
| 182 |
+
Remove duplicate chunks based on chunks that are already stored in the database.
|
| 183 |
+
|
| 184 |
+
If all chunks are duplicates, invokes the deduplication callback to decide whether
|
| 185 |
+
to delete existing duplicates and reimport. Otherwise, returns only unique chunks.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
result (ProcessingResult): The processing result containing document chunks.
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
list[dict]: List of unique chunk dictionaries (or all if reimporting duplicates).
|
| 192 |
+
"""
|
| 193 |
+
self._logging_callback('Performing deduplication...', 80)
|
| 194 |
+
unique_chunks = []
|
| 195 |
+
duplicate_ids = []
|
| 196 |
+
for chunk in result.chunks:
|
| 197 |
+
chunk_id = chunk['chunk_id']
|
| 198 |
+
if chunk_id in self._ids:
|
| 199 |
+
duplicate_ids.append(chunk_id)
|
| 200 |
+
else:
|
| 201 |
+
unique_chunks.append(chunk)
|
| 202 |
+
|
| 203 |
+
implogger.info(f"Found {len(duplicate_ids)} already existing IDs in {len(result.chunks)} collected chunks")
|
| 204 |
+
if duplicate_ids:
|
| 205 |
+
implogger.info(f"Duplicates found! Calling deduplication callback...")
|
| 206 |
+
if self._deduplication_callback(result.source, len(duplicate_ids)):
|
| 207 |
+
implogger.info('Duplicated chunks will be reimported as new...')
|
| 208 |
+
self._service._delete_by_id(duplicate_ids)
|
| 209 |
+
return result
|
| 210 |
+
|
| 211 |
+
result.chunks = unique_chunks
|
| 212 |
+
return result
|
src/pipeline/processors.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
import os, re
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
|
| 7 |
+
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
|
| 8 |
+
from docling.datamodel.pipeline_options import PdfPipelineOptions, LayoutOptions
|
| 9 |
+
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
|
| 10 |
+
from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
|
| 11 |
+
from docling.chunking import HybridChunker
|
| 12 |
+
from docling_core.types.doc.document import DoclingDocument, TableItem
|
| 13 |
+
|
| 14 |
+
from .utils import *
|
| 15 |
+
|
| 16 |
+
from ..utils.lang import detect_language
|
| 17 |
+
from ..utils.logging import get_logger
|
| 18 |
+
from ..config import config
|
| 19 |
+
|
| 20 |
+
weblogger = get_logger("website_processor")
|
| 21 |
+
datalogger = get_logger("data_processor")
|
| 22 |
+
|
| 23 |
+
class ProcessorBase:
|
| 24 |
+
def __init__(self) -> None:
|
| 25 |
+
"""
|
| 26 |
+
Initialize the base processor with document conversion and chunking tools.
|
| 27 |
+
|
| 28 |
+
Sets up the PDF pipeline options, document converter, tokenizer, and chunker.
|
| 29 |
+
Loads strategies for chunk preparation.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
logging_callback (callable): A callback function for logging progress.
|
| 33 |
+
"""
|
| 34 |
+
pipeline_options = PdfPipelineOptions(
|
| 35 |
+
do_ocr = False,
|
| 36 |
+
generate_page_images = False,
|
| 37 |
+
|
| 38 |
+
do_layout_analysis = True,
|
| 39 |
+
do_table_structure = True,
|
| 40 |
+
do_cell_matching = True,
|
| 41 |
+
|
| 42 |
+
layout_options=LayoutOptions(
|
| 43 |
+
create_orphan_clusters = True,
|
| 44 |
+
keep_empty_clusters = False,
|
| 45 |
+
skip_cell_assignment = False,
|
| 46 |
+
),
|
| 47 |
+
)
|
| 48 |
+
self._converter: DocumentConverter = DocumentConverter(
|
| 49 |
+
format_options={
|
| 50 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
|
| 51 |
+
},
|
| 52 |
+
)
|
| 53 |
+
tokenizer = AutoTokenizer.from_pretrained(config.processing.EMBEDDING_MODEL)
|
| 54 |
+
self._chunker = HybridChunker(
|
| 55 |
+
tokenizer=HuggingFaceTokenizer(
|
| 56 |
+
tokenizer=tokenizer,
|
| 57 |
+
max_tokens=config.processing.MAX_TOKENS
|
| 58 |
+
),
|
| 59 |
+
serializer_provider=EnhansedSerializerProvider(),
|
| 60 |
+
max_tokens=config.processing.MAX_TOKENS,
|
| 61 |
+
merge_peers=True
|
| 62 |
+
)
|
| 63 |
+
self.strategies_processor = StrategiesProcessor()
|
| 64 |
+
self._logging_callback = config.dbapp['logging_callback'] or logging_callback_placeholder
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def process(self):
|
| 68 |
+
"""
|
| 69 |
+
Abstract method to be implemented by subclasses for processing sources.
|
| 70 |
+
|
| 71 |
+
Raises:
|
| 72 |
+
NotImplementedError: If not overridden in a subclass.
|
| 73 |
+
"""
|
| 74 |
+
raise NotImplementedError("This method is not implemented in ProcessorBase")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def convert_to_txt(self, document: DoclingDocument) -> str:
|
| 78 |
+
plain_text = []
|
| 79 |
+
for node, _ in document.iterate_items(root=document.body, with_groups=False):
|
| 80 |
+
if isinstance(node, TableItem):
|
| 81 |
+
df = node.export_to_dataframe(document)
|
| 82 |
+
table_str = df.to_string(index=False, na_rep='')
|
| 83 |
+
plain_text.append(table_str)
|
| 84 |
+
elif hasattr(node, 'text') and node.text:
|
| 85 |
+
plain_text.append(node.text.strip())
|
| 86 |
+
return '\n\n'.join(plain_text)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _prepare_chunks(self, document_name: str, document_content: str, chunks: list[str]) -> list[dict]:
|
| 90 |
+
"""
|
| 91 |
+
Prepare chunks by applying strategies to generate properties for each chunk.
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
document_name (str): The name or identifier of the document.
|
| 95 |
+
document_content (str): The full content of the document.
|
| 96 |
+
chunks (list[str]): List of text chunks to prepare.
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
list[dict]: List of dictionaries, each containing properties for a chunk.
|
| 100 |
+
"""
|
| 101 |
+
prepared_chunks = []
|
| 102 |
+
for chunk in chunks:
|
| 103 |
+
prepared_chunks.append({
|
| 104 |
+
prop: self.strategies_processor.apply_strategy(
|
| 105 |
+
strategy_name=prop,
|
| 106 |
+
arguments=StrategyArguments(document_name, document_content, chunk),
|
| 107 |
+
)
|
| 108 |
+
for prop in self.strategies_processor.list_strategies()
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
return prepared_chunks
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _clean_content(self, document_content: str) -> str:
|
| 115 |
+
"""
|
| 116 |
+
Clean the document content by removing garbage symbols and normalizing whitespace.
|
| 117 |
+
|
| 118 |
+
Handles specific replacements for punctuation, symbols, and line breaks.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
document_content (str): The raw document content to clean.
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
str: The cleaned document content.
|
| 125 |
+
"""
|
| 126 |
+
cleaned = re.sub(r'\s+/\s+', '/', document_content)
|
| 127 |
+
cleaned = re.sub(r'\s+\.\s+', '.', cleaned)
|
| 128 |
+
cleaned = re.sub(r',\s+', '.', cleaned)
|
| 129 |
+
cleaned = re.sub(r'\s+\|\s+', ' ', cleaned)
|
| 130 |
+
cleaned = re.sub(r'\/\s+', '/', cleaned)
|
| 131 |
+
cleaned = re.sub(r'\s+/','/', cleaned)
|
| 132 |
+
cleaned = re.sub(r'\s+\.', '.', cleaned)
|
| 133 |
+
cleaned = re.sub(r'(\d+)\s*,\s*(\d{4})', r'\1', cleaned)
|
| 134 |
+
cleaned = re.sub(r'(\d+)\s*/\s*(\d+)', r'\1', cleaned)
|
| 135 |
+
cleaned = re.sub(r'\.(\d{4})', r'.\1', cleaned)
|
| 136 |
+
|
| 137 |
+
cleaned = cleaned.replace('ä', 'ä').replace('ö', 'ö').replace('ü', 'ü')
|
| 138 |
+
|
| 139 |
+
cleaned = re.sub(r'\n\s*\n+', '\n\n', cleaned)
|
| 140 |
+
cleaned = re.sub(r' +', ' ', cleaned)
|
| 141 |
+
|
| 142 |
+
return cleaned
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _extract_document_content(self, document: DoclingDocument) -> str:
|
| 146 |
+
"""
|
| 147 |
+
Extract and compile text content from the document into a single string.
|
| 148 |
+
|
| 149 |
+
Organizes text items by page, sorts them by position, and joins them
|
| 150 |
+
while handling line breaks and spacing.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
document (DoclingDocument): The document object to extract content from.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
str: The cleaned, compiled text content.
|
| 157 |
+
"""
|
| 158 |
+
page_texts = defaultdict(list)
|
| 159 |
+
for text_item in document.texts:
|
| 160 |
+
if not text_item.text.strip():
|
| 161 |
+
continue
|
| 162 |
+
|
| 163 |
+
prov = text_item.prov[0] if text_item.prov else None
|
| 164 |
+
if prov:
|
| 165 |
+
page_number = prov.page_no
|
| 166 |
+
bbox = prov.bbox
|
| 167 |
+
page_texts[page_number].append({
|
| 168 |
+
'text': text_item.text.strip(),
|
| 169 |
+
'top': bbox.t,
|
| 170 |
+
'left': bbox.l,
|
| 171 |
+
'bottom': bbox.b,
|
| 172 |
+
})
|
| 173 |
+
|
| 174 |
+
full_page_texts = []
|
| 175 |
+
for page_number in sorted(page_texts.keys()):
|
| 176 |
+
text_items = sorted(
|
| 177 |
+
page_texts[page_number],
|
| 178 |
+
key=lambda text: (-text['top'], text['left']),
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
content = []
|
| 182 |
+
last_bottom = None
|
| 183 |
+
|
| 184 |
+
line_treshold = 15
|
| 185 |
+
|
| 186 |
+
for item in text_items:
|
| 187 |
+
text = item['text']
|
| 188 |
+
|
| 189 |
+
if last_bottom is not None and (last_bottom - item['bottom'] > line_treshold):
|
| 190 |
+
if content:
|
| 191 |
+
full_page_texts.append(' '.join(content))
|
| 192 |
+
content = []
|
| 193 |
+
|
| 194 |
+
if last_bottom - item['bottom'] > 50:
|
| 195 |
+
full_page_texts.append("")
|
| 196 |
+
|
| 197 |
+
content.append(text)
|
| 198 |
+
last_bottom = item['bottom']
|
| 199 |
+
|
| 200 |
+
if content:
|
| 201 |
+
full_page_texts.append(' '.join(content))
|
| 202 |
+
|
| 203 |
+
full_text = '\n\n'.join(full_page_texts)
|
| 204 |
+
cleaned_text = self._clean_content(full_text)
|
| 205 |
+
|
| 206 |
+
return cleaned_text
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _collect_chunks(self, document: DoclingDocument) -> list[str]:
|
| 210 |
+
"""
|
| 211 |
+
Collect contextualized chunks from the document using the chunker.
|
| 212 |
+
|
| 213 |
+
Args:
|
| 214 |
+
document (DoclingDocument): The document to chunk.
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
list[str]: List of enriched text chunks.
|
| 218 |
+
"""
|
| 219 |
+
chunks = []
|
| 220 |
+
for base_chunk in self._chunker.chunk(dl_doc=document):
|
| 221 |
+
enriched = self._chunker.contextualize(chunk=base_chunk)
|
| 222 |
+
chunks.append(enriched)
|
| 223 |
+
return chunks
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _collect_chunks_fallback(self, document_content: str) -> list[str]:
|
| 227 |
+
"""
|
| 228 |
+
Fallback method to chunk the document content manually using tokenization.
|
| 229 |
+
|
| 230 |
+
Splits the content into overlapping chunks based on token limits.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
document_content (str): The full content extracted from document.
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
list[str]: List of text chunks.
|
| 237 |
+
"""
|
| 238 |
+
tokenizer_wrapper = self._chunker.tokenizer
|
| 239 |
+
tokenizer = getattr(tokenizer_wrapper, 'tokenizer', tokenizer_wrapper)
|
| 240 |
+
|
| 241 |
+
tokens = tokenizer.encode(document_content)
|
| 242 |
+
chunk_size = self._chunker.max_tokens
|
| 243 |
+
overlap = 50
|
| 244 |
+
|
| 245 |
+
collected_chunks = []
|
| 246 |
+
for i in range(0, len(tokens), chunk_size-overlap):
|
| 247 |
+
chunk_tokens = tokens[i:i+chunk_size]
|
| 248 |
+
chunk = tokenizer.decode(
|
| 249 |
+
chunk_tokens,
|
| 250 |
+
skip_special_tokens=True,
|
| 251 |
+
clean_up_tokenization_spaces=True
|
| 252 |
+
)
|
| 253 |
+
collected_chunks.append(chunk)
|
| 254 |
+
|
| 255 |
+
return collected_chunks
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
class DocumentProcessor(ProcessorBase):
|
| 259 |
+
def process(self, source: Path | str) -> ProcessingResult:
|
| 260 |
+
"""
|
| 261 |
+
Process a single local document, converting it to text, chunking, and preparing for import.
|
| 262 |
+
|
| 263 |
+
Handles document conversion, chunk collection (with fallback if needed),
|
| 264 |
+
chunk preparation, and language detection.
|
| 265 |
+
|
| 266 |
+
Args:
|
| 267 |
+
source (Path | str): Path to the document to process.
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
ProcessingResult: The result containing chunks, source name, and detected language.
|
| 271 |
+
Returns None if the source does not exist or processing fails.
|
| 272 |
+
"""
|
| 273 |
+
if not os.path.exists(source) or not os.path.isfile(source):
|
| 274 |
+
datalogger.error(f"Failed to initiate processing pipeline for source {source}: file does not exist")
|
| 275 |
+
return ProcessingResult(source=source, chunks=None, lang='')
|
| 276 |
+
|
| 277 |
+
document_name = os.path.basename(source)
|
| 278 |
+
datalogger.info(f"Initiating processing pipeline for source {document_name}")
|
| 279 |
+
self._logging_callback(f'Converting source {document_name}...', 20)
|
| 280 |
+
document = self._converter.convert(source).document
|
| 281 |
+
|
| 282 |
+
self._logging_callback(f'Collecting chunks from {document_name}...', 40)
|
| 283 |
+
collected_chunks = self._collect_chunks(document)
|
| 284 |
+
document_content = MarkdownDocSerializer(doc=document).serialize().text
|
| 285 |
+
|
| 286 |
+
if len(collected_chunks) <= 1: # Document content manual extraction
|
| 287 |
+
document_content = self._extract_document_content(document)
|
| 288 |
+
document = self._converter.convert_string(
|
| 289 |
+
content=document_content,
|
| 290 |
+
format=InputFormat.MD
|
| 291 |
+
).document
|
| 292 |
+
collected_chunks = self._collect_chunks(document)
|
| 293 |
+
|
| 294 |
+
self._logging_callback(f'Preparing chunks for {document_name} for importing...', 60)
|
| 295 |
+
prepared_chunks = self._prepare_chunks(document_name, document_content, collected_chunks)
|
| 296 |
+
|
| 297 |
+
datalogger.info(f"Successfully collected {len(prepared_chunks)} chunks from {document_name}")
|
| 298 |
+
|
| 299 |
+
return ProcessingResult(
|
| 300 |
+
chunks=prepared_chunks,
|
| 301 |
+
source=document_name,
|
| 302 |
+
lang=detect_language(document_content),
|
| 303 |
+
)
|
src/pipeline/utilclasses.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
|
src/pipeline/utils/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .strategies_processor import StrategyArguments, StrategiesProcessor
|
| 2 |
+
from .serializer import EnhansedSerializerProvider
|
| 3 |
+
from .utilclasses import *
|
src/pipeline/utils/serializer.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider
|
| 2 |
+
from docling_core.transforms.serializer.base import BaseTableSerializer, SerializationResult
|
| 3 |
+
from docling_core.transforms.serializer.common import create_ser_result
|
| 4 |
+
from docling_core.types.doc.document import RichTableCell
|
| 5 |
+
|
| 6 |
+
class EnhancedTableSerializer(BaseTableSerializer):
|
| 7 |
+
def serialize(self, *, item, doc_serializer, doc, **kwargs) -> SerializationResult:
|
| 8 |
+
if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
|
| 9 |
+
return create_ser_result(text='')
|
| 10 |
+
|
| 11 |
+
grid = item.data.grid
|
| 12 |
+
if not grid:
|
| 13 |
+
return create_ser_result(text='')
|
| 14 |
+
|
| 15 |
+
row_cells = []
|
| 16 |
+
for row in grid:
|
| 17 |
+
clean_row = []
|
| 18 |
+
for cell in row:
|
| 19 |
+
if isinstance(cell, RichTableCell):
|
| 20 |
+
ser = doc_serializer.serialize(item=cell.ref.resolve(doc), **kwargs)
|
| 21 |
+
clean_row.append(ser.text.strip())
|
| 22 |
+
else:
|
| 23 |
+
clean_row.append((cell.text or "").strip())
|
| 24 |
+
if any(c for c in clean_row):
|
| 25 |
+
row_cells.append(clean_row)
|
| 26 |
+
|
| 27 |
+
headers = row_cells[0]
|
| 28 |
+
data_rows = row_cells[1:]
|
| 29 |
+
|
| 30 |
+
lines = []
|
| 31 |
+
|
| 32 |
+
for row in data_rows:
|
| 33 |
+
if len(row) < 2 or not row[0].strip():
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
main_key = row[0].strip().replace('\n', ' ')
|
| 37 |
+
top_line = f'- {main_key}:'
|
| 38 |
+
lines.append(top_line)
|
| 39 |
+
|
| 40 |
+
for i in range(1, len(row)):
|
| 41 |
+
value = row[i].strip().replace('\n', ' ')
|
| 42 |
+
if not value: continue
|
| 43 |
+
sub_header = headers[i].strip().replace('\n', ' ') if i < len(headers) else f""
|
| 44 |
+
sub_line = f' - {sub_header}: {value}'
|
| 45 |
+
lines.append(sub_line)
|
| 46 |
+
|
| 47 |
+
lines.append("")
|
| 48 |
+
|
| 49 |
+
final_text = "\n".join(lines).rstrip()
|
| 50 |
+
return create_ser_result(text=final_text, span_source=item)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class EnhansedSerializerProvider(ChunkingSerializerProvider):
|
| 54 |
+
def get_serializer(self, doc):
|
| 55 |
+
return ChunkingDocSerializer(
|
| 56 |
+
doc=doc,
|
| 57 |
+
table_serializer=EnhancedTableSerializer(),
|
| 58 |
+
)
|
src/pipeline/utils/strategies_processor.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, re, importlib.util
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
from src.config import config
|
| 5 |
+
from src.utils.logging import get_logger
|
| 6 |
+
|
| 7 |
+
logger = get_logger('pipeline.strats')
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class StrategyArguments:
|
| 11 |
+
name: str = None
|
| 12 |
+
content: str = None
|
| 13 |
+
chunk: str = None
|
| 14 |
+
|
| 15 |
+
class StrategiesProcessor:
|
| 16 |
+
def __init__(self) -> None:
|
| 17 |
+
os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
|
| 18 |
+
|
| 19 |
+
self._strategies: dict = self._load_strategies()
|
| 20 |
+
|
| 21 |
+
def list_strategies(self) -> list[str]:
|
| 22 |
+
return self._strategies.keys()
|
| 23 |
+
|
| 24 |
+
def apply_strategy(self, strategy_name: str, arguments: StrategyArguments | dict):
|
| 25 |
+
if strategy_name not in self._strategies.keys():
|
| 26 |
+
raise ValueError(f"Cannot apply strategy '{strategy_name}': strategy not found!")
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
strategy = self._strategies[strategy_name]
|
| 30 |
+
run_result = None
|
| 31 |
+
if isinstance(arguments, StrategyArguments):
|
| 32 |
+
run_result = strategy.run(arguments.name, arguments.content, arguments.chunk)
|
| 33 |
+
else:
|
| 34 |
+
run_result = strategy.run(
|
| 35 |
+
arguments.get('document_name', ""),
|
| 36 |
+
arguments.get('document_content', ""),
|
| 37 |
+
arguments.get('chunk', None)
|
| 38 |
+
)
|
| 39 |
+
return run_result
|
| 40 |
+
except Exception as e:
|
| 41 |
+
raise RuntimeError(f"Cannot apply strategy '{strategy_name}': {e}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _load_strategies(self) -> dict:
|
| 45 |
+
loaded_strategies = dict()
|
| 46 |
+
for strat_file in os.listdir(config.weaviate.STRATEGIES_PATH):
|
| 47 |
+
strat_name = self._extract_strategy_name(strat_file)
|
| 48 |
+
if not strat_name: continue
|
| 49 |
+
|
| 50 |
+
strat_path = os.path.join(config.weaviate.STRATEGIES_PATH, strat_file)
|
| 51 |
+
|
| 52 |
+
spec = importlib.util.spec_from_file_location(
|
| 53 |
+
name=strat_name,
|
| 54 |
+
location=strat_path
|
| 55 |
+
)
|
| 56 |
+
strategy = importlib.util.module_from_spec(spec)
|
| 57 |
+
spec.loader.exec_module(strategy)
|
| 58 |
+
|
| 59 |
+
if not hasattr(strategy, 'run'):
|
| 60 |
+
logger.warning(f"Found strategy '{strat_name}' has no valid run() function!")
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
loaded_strategies[strat_name] = strategy
|
| 64 |
+
|
| 65 |
+
logger.info(f"Loaded {len(loaded_strategies.keys())} strategies")
|
| 66 |
+
return loaded_strategies
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _extract_strategy_name(self, strat_file: str) -> str:
|
| 70 |
+
match = re.fullmatch(r'^strat_(.*)\.py$', strat_file)
|
| 71 |
+
return match.group(1) if match else None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
src/pipeline/utils/utilclasses.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
|
| 3 |
+
def logging_callback_placeholder(*_):
|
| 4 |
+
pass
|
| 5 |
+
|
| 6 |
+
def deduplication_callback_placeholder(*_) -> bool:
|
| 7 |
+
return False
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class ProcessingResult:
|
| 11 |
+
chunks: list[dict]
|
| 12 |
+
source: str
|
| 13 |
+
lang: str
|
src/rag/__init__.py
ADDED
|
File without changes
|
src/rag/agent_chain.py
ADDED
|
@@ -0,0 +1,1022 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.runnables import RunnableConfig
|
| 2 |
+
from langsmith import traceable
|
| 3 |
+
from langchain.tools import tool
|
| 4 |
+
from langchain.agents import create_agent
|
| 5 |
+
from langchain_core.messages import (
|
| 6 |
+
HumanMessage,
|
| 7 |
+
AIMessage,
|
| 8 |
+
SystemMessage,
|
| 9 |
+
)
|
| 10 |
+
from langchain.agents.middleware import ModelFallbackMiddleware
|
| 11 |
+
from langchain.agents.structured_output import ProviderStrategy
|
| 12 |
+
|
| 13 |
+
import uuid
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
import random
|
| 18 |
+
import glob
|
| 19 |
+
from datetime import datetime
|
| 20 |
+
|
| 21 |
+
from src.database.weavservice import WeaviateService
|
| 22 |
+
|
| 23 |
+
from src.rag.utilclasses import *
|
| 24 |
+
from src.const.agent_response_constants import *
|
| 25 |
+
from src.rag.middleware import AgentChainMiddleware as chainmdw
|
| 26 |
+
from src.rag.prompts import PromptConfigurator as promptconf
|
| 27 |
+
from src.rag.models import ModelConfigurator as modelconf
|
| 28 |
+
from src.rag.input_handler import InputHandler
|
| 29 |
+
from src.rag.response_formatter import ResponseFormatter
|
| 30 |
+
from src.rag.scope_guardian import ScopeGuardian
|
| 31 |
+
# from src.rag.quality_score_handler import QualityEvaluationResult, QualityScoreHandler
|
| 32 |
+
from src.rag.language_detection import LanguageDetector
|
| 33 |
+
|
| 34 |
+
from src.utils.logging import get_logger
|
| 35 |
+
from src.utils.lang import get_language_name
|
| 36 |
+
from src.config import config
|
| 37 |
+
|
| 38 |
+
from ..cache.cache import Cache
|
| 39 |
+
|
| 40 |
+
chain_logger = get_logger('agent_chain')
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class ExecutiveAgentChain:
|
| 44 |
+
def __init__(self, language: str = 'en', session_id: str | None = None) -> None:
|
| 45 |
+
self._initial_language = language
|
| 46 |
+
self._stored_language = language
|
| 47 |
+
self._dbservice = WeaviateService()
|
| 48 |
+
self._agents, self._config = self._init_agents()
|
| 49 |
+
self._conversation_history = []
|
| 50 |
+
self._cache = Cache.get_cache()
|
| 51 |
+
|
| 52 |
+
# Confidence scoring is intentionally disabled here because the extra
|
| 53 |
+
# model call adds latency and has not been reliable enough to justify it.
|
| 54 |
+
# if config.chain.EVALUATE_RESPONSE_QUALITY:
|
| 55 |
+
# self._quality_handler = QualityScoreHandler()
|
| 56 |
+
self._language_detector = LanguageDetector()
|
| 57 |
+
|
| 58 |
+
# Generate unique user ID for this session
|
| 59 |
+
self._user_id = session_id or str(uuid.uuid4())
|
| 60 |
+
|
| 61 |
+
# Initialize conversation state with user profile tracking
|
| 62 |
+
self._conversation_state: ConversationState = {
|
| 63 |
+
'session_id': self._user_id,
|
| 64 |
+
'user_id': self._user_id,
|
| 65 |
+
'user_language': None,
|
| 66 |
+
'user_name': None,
|
| 67 |
+
'experience_years': None,
|
| 68 |
+
'leadership_years': None,
|
| 69 |
+
'field': None,
|
| 70 |
+
'interest': None,
|
| 71 |
+
'qualification_level': None,
|
| 72 |
+
'program_interest': [],
|
| 73 |
+
'suggested_program': None,
|
| 74 |
+
'handover_requested': None,
|
| 75 |
+
'topics_discussed': [],
|
| 76 |
+
'preferences_known': False
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# Track scope violations for escalation
|
| 80 |
+
self._scope_violation_counts: dict[str, int] = {}
|
| 81 |
+
self._aggressive_violation_count = 0
|
| 82 |
+
|
| 83 |
+
chain_logger.info(f"Initialized new Agent Chain for language '{language}' with user_id: {self._user_id}")
|
| 84 |
+
|
| 85 |
+
def _retrieve_context(self, query: str, program: str, language: str = None):
|
| 86 |
+
"""
|
| 87 |
+
Send the query to the vector database to retrieve additional information about the program.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
query: Keywords depicting information you want to retrieve in the primary language.
|
| 91 |
+
program: Name of the program (either 'emba', 'iemba' or 'emba x') for which the information is requested.
|
| 92 |
+
language: Optional parameter (either 'en' for English language or 'de' for German language). This parameter selects the language of the database to query from. The input query must be written in the same language as the selected language. Use this parameter only if there's not enough information in your main language.
|
| 93 |
+
"""
|
| 94 |
+
lang = language if language in ['en', 'de'] else self._initial_language
|
| 95 |
+
try:
|
| 96 |
+
response, _ = self._dbservice.query(
|
| 97 |
+
query=query,
|
| 98 |
+
lang=lang,
|
| 99 |
+
limit=config.get('TOP_K_RETRIEVAL'),
|
| 100 |
+
property_filters={
|
| 101 |
+
'programs': [program],
|
| 102 |
+
},
|
| 103 |
+
)
|
| 104 |
+
serialized = '\n\n'.join([doc.properties.get('body', '') for doc in response.objects])
|
| 105 |
+
return serialized
|
| 106 |
+
except Exception as e:
|
| 107 |
+
raise e
|
| 108 |
+
|
| 109 |
+
def _call_emba_agent(self, query: str) -> str:
|
| 110 |
+
"""
|
| 111 |
+
Invokes the EMBA support agent to retrieve more detailed information about the EMBA program.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
query: Query to the EMBA support agent. Provide collected user data in the query if possible.
|
| 115 |
+
"""
|
| 116 |
+
try:
|
| 117 |
+
structured_response = self._query(
|
| 118 |
+
agent=self._agents['emba'],
|
| 119 |
+
messages=[HumanMessage(query)],
|
| 120 |
+
thread_id=f"emba_{hash(query)}",
|
| 121 |
+
)
|
| 122 |
+
return structured_response.response
|
| 123 |
+
except Exception as e:
|
| 124 |
+
chain_logger.error(f"EMBA Agent error: {e}")
|
| 125 |
+
raise RuntimeError("Unable to retrieve EMBA information at this time.")
|
| 126 |
+
|
| 127 |
+
def _call_iemba_agent(self, query: str) -> str:
|
| 128 |
+
"""
|
| 129 |
+
Invokes the IEMBA support agent to retrieve more detailed information about the IEMBA program.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
query: Query to the IEMBA support agent. Provide collected user data in the query if possible.
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
structured_response = self._query(
|
| 136 |
+
agent=self._agents['iemba'],
|
| 137 |
+
messages=[HumanMessage(query)],
|
| 138 |
+
thread_id=f"emba_{hash(query)}",
|
| 139 |
+
)
|
| 140 |
+
return structured_response.response
|
| 141 |
+
except Exception as e:
|
| 142 |
+
chain_logger.error(f"IEMBA Agent error: {e}")
|
| 143 |
+
raise RuntimeError("Unable to retrieve IEMBA information at this time.")
|
| 144 |
+
|
| 145 |
+
def _call_embax_agent(self, query: str) -> str:
|
| 146 |
+
"""
|
| 147 |
+
Invokes the emba X support agent to retrieve more detailed information about the emba X program.
|
| 148 |
+
|
| 149 |
+
Args:
|
| 150 |
+
query: Query to the emba X support agent. Provide collected user data in the query if possible.
|
| 151 |
+
"""
|
| 152 |
+
try:
|
| 153 |
+
structured_response = self._query(
|
| 154 |
+
agent=self._agents['embax'],
|
| 155 |
+
messages=[HumanMessage(query)],
|
| 156 |
+
thread_id=f"emba_{hash(query)}",
|
| 157 |
+
)
|
| 158 |
+
return structured_response.response
|
| 159 |
+
except Exception as e:
|
| 160 |
+
chain_logger.error(f"emba X Agent error: {e}")
|
| 161 |
+
raise RuntimeError("Unable to retrieve emba X information at this time.")
|
| 162 |
+
|
| 163 |
+
def _init_agents(self):
|
| 164 |
+
config: RunnableConfig = {
|
| 165 |
+
'configurable': {'thread_id': 0}
|
| 166 |
+
}
|
| 167 |
+
fallback_middleware = ModelFallbackMiddleware(
|
| 168 |
+
*modelconf.get_fallback_models()
|
| 169 |
+
)
|
| 170 |
+
tool_retrieve_context = tool(
|
| 171 |
+
name_or_callable='retrieve_context',
|
| 172 |
+
runnable=self._retrieve_context,
|
| 173 |
+
return_direct=False,
|
| 174 |
+
parse_docstring=True,
|
| 175 |
+
)
|
| 176 |
+
tools_agent_calling = [
|
| 177 |
+
tool(
|
| 178 |
+
name_or_callable='call_emba_agent',
|
| 179 |
+
runnable=self._call_emba_agent,
|
| 180 |
+
return_direct=False,
|
| 181 |
+
parse_docstring=True,
|
| 182 |
+
),
|
| 183 |
+
tool(
|
| 184 |
+
name_or_callable='call_iemba_agent',
|
| 185 |
+
runnable=self._call_iemba_agent,
|
| 186 |
+
return_direct=False,
|
| 187 |
+
parse_docstring=True,
|
| 188 |
+
),
|
| 189 |
+
tool(
|
| 190 |
+
name_or_callable='call_embax_agent',
|
| 191 |
+
runnable=self._call_embax_agent,
|
| 192 |
+
return_direct=False,
|
| 193 |
+
parse_docstring=True,
|
| 194 |
+
),
|
| 195 |
+
]
|
| 196 |
+
agents = {
|
| 197 |
+
'lead': create_agent(
|
| 198 |
+
name="lead_agent",
|
| 199 |
+
model=modelconf.get_main_agent_model(),
|
| 200 |
+
tools=tools_agent_calling,
|
| 201 |
+
state_schema=LeadInformationState,
|
| 202 |
+
system_prompt=promptconf.get_configured_agent_prompt('lead', language=self._initial_language),
|
| 203 |
+
middleware=[
|
| 204 |
+
chainmdw.get_tool_wrapper(),
|
| 205 |
+
chainmdw.get_model_wrapper(),
|
| 206 |
+
fallback_middleware,
|
| 207 |
+
],
|
| 208 |
+
context_schema=AgentContext,
|
| 209 |
+
response_format=ProviderStrategy(
|
| 210 |
+
StructuredAgentResponse
|
| 211 |
+
),
|
| 212 |
+
),
|
| 213 |
+
}
|
| 214 |
+
for agent in ['emba', 'iemba', 'embax']:
|
| 215 |
+
agents[agent] = create_agent(
|
| 216 |
+
name=f"{agent}_agent",
|
| 217 |
+
model=modelconf.get_subagent_model(),
|
| 218 |
+
tools=[tool_retrieve_context],
|
| 219 |
+
state_schema=LeadInformationState,
|
| 220 |
+
system_prompt=promptconf.get_configured_agent_prompt(agent, language=self._initial_language),
|
| 221 |
+
middleware=[
|
| 222 |
+
fallback_middleware,
|
| 223 |
+
chainmdw.get_tool_wrapper(),
|
| 224 |
+
chainmdw.get_model_wrapper(),
|
| 225 |
+
],
|
| 226 |
+
context_schema=AgentContext,
|
| 227 |
+
)
|
| 228 |
+
return agents, config
|
| 229 |
+
|
| 230 |
+
def _extract_experience_years(self, conversation: str) -> int | None:
|
| 231 |
+
"""Extract years of professional experience from conversation text."""
|
| 232 |
+
# Look for patterns like "10 years", "5 years experience", etc.
|
| 233 |
+
patterns = [
|
| 234 |
+
r'(\d+)\s*years?\s*(?:of\s*)?(?:experience|work)',
|
| 235 |
+
r'(\d+)\s*years?\s*in\s*(?:the\s*)?(?:field|industry)',
|
| 236 |
+
r'working\s*for\s*(\d+)\s*years?',
|
| 237 |
+
r'(\d+)\s*Jahre\s*(?:Erfahrung|Berufserfahrung)', # German
|
| 238 |
+
]
|
| 239 |
+
for pattern in patterns:
|
| 240 |
+
match = re.search(pattern, conversation, re.IGNORECASE)
|
| 241 |
+
if match:
|
| 242 |
+
return int(match.group(1))
|
| 243 |
+
return None
|
| 244 |
+
|
| 245 |
+
def _extract_leadership_years(self, conversation: str) -> int | None:
|
| 246 |
+
"""Extract years of leadership experience from conversation text."""
|
| 247 |
+
patterns = [
|
| 248 |
+
r'(\d+)\s*years?\s*(?:of\s*)?(?:leadership|management|managing)',
|
| 249 |
+
r'(?:lead|led|manage|managed)\s*(?:for\s*)?(\d+)\s*years?',
|
| 250 |
+
r'(\d+)\s*Jahre\s*(?:Führungserfahrung|Führung)', # German
|
| 251 |
+
]
|
| 252 |
+
for pattern in patterns:
|
| 253 |
+
match = re.search(pattern, conversation, re.IGNORECASE)
|
| 254 |
+
if match:
|
| 255 |
+
return int(match.group(1))
|
| 256 |
+
return None
|
| 257 |
+
|
| 258 |
+
def _extract_field(self, conversation: str) -> str | None:
|
| 259 |
+
"""Extract professional field/industry from conversation text."""
|
| 260 |
+
# Common fields mentioned in executive education
|
| 261 |
+
fields = [
|
| 262 |
+
'finance', 'banking', 'technology', 'tech', 'IT', 'healthcare',
|
| 263 |
+
'consulting', 'manufacturing', 'retail', 'marketing', 'sales',
|
| 264 |
+
'engineering', 'pharma', 'telecommunications', 'energy',
|
| 265 |
+
'Finanzwesen', 'Technologie', 'Gesundheitswesen', 'Beratung' # German
|
| 266 |
+
]
|
| 267 |
+
conversation_lower = conversation.lower()
|
| 268 |
+
for field in fields:
|
| 269 |
+
if field.lower() in conversation_lower:
|
| 270 |
+
return field.capitalize()
|
| 271 |
+
return None
|
| 272 |
+
|
| 273 |
+
def _extract_interest(self, conversation: str) -> str | None:
|
| 274 |
+
"""Extract content interests from conversation text."""
|
| 275 |
+
# Look for interest indicators
|
| 276 |
+
interests = [
|
| 277 |
+
'strategy', 'innovation', 'leadership', 'digital transformation',
|
| 278 |
+
'finance', 'operations', 'marketing', 'entrepreneurship',
|
| 279 |
+
'social impact', 'technology', 'management',
|
| 280 |
+
'Strategie', 'Innovation', 'Führung', 'Digitalisierung' # German
|
| 281 |
+
]
|
| 282 |
+
conversation_lower = conversation.lower()
|
| 283 |
+
found_interests = [interest for interest in interests
|
| 284 |
+
if interest.lower() in conversation_lower]
|
| 285 |
+
return ', '.join(found_interests) if found_interests else None
|
| 286 |
+
|
| 287 |
+
def _extract_name(self, conversation: str) -> str | None:
|
| 288 |
+
"""Extract user's name from conversation text."""
|
| 289 |
+
patterns = [
|
| 290 |
+
r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
|
| 291 |
+
r"(?:this is|it's)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
|
| 292 |
+
r"(?:ich heiße|mein Name ist|ich bin)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # German
|
| 293 |
+
]
|
| 294 |
+
for pattern in patterns:
|
| 295 |
+
match = re.search(pattern, conversation, re.IGNORECASE)
|
| 296 |
+
if match:
|
| 297 |
+
name = match.group(1).strip()
|
| 298 |
+
# Filter out common words that might be误ly matched
|
| 299 |
+
excluded = ['interested', 'looking', 'working', 'searching', 'asking']
|
| 300 |
+
if name.lower() not in excluded:
|
| 301 |
+
return name
|
| 302 |
+
return None
|
| 303 |
+
|
| 304 |
+
def _detect_handover_request(self, conversation: str) -> bool:
|
| 305 |
+
"""Detect if user requested appointment, callback, or contact."""
|
| 306 |
+
# Keywords indicating handover request
|
| 307 |
+
handover_keywords = [
|
| 308 |
+
'appointment', 'call me', 'contact me', 'schedule', 'meeting',
|
| 309 |
+
'callback', 'reach out', 'follow up', 'get in touch', 'speak with',
|
| 310 |
+
'talk to', 'consultation', 'discuss with', 'meet with',
|
| 311 |
+
'Termin', 'Rückruf', 'kontaktieren', 'Gespräch', 'anrufen', # German
|
| 312 |
+
'zurückrufen', 'Beratung', 'treffen'
|
| 313 |
+
]
|
| 314 |
+
conversation_lower = conversation.lower()
|
| 315 |
+
return any(keyword.lower() in conversation_lower for keyword in handover_keywords)
|
| 316 |
+
|
| 317 |
+
def _previous_response_offered_booking(self) -> bool:
|
| 318 |
+
"""Return True if the latest assistant turn offered booking as a next step."""
|
| 319 |
+
booking_offer_terms = [
|
| 320 |
+
"appointment slots",
|
| 321 |
+
"book an appointment",
|
| 322 |
+
"book a consultation",
|
| 323 |
+
"appointment booking",
|
| 324 |
+
"show you available appointments",
|
| 325 |
+
"show appointment options",
|
| 326 |
+
"terminbuchung",
|
| 327 |
+
"termin buchen",
|
| 328 |
+
"termine anzeigen",
|
| 329 |
+
"verfügbare termine",
|
| 330 |
+
"beratungstermin",
|
| 331 |
+
]
|
| 332 |
+
|
| 333 |
+
for message in reversed(self._conversation_history):
|
| 334 |
+
if not isinstance(message, AIMessage):
|
| 335 |
+
continue
|
| 336 |
+
content = getattr(message, "content", "") or getattr(message, "text", "")
|
| 337 |
+
if isinstance(content, list):
|
| 338 |
+
content = " ".join(str(part) for part in content)
|
| 339 |
+
content_lower = str(content).lower()
|
| 340 |
+
return any(term in content_lower for term in booking_offer_terms)
|
| 341 |
+
|
| 342 |
+
return False
|
| 343 |
+
|
| 344 |
+
def _get_latest_ai_message_content(self, skip_latest: bool = False) -> str:
|
| 345 |
+
"""Return the latest assistant message content from conversation history."""
|
| 346 |
+
ai_messages_seen = 0
|
| 347 |
+
|
| 348 |
+
for message in reversed(self._conversation_history):
|
| 349 |
+
if not isinstance(message, AIMessage):
|
| 350 |
+
continue
|
| 351 |
+
|
| 352 |
+
ai_messages_seen += 1
|
| 353 |
+
if skip_latest and ai_messages_seen == 1:
|
| 354 |
+
continue
|
| 355 |
+
|
| 356 |
+
content = getattr(message, "content", "") or getattr(message, "text", "")
|
| 357 |
+
if isinstance(content, list):
|
| 358 |
+
return " ".join(str(part) for part in content)
|
| 359 |
+
return str(content)
|
| 360 |
+
|
| 361 |
+
return ""
|
| 362 |
+
|
| 363 |
+
def _is_booking_preference_follow_up(self, query: str) -> bool:
|
| 364 |
+
"""Detect short follow-up answers that continue an active booking flow."""
|
| 365 |
+
query_lower = query.lower().strip()
|
| 366 |
+
if not query_lower:
|
| 367 |
+
return False
|
| 368 |
+
|
| 369 |
+
preference_terms = [
|
| 370 |
+
"online",
|
| 371 |
+
"on-site",
|
| 372 |
+
"onsite",
|
| 373 |
+
"in person",
|
| 374 |
+
"in-person",
|
| 375 |
+
"st.gallen",
|
| 376 |
+
"st. gallen",
|
| 377 |
+
"morning",
|
| 378 |
+
"mornings",
|
| 379 |
+
"afternoon",
|
| 380 |
+
"afternoons",
|
| 381 |
+
"evening",
|
| 382 |
+
"beginning of the week",
|
| 383 |
+
"start of the week",
|
| 384 |
+
"end of the week",
|
| 385 |
+
"monday",
|
| 386 |
+
"tuesday",
|
| 387 |
+
"wednesday",
|
| 388 |
+
"thursday",
|
| 389 |
+
"friday",
|
| 390 |
+
"morgens",
|
| 391 |
+
"vormittag",
|
| 392 |
+
"vormittags",
|
| 393 |
+
"nachmittag",
|
| 394 |
+
"nachmittags",
|
| 395 |
+
"abends",
|
| 396 |
+
"wochenanfang",
|
| 397 |
+
"anfang der woche",
|
| 398 |
+
"ende der woche",
|
| 399 |
+
"montag",
|
| 400 |
+
"dienstag",
|
| 401 |
+
"mittwoch",
|
| 402 |
+
"donnerstag",
|
| 403 |
+
"freitag",
|
| 404 |
+
"vor ort",
|
| 405 |
+
"vor-ort",
|
| 406 |
+
"persönlich",
|
| 407 |
+
"persoenlich",
|
| 408 |
+
"hybrid",
|
| 409 |
+
]
|
| 410 |
+
|
| 411 |
+
if any(term in query_lower for term in preference_terms):
|
| 412 |
+
return True
|
| 413 |
+
|
| 414 |
+
return False
|
| 415 |
+
|
| 416 |
+
def _previous_response_requested_booking_preferences(self) -> bool:
|
| 417 |
+
"""Return True when the previous assistant turn asked clarifying booking questions."""
|
| 418 |
+
content_lower = self._get_latest_ai_message_content().lower()
|
| 419 |
+
if not content_lower:
|
| 420 |
+
return False
|
| 421 |
+
|
| 422 |
+
booking_context_terms = [
|
| 423 |
+
"appointment options",
|
| 424 |
+
"available appointments",
|
| 425 |
+
"available slots",
|
| 426 |
+
"appointment slots",
|
| 427 |
+
"online-terminoptionen",
|
| 428 |
+
"terminoptionen",
|
| 429 |
+
"verfügbare slots",
|
| 430 |
+
"verfügbare termine",
|
| 431 |
+
"beratungsgespräch",
|
| 432 |
+
"beratung",
|
| 433 |
+
]
|
| 434 |
+
clarification_terms = [
|
| 435 |
+
"do you prefer",
|
| 436 |
+
"would you prefer",
|
| 437 |
+
"which programme",
|
| 438 |
+
"which program",
|
| 439 |
+
"one short question",
|
| 440 |
+
"final question",
|
| 441 |
+
"when i know this",
|
| 442 |
+
"bitte noch kurz",
|
| 443 |
+
"eine kurze rückfrage",
|
| 444 |
+
"eine kurze letzte frage",
|
| 445 |
+
"bevorzugen sie",
|
| 446 |
+
"haben sie eine tagespräferenz",
|
| 447 |
+
"sobald ich das weiss",
|
| 448 |
+
"damit die slots besser passen",
|
| 449 |
+
]
|
| 450 |
+
|
| 451 |
+
return (
|
| 452 |
+
any(term in content_lower for term in booking_context_terms)
|
| 453 |
+
and any(term in content_lower for term in clarification_terms)
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
def _response_commits_to_showing_booking_widget(self, response: str) -> bool:
|
| 457 |
+
"""Detect when the assistant says booking options are being shown now."""
|
| 458 |
+
response_lower = response.lower()
|
| 459 |
+
|
| 460 |
+
positive_terms = [
|
| 461 |
+
"i can show you",
|
| 462 |
+
"contact details and available appointment slots are shown below",
|
| 463 |
+
"appointment options are shown below",
|
| 464 |
+
"available slots are shown below",
|
| 465 |
+
"i can now show you",
|
| 466 |
+
"ich kann ihnen nun",
|
| 467 |
+
"ich kann ihnen jetzt",
|
| 468 |
+
"unten werden ihnen",
|
| 469 |
+
"unten finden sie",
|
| 470 |
+
"unten sehen sie",
|
| 471 |
+
"terminoptionen anzeigen",
|
| 472 |
+
"verfügbaren slots",
|
| 473 |
+
"verfügbaren termine",
|
| 474 |
+
]
|
| 475 |
+
defer_terms = [
|
| 476 |
+
"if you would like",
|
| 477 |
+
"if you later wish",
|
| 478 |
+
"you can ask me",
|
| 479 |
+
"if that would be helpful",
|
| 480 |
+
"sobald ich das weiss",
|
| 481 |
+
"wenn ich das weiss",
|
| 482 |
+
"damit die slots besser passen",
|
| 483 |
+
"bitte noch kurz",
|
| 484 |
+
"eine kurze rückfrage",
|
| 485 |
+
"eine kurze letzte frage",
|
| 486 |
+
"bevorzugen sie",
|
| 487 |
+
"have you got a preference",
|
| 488 |
+
"do you prefer",
|
| 489 |
+
"would you prefer",
|
| 490 |
+
"which programme",
|
| 491 |
+
"which program",
|
| 492 |
+
]
|
| 493 |
+
|
| 494 |
+
return (
|
| 495 |
+
any(term in response_lower for term in positive_terms)
|
| 496 |
+
and not any(term in response_lower for term in defer_terms)
|
| 497 |
+
)
|
| 498 |
+
def _is_explicit_booking_intent(self, query: str) -> bool:
|
| 499 |
+
"""Detect whether the user is actively asking to book or accepting a booking offer."""
|
| 500 |
+
query_lower = query.lower()
|
| 501 |
+
direct_booking_terms = [
|
| 502 |
+
"book",
|
| 503 |
+
"schedule",
|
| 504 |
+
"appointment",
|
| 505 |
+
"consultation",
|
| 506 |
+
"need a consultation",
|
| 507 |
+
"personal consultation",
|
| 508 |
+
"speak with",
|
| 509 |
+
"talk to an advisor",
|
| 510 |
+
"talk to admissions",
|
| 511 |
+
"connect me",
|
| 512 |
+
"show me available",
|
| 513 |
+
"show appointment",
|
| 514 |
+
"available slots",
|
| 515 |
+
"termin",
|
| 516 |
+
"termin buchen",
|
| 517 |
+
"termin vereinbaren",
|
| 518 |
+
"beratungstermin",
|
| 519 |
+
"beratungsgespräch",
|
| 520 |
+
"ich brauche eine beratung",
|
| 521 |
+
"ich möchte eine beratung",
|
| 522 |
+
"ich will eine beratung",
|
| 523 |
+
"beratung für",
|
| 524 |
+
"persönliche beratung",
|
| 525 |
+
"persoenliche beratung",
|
| 526 |
+
"mit jemandem sprechen",
|
| 527 |
+
"mit admissions sprechen",
|
| 528 |
+
"mit der zulassung sprechen",
|
| 529 |
+
"termine anzeigen",
|
| 530 |
+
"verfügbare termine",
|
| 531 |
+
]
|
| 532 |
+
rejection_terms = [
|
| 533 |
+
"do not want",
|
| 534 |
+
"don't want",
|
| 535 |
+
"no appointment",
|
| 536 |
+
"not book",
|
| 537 |
+
"not schedule",
|
| 538 |
+
"no thanks",
|
| 539 |
+
"no thank you",
|
| 540 |
+
"kein termin",
|
| 541 |
+
"keinen termin",
|
| 542 |
+
"keine beratung",
|
| 543 |
+
"nicht buchen",
|
| 544 |
+
"nicht vereinbaren",
|
| 545 |
+
"nein danke",
|
| 546 |
+
]
|
| 547 |
+
acceptance_terms = [
|
| 548 |
+
"yes",
|
| 549 |
+
"yes please",
|
| 550 |
+
"please do",
|
| 551 |
+
"that would be helpful",
|
| 552 |
+
"show me",
|
| 553 |
+
"ja",
|
| 554 |
+
"ja bitte",
|
| 555 |
+
"gerne",
|
| 556 |
+
"bitte",
|
| 557 |
+
"mach das",
|
| 558 |
+
"zeige",
|
| 559 |
+
]
|
| 560 |
+
|
| 561 |
+
def contains_term(term: str) -> bool:
|
| 562 |
+
if term in {"yes", "ja", "bitte"}:
|
| 563 |
+
return re.search(rf"\b{re.escape(term)}\b", query_lower) is not None
|
| 564 |
+
return term in query_lower
|
| 565 |
+
|
| 566 |
+
if any(contains_term(term) for term in rejection_terms):
|
| 567 |
+
return False
|
| 568 |
+
|
| 569 |
+
if any(contains_term(term) for term in direct_booking_terms):
|
| 570 |
+
return True
|
| 571 |
+
|
| 572 |
+
return (
|
| 573 |
+
self._previous_response_offered_booking()
|
| 574 |
+
and any(contains_term(term) for term in acceptance_terms)
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
def _determine_suggested_program(self) -> str | None:
|
| 578 |
+
"""Determine recommended program based on user profile."""
|
| 579 |
+
state = self._conversation_state
|
| 580 |
+
|
| 581 |
+
# If program interest was explicitly mentioned
|
| 582 |
+
if state['program_interest']:
|
| 583 |
+
return state['program_interest'][0]
|
| 584 |
+
|
| 585 |
+
# Make recommendation based on profile
|
| 586 |
+
experience = state.get('experience_years', 0) or 0
|
| 587 |
+
leadership = state.get('leadership_years', 0) or 0
|
| 588 |
+
|
| 589 |
+
# EMBA: 5+ years experience, 2+ years leadership
|
| 590 |
+
if experience >= 5 and leadership >= 2:
|
| 591 |
+
return 'EMBA'
|
| 592 |
+
# IEMBA: International focus, 3+ years experience
|
| 593 |
+
elif experience >= 3:
|
| 594 |
+
return 'IEMBA'
|
| 595 |
+
# EMBA X: Digital/Innovation focus
|
| 596 |
+
elif state.get('interest') and any(kw in state.get('interest', '').lower()
|
| 597 |
+
for kw in ['digital', 'innovation', 'technology']):
|
| 598 |
+
return 'emba X'
|
| 599 |
+
|
| 600 |
+
return None
|
| 601 |
+
|
| 602 |
+
def _update_conversation_state(self, user_query: str, agent_response: str) -> None:
|
| 603 |
+
"""Update conversation state by extracting information from the conversation."""
|
| 604 |
+
if not config.convstate.TRACK_USER_PROFILE:
|
| 605 |
+
return
|
| 606 |
+
|
| 607 |
+
# Combine query and response for analysis
|
| 608 |
+
conversation_text = f"{user_query} {agent_response}"
|
| 609 |
+
|
| 610 |
+
# Extract profile information
|
| 611 |
+
if not self._conversation_state.get('experience_years'):
|
| 612 |
+
exp_years = self._extract_experience_years(conversation_text)
|
| 613 |
+
if exp_years:
|
| 614 |
+
self._conversation_state['experience_years'] = exp_years
|
| 615 |
+
chain_logger.info(f"Extracted experience years: {exp_years}")
|
| 616 |
+
|
| 617 |
+
if not self._conversation_state.get('leadership_years'):
|
| 618 |
+
lead_years = self._extract_leadership_years(conversation_text)
|
| 619 |
+
if lead_years:
|
| 620 |
+
self._conversation_state['leadership_years'] = lead_years
|
| 621 |
+
chain_logger.info(f"Extracted leadership years: {lead_years}")
|
| 622 |
+
|
| 623 |
+
if not self._conversation_state.get('field'):
|
| 624 |
+
field = self._extract_field(conversation_text)
|
| 625 |
+
if field:
|
| 626 |
+
self._conversation_state['field'] = field
|
| 627 |
+
chain_logger.info(f"Extracted field: {field}")
|
| 628 |
+
|
| 629 |
+
if not self._conversation_state.get('interest'):
|
| 630 |
+
interest = self._extract_interest(conversation_text)
|
| 631 |
+
if interest:
|
| 632 |
+
self._conversation_state['interest'] = interest
|
| 633 |
+
chain_logger.info(f"Extracted interest: {interest}")
|
| 634 |
+
|
| 635 |
+
# Extract name
|
| 636 |
+
if not self._conversation_state.get('user_name'):
|
| 637 |
+
name = self._extract_name(conversation_text)
|
| 638 |
+
if name:
|
| 639 |
+
self._conversation_state['user_name'] = name
|
| 640 |
+
chain_logger.info(f"Extracted name: {name}")
|
| 641 |
+
|
| 642 |
+
# Detect handover request from the user only; assistant soft offers should not count.
|
| 643 |
+
if self._detect_handover_request(user_query):
|
| 644 |
+
self._conversation_state['handover_requested'] = True
|
| 645 |
+
chain_logger.info("Handover request detected")
|
| 646 |
+
|
| 647 |
+
# Check for program mentions
|
| 648 |
+
programs = ['EMBA', 'IEMBA', 'EMBA X']
|
| 649 |
+
for program in programs:
|
| 650 |
+
if program.lower() in conversation_text.lower():
|
| 651 |
+
if program not in self._conversation_state['program_interest']:
|
| 652 |
+
self._conversation_state['program_interest'].append(program)
|
| 653 |
+
|
| 654 |
+
# Update suggested program
|
| 655 |
+
suggested = self._determine_suggested_program()
|
| 656 |
+
if suggested and not self._conversation_state.get('suggested_program'):
|
| 657 |
+
self._conversation_state['suggested_program'] = suggested
|
| 658 |
+
chain_logger.info(f"Suggested program: {suggested}")
|
| 659 |
+
|
| 660 |
+
def _log_user_profile(self) -> None:
|
| 661 |
+
"""Log user profile to JSON file."""
|
| 662 |
+
if not config.convstate.TRACK_USER_PROFILE:
|
| 663 |
+
return
|
| 664 |
+
|
| 665 |
+
try:
|
| 666 |
+
# Create logs directory if it doesn't exist
|
| 667 |
+
log_dir = os.path.join('logs', 'user_profiles')
|
| 668 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 669 |
+
|
| 670 |
+
# Create profile data
|
| 671 |
+
profile_data = {
|
| 672 |
+
'session_id': self._conversation_state['session_id'],
|
| 673 |
+
'user_id': self._conversation_state['user_id'],
|
| 674 |
+
'name': self._conversation_state.get('user_name'),
|
| 675 |
+
'timestamp': datetime.now().isoformat(),
|
| 676 |
+
'experience_years': self._conversation_state.get('experience_years'),
|
| 677 |
+
'leadership_years': self._conversation_state.get('leadership_years'),
|
| 678 |
+
'field': self._conversation_state.get('field'),
|
| 679 |
+
'interest': self._conversation_state.get('interest'),
|
| 680 |
+
'suggested_program': self._conversation_state.get('suggested_program'),
|
| 681 |
+
'handover': self._conversation_state.get('handover_requested'),
|
| 682 |
+
'user_language': self._conversation_state.get('user_language'),
|
| 683 |
+
'program_interest': self._conversation_state.get('program_interest', []),
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
# Log file path with timestamp
|
| 687 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 688 |
+
log_file = os.path.join(log_dir, f'profile_{self._user_id}_{timestamp}.json')
|
| 689 |
+
|
| 690 |
+
# Write to file
|
| 691 |
+
with open(log_file, 'w', encoding='utf-8') as f:
|
| 692 |
+
json.dump(profile_data, f, indent=2, ensure_ascii=False)
|
| 693 |
+
|
| 694 |
+
chain_logger.info(f"User profile logged to {log_file}")
|
| 695 |
+
|
| 696 |
+
except Exception as e:
|
| 697 |
+
chain_logger.error(f"Failed to log user profile: {e}")
|
| 698 |
+
|
| 699 |
+
def wipe_session_data(self) -> None:
|
| 700 |
+
"""Delete in-memory session data and on-disk profile files (GDPR withdrawal)."""
|
| 701 |
+
|
| 702 |
+
# --- 1) In-memory wipe ---
|
| 703 |
+
self._conversation_history = []
|
| 704 |
+
self._conversation_state.update({
|
| 705 |
+
'user_language': None,
|
| 706 |
+
'user_name': None,
|
| 707 |
+
'experience_years': None,
|
| 708 |
+
'leadership_years': None,
|
| 709 |
+
'field': None,
|
| 710 |
+
'interest': None,
|
| 711 |
+
'qualification_level': None,
|
| 712 |
+
'program_interest': [],
|
| 713 |
+
'suggested_program': None,
|
| 714 |
+
'handover_requested': None,
|
| 715 |
+
'topics_discussed': [],
|
| 716 |
+
'preferences_known': False
|
| 717 |
+
})
|
| 718 |
+
self._scope_violation_counts = {}
|
| 719 |
+
self._aggressive_violation_count = 0
|
| 720 |
+
|
| 721 |
+
# --- 2) On-disk wipe (delete profile_<user_id>_*.json) ---
|
| 722 |
+
if not self._user_id:
|
| 723 |
+
chain_logger.warning("wipe_session_data called without user_id – skipping file deletion")
|
| 724 |
+
return
|
| 725 |
+
|
| 726 |
+
pattern = os.path.join(
|
| 727 |
+
"logs",
|
| 728 |
+
"user_profiles",
|
| 729 |
+
f"profile_{self._user_id}_*.json"
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
for path in glob.glob(pattern):
|
| 733 |
+
try:
|
| 734 |
+
os.remove(path)
|
| 735 |
+
chain_logger.info(f"Deleted profile file: {path}")
|
| 736 |
+
except OSError as e:
|
| 737 |
+
chain_logger.error(f"Failed to delete {path}: {e}")
|
| 738 |
+
|
| 739 |
+
def generate_greeting(self) -> str:
|
| 740 |
+
greeting_message = random.choice(GREETING_MESSAGES[self._stored_language])
|
| 741 |
+
return greeting_message
|
| 742 |
+
|
| 743 |
+
@traceable
|
| 744 |
+
def query(self, query: str) -> LeadAgentQueryResponse:
|
| 745 |
+
"""
|
| 746 |
+
Phase 1: Validation, Scope-Check and language detection.
|
| 747 |
+
Does not call the agent directly.
|
| 748 |
+
"""
|
| 749 |
+
# Remember fallback language
|
| 750 |
+
current_language = self._stored_language
|
| 751 |
+
|
| 752 |
+
if len(self._conversation_history) >= config.convstate.MAX_CONVERSATION_TURNS:
|
| 753 |
+
return LeadAgentQueryResponse(
|
| 754 |
+
response = CONVERSATION_END_MESSAGE[current_language],
|
| 755 |
+
language = current_language,
|
| 756 |
+
max_turns_reached = True,
|
| 757 |
+
relevant_programs=[],
|
| 758 |
+
processed_query = query
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
# 2. Input Processing
|
| 762 |
+
processed_query, is_valid = InputHandler.process_input(
|
| 763 |
+
query,
|
| 764 |
+
[msg for msg in self._conversation_history if isinstance(msg, (HumanMessage, AIMessage))]
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
if not is_valid or not processed_query:
|
| 768 |
+
chain_logger.warning(f"Invalid input received: '{query}'")
|
| 769 |
+
return LeadAgentQueryResponse(
|
| 770 |
+
response=NOT_VALID_QUERY_MESSAGE[self._stored_language],
|
| 771 |
+
language=current_language,
|
| 772 |
+
processed_query=query
|
| 773 |
+
)
|
| 774 |
+
|
| 775 |
+
# Log check
|
| 776 |
+
if processed_query != query:
|
| 777 |
+
chain_logger.info(f"Interpreted input '{query}' as '{processed_query}'")
|
| 778 |
+
|
| 779 |
+
# 3. Language Detection
|
| 780 |
+
# First: Check for explicit language switch request (overrides lock)
|
| 781 |
+
explicit_switch = self._language_detector.detect_explicit_switch_request(processed_query)
|
| 782 |
+
if explicit_switch:
|
| 783 |
+
self._stored_language = explicit_switch
|
| 784 |
+
current_language = explicit_switch
|
| 785 |
+
self._conversation_state['user_language'] = explicit_switch
|
| 786 |
+
elif self._language_detector.is_language_neutral_program_reference(processed_query):
|
| 787 |
+
chain_logger.info(
|
| 788 |
+
f"Skipping language re-detection for language-neutral programme reference: '{processed_query}'"
|
| 789 |
+
)
|
| 790 |
+
current_language = self._stored_language
|
| 791 |
+
else:
|
| 792 |
+
# Count user messages in conversation history
|
| 793 |
+
user_message_count = len([m for m in self._conversation_history if isinstance(m, HumanMessage)])
|
| 794 |
+
|
| 795 |
+
# Lock language after N user messages (allows language switch early in conversation)
|
| 796 |
+
lang_lock_n = config.convstate.LOCK_LANGUAGE_AFTER_N_MESSAGES
|
| 797 |
+
if lang_lock_n > 0 and user_message_count >= lang_lock_n:
|
| 798 |
+
chain_logger.info(f"Language locked to '{self._stored_language}' (after {user_message_count} messages)")
|
| 799 |
+
current_language = self._stored_language
|
| 800 |
+
else:
|
| 801 |
+
detected_language = self._language_detector.detect_language(processed_query)
|
| 802 |
+
self._conversation_state['user_language'] = detected_language
|
| 803 |
+
|
| 804 |
+
# Language validation
|
| 805 |
+
if detected_language in ['de', 'en']:
|
| 806 |
+
self._stored_language = detected_language
|
| 807 |
+
current_language = detected_language
|
| 808 |
+
else:
|
| 809 |
+
chain_logger.info("Invalid language detected.")
|
| 810 |
+
return LeadAgentQueryResponse(
|
| 811 |
+
response=LANGUAGE_FALLBACK_MESSAGE[current_language],
|
| 812 |
+
language=current_language,
|
| 813 |
+
processed_query=processed_query
|
| 814 |
+
)
|
| 815 |
+
|
| 816 |
+
# 4. Scope Check
|
| 817 |
+
scope_type = ScopeGuardian.check_scope(processed_query, current_language)
|
| 818 |
+
|
| 819 |
+
if scope_type != 'on_topic':
|
| 820 |
+
chain_logger.info(f"Out-of-scope query detected: {scope_type}")
|
| 821 |
+
if scope_type == 'aggressive':
|
| 822 |
+
self._aggressive_violation_count += 1
|
| 823 |
+
attempt_count = self._aggressive_violation_count
|
| 824 |
+
else:
|
| 825 |
+
self._scope_violation_counts[scope_type] = self._scope_violation_counts.get(scope_type, 0) + 1
|
| 826 |
+
attempt_count = self._scope_violation_counts[scope_type]
|
| 827 |
+
|
| 828 |
+
should_escalate, escalation_type = ScopeGuardian.should_escalate(
|
| 829 |
+
processed_query, scope_type, attempt_count
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
if should_escalate:
|
| 833 |
+
redirect_msg = ScopeGuardian.get_escalation_message(escalation_type, current_language)
|
| 834 |
+
else:
|
| 835 |
+
redirect_msg = ScopeGuardian.get_redirect_message(scope_type, current_language)
|
| 836 |
+
|
| 837 |
+
self._conversation_history.append(HumanMessage(processed_query))
|
| 838 |
+
self._conversation_history.append(AIMessage(redirect_msg))
|
| 839 |
+
|
| 840 |
+
return LeadAgentQueryResponse(
|
| 841 |
+
response=redirect_msg,
|
| 842 |
+
language=current_language,
|
| 843 |
+
processed_query=processed_query,
|
| 844 |
+
appointment_requested=False,
|
| 845 |
+
show_booking_widget=False,
|
| 846 |
+
)
|
| 847 |
+
|
| 848 |
+
# 5. Check if cached data already exists for this session
|
| 849 |
+
if config.cache.ENABLED:
|
| 850 |
+
cached_data = self._cache.get(query, current_language, self._user_id)
|
| 851 |
+
if cached_data and isinstance(cached_data, dict):
|
| 852 |
+
return LeadAgentQueryResponse(
|
| 853 |
+
response=cached_data["response"],
|
| 854 |
+
language=current_language,
|
| 855 |
+
appointment_requested=cached_data.get("appointment_requested", False),
|
| 856 |
+
show_booking_widget=cached_data.get("show_booking_widget", False),
|
| 857 |
+
relevant_programs=cached_data.get("relevant_programs", []),
|
| 858 |
+
)
|
| 859 |
+
|
| 860 |
+
|
| 861 |
+
# 6. Preprocessing is finished - the agent has to answer the query
|
| 862 |
+
response = self._query_lead(query)
|
| 863 |
+
|
| 864 |
+
if config.cache.ENABLED and response.should_cache:
|
| 865 |
+
self._cache.set(
|
| 866 |
+
key=query,
|
| 867 |
+
value={
|
| 868 |
+
"response": response.response,
|
| 869 |
+
"appointment_requested": response.appointment_requested,
|
| 870 |
+
"show_booking_widget": response.show_booking_widget,
|
| 871 |
+
"relevant_programs": response.relevant_programs,
|
| 872 |
+
},
|
| 873 |
+
language = current_language,
|
| 874 |
+
session_id = self._user_id,
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
return response
|
| 878 |
+
|
| 879 |
+
|
| 880 |
+
def _query_lead(self, preprocessed_query: str) -> LeadAgentQueryResponse:
|
| 881 |
+
"""
|
| 882 |
+
Phase 2: Execute agent.
|
| 883 |
+
Takes the ALREADY validated query from the preprocessing phase.
|
| 884 |
+
"""
|
| 885 |
+
# Reset scope-violation tracking
|
| 886 |
+
self._scope_violation_counts = {}
|
| 887 |
+
|
| 888 |
+
response_language = self._stored_language
|
| 889 |
+
explicit_booking_intent = self._is_explicit_booking_intent(preprocessed_query)
|
| 890 |
+
booking_preference_follow_up = (
|
| 891 |
+
self._conversation_state.get('handover_requested') is True
|
| 892 |
+
and self._previous_response_requested_booking_preferences()
|
| 893 |
+
and self._is_booking_preference_follow_up(preprocessed_query)
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
# 1. History Update
|
| 897 |
+
self._conversation_history.append(HumanMessage(preprocessed_query))
|
| 898 |
+
|
| 899 |
+
# 2. System instruction
|
| 900 |
+
language_instruction = SystemMessage(f"Respond in {get_language_name(response_language)} language.")
|
| 901 |
+
|
| 902 |
+
# 3. Agent Call
|
| 903 |
+
structured_response = self._query(
|
| 904 |
+
agent=self._agents['lead'],
|
| 905 |
+
messages=self._conversation_history + [language_instruction],
|
| 906 |
+
)
|
| 907 |
+
agent_response = structured_response.response
|
| 908 |
+
chain_logger.info(f"Is answer context dependent: {structured_response.is_context_dependent}")
|
| 909 |
+
chain_logger.info(f"Appointment Requested: {structured_response.appointment_requested}")
|
| 910 |
+
chain_logger.info(f"Show Booking Widget: {structured_response.show_booking_widget}")
|
| 911 |
+
chain_logger.info(f"Relevant Programs: {structured_response.relevant_programs}")
|
| 912 |
+
|
| 913 |
+
# 4. Formatting
|
| 914 |
+
if config.chain.ENABLE_RESPONSE_CHUNKING:
|
| 915 |
+
formatted_response = ResponseFormatter.format_response(
|
| 916 |
+
agent_response, agent_type='lead', enable_chunking=True, language=response_language
|
| 917 |
+
)
|
| 918 |
+
else:
|
| 919 |
+
formatted_response = ResponseFormatter.remove_tables(agent_response)
|
| 920 |
+
|
| 921 |
+
formatted_response = ResponseFormatter.clean_response(formatted_response)
|
| 922 |
+
|
| 923 |
+
confidence_fallback = False
|
| 924 |
+
# if config.chain.EVALUATE_RESPONSE_QUALITY:
|
| 925 |
+
# quality_evaluation: QualityEvaluationResult = self._quality_handler. \
|
| 926 |
+
# evaluate_response_quality(preprocessed_query, formatted_response)
|
| 927 |
+
#
|
| 928 |
+
# chain_logger.info(f"Quality Score: {quality_evaluation.overall_score:1.2f}")
|
| 929 |
+
#
|
| 930 |
+
# if quality_evaluation.overall_score < config.chain.CONFIDENCE_THRESHOLD:
|
| 931 |
+
# confidence_fallback = True
|
| 932 |
+
# formatted_response = CONFIDENCE_FALLBACK_MESSAGE[response_language]
|
| 933 |
+
# chain_logger.info("Fallback Mechanism activated!")
|
| 934 |
+
|
| 935 |
+
# Add to history
|
| 936 |
+
self._conversation_history.append(AIMessage(formatted_response))
|
| 937 |
+
|
| 938 |
+
# 6. Profiling
|
| 939 |
+
if config.convstate.TRACK_USER_PROFILE:
|
| 940 |
+
self._update_conversation_state(preprocessed_query, formatted_response)
|
| 941 |
+
|
| 942 |
+
message_count = len([m for m in self._conversation_history if isinstance(m, HumanMessage)])
|
| 943 |
+
if message_count % 5 == 0 or self._conversation_state.get('suggested_program'):
|
| 944 |
+
self._log_user_profile()
|
| 945 |
+
|
| 946 |
+
formatted_response = ResponseFormatter.format_name_of_university(formatted_response, language=response_language)
|
| 947 |
+
|
| 948 |
+
# Proactive booking offer.
|
| 949 |
+
# When the lead model signals booking readiness AND the assessment chain
|
| 950 |
+
# has identified a clear programme match, the booking widget is shown
|
| 951 |
+
# without waiting for an explicit "book"/"appointment" word from the user.
|
| 952 |
+
# The match comes from the existing profile-based assessment
|
| 953 |
+
# (suggested_program, set by _update_conversation_state above) or from
|
| 954 |
+
# relevant_programs returned by the lead model. Without this gate, the
|
| 955 |
+
# earlier user-led-only logic meant the widget effectively never fired.
|
| 956 |
+
clear_programme_match = (
|
| 957 |
+
self._conversation_state.get('suggested_program') is not None
|
| 958 |
+
or bool(structured_response.relevant_programs)
|
| 959 |
+
)
|
| 960 |
+
proactive_booking_offer = (
|
| 961 |
+
clear_programme_match
|
| 962 |
+
and structured_response.show_booking_widget
|
| 963 |
+
)
|
| 964 |
+
|
| 965 |
+
booking_flow_requested = (
|
| 966 |
+
explicit_booking_intent
|
| 967 |
+
or booking_preference_follow_up
|
| 968 |
+
or proactive_booking_offer
|
| 969 |
+
)
|
| 970 |
+
appointment_requested = bool(booking_flow_requested)
|
| 971 |
+
show_booking_widget = bool(
|
| 972 |
+
booking_flow_requested and (
|
| 973 |
+
structured_response.show_booking_widget
|
| 974 |
+
or self._response_commits_to_showing_booking_widget(formatted_response)
|
| 975 |
+
)
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
if proactive_booking_offer and not (explicit_booking_intent or booking_preference_follow_up):
|
| 979 |
+
chain_logger.info(
|
| 980 |
+
"Proactive booking offer triggered "
|
| 981 |
+
f"(suggested_program={self._conversation_state.get('suggested_program')}, "
|
| 982 |
+
f"relevant_programs={structured_response.relevant_programs})"
|
| 983 |
+
)
|
| 984 |
+
elif structured_response.appointment_requested and not booking_flow_requested:
|
| 985 |
+
chain_logger.info("Suppressed booking state because no programme match or booking intent was detected.")
|
| 986 |
+
elif booking_preference_follow_up and show_booking_widget:
|
| 987 |
+
chain_logger.info("Continuing active booking flow and showing booking widget for a preference follow-up.")
|
| 988 |
+
|
| 989 |
+
return LeadAgentQueryResponse(
|
| 990 |
+
response = formatted_response,
|
| 991 |
+
language = response_language,
|
| 992 |
+
confidence_fallback = confidence_fallback,
|
| 993 |
+
should_cache = False if (confidence_fallback or appointment_requested or structured_response.is_context_dependent) else True,
|
| 994 |
+
processed_query = preprocessed_query,
|
| 995 |
+
appointment_requested = appointment_requested,
|
| 996 |
+
show_booking_widget = show_booking_widget,
|
| 997 |
+
relevant_programs = structured_response.relevant_programs
|
| 998 |
+
)
|
| 999 |
+
|
| 1000 |
+
def _query(self, agent, messages: list, thread_id: str = None) -> StructuredAgentResponse:
|
| 1001 |
+
try:
|
| 1002 |
+
config = self._config.copy()
|
| 1003 |
+
config['configurable']['thread_id'] = thread_id or 0
|
| 1004 |
+
|
| 1005 |
+
result: AIMessage = agent.invoke(
|
| 1006 |
+
{"messages": messages},
|
| 1007 |
+
config=config,
|
| 1008 |
+
context=AgentContext(agent_name=agent.name),
|
| 1009 |
+
)
|
| 1010 |
+
response = result.get(
|
| 1011 |
+
'structured_response',
|
| 1012 |
+
StructuredAgentResponse(
|
| 1013 |
+
response=result['messages'][-1].text,
|
| 1014 |
+
)
|
| 1015 |
+
)
|
| 1016 |
+
return response
|
| 1017 |
+
except Exception as e:
|
| 1018 |
+
error_msg = e.body['message'] if hasattr(e, 'body') else str(e)
|
| 1019 |
+
chain_logger.error(f"Failed to invoke the agent: {error_msg}")
|
| 1020 |
+
return StructuredAgentResponse(
|
| 1021 |
+
response=QUERY_EXCEPTION_MESSAGE[self._stored_language],
|
| 1022 |
+
)
|
src/rag/input_handler.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Input handler for processing and validating user messages.
|
| 3 |
+
Handles numeric inputs, validation, and interpretation.
|
| 4 |
+
"""
|
| 5 |
+
import re
|
| 6 |
+
from src.rag.utilclasses import ConversationState
|
| 7 |
+
from src.utils.logging import get_logger
|
| 8 |
+
|
| 9 |
+
logger = get_logger("input_handler")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class InputHandler:
|
| 13 |
+
"""Handles input validation and interpretation"""
|
| 14 |
+
|
| 15 |
+
@staticmethod
|
| 16 |
+
def validate_and_normalize(message: str) -> str:
|
| 17 |
+
"""
|
| 18 |
+
Normalize and validate user input.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
message: Raw user input
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
Normalized message
|
| 25 |
+
"""
|
| 26 |
+
if not message:
|
| 27 |
+
return ""
|
| 28 |
+
|
| 29 |
+
# Strip whitespace
|
| 30 |
+
normalized = message.strip()
|
| 31 |
+
|
| 32 |
+
# Handle empty or very short inputs
|
| 33 |
+
if len(normalized) < 1:
|
| 34 |
+
return ""
|
| 35 |
+
|
| 36 |
+
return normalized
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def is_numeric_input(message: str) -> bool:
|
| 40 |
+
"""
|
| 41 |
+
Check if message is a standalone number.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
message: User input
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
True if message is just a number
|
| 48 |
+
"""
|
| 49 |
+
normalized = message.strip()
|
| 50 |
+
# Check if it's just digits (possibly with decimal)
|
| 51 |
+
return bool(re.match(r'^\d+(\.\d+)?$', normalized))
|
| 52 |
+
|
| 53 |
+
@staticmethod
|
| 54 |
+
def interpret_numeric_input(
|
| 55 |
+
message: str,
|
| 56 |
+
conversation_history: list
|
| 57 |
+
) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Interpret standalone numeric input based on conversation context.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
message: Numeric input (e.g., "5")
|
| 63 |
+
conversation_history: Recent conversation messages (LangChain message objects)
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
Interpreted message (e.g., "I have 5 years of experience")
|
| 67 |
+
"""
|
| 68 |
+
number = message.strip()
|
| 69 |
+
|
| 70 |
+
# Look at recent messages for context
|
| 71 |
+
recent_context = ""
|
| 72 |
+
if len(conversation_history) > 0:
|
| 73 |
+
# Get last bot message
|
| 74 |
+
# Import here to avoid circular dependency
|
| 75 |
+
from langchain_core.messages import AIMessage
|
| 76 |
+
|
| 77 |
+
for msg in reversed(conversation_history):
|
| 78 |
+
# Handle LangChain message objects
|
| 79 |
+
if isinstance(msg, AIMessage):
|
| 80 |
+
recent_context = msg.content.lower() if hasattr(msg, 'content') else ""
|
| 81 |
+
break
|
| 82 |
+
# Handle dictionary format (for backward compatibility)
|
| 83 |
+
elif isinstance(msg, dict) and msg.get("role") == "assistant":
|
| 84 |
+
recent_context = msg.get("content", "").lower()
|
| 85 |
+
break
|
| 86 |
+
|
| 87 |
+
# Interpret based on context keywords
|
| 88 |
+
if any(keyword in recent_context for keyword in [
|
| 89 |
+
"experience", "years", "worked", "arbeits", "erfahrung", "jahre"
|
| 90 |
+
]):
|
| 91 |
+
logger.info(f"Interpreting numeric input '{number}' as years of experience")
|
| 92 |
+
return f"I have {number} years of work experience"
|
| 93 |
+
|
| 94 |
+
elif any(keyword in recent_context for keyword in [
|
| 95 |
+
"age", "old", "alter", "jahre alt"
|
| 96 |
+
]):
|
| 97 |
+
logger.info(f"Interpreting numeric input '{number}' as age")
|
| 98 |
+
return f"I am {number} years old"
|
| 99 |
+
|
| 100 |
+
elif any(keyword in recent_context for keyword in [
|
| 101 |
+
"qualification", "degree", "bachelor", "master", "qualifikation"
|
| 102 |
+
]):
|
| 103 |
+
logger.info(f"Interpreting numeric input '{number}' as qualification level")
|
| 104 |
+
# Interpret as degree type
|
| 105 |
+
level_map = {
|
| 106 |
+
"1": "I have a Bachelor's degree",
|
| 107 |
+
"2": "I have a Master's degree",
|
| 108 |
+
"3": "I have an MBA",
|
| 109 |
+
"4": "I have a doctorate/PhD"
|
| 110 |
+
}
|
| 111 |
+
return level_map.get(number, f"My qualification level is {number}")
|
| 112 |
+
|
| 113 |
+
# Default: assume years of experience (most common)
|
| 114 |
+
logger.info(f"Interpreting numeric input '{number}' as years of experience (default)")
|
| 115 |
+
return f"I have {number} years of work experience"
|
| 116 |
+
|
| 117 |
+
@staticmethod
|
| 118 |
+
def process_input(
|
| 119 |
+
message: str,
|
| 120 |
+
conversation_history: list
|
| 121 |
+
) -> tuple[str, bool]:
|
| 122 |
+
"""
|
| 123 |
+
Process user input with validation and interpretation.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
message: Raw user input
|
| 127 |
+
conversation_history: Recent messages for context
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Tuple of (processed_message, is_valid)
|
| 131 |
+
"""
|
| 132 |
+
# Normalize
|
| 133 |
+
normalized = InputHandler.validate_and_normalize(message)
|
| 134 |
+
|
| 135 |
+
if not normalized:
|
| 136 |
+
return "", False
|
| 137 |
+
|
| 138 |
+
# Check if numeric
|
| 139 |
+
if InputHandler.is_numeric_input(normalized):
|
| 140 |
+
interpreted = InputHandler.interpret_numeric_input(
|
| 141 |
+
normalized,
|
| 142 |
+
conversation_history
|
| 143 |
+
)
|
| 144 |
+
return interpreted, True
|
| 145 |
+
|
| 146 |
+
return normalized, True
|
| 147 |
+
|