Pygmales commited on
Commit
2b7b752
·
1 Parent(s): 682e2a1

loaded project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +17 -0
  2. .gitignore +69 -0
  3. Dockerfile +37 -0
  4. config.py +186 -0
  5. main.py +163 -0
  6. requirements.txt +39 -0
  7. src/__init__.py +0 -0
  8. src/apps/__init__.py +0 -0
  9. src/apps/chat/__init__.py +0 -0
  10. src/apps/chat/app.py +324 -0
  11. src/apps/dbapp/app.py +44 -0
  12. src/apps/dbapp/backup.py +191 -0
  13. src/apps/dbapp/collections.py +8 -0
  14. src/apps/dbapp/config.py +350 -0
  15. src/apps/dbapp/framebase.py +15 -0
  16. src/apps/dbapp/imports.py +244 -0
  17. src/apps/dbapp/mainframe.py +8 -0
  18. src/apps/dbapp/query.py +108 -0
  19. src/apps/dbapp/utilclasses.py +38 -0
  20. src/cache/__init__.py +0 -0
  21. src/cache/cache.py +75 -0
  22. src/cache/cache_base.py +19 -0
  23. src/cache/cache_metrics.py +28 -0
  24. src/cache/cache_strategies.py +88 -0
  25. src/cache/utils.py +5 -0
  26. src/config/__init__.py +39 -0
  27. src/config/configs.py +249 -0
  28. src/const/agent_response_constants.py +209 -0
  29. src/const/cc_whitelist.py +3 -0
  30. src/const/data_consent_constants.py +60 -0
  31. src/const/page_blacklist.py +5 -0
  32. src/const/page_priority.py +137 -0
  33. src/database/__init__.py +0 -0
  34. src/database/docker-compose-cache.yml +27 -0
  35. src/database/docker-compose.yml +29 -0
  36. src/database/redisservice.py +53 -0
  37. src/database/weavservice.py +851 -0
  38. src/notification/__init__.py +0 -0
  39. src/notification/notification_center.py +148 -0
  40. src/pipeline/__init__.py +0 -0
  41. src/pipeline/pipeline.py +212 -0
  42. src/pipeline/processors.py +303 -0
  43. src/pipeline/utilclasses.py +3 -0
  44. src/pipeline/utils/__init__.py +3 -0
  45. src/pipeline/utils/serializer.py +58 -0
  46. src/pipeline/utils/strategies_processor.py +74 -0
  47. src/pipeline/utils/utilclasses.py +13 -0
  48. src/rag/__init__.py +0 -0
  49. src/rag/agent_chain.py +1022 -0
  50. src/rag/input_handler.py +147 -0
.dockerignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .env
4
+ __pycache__
5
+ *.pyc
6
+ *.pyo
7
+ venv/
8
+ logs/
9
+ data/database/backups/
10
+ tests/
11
+ docs/
12
+ README.md
13
+ README_UPDATES.md
14
+ *.md
15
+ .github/
16
+ .pytest_cache/
17
+ htmlcov/
.gitignore ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ .env
8
+ .venv/
9
+ env/
10
+ venv/
11
+ ENV/
12
+ env.bak/
13
+ venv.bak/
14
+
15
+ # Environment variables
16
+ .env
17
+
18
+ # VS Code settings
19
+ .vscode/
20
+
21
+ # MacOS system files
22
+ .DS_Store
23
+
24
+ # Jupyter Notebook checkpoints
25
+ .ipynb_checkpoints/
26
+
27
+ # Logs
28
+ *.log
29
+
30
+ # Cache and temp files
31
+ *.tmp
32
+ *.swp
33
+ *.bak
34
+ .cache/
35
+ *.sqlite3
36
+ *.db
37
+
38
+ # Data files
39
+ *.pdf
40
+ *.json
41
+ *.jsonl
42
+
43
+ # Output folders
44
+ dist/
45
+ build/
46
+ *.egg-info/
47
+
48
+ # Output data
49
+ data/
50
+
51
+ # Pyright config may differ from one platform to another
52
+ pyrightconfig.json
53
+
54
+ # Pycharm
55
+ .idea/
56
+
57
+ # OS junk
58
+ .Trashes.env
59
+ .env
60
+ .env
61
+
62
+ #idk
63
+ --source-branch
64
+ --source-repo
65
+ /.gradio/certificate.pem
66
+
67
+ #feedback I just uploaded into the same file to check for accuracy
68
+ chatbot emba x.docx
69
+ IEBMA Test Cards 1_2.docx
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================== Initial Building =============================
2
+ FROM python:3.11.14-slim-bookworm AS builder
3
+
4
+ WORKDIR /app
5
+
6
+ # CPU-only PyTorch
7
+ RUN pip install --no-cache-dir torch torchvision torchaudio \
8
+ --index-url https://download.pytorch.org/whl/cpu
9
+
10
+ # Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+ # ============================== Size Reduction ===============================
14
+ FROM python:3.11.14-slim-bookworm
15
+
16
+ WORKDIR /app
17
+
18
+ # Only necessary dependencies from builder
19
+ COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
20
+ COPY --from=builder /usr/local/bin /usr/local/bin
21
+
22
+ # System dependencies for runtime
23
+ RUN apt-get update && apt-get install -y --no-install-recommends \
24
+ libmagic1 \
25
+ poppler-utils \
26
+ curl \
27
+ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
28
+
29
+ # ============================ Final Compilation ==============================
30
+ COPY . .
31
+
32
+ EXPOSE 7860
33
+
34
+ HEALTHCHECK --interval=60s --timeout=10s --retries=3 \
35
+ CMD curl -f http://localhost:7860/health || exit 1
36
+
37
+ CMD ["python", "main.py", "--app", "de"]
config.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for the Executive Education RAG Chatbot.
3
+ PLEASE CONSIDER READING THE 'docs/configuration_system_documentation.md' TO PROPERLY USE THE NEW CONFIGURATION SYSTEM.
4
+ """
5
+ # ========================================= General Configuration ===========================================
6
+
7
+ # A list of ISO 639 language codes. Defines a list of languages in which
8
+ # the application can operate. Defaults to ['en', 'de'].
9
+ AVAILABLE_LANGUAGES = ['en', 'de']
10
+
11
+ # A string representing a path (relative to the project root or absolute) to the directory
12
+ # where the data output files such as scraping or document processing outputs will be stored.
13
+ DATA_PATH = 'data'
14
+
15
+ # A string representing a path (relative to the project root or absolute) to the directory
16
+ # where the loging files will be stored.
17
+ LOGS_PATH = 'logs'
18
+
19
+ # =================================== Conversation State Configuration ======================================
20
+
21
+ # A boolean; either True or False. Enables the collection of user preferences
22
+ # during conversation to avoid repetetive questions. Defaults to True.
23
+ TRACK_USER_PROFILE = True
24
+
25
+ # An integer. Defines the amount of user messages after which the language
26
+ # of the conversation will be locked. If set to 0, the language will not be locked.
27
+ LOCK_LANGUAGE_AFTER_N_MESSAGES = 3
28
+
29
+ # An integer. Sets the maximum amount of conversation turns as the sum of user queries
30
+ # and agent responses. The conversation ends after the maximum turns amount is reached.
31
+ MAX_CONVERSATION_TURNS = 20
32
+
33
+ # ============================================ LLM Configuration ============================================
34
+
35
+ # A string, either 'openai', 'groq', 'open_router' or 'ollama' (local).
36
+ # Defines the main model provider for the application.
37
+ LLM_PROVIDER = 'openai'
38
+
39
+ # A string. Defines the model that will be used by the application agents.
40
+ OPENAI_MODEL = 'gpt-5.1'
41
+ # GROQ_MODEL =
42
+ # OLLAMA_MODEL =
43
+ # OPEN_ROUTER_MODEL =
44
+
45
+ # ==================================== Weaviate Database Configuration ======================================
46
+
47
+ # A boolean; either True or False.
48
+ # Defines whether the database is set as a local instance (via Docker container),
49
+ # or as a cloud service. More information on https://docs.weaviate.io/weaviate.
50
+ WEAVIATE_IS_LOCAL = False
51
+
52
+ # A string. Defines the name of the colletions stored in the database.
53
+ # For each available language a new collection will be created
54
+ # with set name <WEAVIATE_COLLECTION_BASENAME>_<LANGUAGE>.
55
+ WEAVIATE_COLLECTION_BASENAME = 'hsg_rag_content'
56
+
57
+ # A string; either 'manual', 'filesystem' (local instance), 's3' (AWS).
58
+ # Defines the service for storing the database backups.
59
+ # More information on https://docs.weaviate.io/deploy/configuration/backups.
60
+ WEAVIATE_BACKUP_METHOD = 'manual'
61
+
62
+ # A string representing a path in the system where backups will be stored
63
+ # only if WEAVIATE_BACKUP_METHOD is set to 'manual'.
64
+ BACKUPS_PATH = 'data/database/backups'
65
+
66
+ # A string representing a system path where collection properties will be stored.
67
+ PROPERTIES_PATH = 'data/database'
68
+
69
+ # A string representing a system path where property strategies will be stored.
70
+ # More information on property strategies in the documentation.
71
+ STRATEGIES_PATH = 'data/database/strategies'
72
+
73
+ # An integer. Defines a connection timeout to the cloud weaviate service (in seconds).
74
+ # Defaults to 90.
75
+ WEAVIATE_INIT_TIMEOUT = 90
76
+
77
+ # An integer. Defines the query response time limit upon querying the database (in seconds).
78
+ # Defaults to 60.
79
+ WEAVIATE_QUERY_TIMEOUT = 60
80
+
81
+ # An integer. Defines the chunk insertion time limit when importing new chunks to database (in seconds).
82
+ # Defaults to 600
83
+ WEAVIATE_INSERT_TIMEOUT = 600
84
+
85
+ # ========================================== Cache Configuration ============================================
86
+
87
+ # A string; either 'local', 'cloud' (Redis) or 'dict'. Defaults to 'cloud'.
88
+ # Sets the default cache mode. More information on cache modes in documentation.
89
+ CACHE_MODE = 'cloud'
90
+
91
+ # An integer. Sets the reset time (time to live) in seconds for the cache storage.
92
+ # The cache storage will be cleared upon reset time exceedance.
93
+ # Defaults to 86400 seconds (24 hours).
94
+ CACHE_TTL = 86400
95
+
96
+ # An integer. Maximum amount of cached messages that will be held in the cache storage.
97
+ # Defaults to 1000.
98
+ CACHE_MAX_SIZE = 1000
99
+
100
+ # A string. Defines the IP adress to access the local cache storage. Defaults to 'localhost'.
101
+ CACHE_LOCAL_HOST = 'localhost'
102
+
103
+ # An integer. Defines the port for accessing the local cache storage. Defaults to 6379.
104
+ CACHE_LOCAL_PORT = 6379
105
+
106
+ # ===================================== Data Processing Configuration =======================================
107
+
108
+ # A string representing the name of an embeding model for embedding generation.
109
+ # The parameter MAX_TOKENS must match this model's maximum token amount.
110
+ EMBEDDING_MODEL = 'sentence-transformers/multi-qa-mpnet-base-dot-v1'
111
+
112
+ # A float in range from 0 to 1. Sets the threshold for english language in the language detector.
113
+ # If the language detection certanty is lower than the threshold, the English language will be returned.
114
+ LANG_AMBIGUITY_THRESHOLD = 0.6
115
+
116
+ # An integer. Defines the maximum amount of tokens pro single chunk.
117
+ MAX_TOKENS = 512
118
+
119
+ # An integer. Defines the amount of overlapping tokens between chunks to keep the context.
120
+ CHUNK_OVERLAP = 100
121
+
122
+
123
+ # An integer representing seconds. Defines the maximum waiting time for the target server
124
+ # responses during the scraping procedures.
125
+ SCRAPING_TIMEOUT = 30
126
+
127
+ # An integer. Defines the maximum amount of additional tries that will be performed
128
+ # if the initial request to the server failed.
129
+ SCRAPING_MAX_RETRIES = 3
130
+
131
+ # An integer representing seconds. Defines the waiting interval between two server calls.
132
+ # This value might be overwritten by the delay set by the server.
133
+ SCRAPING_CRAWL_DELAY = 1
134
+
135
+ # An integer. Defines the backoff base value for retries with exponential backoff.
136
+ # The higher is the number, the longer is the waiting interval between subsequent retries going to be.
137
+ SCRAPING_BACKOFF_RATE = 1.25
138
+
139
+ # A list of string URLs. Defines the starting points for the website scraping.
140
+ SCRAPING_TARGET_URLS = [
141
+ # 'https://emba.unisg.ch/', # EMBA HSG root
142
+ 'https://embax.ch/', # emba X root
143
+ ]
144
+
145
+ # Scraping Priority Interval in days
146
+ SCRAPING_PRIO_INTERVAL = {
147
+ "high": 1,
148
+ "medium": 7,
149
+ "low": 30
150
+ }
151
+
152
+ # ======================================== Agent Chain Configuration ========================================
153
+
154
+ # A boolean; either True or False. Activates the response quality evaluation procedure
155
+ # for agentic responses. Defaults to True.
156
+ ENABLE_EVALUATE_RESPONSE_QUALITY = True
157
+
158
+ # A float in range from 0 to 1. Sets the treshold value for the quality evaluation.
159
+ # The fallback mechanism will be activated if the quality of the agentic response
160
+ # is lower than the confidence threshold.
161
+ CONFIDENCE_THRESHOLD = 0.6
162
+
163
+ # An integer. Defines the amount of chunks that should be retrieved from the database
164
+ # upon querying by subagents during conversation. Defaults to 4.
165
+ TOP_K_RETRIEVAL = 4
166
+
167
+ # An integer. Sets the amount of model invocation retries after which the fallback model
168
+ # will be invoked. Defaults to 3.
169
+ MODEL_MAX_RETRIES = 3
170
+
171
+ # An integer. Sets the maximum amount of words in the response from the lead agent.
172
+ MAX_RESPONSE_WORDS_LEAD = 100
173
+
174
+ # An integer. Sets the maximum amount of words in the response for subagents.
175
+ MAX_RESPONSE_WORDS_SUBAGENT = 200
176
+
177
+ # A boolean; either True or False. If response chunking is enabled, long responses
178
+ # from the lead agent will be split and retuned through multiple conversation turns.
179
+ ENABLE_RESPONSE_CHUNKING = True
180
+
181
+ # ========================================== Notification Configuration =====================================
182
+
183
+ NOTIFY_ENABLE_EMAIL_ALERTS= True
184
+ NOTIFY_ENABLE_SLACK_ALERTS = True
185
+
186
+ # ===========================================================================================================
main.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main entry point for the Executive Education RAG Chatbot.
3
+ """
4
+ import argparse
5
+ import langsmith
6
+ from langsmith import traceable
7
+ from src.utils.logging import init_logging, get_logger
8
+ from config import AVAILABLE_LANGUAGES
9
+ from src.cache.cache import Cache
10
+ from src.config import config
11
+
12
+
13
+ # Initialize logging
14
+ def logging_startup():
15
+ init_logging()
16
+ return get_logger('main_module')
17
+
18
+
19
+ def run_scraper() -> None:
20
+ """
21
+ Run the scraper to collect program data.
22
+
23
+ Args:
24
+ use_selenium: Whether to use Selenium for scraping.
25
+ """
26
+ from src.pipeline.pipeline import ImportPipeline
27
+ logger = logging_startup()
28
+
29
+ logger.info("Running scraper...")
30
+ ImportPipeline().scrape_website()
31
+ logger.info("Scraping completed.")
32
+
33
+
34
+ def run_importer(sources: list[str]) -> None:
35
+ """Run the data import pipeline."""
36
+ from src.pipeline.pipeline import ImportPipeline
37
+ logger = logging_startup()
38
+
39
+ logger.info("Running data import pipeline...")
40
+ ImportPipeline().import_many_documents(sources)
41
+ logger.info("Data processing completed.")
42
+
43
+
44
+ def run_weaviate_command(command: str, backup_id: str = None):
45
+ """Run commands to manipulate the database contents."""
46
+ from src.database.weavservice import WeaviateService
47
+ logger = logging_startup()
48
+
49
+ logger.info(f"Running database command {command}")
50
+ if command == 'restore' and not backup_id:
51
+ logger.error("Backup ID is required to initalize the restore process.")
52
+
53
+ service = WeaviateService()
54
+ if command == 'backup':
55
+ service._create_backup()
56
+
57
+ if command == 'restore':
58
+ service._restore_backup(backup_id)
59
+
60
+ if command == 'delete' or command == 'redo':
61
+ service._delete_collections()
62
+
63
+ if command == 'init' or command == 'redo':
64
+ service._create_collections()
65
+
66
+ if command == 'checkhealth' or command == 'init' or command == 'redo':
67
+ service._checkhealth()
68
+
69
+
70
+ def clear_cache():
71
+ cache = Cache.get_cache()
72
+ if cache:
73
+ cache.clear_cache()
74
+
75
+
76
+ def run_application(lang: str, cache_mode, cache) -> None:
77
+ """Run the chatbot web application."""
78
+ from src.apps.chat.app import ChatbotApplication
79
+ logger = logging_startup()
80
+
81
+ Cache.configure(cache_mode, cache)
82
+
83
+ logger.info("Starting chatbot web application...")
84
+ app = ChatbotApplication(language=lang)
85
+ app.run()
86
+
87
+
88
+ def run_dbapp() -> None:
89
+ """Run the database application."""
90
+ from src.apps.dbapp.app import DatabaseApplication
91
+ logger = logging_startup()
92
+ logger.info("Starting database application...")
93
+ app = DatabaseApplication()
94
+ app.run()
95
+
96
+
97
+ def parse_args():
98
+ """Parse command-line arguments."""
99
+ parser = argparse.ArgumentParser(description="University of St. Gallen Executive Education RAG Chatbot")
100
+
101
+ # Add arguments
102
+ parser.add_argument("--scrape", action="store_true",
103
+ help="Scrapes the data from the HSG website and imports it into the database")
104
+ parser.add_argument("--imports", nargs="+", help="Runs the data importing pipeline for the provided files")
105
+
106
+ parser.add_argument("--weaviate", type=str, choices=['init', 'delete', 'redo', 'checkhealth', 'backup', 'restore'],
107
+ help="Runs different database actions")
108
+ parser.add_argument("--backup-id", type=str, help="Required when calling the --weaviate restore command!")
109
+
110
+ parser.add_argument("--cache-mode", type=str, choices=['local', 'cloud', 'dict'], default=config.cache.CACHE_MODE,
111
+ help="Defines whether to use the local or cloud Redis database or the special python dict as cache")
112
+
113
+ parser.add_argument("--cache", action="store_false", help="(De-)activates the caching mechanism")
114
+
115
+ parser.add_argument("--clear-cache", action="store_true",
116
+ help="Clears the cache")
117
+
118
+ parser.add_argument("--cli", action="store_true", help="Run the chatbot CLI")
119
+ parser.add_argument("--app", type=str, choices=AVAILABLE_LANGUAGES, help="Run the chatbot web application")
120
+ parser.add_argument("--dbapp", action="store_true", help="Run the database management application")
121
+
122
+ return parser.parse_args()
123
+
124
+
125
+ def main():
126
+ """Main entry point for the application."""
127
+ args = parse_args()
128
+
129
+ # Load cache settings with the cache args
130
+ must_clear_cache = False
131
+
132
+ # Check if any argument is provided
133
+ if not any([args.scrape, args.imports, args.weaviate, args.cli, args.cache, args.app, args.dbapp]):
134
+ # If no argument is provided, run the chatbot by default
135
+ run_application(cache_mode=args.cache_mode, cache=args.cache)
136
+ return
137
+
138
+ # Run the specified components
139
+ if args.scrape:
140
+ must_clear_cache = True
141
+ run_scraper()
142
+
143
+ if args.imports:
144
+ must_clear_cache = True
145
+ run_importer(args.imports)
146
+
147
+ if args.weaviate:
148
+ if args.weaviate in ["init", "redo", "restore"]:
149
+ must_clear_cache = True
150
+ run_weaviate_command(command=args.weaviate, backup_id=args.backup_id)
151
+
152
+ if args.clear_cache or must_clear_cache:
153
+ clear_cache()
154
+
155
+ if args.app:
156
+ run_application(lang=args.app, cache_mode=args.cache_mode, cache=args.cache)
157
+
158
+ if args.dbapp:
159
+ run_dbapp()
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ langchain>=1.0.2
3
+ langchain-core>=1.0.1
4
+ langchain-deepseek>=1.0.0
5
+ langchain-groq>=1.0.0
6
+ langchain-ollama>=0.3.10
7
+ langchain-openai>=1.0.1
8
+ langsmith>=0.4.0
9
+
10
+ requests>=2.31.0
11
+ openai>=1.3.0
12
+ python-dotenv>=1.0.0
13
+ colorama>=0.4.6
14
+
15
+ # Language detection
16
+ langdetect>=1.0.9
17
+
18
+ # Transformers for tokenization
19
+ transformers>=4.34.0
20
+
21
+ # Web applications
22
+ gradio>=5.49.1
23
+
24
+ # Processing pipeline
25
+ docling>=2.55.0
26
+ ultimate-sitemap-parser>=1.8.0
27
+ beautifulsoup4>=4.14.3
28
+ fake-useragent>=1.5.1
29
+
30
+ # Weaviate Vector DB
31
+ weaviate-client>=4.16.9
32
+ PyYAML>=6.0
33
+
34
+ # Cache
35
+ cachetools>=5.0.0
36
+ redis>=4.5.5
37
+
38
+ # Scheduling
39
+ apscheduler
src/__init__.py ADDED
File without changes
src/apps/__init__.py ADDED
File without changes
src/apps/chat/__init__.py ADDED
File without changes
src/apps/chat/app.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ import gradio as gr
3
+ from fastapi import FastAPI
4
+ from datetime import datetime
5
+
6
+
7
+ from src.const.agent_response_constants import *
8
+ from src.const.data_consent_constants import *
9
+ from src.rag.agent_chain import ExecutiveAgentChain
10
+ from src.utils.logging import get_logger, ConsentLogger
11
+
12
+ logger = get_logger("chatbot_app")
13
+
14
+ def init_fastapi_app(language):
15
+ fastapi_app = FastAPI()
16
+
17
+ @fastapi_app.get('/health')
18
+ def healthcheck():
19
+ from src.database.weavservice import WeaviateService
20
+ from fastapi.responses import JSONResponse
21
+
22
+ status = 200
23
+ message = { 'timestamp': datetime.now().isoformat() }
24
+ try:
25
+ message |= {
26
+ 'status': 'ok',
27
+ 'weaviate': True,
28
+ }
29
+ response = WeaviateService().ping(language)
30
+ if response['status'] != 'OK':
31
+ status = 503
32
+ message |= {
33
+ 'status': 'degraded',
34
+ 'weaviate': False,
35
+ 'error': str(response['error']),
36
+ }
37
+ except Exception as e:
38
+ status = 503
39
+ message |= {
40
+ 'status': 'down',
41
+ 'weaviate': False,
42
+ 'error': str(e),
43
+ }
44
+
45
+ return JSONResponse(
46
+ status_code = status,
47
+ content = message,
48
+ )
49
+
50
+ return fastapi_app
51
+
52
+
53
+ class ChatbotApplication:
54
+ def __init__(self, language: str = "de") -> None:
55
+ self._fastapi_app = init_fastapi_app(language)
56
+ self._gradio_app = gr.Blocks()
57
+ self._app = gr.mount_gradio_app(self._fastapi_app, self._gradio_app, path='/')
58
+ self._language = language
59
+ self._consentLogger = ConsentLogger()
60
+
61
+ with self._gradio_app:
62
+ agent_state = gr.State(None)
63
+ lang_state = gr.State(language)
64
+ consent_state = gr.State(False)
65
+ session_id_state = gr.State(str(uuid.uuid4())) # for consent logging later
66
+
67
+ with gr.Row():
68
+ lang_selector = gr.Radio(
69
+ choices=["Deutsch", "English"],
70
+ value="English" if language == "en" else "Deutsch",
71
+ label="Selected Language",
72
+ interactive=True,
73
+ )
74
+ reset_button = gr.Button("Reset Conversation", visible=False)
75
+
76
+ # ---- Consent Screen (Page 1) ----
77
+ with gr.Column(visible=True) as consent_screen:
78
+ data_policy = gr.Markdown(PRIVACY_NOTICE[language])
79
+ with gr.Row():
80
+ decline_btn = gr.Button(DECLINE[language])
81
+ accept_btn = gr.Button(ACCEPT[language])
82
+
83
+ decline_info = gr.Markdown("", visible=False)
84
+
85
+ # ---- Chat Screen (Page 2) ----
86
+ with gr.Column(visible=False) as chat_screen:
87
+ chat = gr.ChatInterface(
88
+ fn=lambda msg, history, agent: self._chat(
89
+ message=msg, history=history, agent=agent
90
+ ),
91
+ additional_inputs=[agent_state],
92
+ title="Executive Education Adviser",
93
+ )
94
+
95
+ with gr.Row():
96
+ withdraw_button = gr.Button(WITHDRAW_TEXT[language], visible=False, variant="stop")
97
+
98
+ def create_session_id() -> str:
99
+ return str(uuid.uuid4())
100
+
101
+ def initialize_agent(lang: str, session_id: str):
102
+ agent = ExecutiveAgentChain(language=lang, session_id=session_id)
103
+ greeting = agent.generate_greeting()
104
+
105
+ disclaimer_html = get_disclaimer_widget(lang)
106
+
107
+ full_content = f"{disclaimer_html}{greeting}"
108
+
109
+ return agent, [{"role": "assistant", "content": full_content}]
110
+
111
+ def label_to_lang_code(label: str) -> str:
112
+ return "en" if label == "English" else "de"
113
+
114
+ # Language change: before consent => only update consent UI text.
115
+ # After consent: keep chat running (or optionally re-init agent on language change).
116
+ def on_language_change(
117
+ language_label: str,
118
+ consent_given: bool,
119
+ agent,
120
+ session_id: str,
121
+ ):
122
+ lang_code = label_to_lang_code(language_label)
123
+
124
+ # Before consent: update consent screen text to selected language
125
+ if not consent_given:
126
+ return (
127
+ lang_code,
128
+ gr.update(value=PRIVACY_NOTICE[lang_code]),
129
+ gr.update(value=DECLINE[lang_code]),
130
+ gr.update(value=ACCEPT[lang_code]),
131
+ gr.update(visible=False, value=""),
132
+ None, # agent_state stays None
133
+ None, # chat stays as it is
134
+ gr.update(value=WITHDRAW_TEXT[lang_code], visible=False),
135
+ )
136
+
137
+ # After consent
138
+ new_agent, greeting = initialize_agent(lang_code, session_id=session_id)
139
+ return (
140
+ lang_code,
141
+ gr.update(value=PRIVACY_NOTICE[lang_code]),
142
+ gr.update(value=DECLINE[lang_code]),
143
+ gr.update(value=ACCEPT[lang_code]),
144
+ gr.update(visible=False, value=""),
145
+ new_agent,
146
+ greeting,
147
+ gr.update(value=WITHDRAW_TEXT[lang_code], visible=True),
148
+ )
149
+
150
+ def on_accept(lang: str, session_id: str):
151
+ agent, greeting = initialize_agent(lang, session_id=session_id)
152
+ self._consentLogger.log(session_id, "accepted", policy_version="1.0")
153
+ self._language = lang
154
+ return (
155
+ gr.update(visible=False), # consent_screen hide
156
+ gr.update(visible=True), # chat_screen show
157
+ True, # consent_state
158
+ agent, # agent_state
159
+ greeting, # chat initial history
160
+ gr.update(visible=False, value=""), # decline_info hide
161
+ gr.update(visible=True), # show reset_button
162
+ gr.update(value=WITHDRAW_TEXT[lang], visible=True),
163
+ )
164
+
165
+ def on_decline(lang: str, session_id: str):
166
+ self._language = lang
167
+ self._consentLogger.log(session_id, "declined", policy_version="1.0")
168
+ return (
169
+ gr.update(visible=True), # consent_screen stays
170
+ gr.update(visible=False), # chat_screen stays hidden
171
+ False, # consent_state
172
+ None, # agent_state
173
+ [], # chat history empty
174
+ gr.update(visible=True, value=DECLINE_MESSAGE[lang]),
175
+ )
176
+
177
+ def on_reset_chat(lang: str, session_id: str):
178
+ agent, greeting = initialize_agent(lang, session_id=session_id)
179
+ self._language = lang
180
+ return (
181
+ agent,
182
+ greeting,
183
+ )
184
+
185
+ def on_withdraw(lang: str, agent, session_id: str):
186
+ self._consentLogger.log(session_id, "withdrawn", policy_version="1.0")
187
+
188
+ # 1) wipe server-side
189
+ if agent is not None:
190
+ try:
191
+ agent.wipe_session_data()
192
+ logger.info("wipe_session_data executed")
193
+ except Exception as e:
194
+ logger.error(f"wipe_session_data failed: {e}", exc_info=True)
195
+
196
+ # 2) lock chat again (back to consent screen)
197
+ new_session_id = create_session_id()
198
+ return (
199
+ gr.update(visible=True), # consent_screen
200
+ gr.update(value=PRIVACY_NOTICE[lang]), # data_policy
201
+ gr.update(value=DECLINE[lang]), # decline_btn
202
+ gr.update(value=ACCEPT[lang]), # accept_btn
203
+ gr.update(visible=False), # chat_screen
204
+ gr.update(visible=True, value=WITHDRAW_CONFIRMATION_MESSAGE[lang]), # decline_info
205
+ False, # consent_state
206
+ None, # agent_state
207
+ [], # chat.chatbot_value (history)
208
+ gr.update(visible=False), # reset_button
209
+ gr.update(visible=False), # withdraw_button
210
+ new_session_id, # session_id_state
211
+ )
212
+
213
+ # Language switch updates consent UI if consent not given
214
+ lang_selector.change(
215
+ fn=on_language_change,
216
+ inputs=[lang_selector, consent_state, agent_state, session_id_state],
217
+ outputs=[lang_state,
218
+ data_policy,
219
+ decline_btn,
220
+ accept_btn,
221
+ decline_info,
222
+ agent_state,
223
+ chat.chatbot_value,
224
+ withdraw_button,
225
+ ],
226
+ queue=True,
227
+ )
228
+
229
+ # Accept/Decline data consent
230
+ accept_btn.click(
231
+ fn=on_accept,
232
+ inputs=[lang_state, session_id_state],
233
+ outputs=[
234
+ consent_screen,
235
+ chat_screen,
236
+ consent_state,
237
+ agent_state,
238
+ chat.chatbot_value,
239
+ decline_info,
240
+ reset_button,
241
+ withdraw_button,
242
+ ],
243
+ queue=True,
244
+ )
245
+
246
+ decline_btn.click(
247
+ fn=on_decline,
248
+ inputs=[lang_state, session_id_state],
249
+ outputs=[consent_screen, chat_screen, consent_state, agent_state, chat.chatbot_value, decline_info],
250
+ queue=True,
251
+ )
252
+
253
+ # Reset
254
+ reset_button.click(
255
+ fn=on_reset_chat,
256
+ inputs=[lang_state, session_id_state],
257
+ outputs=[
258
+ agent_state,
259
+ chat.chatbot_value,
260
+ ],
261
+ queue=True,
262
+ )
263
+
264
+ # Withdraw consent
265
+ withdraw_button.click(
266
+ fn=on_withdraw,
267
+ inputs=[lang_state, agent_state, session_id_state],
268
+ outputs=[
269
+ consent_screen,
270
+ data_policy,
271
+ decline_btn,
272
+ accept_btn,
273
+ chat_screen,
274
+ decline_info,
275
+ consent_state,
276
+ agent_state,
277
+ chat.chatbot_value,
278
+ reset_button,
279
+ withdraw_button,
280
+ session_id_state,
281
+ ],
282
+ queue=True,
283
+ )
284
+
285
+
286
+ @property
287
+ def app(self) -> gr.Blocks:
288
+ """Expose underlying Gradio Blocks for external runners (e.g., HF Spaces)."""
289
+ return self._app
290
+
291
+ def _chat(self, message: str, history: list[dict], agent: ExecutiveAgentChain):
292
+ if agent is None:
293
+ logger.error("Agent not initialized")
294
+ return ["I apologize, but the chatbot is not properly initialized."]
295
+
296
+ answers = []
297
+ try:
298
+ logger.info(f"Processing user query: {message[:100]}...")
299
+ response = agent.query(message)
300
+ answers.append(response.response)
301
+ self._language = response.language
302
+
303
+ if response.show_booking_widget:
304
+ html_code = get_booking_widget(language=self._language, programs=response.relevant_programs)
305
+ answers.append(gr.HTML(value=html_code))
306
+ except Exception as e:
307
+ logger.error(f"Error processing query: {e}", exc_info=True)
308
+ error_message = (
309
+ "I apologize, but I encountered an error processing your request. "
310
+ "Please try rephrasing your question or contact our admissions team for assistance."
311
+ )
312
+ answers.append(error_message)
313
+
314
+ return answers
315
+
316
+
317
+ def run(self):
318
+ import uvicorn
319
+ uvicorn.run(
320
+ self._app,
321
+ host='0.0.0.0',
322
+ port=7860,
323
+ log_config=None
324
+ )
src/apps/dbapp/app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tkinter import *
2
+ from tkinter import ttk
3
+ from src.database.weavservice import WeaviateService
4
+
5
+ from src.apps.dbapp.mainframe import MainFrame
6
+ from src.apps.dbapp.query import QueryFrame
7
+ from src.apps.dbapp.imports import ImportFrame
8
+ from src.apps.dbapp.backup import BackupsFrame
9
+ from src.apps.dbapp.collections import CollectionsFrame
10
+ from src.apps.dbapp.config import SchemaConfigurationFrame
11
+
12
+ from src.utils.logging import get_logger
13
+
14
+ logger = get_logger("db_inter ")
15
+
16
+ class DatabaseApplication:
17
+ def __init__(self) -> None:
18
+ self._root = Tk()
19
+ self._service = WeaviateService()
20
+
21
+ self._root.title("Database Interface")
22
+ self._root.geometry("810x500")
23
+
24
+ notebook = ttk.Notebook(self._root)
25
+ notebook.pack(fill=BOTH, expand=True)
26
+
27
+ main_frame = MainFrame(notebook, self._service).init()
28
+ import_frame = ImportFrame(notebook, self._service).init()
29
+ config_frame = SchemaConfigurationFrame(notebook, self._service).init()
30
+ collections_frame = CollectionsFrame(notebook, self._service).init()
31
+ query_frame = QueryFrame(notebook, self._service).init()
32
+ backups_frame = BackupsFrame(notebook, self._service).init()
33
+
34
+ notebook.add(main_frame, text='Main')
35
+ notebook.add(import_frame, text='Import')
36
+ notebook.add(config_frame, text='Schemas')
37
+ notebook.add(collections_frame, text='Collections')
38
+ notebook.add(query_frame, text='Query')
39
+ notebook.add(backups_frame, text='Backups')
40
+
41
+ logger.info("Application initialization finished")
42
+
43
+ def run(self):
44
+ self._root.mainloop()
src/apps/dbapp/backup.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, shutil
2
+ from datetime import datetime
3
+
4
+ from tkinter import *
5
+ from tkinter import ttk
6
+ from src.database.weavservice import WeaviateService
7
+ from src.apps.dbapp.framebase import CustomFrameBase
8
+ from src.apps.dbapp.utilclasses import BackupData
9
+ from src.config import config
10
+
11
+ def _load_backup_files():
12
+ backups = []
13
+ os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
14
+
15
+ for backup_id in os.listdir(config.weaviate.BACKUP_PATH):
16
+ backups.append(BackupData(backup_id))
17
+
18
+ return backups
19
+
20
+ class BackupsFrame(CustomFrameBase):
21
+ def __init__(self, parent, service: WeaviateService):
22
+ super().__init__(parent, service)
23
+ self._backups = _load_backup_files()
24
+
25
+ def init(self) -> ttk.Frame:
26
+ self._backups = _load_backup_files()
27
+
28
+ main_frame = ttk.Frame(self._parent)
29
+ main_frame.pack(fill=BOTH, expand=True)
30
+
31
+ tree_frame = ttk.Frame(main_frame)
32
+ tree_frame.pack(fill=BOTH, expand=True, padx=10, pady=10)
33
+
34
+ label_frame = ttk.Frame(main_frame)
35
+ label_frame.pack(fill=X, expand=True, padx=10, pady=10)
36
+
37
+ button_frame = ttk.Frame(main_frame)
38
+ button_frame.pack(fill=X, padx=10, pady=10)
39
+
40
+ date_reverse_sort = True
41
+ columns = ('date', 'size')
42
+
43
+ info_label = ttk.Label(label_frame, text="", padding=8)
44
+
45
+ def _print_label(msg, backc, forc):
46
+ info_label.configure(text=msg, foreground=forc, background=backc)
47
+ info_label.update_idletasks()
48
+
49
+ def print_failure(msg: str):
50
+ _print_label(msg, "#FFCDD2", "#B71C1C")
51
+
52
+ def print_info(msg: str):
53
+ _print_label(msg, "#cdedff", "#1c31b7")
54
+
55
+ def print_success(msg: str):
56
+ _print_label(msg, "#d7ffcd", "#4db71c")
57
+
58
+
59
+ tree = ttk.Treeview(
60
+ tree_frame,
61
+ columns=columns,
62
+ show='tree headings',
63
+ selectmode='browse',
64
+ )
65
+
66
+ def sort_by_date():
67
+ nonlocal date_reverse_sort
68
+
69
+ parents = tree.get_children("")
70
+ data = []
71
+
72
+ for p in parents:
73
+ value = tree.set(p, 'date')
74
+ try:
75
+ value = datetime.strptime(value, "%d.%m.%Y %H:%M:%S")
76
+ except Exception:
77
+ pass
78
+ data.append((value, p))
79
+
80
+ data.sort(reverse=date_reverse_sort)
81
+ date_reverse_sort = not date_reverse_sort
82
+
83
+ for index, (_, p) in enumerate(data):
84
+ tree.move(p, "", index)
85
+
86
+ tree.heading(
87
+ 'date',
88
+ text='Created at ' + ('▾' if date_reverse_sort else '▴'),
89
+ command=lambda: sort_by_date()
90
+ )
91
+
92
+ tree.heading('#0', text='Backup ID')
93
+ tree.heading('date', text='Created at ▾', command=lambda: sort_by_date())
94
+ tree.heading('size', text='Embeddings amount')
95
+
96
+ tree.column("#0", width=100)
97
+ tree.column("date", width=60)
98
+ tree.column("size", width=30)
99
+
100
+ def insert_backup(backup):
101
+ nonlocal date_reverse_sort
102
+ bk = backup.to_treeformat()
103
+ parent = tree.insert('', 0 if not date_reverse_sort else END,
104
+ text=bk['id'],
105
+ values=bk['date']
106
+ )
107
+ for collection in bk['collections']:
108
+ tree.insert(parent, END,
109
+ text=collection['name'],
110
+ values=collection['size'],
111
+ )
112
+
113
+ for backup in self._backups:
114
+ insert_backup(backup)
115
+ sort_by_date()
116
+
117
+ def create_backup():
118
+ print_info(f"Creating new backup...")
119
+ backup_id = self._service._create_backup()
120
+
121
+ backup = BackupData(backup_id)
122
+ self._backups.append(backup)
123
+ insert_backup(backup)
124
+ print_success(f"Successfully created new backup {backup._backup_id}!")
125
+
126
+ def restore_backup():
127
+ item_id = tree.selection()[0]
128
+ backup = tree.item(item_id)
129
+
130
+ print_info(f"Restoring backup {backup['text']}...")
131
+ self._service._restore_backup('backup_' + backup['text'])
132
+ print_success(f"Successfully restored backup {backup['text']}!")
133
+
134
+ def delete_backup():
135
+ item_id = tree.selection()[0]
136
+ backup = tree.item(item_id)
137
+
138
+ backup_path = os.path.join(config.weaviate.BACKUP_PATH, 'backup_' + backup['text'])
139
+ shutil.rmtree(backup_path, ignore_errors=True)
140
+
141
+ tree.delete(item_id)
142
+ print_success(f"Deleted backup {backup['text']}.")
143
+
144
+
145
+ create_bkp_btn = ttk.Button(
146
+ button_frame,
147
+ text="Create Backup",
148
+ command=create_backup
149
+ )
150
+
151
+ restore_bkp_btn = ttk.Button(
152
+ button_frame,
153
+ text="Restore Backup",
154
+ command=restore_backup,
155
+ state=['disabled']
156
+ )
157
+
158
+ delete_bkp_btn = ttk.Button(
159
+ button_frame,
160
+ text="Delete Backup",
161
+ command=delete_backup,
162
+ state=['disabled']
163
+ )
164
+
165
+ def on_item_selection(event):
166
+ selected = tree.selection()
167
+ if not selected:
168
+ restore_bkp_btn.state(['disabled'])
169
+ delete_bkp_btn.state(['disabled'])
170
+ return
171
+
172
+ item_id = selected[0]
173
+ is_parent = tree.parent(item_id) == ''
174
+ restore_bkp_btn.state(['!disabled' if is_parent else 'disabled'])
175
+ delete_bkp_btn.state(['!disabled' if is_parent else 'disabled'])
176
+
177
+ tree.bind("<<TreeviewSelect>>", on_item_selection)
178
+
179
+ scrollbar = ttk.Scrollbar(tree_frame, orient="vertical", command=tree.yview)
180
+ tree.configure(yscrollcommand=scrollbar.set)
181
+
182
+ info_label.pack()
183
+
184
+ tree.pack(side=LEFT, fill=BOTH, expand=True)
185
+ scrollbar.pack(side=RIGHT, fill=Y)
186
+
187
+ create_bkp_btn.pack(side=LEFT, padx=5)
188
+ restore_bkp_btn.pack(side=RIGHT, padx=5)
189
+ delete_bkp_btn.pack(side=RIGHT, padx=5)
190
+
191
+ return main_frame
src/apps/dbapp/collections.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from tkinter import *
2
+ from tkinter import ttk
3
+ from src.apps.dbapp.framebase import CustomFrameBase
4
+ from src.database.weavservice import WeaviateService
5
+
6
+ class CollectionsFrame(CustomFrameBase):
7
+ def __init__(self, parent, service: WeaviateService) -> None:
8
+ super().__init__(parent, service)
src/apps/dbapp/config.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+
3
+ from tkinter import *
4
+ from tkinter import ttk
5
+ from src.apps.dbapp.framebase import CustomFrameBase
6
+ from src.utils.stratutils.generator import generate_strategy
7
+ from src.database.weavservice import WeaviateService
8
+ from src.config import config
9
+
10
+ def _dump_schema(schema):
11
+ os.makedirs(config.weaviate.PROPERTIES_PATH, exist_ok=True)
12
+ properties_file_path = os.path.join(config.weaviate.PROPERTIES_PATH, 'properties.json')
13
+ with open(properties_file_path, 'w', encoding='utf-8') as f:
14
+ json.dump(schema, f, indent=2, default=str)
15
+
16
+
17
+ class SchemaConfigurationFrame(CustomFrameBase):
18
+ def __init__(self, parent, service: WeaviateService) -> None:
19
+ super().__init__(parent, service)
20
+ self._schema = self._load_schema_data()
21
+ self._strategies = self._load_strategies()
22
+
23
+
24
+ def _load_strategies(self) -> dict:
25
+ os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
26
+ loaded_strats = os.listdir(config.weaviate.STRATEGIES_PATH)
27
+ strategies = {}
28
+
29
+ for name, prop in self._schema.items():
30
+ strategy_file = f"strat_{name}.py"
31
+ file_path = os.path.join(config.weaviate.STRATEGIES_PATH, strategy_file)
32
+ strategy_content = ""
33
+
34
+ if strategy_file not in loaded_strats:
35
+ strategy_content = generate_strategy(name, prop)
36
+ with open(file_path, 'w', encoding='utf-8') as f:
37
+ f.write(strategy_content)
38
+ else:
39
+ with open(file_path) as f:
40
+ strategy_content = f.read()
41
+
42
+ strategies[name] = strategy_content
43
+
44
+ return strategies
45
+
46
+
47
+ def _save_strategy(self, name, strategy) -> None:
48
+ os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
49
+ self._strategies[name] = strategy
50
+
51
+ file_path = os.path.join(config.weaviate.STRATEGIES_PATH, f"strat_{name}.py")
52
+ with open(file_path, 'w', encoding='utf-8') as f:
53
+ f.write(strategy)
54
+
55
+
56
+ def _load_schema_data(self) -> dict:
57
+ schema_data = {}
58
+
59
+ schema = self._service._extract_data()['schema']
60
+ if not schema:
61
+ return schema_data
62
+
63
+ for prop in schema[0]['properties']:
64
+ data_property = {
65
+ 'description': prop.get('description', ''),
66
+ 'data_type': prop['dataType'][0],
67
+ 'filterable': prop['indexFilterable'],
68
+ 'searchable': prop['indexSearchable'],
69
+ 'skip_vectorization': prop['moduleConfig']['text2vec-huggingface']['skip'],
70
+ }
71
+ schema_data[prop['name']] = data_property
72
+
73
+ _dump_schema(schema_data)
74
+
75
+ return schema_data
76
+
77
+
78
+ def _update_schema_property(self, old_name: str, new_name: str, prop: dict) -> None:
79
+ del self._schema[old_name]
80
+ self._schema[new_name] = prop
81
+ _dump_schema(self._schema)
82
+
83
+
84
+ def _add_schema_property(self, name, prop: dict) -> None:
85
+ self._schema[name] = prop
86
+ _dump_schema(self._schema)
87
+
88
+
89
+ def _delete_schema_property(self, name) -> None:
90
+ del self._schema[name]
91
+ _dump_schema(self._schema)
92
+
93
+
94
+ def init(self) -> ttk.Frame:
95
+ main_frame = ttk.Frame(self._parent)
96
+ main_frame.pack(fill=BOTH, expand=True)
97
+
98
+ schema_frame = ttk.Frame(main_frame)
99
+ schema_frame.pack(fill=BOTH, expand=True)
100
+
101
+ add_button = ttk.Button(schema_frame, text='Add property',
102
+ command=lambda: self._add_property(refresh_table))
103
+ add_button.pack(anchor=NW, padx=5, pady=5)
104
+
105
+ canvas = Canvas(schema_frame)
106
+ scrollbar = ttk.Scrollbar(schema_frame, orient="vertical", command=canvas.yview)
107
+ scrollable_frame = ttk.Frame(canvas)
108
+
109
+ scrollable_frame.bind("<Configure>", lambda _: canvas.configure(scrollregion=canvas.bbox("all")))
110
+ canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
111
+ canvas.configure(yscrollcommand=scrollbar.set)
112
+ canvas.pack(side=LEFT, fill=BOTH, expand=True)
113
+ scrollbar.pack(side=RIGHT, fill=Y)
114
+
115
+ def refresh_table():
116
+ for widget in scrollable_frame.winfo_children():
117
+ widget.destroy()
118
+
119
+ self._build_table(scrollable_frame, refresh_table)
120
+
121
+ refresh_table()
122
+ return main_frame
123
+
124
+
125
+ def _build_table(self, parent_frame, refresh_callback):
126
+ style = ttk.Style()
127
+ style.configure('Header.TLabel', font=('Helvetica', 10, 'bold'), background='#e0e0e0')
128
+ style.configure('EvenRow.TLabel', background='#f0f0f0')
129
+ style.configure('OddRow.TLabel', background='white')
130
+
131
+ table_frame = ttk.Frame(parent_frame)
132
+ table_frame.pack(fill=X, padx=5, pady=5)
133
+
134
+ for i in range(5):
135
+ table_frame.grid_columnconfigure(i, minsize=100, weight=1)
136
+
137
+ headers = ['Name', 'Data Type', 'Filterable', 'Searchable', 'Skip Vectorize']
138
+ for col, text in enumerate(headers):
139
+ label = ttk.Label(table_frame, text=text, borderwidth=1, relief=SOLID, anchor='center', style='Header.TLabel')
140
+ label.grid(row=0, column=col, sticky='ew')
141
+
142
+ for idx, (name, prop) in enumerate(self._schema.items(), start=1):
143
+ row_style = 'EvenRow.TLabel' if idx % 2 == 0 else 'OddRow.TLabel'
144
+
145
+ row_name_label = ttk.Label(table_frame, text=name, style=row_style)
146
+ row_type_label = ttk.Label(table_frame, text=prop['data_type'].upper(), style=row_style)
147
+ row_filterable_label = ttk.Label(table_frame, text='Yes' if prop['filterable'] else 'No', style=row_style)
148
+ row_searchable_label = ttk.Label(table_frame, text='Yes' if prop['searchable'] else 'No', style=row_style)
149
+ row_vectorize_label = ttk.Label(table_frame, text='Yes' if prop['skip_vectorization'] else 'No', style=row_style)
150
+
151
+ row_edit_button = ttk.Button(table_frame, text='Edit',
152
+ command=lambda n=name, p=prop: self._edit_property(n, p, refresh_callback))
153
+ row_delete_button = ttk.Button(table_frame, text='Delete',
154
+ command=lambda n=name: self._delete_property(n, refresh_callback))
155
+ row_strategy_button = ttk.Button(table_frame, text='Strategy',
156
+ command=lambda n=name: self._handle_strategy(n))
157
+
158
+ row_name_label.grid(row=idx, column=0, sticky='ew', ipadx=25)
159
+ row_type_label.grid(row=idx, column=1, sticky='ew', ipadx=25)
160
+ row_filterable_label.grid(row=idx, column=2, sticky='ew', ipadx=25)
161
+ row_searchable_label.grid(row=idx, column=3, sticky='ew')
162
+ row_vectorize_label.grid(row=idx, column=4, sticky='ew')
163
+ row_edit_button.grid(row=idx, column=5, sticky='ew')
164
+ row_delete_button.grid(row=idx, column=6, sticky='ew')
165
+ row_strategy_button.grid(row=idx, column=7, sticky='ew')
166
+
167
+
168
+ def _handle_strategy(self, n):
169
+ dialog = Toplevel()
170
+ dialog.title(f"Property {n} strategy")
171
+ dialog.geometry("700x400")
172
+
173
+ field_frame = ttk.Frame(dialog)
174
+ field_frame.pack(fill=BOTH, expand=True, padx=10, pady=10)
175
+
176
+ scrollbar = Scrollbar(field_frame, orient=VERTICAL)
177
+ scrollbar.pack(side=RIGHT, fill=Y)
178
+
179
+ strategy = self._strategies[n]
180
+ edit_field = Text(field_frame, width=80, height=15, wrap=WORD, yscrollcommand=scrollbar.set)
181
+ edit_field.insert(END, strategy)
182
+ edit_field.pack(side=LEFT, fill=BOTH, expand=True)
183
+
184
+ scrollbar.config(command=edit_field.yview)
185
+
186
+ def commit():
187
+ new_strategy = edit_field.get("1.0", END).strip()
188
+ self._save_strategy(n, new_strategy)
189
+ dialog.destroy()
190
+
191
+
192
+ ttk.Button(dialog, text="Save", command=commit).pack(side=BOTTOM, anchor=S, pady=10)
193
+
194
+
195
+ def _delete_property(self, name, refresh_callback):
196
+ msg = f"Do you want to delete property '{name}'?"
197
+ dialog = Toplevel()
198
+ dialog.title('Warning!')
199
+ dialog.geometry(f"{len(msg)*5+120}x50")
200
+ dialog.grab_set()
201
+
202
+ ttk.Label(dialog, text=msg).pack()
203
+
204
+ def submit():
205
+ self._delete_schema_property(name)
206
+ refresh_callback()
207
+ dialog.destroy()
208
+
209
+ button_frame = ttk.Frame(dialog)
210
+ button_frame.pack(fill=X, expand=True)
211
+
212
+ ttk.Button(button_frame, text='Delete', command=submit).pack(side=LEFT, padx=15)
213
+ ttk.Button(button_frame, text='Cancel', command=dialog.destroy).pack(side=RIGHT, padx=15)
214
+
215
+
216
+ def _add_property(self, refresh_callback):
217
+ dialog = Toplevel()
218
+ dialog.title(f"New property")
219
+ dialog.geometry("280x300")
220
+ dialog.grab_set()
221
+
222
+ texts_frame = ttk.Frame(dialog)
223
+ texts_frame.pack(fill=X, expand=True)
224
+
225
+ ttk.Label(texts_frame, text="Name:").grid(row=0, column=0, padx=5, pady=5, sticky='e')
226
+ name_entry = ttk.Entry(texts_frame)
227
+ name_entry.grid(row=0, column=1, padx=5, pady=5, sticky='w')
228
+
229
+ ttk.Label(texts_frame, text="Description:").grid(row=1, column=0, padx=5, pady=5, sticky='e')
230
+ desc_entry = ttk.Entry(texts_frame)
231
+ desc_entry.insert(0, '')
232
+ desc_entry.grid(row=1, column=1, padx=5, pady=5, sticky='w')
233
+
234
+ ttk.Label(texts_frame, text="Data Type:").grid(row=2, column=0, padx=5, pady=5, sticky='e')
235
+ type_var = StringVar(value='text')
236
+ type_combo = ttk.Combobox(texts_frame, textvariable=type_var,
237
+ values=["text", "int", "number", "boolean", "date", "text[]", "int[]", "number[]", "boolean[]", "date[]", "object"]
238
+ )
239
+ type_combo.grid(row=2, column=1, padx=5, pady=5, sticky='w')
240
+
241
+ checks_frame = ttk.Frame(dialog)
242
+ checks_frame.pack(fill=X, expand=True)
243
+
244
+ filterable_var = BooleanVar(value=True)
245
+ searchable_var = BooleanVar(value=True)
246
+ skip_vec_var = BooleanVar(value=False)
247
+
248
+ ttk.Checkbutton(checks_frame, text="Filterable ", variable=filterable_var).pack(anchor=W, padx=15)
249
+ ttk.Checkbutton(checks_frame, text="Searchable ", variable=searchable_var).pack(anchor=W, padx=15)
250
+ ttk.Checkbutton(checks_frame, text="Skip Vectorization", variable=skip_vec_var).pack(anchor=W, padx=15)
251
+
252
+ def submit():
253
+ name = name_entry.get()
254
+ if not name:
255
+ self._show_messagebox("Parameter 'name' is required!")
256
+ return
257
+ if name in self._schema.keys():
258
+ self._show_messagebox(f"Property with name '{name}' already exists!")
259
+ return
260
+
261
+ prop = {
262
+ 'description': desc_entry.get().strip(),
263
+ 'data_type': type_var.get(),
264
+ 'filterable': filterable_var.get(),
265
+ 'searchable': searchable_var.get(),
266
+ 'skip_vectorization': skip_vec_var.get(),
267
+ }
268
+
269
+ self._add_schema_property(name, prop)
270
+ refresh_callback()
271
+ dialog.destroy()
272
+
273
+ buttons_frame = ttk.Frame(dialog)
274
+ buttons_frame.pack(fill=X, expand=True)
275
+
276
+ ttk.Button(buttons_frame, text="Save", command=submit).pack(side=LEFT, padx=15)
277
+ ttk.Button(buttons_frame, text="Cancel", command=dialog.destroy).pack(side=RIGHT, padx=15)
278
+
279
+
280
+ def _edit_property(self, name: str, prop: dict, refresh_callback):
281
+ dialog = Toplevel()
282
+ dialog.title(f"Edit Property: {name}")
283
+ dialog.geometry("280x300")
284
+ dialog.grab_set()
285
+
286
+ texts_frame = ttk.Frame(dialog)
287
+ texts_frame.pack(fill=X, expand=True)
288
+
289
+ ttk.Label(texts_frame, text="Name:").grid(row=0, column=0, padx=5, pady=5, sticky='e')
290
+ name_entry = ttk.Entry(texts_frame)
291
+ name_entry.insert(0, name)
292
+ name_entry.grid(row=0, column=1, padx=5, pady=5, sticky='w')
293
+
294
+ ttk.Label(texts_frame, text="Description:").grid(row=1, column=0, padx=5, pady=5, sticky='e')
295
+ desc_entry = ttk.Entry(texts_frame)
296
+ desc_entry.insert(0, prop.get('description', ''))
297
+ desc_entry.grid(row=1, column=1, padx=5, pady=5, sticky='w')
298
+
299
+ ttk.Label(texts_frame, text="Data Type:").grid(row=2, column=0, padx=5, pady=5, sticky='e')
300
+ type_var = StringVar(value=prop['data_type'])
301
+ type_combo = ttk.Combobox(texts_frame, textvariable=type_var,
302
+ values=["text", "int", "number", "boolean", "date", "text[]", "int[]", "number[]", "boolean[]", "date[]", "object"]
303
+ )
304
+ type_combo.grid(row=2, column=1, padx=5, pady=5, sticky='w')
305
+
306
+ checks_frame = ttk.Frame(dialog)
307
+ checks_frame.pack(fill=X, expand=True)
308
+
309
+ filterable_var = BooleanVar(value=prop['filterable'])
310
+ searchable_var = BooleanVar(value=prop['searchable'])
311
+ skip_vec_var = BooleanVar(value=prop['skip_vectorization'])
312
+
313
+ ttk.Checkbutton(checks_frame, text="Filterable ", variable=filterable_var).pack(anchor=W, padx=15)
314
+ ttk.Checkbutton(checks_frame, text="Searchable ", variable=searchable_var).pack(anchor=W, padx=15)
315
+ ttk.Checkbutton(checks_frame, text="Skip Vectorization", variable=skip_vec_var).pack(anchor=W, padx=15)
316
+
317
+ def submit():
318
+ new_name = name_entry.get().strip()
319
+ if not new_name:
320
+ self._show_messagebox("Parameter 'name' is required!")
321
+ return
322
+
323
+ updated_prop = {
324
+ 'description': desc_entry.get().strip(),
325
+ 'data_type': type_var.get(),
326
+ 'filterable': filterable_var.get(),
327
+ 'searchable': searchable_var.get(),
328
+ 'skip_vectorization': skip_vec_var.get(),
329
+ }
330
+
331
+ self._update_schema_property(name, new_name, updated_prop)
332
+ refresh_callback()
333
+ dialog.destroy()
334
+
335
+ buttons_frame = ttk.Frame(dialog)
336
+ buttons_frame.pack(fill=X, expand=True)
337
+
338
+ ttk.Button(buttons_frame, text="Save", command=submit).pack(side=LEFT, padx=15)
339
+ ttk.Button(buttons_frame, text="Cancel", command=dialog.destroy).pack(side=RIGHT, padx=15)
340
+
341
+
342
+ @staticmethod
343
+ def _show_messagebox(msg):
344
+ dialog = Toplevel()
345
+ dialog.title('Warning!')
346
+ dialog.geometry(f"{len(msg)*5+120}x50")
347
+ dialog.grab_set()
348
+
349
+ ttk.Label(dialog, text=msg).pack()
350
+ ttk.Button(dialog, text='OK', command=dialog.destroy).pack(padx=15)
src/apps/dbapp/framebase.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tkinter import *
2
+ from tkinter import ttk
3
+ from src.database.weavservice import WeaviateService
4
+
5
+ class CustomFrameBase:
6
+ def __init__(self, parent, service: WeaviateService) -> None:
7
+ self._parent = parent
8
+ self._service = service
9
+
10
+
11
+ def init(self) -> ttk.Frame:
12
+ main_frame = ttk.Frame(self._parent)
13
+ main_frame.pack()
14
+
15
+ return main_frame
src/apps/dbapp/imports.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ from tkinter import *
4
+ from tkinter import ttk
5
+ from tkinter import filedialog
6
+ from queue import Queue
7
+
8
+ from .framebase import CustomFrameBase
9
+
10
+ from src.pipeline.pipeline import ImportPipeline
11
+ from src.pipeline.utils import ProcessingResult
12
+
13
+ from src.database.weavservice import WeaviateService
14
+ from src.utils.lang import get_language_name
15
+ from src.config import config
16
+
17
+ class ImportFrame(CustomFrameBase):
18
+ def __init__(self, parent, service: WeaviateService) -> None:
19
+ super().__init__(parent, service)
20
+ self._import_paths = dict()
21
+
22
+ def init(self) -> ttk.Frame:
23
+ main_frame = ttk.Frame(self._parent)
24
+ main_frame.pack(fill=BOTH, expand=True)
25
+
26
+ # ====================== Helper functions ======================
27
+ def update_treeview():
28
+ for item in self.files_treeview.get_children():
29
+ self.files_treeview.delete(item)
30
+ for filename in self._import_paths:
31
+ self.files_treeview.insert("", 0, text=filename)
32
+
33
+ def open_file_dialog():
34
+ filepaths = filedialog.askopenfilenames(
35
+ title="Select files to import",
36
+ filetypes=(("PDF", "*.pdf"), ("Text files", "*.txt"), ("All files", "*.*"))
37
+ )
38
+ for path in filepaths:
39
+ filename = os.path.basename(path)
40
+ self._import_paths[filename] = path
41
+ update_treeview()
42
+
43
+ def remove_files():
44
+ selection = self.files_treeview.selection()
45
+ if not selection:
46
+ return
47
+ for item in selection:
48
+ filename = self.files_treeview.item(item)["text"]
49
+ self._import_paths.pop(filename, None)
50
+ update_treeview()
51
+
52
+ def change_button_state(state):
53
+ add_button.config(state=state)
54
+ remove_button.config(state=state)
55
+ import_button.config(state=state)
56
+
57
+ # Configure grid for 50/50 split
58
+ main_frame.grid_rowconfigure(0, weight=1)
59
+ main_frame.grid_columnconfigure(0, weight=1)
60
+ main_frame.grid_columnconfigure(1, weight=1)
61
+
62
+ # ====================== LEFT SIDE ======================
63
+ left_frame = ttk.Frame(main_frame)
64
+ left_frame.grid(row=0, column=0, sticky='nsew', padx=(10, 5), pady=10)
65
+
66
+ # Button row for add/remove
67
+ btn_row = ttk.Frame(left_frame)
68
+ btn_row.pack(fill=X, pady=(0, 8))
69
+
70
+ add_button = ttk.Button(btn_row, text="Add files", command=open_file_dialog)
71
+ add_button.pack(side=LEFT, padx=8)
72
+
73
+ remove_button = ttk.Button(btn_row, text="Remove files", command=remove_files)
74
+ remove_button.pack(side=LEFT, padx=8)
75
+
76
+ # Controls row for checkbox and import button
77
+ controls_row = ttk.Frame(left_frame)
78
+ controls_row.pack(fill=X, pady=(0, 8))
79
+
80
+ import_button = ttk.Button(
81
+ controls_row,
82
+ text="Begin Import",
83
+ command=lambda: self._import_callback(change_button_state)
84
+ )
85
+ import_button.pack(side=LEFT, padx=10)
86
+
87
+ self.reset_cd_var = BooleanVar(value=False)
88
+ reset_cb = ttk.Checkbutton(
89
+ controls_row,
90
+ text="Reset database",
91
+ variable=self.reset_cd_var
92
+ )
93
+ reset_cb.pack(side=LEFT, padx=8, pady=6)
94
+
95
+ # Files treeview
96
+ self.files_treeview = ttk.Treeview(
97
+ left_frame,
98
+ columns=[],
99
+ show="tree headings",
100
+ selectmode="extended",
101
+ height=18
102
+ )
103
+ self.files_treeview.heading("#0", text="File name")
104
+ self.files_treeview.column("#0", width=260)
105
+ self.files_treeview.pack(fill=BOTH, expand=True, pady=8)
106
+
107
+ # ====================== RIGHT SIDE ======================
108
+ right_frame = ttk.Frame(main_frame)
109
+ right_frame.grid(row=0, column=1, sticky='nsew', padx=(5, 10), pady=10)
110
+
111
+ ttk.Label(right_frame, text="Enter URLs (one per line):").pack(anchor=W, padx=5, pady=(0, 6))
112
+
113
+ self.url_text = Text(right_frame, width=28, height=22, undo=True, wrap="word", font=("Segoe UI", 10))
114
+ self.url_text.pack(side=LEFT, fill=BOTH, expand=True, padx=5, pady=5)
115
+
116
+ self.url_text.insert(END, '\n'.join(config.get('SCRAPING_TARGET_URLS')))
117
+
118
+ # Scrollbar
119
+ scrollbar = ttk.Scrollbar(right_frame, orient="vertical", command=self.url_text.yview)
120
+ scrollbar.pack(side=RIGHT, fill=Y)
121
+ self.url_text.config(yscrollcommand=scrollbar.set)
122
+
123
+ return main_frame
124
+
125
+
126
+ def _deduplication_callback(self, source: str, amount: int):
127
+ result_queue = Queue()
128
+
129
+ def show_dialog():
130
+ dialog = Toplevel()
131
+ dialog.title("Duplicated content!")
132
+ dialog.bell()
133
+
134
+ wrap_width = 360
135
+
136
+ info_label = ttk.Label(
137
+ dialog,
138
+ text=f'{amount} duplicated chunks found in database for {source}!',
139
+ wraplength=wrap_width,
140
+ justify=LEFT
141
+ )
142
+ info_label2 = ttk.Label(
143
+ dialog,
144
+ text='Would you like to reimport them with updated properties?',
145
+ wraplength=wrap_width,
146
+ justify=LEFT
147
+ )
148
+
149
+ info_label.pack(fill=X, anchor=W, padx=15, pady=15)
150
+ info_label2.pack(fill=X, anchor=W, padx=15, pady=15)
151
+
152
+ def reimport_callback():
153
+ result_queue.put(True)
154
+ dialog.destroy()
155
+
156
+ def dispose_callback():
157
+ result_queue.put(False)
158
+ dialog.destroy()
159
+
160
+ reimport_button = ttk.Button(dialog, text='Reimport', command=reimport_callback)
161
+ dispose_button = ttk.Button(dialog, text='Dispose', command=dispose_callback)
162
+
163
+ reimport_button.pack(side=LEFT, padx=15, pady=15)
164
+ dispose_button.pack(side=RIGHT, padx=15, pady=15)
165
+
166
+ dialog.update_idletasks()
167
+ width = dialog.winfo_reqwidth() + 20
168
+ height = dialog.winfo_reqheight() + 20
169
+ dialog.geometry(f"{width}x{height}")
170
+
171
+ dialog.protocol("WM_DELETE_WINDOW", dispose_callback)
172
+
173
+ dialog.wait_visibility()
174
+ dialog.grab_set()
175
+
176
+ self._parent.after(0, show_dialog)
177
+ return result_queue.get()
178
+
179
+
180
+ def _import_callback(self, button_state_callback):
181
+ dialog = Toplevel()
182
+ dialog.title("Import status")
183
+ dialog.geometry("600x400")
184
+
185
+ current_import_label = ttk.Label(dialog, text='Initiating the import pipeline...')
186
+ current_import_label.pack(side=TOP, padx=15, pady=15)
187
+
188
+ progress_bar = ttk.Progressbar(dialog, length=200, value=0, maximum=100)
189
+ progress_bar.pack(side=TOP, padx=15, pady=15)
190
+
191
+ chunks_treeview = ttk.Treeview(
192
+ dialog,
193
+ columns=['chunks', 'lang'],
194
+ show='tree headings',
195
+ selectmode='extended',
196
+ )
197
+ chunks_treeview.heading('#0', text='File name')
198
+ chunks_treeview.heading('chunks', text='Collected chunks')
199
+ chunks_treeview.heading('lang', text='Language')
200
+
201
+ chunks_treeview.column('#0', width=100)
202
+ chunks_treeview.column('chunks', width=60)
203
+ chunks_treeview.column('lang', width=40)
204
+
205
+ chunks_treeview.pack(side=TOP, fill=X, padx=15, pady=15, expand=True)
206
+
207
+ def logging_callback(
208
+ msg: str,
209
+ progress: int,
210
+ result: ProcessingResult = None,
211
+ failed: bool = False,
212
+ ):
213
+ current_import_label.config(text=msg)
214
+ progress_bar.config(value=progress)
215
+
216
+ if result:
217
+ chunks_treeview.insert('', index=0,
218
+ text=result.source,
219
+ values=(
220
+ 'Failure!' if failed else len(result.chunks),
221
+ get_language_name(result.lang)
222
+ )
223
+ )
224
+ config.dbapp['logging_callback'] = logging_callback
225
+
226
+ def import_task():
227
+ button_state_callback(DISABLED)
228
+ filepaths = self._import_paths.values()
229
+ urls = self.url_text.get('1.0', END).strip().split('\n')
230
+ try:
231
+ ImportPipeline(
232
+ logging_callback=logging_callback,
233
+ deduplication_callback=self._deduplication_callback,
234
+ ).import_all(
235
+ paths=filepaths,
236
+ urls=urls,
237
+ reset_collections=self.reset_cd_var.get()
238
+ )
239
+ dialog.bell()
240
+ finally:
241
+ button_state_callback(NORMAL)
242
+
243
+ import_thread = threading.Thread(target=import_task)
244
+ import_thread.start()
src/apps/dbapp/mainframe.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from tkinter import *
2
+ from tkinter import ttk
3
+ from src.apps.dbapp.framebase import CustomFrameBase
4
+ from src.database.weavservice import WeaviateService
5
+
6
+ class MainFrame(CustomFrameBase):
7
+ def __init__(self, parent, service: WeaviateService) -> None:
8
+ super().__init__(parent, service)
src/apps/dbapp/query.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tkinter import *
2
+ from tkinter import ttk
3
+ from src.apps.dbapp.framebase import CustomFrameBase
4
+ from src.database.weavservice import WeaviateService
5
+
6
+ class QueryFrame(CustomFrameBase):
7
+ def __init__(self, parent, service: WeaviateService) -> None:
8
+ super().__init__(parent, service)
9
+
10
+ def init(self) -> ttk.Frame:
11
+ main_frame = ttk.Frame(self._parent)
12
+ main_frame.pack(fill=BOTH, expand=True)
13
+
14
+ input_frame = ttk.Frame(main_frame)
15
+ input_frame.pack(fill=X, padx=10, pady=(5, 10))
16
+
17
+ self.language_var = StringVar(value="de")
18
+
19
+ self.filters_button = ttk.Button(input_frame, text="Filters...", command=self.open_filters)
20
+ self.filters_button.pack(side=LEFT, padx=(0, 10))
21
+
22
+ lang_frame = ttk.Frame(input_frame)
23
+ lang_frame.pack(side=LEFT, padx=(0, 15))
24
+
25
+ ttk.Radiobutton(
26
+ lang_frame,
27
+ text="EN",
28
+ variable=self.language_var,
29
+ value="en"
30
+ ).pack(side=LEFT, padx=(0, 8))
31
+
32
+ ttk.Radiobutton(
33
+ lang_frame,
34
+ text="DE",
35
+ variable=self.language_var,
36
+ value="de"
37
+ ).pack(side=LEFT)
38
+
39
+ self.query_entry = ttk.Entry(input_frame)
40
+ self.query_entry.pack(side=LEFT, fill=X, expand=True, padx=(0, 10))
41
+
42
+ self.send_button = ttk.Button(input_frame, text="Send", command=self.send_query)
43
+ self.send_button.pack(side=RIGHT)
44
+
45
+ self.query_entry.bind("<Return>", lambda _: self.send_query())
46
+
47
+ results_frame = ttk.Frame(main_frame)
48
+ results_frame.pack(fill=BOTH, expand=True, padx=10, pady=(10, 5))
49
+
50
+ self.results_text = Text(results_frame, wrap=WORD, font=("TkDefaultFont", 10))
51
+ y_scrollbar = ttk.Scrollbar(results_frame, orient=VERTICAL, command=self.results_text.yview)
52
+ self.results_text.configure(yscrollcommand=y_scrollbar.set)
53
+
54
+ self.results_text.pack(side=LEFT, fill=BOTH, expand=True)
55
+ y_scrollbar.pack(side=RIGHT, fill=Y)
56
+
57
+ self.results_text.config(state=NORMAL)
58
+ self.results_text.insert(END, "Enter your query below and click Send (or press Enter) to see results.\n")
59
+ self.results_text.config(state=DISABLED)
60
+
61
+ return main_frame
62
+
63
+
64
+ def send_query(self):
65
+ query_text = self.query_entry.get().strip()
66
+ if not query_text:
67
+ return
68
+
69
+ self.query_entry.delete(0, END)
70
+
71
+ try:
72
+ response, _ = self._service.query(
73
+ lang=self.language_var.get(),
74
+ query=query_text,
75
+ )
76
+ result_str = ''.join([f"""
77
+ ---------------------- Result {idx} ----------------------
78
+ SOURCE: {obj.properties['source']}
79
+ INSERTION DATE: {obj.properties['date']}
80
+ RELEVANT PROGRAMS: {', '.join(obj.properties['programs'])}
81
+
82
+ CONTENT:
83
+ {obj.properties['body']}
84
+
85
+ VECTOR:
86
+ {obj.vector}
87
+ """ for idx, obj in enumerate(response.objects, start=1)])
88
+
89
+ result_str = f"Query: {query_text}\n{result_str}"
90
+
91
+ self.display_result(result_str)
92
+ except Exception as e:
93
+ self.display_result(f"Error:\n{str(e)}")
94
+
95
+
96
+ def display_result(self, result_text: str):
97
+ self.results_text.config(state=NORMAL)
98
+ self.results_text.delete(1.0, END)
99
+ self.results_text.insert(END, result_text + "\n")
100
+ self.results_text.config(state=DISABLED)
101
+ self.results_text.see(1.0)
102
+
103
+
104
+ def open_filters(self):
105
+ dialog = Toplevel(self._parent)
106
+ dialog.title("Query Filters")
107
+ dialog.geometry("400x300")
108
+ dialog.grab_set()
src/apps/dbapp/utilclasses.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from datetime import datetime
3
+ from src.config import config
4
+
5
+ class BackupData:
6
+ def __init__(self, backup_id: str) -> None:
7
+ self._backup_id = backup_id
8
+ self._creation_date = ""
9
+ self._collections = []
10
+
11
+ backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
12
+ files = os.listdir(backup_path)
13
+
14
+ if 'data.json' in files:
15
+ data_path = os.path.join(backup_path, 'data.json')
16
+ with open(data_path) as f:
17
+ data = json.load(f)
18
+
19
+ date = datetime.fromisoformat(data['creation_date'])
20
+ self._creation_date = date.strftime("%d.%m.%Y %H:%M:%S")
21
+
22
+ if 'objects.json' in files:
23
+ objects_path = os.path.join(backup_path, 'objects.json')
24
+ with open(objects_path) as f:
25
+ data = json.load(f)
26
+ for name, objs in data.items():
27
+ self._collections.append({
28
+ 'name': name.lower(),
29
+ 'size': ('', len(objs))
30
+ })
31
+
32
+
33
+ def to_treeformat(self):
34
+ return {
35
+ 'id': self._backup_id.replace('backup_', ''),
36
+ 'date': (self._creation_date, ''),
37
+ 'collections': self._collections,
38
+ }
src/cache/__init__.py ADDED
File without changes
src/cache/cache.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Lock
2
+ from src.cache.cache_metrics import CacheMetrics
3
+ from src.cache.cache_strategies import RedisCache, LocalCache
4
+
5
+ from src.utils.logging import get_logger
6
+ from src.config import config
7
+
8
+ logger = get_logger("cache ")
9
+
10
+ class Cache:
11
+ _instance = None
12
+ _settings = None
13
+ _lock = Lock()
14
+ _cache_metrics = None
15
+
16
+ @staticmethod
17
+ def configure(mode: str, cache: bool):
18
+ logger.info(f"Cache configured with parameters: mode={mode}, cache={cache}")
19
+ config.cache.ENABLED = cache
20
+ Cache._settings = {
21
+ "mode": mode,
22
+ "enabled": cache
23
+ }
24
+
25
+ @staticmethod
26
+ def get_cache():
27
+ if Cache._instance is not None:
28
+ return Cache._instance
29
+
30
+ with Cache._lock:
31
+ if Cache._instance is not None:
32
+ return Cache._instance
33
+
34
+ settings = Cache._settings or {"mode": 'local', "enabled": True}
35
+
36
+ if not settings.get("enabled", True):
37
+ Cache._instance = None
38
+ return None
39
+
40
+ if Cache._cache_metrics is None:
41
+ Cache._cache_metrics = CacheMetrics()
42
+
43
+ mode = settings.get("mode", 'local')
44
+
45
+ if mode == 'cloud':
46
+ cache_obj = RedisCache(
47
+ host=config.cache.CLOUD_HOST,
48
+ port=config.cache.CLOUD_PORT,
49
+ password=config.cache.CLOUD_PASS,
50
+ mode=mode,
51
+ metrics=Cache._cache_metrics
52
+ )
53
+ elif mode == 'local':
54
+ cache_obj = RedisCache(
55
+ host=config.cache.LOCAL_HOST,
56
+ port=config.cache.LOCAL_PORT,
57
+ password=config.cache.LOCAL_PASS,
58
+ mode=mode,
59
+ metrics=Cache._cache_metrics
60
+ )
61
+ elif mode == 'dict':
62
+ Cache._instance = LocalCache(metrics=Cache._cache_metrics)
63
+ return Cache._instance
64
+ else:
65
+ logger.error("FALLBACK to dict cache. Unknown cache mode")
66
+ Cache._instance = LocalCache(metrics=Cache._cache_metrics)
67
+ return Cache._instance
68
+
69
+ if cache_obj.client is None:
70
+ logger.error("FALLBACK to dict cache. Redis connection failed")
71
+ Cache._instance = LocalCache(metrics=Cache._cache_metrics)
72
+ else:
73
+ Cache._instance = cache_obj
74
+
75
+ return Cache._instance
src/cache/cache_base.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any
3
+
4
+ class CacheStrategy(ABC):
5
+ """
6
+ Defines the interface for the different cache system strategies (Local or Redis).
7
+ """
8
+
9
+ @abstractmethod
10
+ def set(self, key: str, value: Any, language: str, session_id: str):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def get(self, key: str, language: str, session_id: str):
15
+ pass
16
+
17
+ @abstractmethod
18
+ def clear_cache(self):
19
+ pass
src/cache/cache_metrics.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from threading import Lock
3
+
4
+
5
+ @dataclass
6
+ class CacheStatistics:
7
+ hits: int
8
+ misses: int
9
+ hits_ratio: float
10
+
11
+ class CacheMetrics:
12
+ def __init__(self) -> None:
13
+ self.cache_stats = CacheStatistics(0, 0, 0.0)
14
+ self._lock = Lock()
15
+
16
+ def increment_hit(self):
17
+ with self._lock:
18
+ self.cache_stats.hits += 1
19
+ self._calc_hit_ratio()
20
+
21
+ def increment_miss(self):
22
+ with self._lock:
23
+ self.cache_stats.misses += 1
24
+ self._calc_hit_ratio()
25
+
26
+ def _calc_hit_ratio(self):
27
+ total = self.cache_stats.hits + self.cache_stats.misses
28
+ self.cache_stats.hits_ratio = (self.cache_stats.hits / total) if total else 0.0
src/cache/cache_strategies.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any
3
+ from cachetools import TTLCache
4
+
5
+ from .utils import get_cache_key
6
+ from src.cache.cache_base import CacheStrategy
7
+ from src.database.redisservice import RedisService
8
+ from src.utils.logging import get_logger
9
+ from src.config import config
10
+
11
+ logger = get_logger('cache_strat')
12
+
13
+ class RedisCache(CacheStrategy):
14
+ def __init__(self, host, port, password, mode, metrics):
15
+ service = RedisService(host, port, password, mode)
16
+ self.client = service.get_client()
17
+ self.metrics = metrics
18
+
19
+
20
+ def set(self, key: str, value: Any, language: str, session_id: str):
21
+ if not self.client: return
22
+
23
+ try:
24
+ json_str = json.dumps(value)
25
+ cache_key = get_cache_key(key, language, session_id)
26
+ self.client.set(cache_key, json_str, ex=config.cache.TTL_CACHE)
27
+ logger.info(f"Cached response with key {cache_key[:20]}... to Redis")
28
+ except Exception as e:
29
+ logger.error(f"Could not write to Redis: {e}")
30
+
31
+
32
+ def get(self, key: str, language: str, session_id: str):
33
+ if not self.client: return None
34
+
35
+ try:
36
+ cache_key = get_cache_key(key, language, session_id)
37
+ val = self.client.get(cache_key)
38
+ if val is not None:
39
+ self.metrics.increment_hit()
40
+ logger.info(f"Found cached data with key {cache_key}")
41
+ logger.debug(f"Cache statistics: Hit cache {self.metrics.cache_stats.hits} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
42
+ return json.loads(val)
43
+
44
+ self.metrics.increment_miss()
45
+ logger.debug(f"Cache statistics: Missed cache {self.metrics.cache_stats.misses} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
46
+ return None
47
+ except Exception as e:
48
+ logger.error(f"Could not read from Redis: {e}")
49
+ return None
50
+
51
+
52
+ def clear_cache(self):
53
+ if not self.client: return
54
+
55
+ try:
56
+ self.client.flushdb()
57
+ logger.info(f"Redis Cache cleared.")
58
+ except Exception as e:
59
+ logger.error(f"Could not clear Redis cache: {e}")
60
+
61
+
62
+ class LocalCache(CacheStrategy):
63
+ def __init__(self, metrics):
64
+ self.cache = TTLCache(maxsize=config.cache.MAX_SIZE_CACHE, ttl=config.cache.TTL_CACHE)
65
+ self.metrics = metrics
66
+
67
+
68
+ def set(self, key: str, value: Any, language: str, session_id: str):
69
+ normalized_key = get_cache_key(key, language, session_id)
70
+ self.cache[normalized_key] = value
71
+ logger.info("Response cached")
72
+
73
+
74
+ def get(self, key: str, language: str, session_id: str):
75
+ normalized_key = get_cache_key(key, language, session_id)
76
+ res = self.cache.get(normalized_key, None)
77
+ if res is not None:
78
+ self.metrics.increment_hit()
79
+ logger.debug(f"Cache statistics: Hit cache {self.metrics.cache_stats.hits} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
80
+ else:
81
+ self.metrics.increment_miss()
82
+ logger.debug(f"Cache statistics: Missed cache {self.metrics.cache_stats.misses} times, ratio[{self.metrics.cache_stats.hits_ratio}]")
83
+ return res
84
+
85
+
86
+ def clear_cache(self):
87
+ self.cache.clear()
88
+ logger.info("Local Cache cleared.")
src/cache/utils.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import re
2
+
3
+ def get_cache_key(key: str, language: str, session_id: str) -> str:
4
+ normalized_key = re.sub(r'[^a-z0-9]', '', key.lower())
5
+ return f"cache:{session_id}:{language}:{normalized_key}"
src/config/__init__.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.config.configs import *
2
+ from functools import lru_cache
3
+ from typing import Any
4
+ import config as c
5
+
6
+ class AppConfig:
7
+ # ===================== INITIALIZE YOUR SUBCONFIGS HERE =====================
8
+
9
+ convstate: ConversationStateConfig = ConversationStateConfig()
10
+ processing: ProcessingConfig = ProcessingConfig()
11
+ weaviate: WeaviateConfig = WeaviateConfig()
12
+ scraping: ScrapingConfig = ScrapingConfig()
13
+ chain: ChainConfig = ChainConfig()
14
+ cache: CacheConfig = CacheConfig()
15
+ paths: PathsConfig = PathsConfig()
16
+ dbapp: DatabaseAppConfig = DatabaseAppConfig()
17
+ llm: LLMProviderConfig = LLMProviderConfig()
18
+
19
+ # ===========================================================================
20
+
21
+ def get(self, key: str, default: Any = None) -> Any:
22
+ """
23
+ Retrieves an extra parameter from config.py by name.
24
+
25
+ Raises:
26
+ AttributeError if not found and no default provided.
27
+ """
28
+ try:
29
+ return getattr(c, key)
30
+ except AttributeError:
31
+ if default is not None:
32
+ return default
33
+ raise AttributeError(f"Config parameter '{key}' is not defined!")
34
+
35
+ @lru_cache(maxsize=1)
36
+ def get_config() -> AppConfig:
37
+ return AppConfig()
38
+
39
+ config = get_config()
src/config/configs.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+ from dotenv import load_dotenv
3
+
4
+ import config, os
5
+
6
+ load_dotenv()
7
+
8
+ def _get(param: str, default=None, type_=None):
9
+ value = getattr(config, param, default)
10
+
11
+ if value is None:
12
+ value = os.getenv(param)
13
+
14
+ if value is None:
15
+ return default
16
+
17
+ if not type_: return value
18
+
19
+ try:
20
+ return type_(value)
21
+ except (ValueError, TypeError):
22
+ raise ValueError(f"Failed to cast '{param}' value '{value}' to {type_.__name__}")
23
+
24
+
25
+ class ConfigBase:
26
+ PARAMS: dict = dict()
27
+
28
+ @classmethod
29
+ def __getitem__(cls, key):
30
+ return cls.PARAMS.get(key, None)
31
+
32
+ @classmethod
33
+ def __setitem__(cls, key, value):
34
+ cls.PARAMS[key] = value
35
+
36
+
37
+ class DatabaseAppConfig(ConfigBase):
38
+ pass
39
+
40
+
41
+ class PathsConfig(ConfigBase):
42
+ DATA: str = _get('DATA_PATH')
43
+ LOGS: str = _get('LOGS_PATH')
44
+ URLS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'urls')
45
+ CHUNKS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'chunks')
46
+ TEMP_CHUNKS_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'temp_chunks')
47
+ SCRAPING_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'scraping')
48
+ RAW_TEXT_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'raw_text')
49
+ RAW_HTML_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'raw_html')
50
+ METADATA_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'metadata')
51
+ EXTRACTED_TEXT_OUTPUT: str = os.path.join(_get('DATA_PATH'), 'extracted_text')
52
+
53
+
54
+ class ScrapingConfig(ConfigBase):
55
+ TIMEOUT: int = _get('SCRAPING_SCRAPING_TIMEOUT', 30)
56
+ MAX_RETRIES: int = _get('SCRAPING_MAX_RETRIES', 3)
57
+ CRAWL_DELAY: int = _get('SCRAPING_CRAWL_DELAY', 1)
58
+ BACKOFF_RATE: int = _get('SCRAPING_BACKOFF_RATE', 2)
59
+ TARGET_URLS: int = _get('SCRAPING_TARGET_URLS', None)
60
+ INTERVALS: dict = _get('SCRAPING_PRIO_INTERVAL', dict())
61
+
62
+
63
+ class ConversationStateConfig(ConfigBase):
64
+ TRACK_USER_PROFILE = _get('TRACK_USER_PROFILE')
65
+ LOCK_LANGUAGE_AFTER_N_MESSAGES = _get('LOCK_LANGUAGE_AFTER_N_MESSAGES')
66
+ MAX_CONVERSATION_TURNS = _get('MAX_CONVERSATION_TURNS')
67
+
68
+
69
+ class ProcessingConfig(ConfigBase):
70
+ LANG_AMBIGUITY_THRESHOLD: float = _get('LANG_AMBIGUITY_THRESHOLD')
71
+ EMBEDDING_MODEL: float = _get('EMBEDDING_MODEL')
72
+ MAX_TOKENS: int = _get('MAX_TOKENS')
73
+ CHUNK_OVERLAP: int = _get('CHUNK_OVERLAP')
74
+
75
+
76
+ class ChainConfig(ConfigBase):
77
+ ENABLE_RESPONSE_CHUNKING: bool = _get('ENABLE_RESPONSE_CHUNKING', True)
78
+ EVALUATE_RESPONSE_QUALITY: bool = _get('ENABLE_EVALUATE_RESPONSE_QUALITY', True)
79
+ CONFIDENCE_THRESHOLD: float = _get('CONFIDENCE_THRESHOLD')
80
+
81
+ TOP_K_RETRIEVAL: int = _get('TOP_K_RETRIEVAL', 4)
82
+ MAX_RETRIES: int = _get('MODEL_MAX_RETRIES', 3)
83
+ MAX_RESPONSE_WORDS_LEAD: int = _get('MAX_RESPONSE_WORDS_LEAD', 100)
84
+ MAX_RESPONSE_WORDS_SUBAGENT: int = _get('MAX_RESPONSE_WORDS_SUBAGENT', 200)
85
+
86
+
87
+ class CacheConfig(ConfigBase):
88
+ ENABLED: bool = _get('CACHE_ENABLED', False)
89
+ CACHE_MODE: Literal['local', 'cloud', 'dict'] = _get('CACHE_MODE')
90
+
91
+ LOCAL_HOST: str = _get('CACHE_LOCAL_HOST', 'localhost')
92
+ LOCAL_PORT: int = _get('CACHE_LOCAL_PORT', 6379)
93
+ LOCAL_PASS: str = _get('CACHE_LOCAL_PASSWORD', '')
94
+
95
+ CLOUD_HOST: str = _get('REDIS_CLOUD_HOST')
96
+ CLOUD_PORT: int = _get('REDIS_CLOUD_PORT', type_=int)
97
+ CLOUD_PASS: str = _get('REDIS_CLOUD_PASSWORD')
98
+
99
+ TTL_CACHE: int = _get('CACHE_TTL', 86400)
100
+ MAX_SIZE_CACHE: int = _get('CACHE_MAX_SIZE', 1000)
101
+
102
+
103
+ class WeaviateConfig(ConfigBase):
104
+ LOCAL_DATABASE: bool = _get('WEAVIATE_IS_LOCAL')
105
+ WEAVIATE_COLLECTION_BASENAME: str = _get('WEAVIATE_COLLECTION_BASENAME')
106
+
107
+ BACKUP_METHODS: list[str] = ['manual', 'filesystem', 's3']
108
+ BACKUP_METHOD: Literal['manual', 'filesystem', 's3'] = _get('WEAVIATE_BACKUP_METHOD')
109
+
110
+ BACKUP_PATH: str = _get('BACKUPS_PATH')
111
+ PROPERTIES_PATH: str = _get('PROPERTIES_PATH')
112
+ STRATEGIES_PATH: str = _get('STRATEGIES_PATH')
113
+
114
+ CLUSTER_URL: str = _get('WEAVIATE_CLUSTER_URL')
115
+ WEAVIATE_API_KEY: str = _get('WEAVIATE_API_KEY')
116
+ HUGGING_FACE_API_KEY: str = _get('HUGGING_FACE_API_KEY')
117
+
118
+ INIT_TIMEOUT: int = _get('WEAVIATE_INIT_TIMEOUT', 90)
119
+ QUERY_TIMEOUT: int = _get('WEAVIATE_QUERY_TIMEOUT', 60)
120
+ INSERT_TIMEOUT: int = _get('WEAVIATE_INSERT_TIMEOUT', 600)
121
+
122
+
123
+ #TODO: Clean this configuration (outdated)
124
+ class LLMProvider:
125
+ def __init__(self, base: str, sub: str | None = None) -> None:
126
+ self.base = base
127
+ self.sub = sub
128
+ self.name = f"{base}:{sub}" if sub else base
129
+
130
+
131
+ def with_sub(self, sub: str | None = None) -> str:
132
+ return LLMProvider(self.base, sub)
133
+
134
+
135
+ class LLMProviderConfig:
136
+ AVAIABLE_PROVIDERS: list[str] = [
137
+ 'groq',
138
+ 'ollama',
139
+ 'openai',
140
+ 'open_router',
141
+ ]
142
+ AVAILABLE_SUBPROVIDERS: dict = {
143
+ 'groq': [],
144
+ 'open_router': [
145
+ 'openai',
146
+ 'deepseek',
147
+ 'meituan'
148
+ 'alibaba' # For tongyi models
149
+ 'nvidia',
150
+ ],
151
+ }
152
+
153
+ LLM_PROVIDER: LLMProvider = LLMProvider('openai')
154
+
155
+ # -------------------- Some predefined models for available providers ----------------------
156
+
157
+ # Groq settings
158
+ GROQ_API_KEY: str = os.getenv("GROQ_API_KEY")
159
+ GROQ_MODEL: str = "mixtral-8x7b-32768"
160
+
161
+ # Open Router settings
162
+ OPEN_ROUTER_API_KEY: str = os.getenv("OPEN_ROUTER_API_KEY")
163
+ OPEN_ROUTER_MODEL: str = "meituan/longcat-flash-chat:free"
164
+ OPEN_ROUTER_BASE_URL: str = "https://openrouter.ai/api/v1"
165
+
166
+ # OpenAI settings
167
+ OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY")
168
+ OPENAI_MODEL: str = "gpt-5.1"
169
+
170
+ # The gpt-oss:20b model is preferable but takes much more space
171
+ # Set to False if you only have the llama3.2 installed
172
+ GPT_OSS_ENABLED: bool = False
173
+ # Local/Ollama settings
174
+ OLLAMA_BASE_URL: str = "http://localhost:11434"
175
+ OLLAMA_MODEL: str = "gpt-oss:20b" if GPT_OSS_ENABLED else "llama3.2"
176
+
177
+ # ----------------------------------------------------------------------------------------
178
+
179
+ @classmethod
180
+ def get_fallback_models(cls, provider: LLMProvider | None = None) -> list[str]:
181
+ provider = provider or cls.LLM_PROVIDER
182
+ match provider.base:
183
+ case 'openai':
184
+ return {
185
+ provider: fallback_model
186
+ for fallback_model in [
187
+ 'gpt-5-mini',
188
+ 'gpt-5-nano',
189
+ ]
190
+ }
191
+ case 'open_router':
192
+ return {
193
+ provider.with_sub('openai'): "gpt-oss-20b",
194
+ provider.with_sub('openai'): "gpt-oss-120b",
195
+ provider.with_sub('alibaba'): "alibaba/tongyi-deepresearch-30b-a3b:free",
196
+ provider: "openrouter/polaris-alpha",
197
+ # Currently unusable because has no tool support
198
+ #provider.with_sub('deepseek'): "deepseek/deepseek-chat-v3.1:free",
199
+ }
200
+ case _:
201
+ return {}
202
+
203
+ @classmethod
204
+ def get_reasoning_support(cls, provider: LLMProvider | None = None) -> bool:
205
+ provider = provider or cls.LLM_PROVIDER
206
+ return {
207
+ "groq": True,
208
+ "openai": True,
209
+ "open_router": True,
210
+ }.get(provider.base, False)
211
+
212
+
213
+ @classmethod
214
+ def get_default_model(cls, provider: LLMProvider | None = None) -> str:
215
+ provider = provider or cls.LLM_PROVIDER
216
+ return {
217
+ "groq": cls.GROQ_MODEL,
218
+ "openai": cls.OPENAI_MODEL,
219
+ "ollama": cls.OLLAMA_MODEL,
220
+ "open_router": cls.OPEN_ROUTER_MODEL,
221
+ }.get(provider.base)
222
+
223
+
224
+ @classmethod
225
+ def get_api_key(cls, provider: LLMProvider | None = None) -> str:
226
+ provider = provider or cls.LLM_PROVIDER
227
+ return {
228
+ "groq": cls.GROQ_API_KEY,
229
+ "openai": cls.OPENAI_API_KEY,
230
+ "open_router": cls.OPEN_ROUTER_API_KEY,
231
+ }.get(provider.base)
232
+
233
+
234
+ class NotificationCenterConfig(ConfigBase):
235
+ ENABLE_EMAIL_ALERTS: bool = _get('NOTIFY_ENABLE_EMAIL_ALERTS', True, bool)
236
+
237
+ SMTP_HOST: str = _get("NOTIFY_SMTP_HOST")
238
+ SMTP_PORT: int = _get("NOTIFY_SMTP_PORT", 587, type_=int)
239
+
240
+ SMTP_USER: str = _get("NOTIFY_SMTP_USER")
241
+ SMTP_PASSWORD: str = _get("NOTIFY_SMTP_PASSWORD")
242
+
243
+ SMTP_USE_TLS: bool = _get("NOTIFY_SMTP_USE_TLS", "True").lower() in ("1", "true", "yes", "on")
244
+
245
+ FROM_EMAIL: str = _get("NOTIFY_FROM_EMAIL")
246
+ TO_EMAIL: str = _get("NOTIFY_TO_EMAIL")
247
+
248
+ ENABLE_SLACK_ALERTS: bool = _get('NOTIFY_ENABLE_SLACK_ALERTS', False, bool)
249
+ SLACK_WEBHOOK_URL: str = _get("NOTIFY_SLACK_WEBHOOK_URL")
src/const/agent_response_constants.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ Constants for Gradio app """
2
+
3
+ GREETING_MESSAGES = {
4
+ "en": [
5
+ "Hello and welcome. I am your Executive Education Advisor for the HSG Executive MBA programmes (**IEMBA**, **emba X**, and **EMBA**). How may I support your MBA planning today?",
6
+ "Hello and welcome. I am your Executive Education Advisor for the University of St.Gallen Executive MBA programmes (**IEMBA**, **emba X**, and **EMBA**). How may I assist you with your programme search?",
7
+ "Hello and welcome. I am here to help you explore the University of St.Gallen Executive MBA programmes (**EMBA**, **IEMBA**, and **emba X**). What would you like to discuss today?",
8
+ "Hello and welcome. I am your Executive Education Advisor for the University of St.Gallen’s Executive MBA programmes, and I am here to help you assess fit across **EMBA**, **IEMBA**, and **emba X**.",
9
+ "Hello and welcome. I am here to support you with questions about the University of St.Gallen Executive MBA programmes and to help you evaluate the **EMBA**, **IEMBA**, and **emba X** options.",
10
+ ],
11
+ "de": [
12
+ "Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme und unterstütze Sie gerne bei Fragen zu **EMBA**, **IEMBA** und **emba X**.",
13
+ "Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme (**EMBA**, **IEMBA**, **emba X**). Ich unterstütze Sie bei Programmwahl, Ablauf und Zulassungsfragen.",
14
+ "Guten Tag und herzlich willkommen. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme und unterstütze Sie gerne bei Fragen zu **EMBA**, **IEMBA** und **emba X**.",
15
+ "Guten Tag. Ich bin Ihr Executive-Education-Berater für die HSG Executive MBA Programme (**EMBA**, **IEMBA**, **emba X**) und unterstütze Sie gerne bei der Einschätzung der passenden Option.",
16
+ "Guten Tag. Ich unterstütze Sie gerne bei Fragen zu den HSG Executive MBA Programmen und helfe Ihnen, die Optionen **EMBA**, **IEMBA** und **emba X** einzuordnen.",
17
+ ]
18
+ }
19
+
20
+ QUERY_EXCEPTION_MESSAGE = {
21
+ "en": "I'm sorry, I cannot provide a helpful response right now. Please contact tech support or try again later.",
22
+ "de": "Es tut mir leid, ich kann im Moment keine hilfreiche Antwort geben. Bitte wenden Sie sich an den technischen Support oder versuchen Sie es später erneut.",
23
+ }
24
+
25
+ NOT_VALID_QUERY_MESSAGE = {
26
+ "en": "I didn't quite understand that. Could you please rephrase your question?",
27
+ "de": "Das habe ich nicht ganz verstanden. Könnten Sie Ihre Frage bitte anders formulieren?",
28
+ }
29
+
30
+ CONFIDENCE_FALLBACK_MESSAGE = {
31
+ "en": (
32
+ "I am sorry, but I could not find sufficiently reliable information in my records to answer that question with confidence. "
33
+ "Could you please rephrase your question?\n\n"
34
+ "If you would like a personal consultation, I can also help you with appointment booking."
35
+ ),
36
+ "de": (
37
+ "Es tut mir leid, aber ich konnte in meinen Unterlagen keine Informationen finden, "
38
+ "die zu Ihrer Anfrage passen, sodass ich sie nicht mit ausreichender Sicherheit beantworten kann. "
39
+ "Könnten Sie Ihre Frage bitte umformulieren?\n\n"
40
+ "Wenn Sie ein persönliches Beratungsgespräch wünschen, kann ich Ihnen auch bei der Terminbuchung helfen."
41
+ ),
42
+ }
43
+
44
+ LANGUAGE_FALLBACK_MESSAGE = {
45
+ "en": (
46
+ "I am sorry, I can only reply in English or German. "
47
+ "Would you like to continue our conversation in English?"
48
+ ),
49
+ "de": (
50
+ "Es tut mir leid, ich kann nur auf Englisch oder Deutsch antworten. "
51
+ "Möchten Sie unser Gespräch auf Deutsch fortführen?"
52
+ ),
53
+ }
54
+
55
+ CONVERSATION_END_MESSAGE = {
56
+ "en": (
57
+ "This conversation has reached its maximum length. "
58
+ "To make sure you receive the best possible support, "
59
+ "please continue with a personal consultation.\n\n"
60
+ "If you would like to see appointment options with an admissions advisor, please ask me to show them. "
61
+ "Thank you for your understanding."
62
+ ),
63
+ "de": (
64
+ "Dieses Gespräch hat die maximale Länge erreicht. "
65
+ "Damit Sie bestmöglich unterstützt werden, bitten wir Sie, "
66
+ "das Anliegen in einem persönlichen Beratungsgespräch fortzusetzen.\n\n"
67
+ "Wenn Sie Terminoptionen mit der Studienberatung sehen möchten, sagen Sie mir bitte kurz Bescheid. "
68
+ "Vielen Dank für Ihr Verständnis."
69
+ ),
70
+ }
71
+
72
+ ADMISSIONS_TEAM_CONTACT = {
73
+ "en": {
74
+ "email": "emba@unisg.ch",
75
+ "phone": "+41 71 224 27 02",
76
+ },
77
+ "de": {
78
+ "email": "emba@unisg.ch",
79
+ "phone": "+41 71 224 27 02",
80
+ },
81
+ }
82
+
83
+ ADVISOR_CONTACTS = [
84
+ {
85
+ "name": "Cyra von Müller (EMBA)",
86
+ "program": "emba",
87
+ "email": "cyra.vonmueller@unisg.ch",
88
+ "phone": "+41 71 224 27 12",
89
+ "url": "https://calendly.com/cyra-vonmueller/beratungsgespraech-emba-hsg",
90
+ },
91
+ {
92
+ "name": "Kristin Fuchs (IEMBA)",
93
+ "program": "iemba",
94
+ "email": "kristin.fuchs@unisg.ch",
95
+ "phone": "+41 71 224 75 46",
96
+ "url": "https://calendly.com/kristin-fuchs-unisg/iemba-online-personal-consultation",
97
+ },
98
+ {
99
+ "name": "Teyuna Giger (emba X)",
100
+ "program": "emba_x",
101
+ "email": "teyuna.giger@unisg.ch",
102
+ "phone": "+41 71 224 77 65",
103
+ "url": "https://calendly.com/teyuna-giger-unisg",
104
+ },
105
+ ]
106
+
107
+
108
+ def get_admissions_contact_text(language: str = "en") -> str:
109
+ labels = {
110
+ "en": "You can reach the Executive MBA admissions team at {email} or {phone}.",
111
+ "de": "Sie erreichen das Executive-MBA-Zulassungsteam unter {email} oder {phone}.",
112
+ }
113
+ contact = ADMISSIONS_TEAM_CONTACT.get(language, ADMISSIONS_TEAM_CONTACT["en"])
114
+ template = labels.get(language, labels["en"])
115
+ return template.format(email=contact["email"], phone=contact["phone"])
116
+
117
+
118
+ def get_booking_widget(language: str="en", programs: list[str]=None):
119
+ """
120
+ Returns an HTML string representing a Booking Widget.
121
+ """
122
+
123
+ if programs is None or programs == []:
124
+ programs = ["emba", "iemba", "emba_x"]
125
+
126
+ labels = {
127
+ "en": {
128
+ "header": "Book a Consultation",
129
+ "sub": "Select an advisor to view available appointment slots and contact details:",
130
+ "email": "Email",
131
+ "phone": "Phone",
132
+ },
133
+ "de": {
134
+ "header": "Termin vereinbaren",
135
+ "sub": "Wählen Sie einen Berater, um verfügbare Termine und Kontaktdaten zu sehen:",
136
+ "email": "E-Mail",
137
+ "phone": "Telefon",
138
+ }
139
+ }
140
+ txt = labels.get(language, labels["en"])
141
+
142
+ base_params = "?hide_gdpr_banner=1&embed_type=Inline&embed_domain=1"
143
+
144
+ html_content = f"""
145
+ <div style="width: 100%; min-width: 100%; box-sizing: border-box; background-color: #f9fafb; border: 1px solid #e5e7eb; border-radius: 12px; padding: 20px; margin-top: 10px; font-family: sans-serif;">
146
+ <h3 style="margin: 0 0 10px 0; color: #111827; font-size: 1.2em;">{txt['header']}</h3>
147
+ <p style="margin: 0 0 20px 0; color: #6b7280; font-size: 1em;">{txt['sub']}</p>
148
+ """
149
+
150
+ for advisor in ADVISOR_CONTACTS:
151
+ if advisor["program"] in programs:
152
+ html_content += f"""
153
+ <details style="margin-bottom: 12px; border: 1px solid #d1d5db; border-radius: 8px; background: white; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1);">
154
+ <summary style="cursor: pointer; padding: 16px 20px; background-color: #ffffff; font-weight: 600; color: #374151; font-size: 1.05em; list-style: none; transition: background 0.2s;">
155
+ {advisor['name']}
156
+ </summary>
157
+ <div style="padding: 16px 20px 0 20px; border-top: 1px solid #e5e7eb;">
158
+ <p style="margin: 0 0 6px 0; color: #374151;"><strong>{txt['email']}:</strong> <a href="mailto:{advisor['email']}" style="color: #1d4ed8; text-decoration: none;">{advisor['email']}</a></p>
159
+ <p style="margin: 0 0 16px 0; color: #374151;"><strong>{txt['phone']}:</strong> <a href="tel:{advisor['phone'].replace(' ', '')}" style="color: #1d4ed8; text-decoration: none;">{advisor['phone']}</a></p>
160
+ </div>
161
+ <div style="padding: 0; border-top: 1px solid #e5e7eb;">
162
+ <iframe src="{advisor['url']}{base_params}" width="100%" height="650px" frameborder="0" style="display: block;"></iframe>
163
+ </div>
164
+ </details>
165
+ """
166
+
167
+ html_content += "</div>"
168
+ return html_content
169
+
170
+
171
+ def get_disclaimer_widget(language: str = "en"):
172
+ """
173
+ Returns an HTML string representing a warning disclaimer.
174
+ """
175
+ disclaimers = {
176
+ "en": {
177
+ "title": "Disclaimer",
178
+ "body": "Assessments provided by this advisor are non-binding and based on limited information. Please consult our program directors for final admission or credit evaluations."
179
+ },
180
+ "de": {
181
+ "title": "Haftungsausschluss",
182
+ "body": "Die Einschätzungen dieses Beraters sind unverbindlich und basieren auf begrenzten Informationen. Bitte wenden Sie sich für endgültige Zulassungs- oder Anrechnungsfragen an die Programmleitung."
183
+ }
184
+ }
185
+
186
+ content = disclaimers.get(language, disclaimers["en"])
187
+
188
+ # Yellow styling constants
189
+ bg_color = "#fffbeb" # Light yellow
190
+ border_color = "#f59e0b" # Amber/Yellow border
191
+ icon_color = "#d97706" # Darker amber for the icon
192
+ text_color = "#92400e" # Dark brown/yellow for readability
193
+
194
+ html_content = f"""
195
+ <div style="display: flex; align-items: flex-start; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 8px; padding: 16px; margin-bottom: 20px; font-family: sans-serif;">
196
+ <div style="margin-right: 12px; margin-top: 2px;">
197
+ <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="{icon_color}" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
198
+ <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"/><line x1="12" y1="9" x2="12" y2="13"/><line x1="12" y1="17" x2="12.01" y2="17"/>
199
+ </svg>
200
+ </div>
201
+ <div>
202
+ <strong style="display: block; color: {text_color}; margin-bottom: 4px; font-size: 0.95em;">{content['title']}</strong>
203
+ <p style="margin: 0; color: {text_color}; font-size: 0.85em; line-height: 1.4;">
204
+ {content['body']}
205
+ </p>
206
+ </div>
207
+ </div>
208
+ """
209
+ return html_content
src/const/cc_whitelist.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ REPETITION_WHITELIST = [
2
+ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'januar', 'februar', 'märz', 'mai', 'juni', 'juli', 'oktober', 'dezember', 'total', 'iemba', 'emba', 'emba x', 'programme', 'program',
3
+ ]
src/const/data_consent_constants.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PRIVACY_NOTICE = {
2
+ "de": """
3
+ ### Datenschutzhinweis
4
+
5
+ Wir verwenden Ihre Angaben, um Sie zu **Executive MBA Programmen der Universität St.Gallen** zu beraten.
6
+ Dabei verarbeiten wir insbesondere:
7
+
8
+ - Ihre Gesprächsinhalte und Anfragen
9
+ - Kontaktdaten (Name, E-Mail) bei Terminbuchung
10
+ - Informationen zu Ihrer Berufserfahrung und Ausbildung
11
+
12
+ Ihre Daten werden **ausschließlich für die Studienberatung** verwendet und **nicht an Dritte weitergegeben**.
13
+ Sie können Ihre Einwilligung **jederzeit widerrufen**.
14
+
15
+ [Weitere Informationen zur Datenschutzerklärung](https://www.unisg.ch/en/data-protection-declaration/)
16
+ """,
17
+
18
+ "en": """
19
+ ### Privacy Notice
20
+
21
+ We use your information to advise you on **Executive MBA programmes at the University of St.Gallen**.
22
+ We process in particular:
23
+
24
+ - Your conversation content and inquiries
25
+ - Contact details (name, email) for appointment booking
26
+ - Information about your professional experience and education
27
+
28
+ Your data is used **solely for study advisory purposes** and **is not shared with third parties**.
29
+ You may **withdraw your consent at any time**.
30
+
31
+ [More information in the Privacy Policy](https://www.unisg.ch/en/data-protection-declaration/)
32
+ """
33
+ }
34
+
35
+ ACCEPT = {
36
+ "de": "Zustimmen",
37
+ "en": "Accept"
38
+ }
39
+
40
+ DECLINE = {
41
+ "de": "Ablehnen",
42
+ "en": "Decline"
43
+ }
44
+
45
+ DECLINE_MESSAGE = {
46
+ "de": "Ohne Ihre Einwilligung können wir Sie leider nicht beraten. "
47
+ "Bitte kontaktieren Sie uns direkt unter emba@unisg.ch.",
48
+ "en": "Without your consent, we cannot provide advice. "
49
+ "Please contact us directly at emba@unisg.ch.",
50
+ }
51
+
52
+ WITHDRAW_CONFIRMATION_MESSAGE = {
53
+ "de": "Ihre Einwilligung wurde widerrufen. Ihre Session-Daten wurden gelöscht. Ohne Einwilligung können wir Sie leider nicht beraten.",
54
+ "en": "Your consent has been withdrawn. Your session data has been deleted. Without consent, we cannot continue advising you."
55
+ }
56
+
57
+ WITHDRAW_TEXT = {
58
+ "de": "Einwilligung widerrufen",
59
+ "en": "Withdraw Consent"
60
+ }
src/const/page_blacklist.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ PAGE_BLACKLIST = [
2
+ 'cookie', 'cookies', 'privacy', 'datenschutz', 'popup', 'download',
3
+ 'cookie-policy', 'privacy-policy', 'cookie-and-privacy-policy',
4
+ 'data-protection', 'impressum', 'legal', 'terms', 'agb', 'imprint'
5
+ ]
src/const/page_priority.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PAGE_PRIORITY_KEYWORDS = {
2
+ 'high': [
3
+ # -------------------------------------------- EN --------------------------------------------
4
+ 'overview', 'about', 'introduction', 'summary', 'home', 'general information', 'welcome',
5
+ 'admissions', 'admission', 'apply', 'application', 'how to apply', 'enrollment', 'prospective students', 'entrance',
6
+ 'costs', 'tuition', 'fees', 'expenses', 'financial information', 'funding', 'scholarships',
7
+ 'curriculum', 'courses', 'program', 'programmes', 'degree structure', 'modules', 'syllabus',
8
+ 'eligibility', 'admission requirements', 'entry requirements', 'qualifications', 'prerequisites', 'criteria',
9
+ 'deadlines', 'application deadlines', 'key dates', 'timeline', 'due dates', 'important dates'
10
+
11
+ # -------------------------------------------- DE --------------------------------------------
12
+ 'übersicht', 'überblick', 'einführung', 'zusammenfassung', 'allgemeines', 'willkommen',
13
+ 'zulassung', 'zulassungen', 'bewerbung', 'bewerbungen', 'wie bewerben', 'einschreibung', 'potenzielle studenten', 'aufnahme',
14
+ 'kosten', 'studiengebühren', 'gebühren', 'ausgaben', 'finanzielle informationen', 'finanzierung', 'stipendien',
15
+ 'studienplan', 'lehrplan', 'curriculum', 'modulhandbuch', 'studiengangsstruktur', 'module', 'lehrstoff',
16
+ 'voraussetzungen', 'zulassungsvoraussetzungen', 'eintrittsvoraussetzungen', 'qualifikationen', 'vorkenntnisse', 'kriterien',
17
+ 'fristen', 'bewerbungsfristen', 'schlüsseltermine', 'zeitplan', 'fälligkeitsdaten', 'wichtige daten'
18
+ ],
19
+ 'medium': [
20
+ # -------------------------------------------- EN --------------------------------------------
21
+ 'faculty', 'faculties', 'staff', 'professors', 'departments', 'team', 'instructors', 'lecturers',
22
+ 'alumni', 'graduates', 'former students', 'success stories', 'alumnae'
23
+
24
+ # -------------------------------------------- DE --------------------------------------------
25
+ 'fakultät', 'fakultäten', 'personal', 'professoren', 'dozenten', 'abteilungen', 'team', 'lehrkräfte',
26
+ 'alumni', 'absolventen', 'ehemalige studenten', 'erfolgsgeschichten'
27
+ ],
28
+ 'low': [
29
+ # -------------------------------------------- EN --------------------------------------------
30
+ 'news', 'press', 'blog', 'updates', 'articles', 'announcements',
31
+ 'events', 'calendar', 'activities', 'conferences', 'workshops', 'seminars'
32
+
33
+ # -------------------------------------------- DE --------------------------------------------
34
+ 'nachrichten', 'presse', 'blog', 'aktualisierungen', 'artikel', 'ankündigungen',
35
+ 'veranstaltungen', 'kalender', 'aktivitäten', 'konferenzen', 'workshops', 'seminare'
36
+ ],
37
+ }
38
+
39
+ CHUNK_TOPIC_KEYWORDS = {
40
+ 'admissions': {
41
+ # ----------------------- EN -----------------------
42
+ 'admissions', 'application', 'apply', 'application process',
43
+ 'deadline', 'deadlines', 'selection', 'assessment',
44
+ 'interview', 'admissions committee', 'application form',
45
+ 'submit', 'submission', 'enrollment',
46
+
47
+ # ----------------------- DE -----------------------
48
+ 'zulassung', 'bewerbung', 'bewerben',
49
+ 'bewerbungsprozess', 'frist', 'fristen',
50
+ 'auswahlverfahren', 'aufnahmeverfahren',
51
+ 'assessment', 'interview', 'aufnahmegespräch',
52
+ 'zulassungskomitee', 'einschreibung', 'immatrikulation',
53
+ 'einreichen'
54
+ },
55
+
56
+ 'costs': {
57
+ # ----------------------- EN -----------------------
58
+ 'tuition', 'tuition fee', 'fees', 'costs', 'expenses',
59
+ 'payment', 'payment plan', 'installment', 'installments',
60
+ 'deposit', 'price', 'total cost',
61
+ 'funding', 'financing', 'loan', 'loans',
62
+ 'scholarship', 'scholarships', 'budget',
63
+
64
+ # ----------------------- DE -----------------------
65
+ 'studiengebühren', 'gebühren', 'kosten', 'ausgaben',
66
+ 'zahlung', 'zahlungsplan', 'rate', 'raten',
67
+ 'anzahlung', 'preis', 'gesamtkosten',
68
+ 'finanzierung', 'kredit', 'kredite',
69
+ 'stipendium', 'stipendien', 'budget'
70
+ },
71
+
72
+ 'curriculum': {
73
+ # ----------------------- EN -----------------------
74
+ 'curriculum', 'program', 'programme', 'content',
75
+ 'module', 'modules', 'course', 'courses',
76
+ 'structure', 'format', 'timeline', 'schedule',
77
+ 'duration', 'ects', 'credits',
78
+ 'training', 'coaching', 'workshop', 'workshops',
79
+ 'project', 'projects', 'leadership', 'development',
80
+ 'learning', 'electives',
81
+
82
+ # ----------------------- DE -----------------------
83
+ 'curriculum', 'programm', 'studium', 'inhalt',
84
+ 'modul', 'module', 'kurs', 'kurse',
85
+ 'struktur', 'format', 'zeitplan', 'ablauf',
86
+ 'dauer', 'ects', 'leistungspunkte',
87
+ 'training', 'coaching', 'workshop', 'workshops',
88
+ 'projekt', 'projekte', 'führung', 'entwicklung',
89
+ 'lernen', 'wahlfächer'
90
+ },
91
+
92
+ 'eligibility': {
93
+ # ----------------------- EN -----------------------
94
+ 'eligibility', 'requirements', 'prerequisites',
95
+ 'admission requirements', 'criteria',
96
+ 'qualification', 'qualifications',
97
+ 'work experience', 'leadership experience',
98
+ 'degree', 'academic degree',
99
+ 'language requirement', 'fluency',
100
+
101
+ # ----------------------- DE -----------------------
102
+ 'voraussetzungen', 'zulassungsvoraussetzungen',
103
+ 'anforderungen', 'kriterien',
104
+ 'qualifikation', 'qualifikationen',
105
+ 'berufserfahrung', 'führungserfahrung',
106
+ 'abschluss', 'studienabschluss',
107
+ 'sprachkenntnisse', 'sprachvoraussetzungen'
108
+ },
109
+
110
+ 'alumni': {
111
+ # ----------------------- EN -----------------------
112
+ 'alumni', 'alumni network',
113
+ 'graduates', 'community',
114
+ 'career service', 'mentoring',
115
+
116
+ # ----------------------- DE -----------------------
117
+ 'alumni', 'alumni-netzwerk',
118
+ 'absolventen', 'gemeinschaft',
119
+ 'karriereservice', 'mentoring'
120
+ },
121
+
122
+ 'general': {
123
+ # ----------------------- EN -----------------------
124
+ 'overview', 'introduction', 'summary',
125
+ 'highlights', 'benefits', 'advantages',
126
+ 'experience', 'journey',
127
+ 'programme details', 'program details',
128
+ 'location', 'format', 'language',
129
+
130
+ # ----------------------- DE -----------------------
131
+ 'überblick', 'einführung', 'zusammenfassung',
132
+ 'highlights', 'vorteile',
133
+ 'erfahrung', 'reise',
134
+ 'programmdetails', 'standort',
135
+ 'format', 'sprche'
136
+ },
137
+ }
src/database/__init__.py ADDED
File without changes
src/database/docker-compose-cache.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ redis:
5
+ image: redis:alpine
6
+ container_name: hsg_redis_cache
7
+ ports:
8
+ - "6379:6379"
9
+ command: >
10
+ redis-server
11
+ --requirepass "${REDIS_PASSWORD}"
12
+ --save 60 1
13
+ --loglevel warning
14
+ --maxmemory 200mb
15
+ --maxmemory-policy allkeys-lru
16
+ volumes:
17
+ - redis_data:/data
18
+ restart: unless-stopped
19
+
20
+ healthcheck:
21
+ test: ["CMD", "redis-cli", "-a", "${REDIS_PASSWORD}", "ping"]
22
+ interval: 5s
23
+ timeout: 3s
24
+ retries: 5
25
+
26
+ volumes:
27
+ redis_data:
src/database/docker-compose.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.4'
2
+
3
+ services:
4
+ weaviate:
5
+ image: semitechnologies/weaviate:1.33.0
6
+ restart: on-failure:0
7
+ ports:
8
+ - "8080:8080"
9
+ - "50051:50051"
10
+ environment:
11
+ QUERY_DEFAULTS_LIMIT: 25
12
+ AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
13
+ PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
14
+ ENABLE_API_BASED_MODULES: 'true'
15
+ ENABLE_MODULES: 'text2vec-transformers'
16
+ TRANSFORMERS_INFERENCE_API: 'http://t2v-transformers:8080'
17
+ CLUSTER_HOSTNAME: 'node1'
18
+ volumes:
19
+ - weaviate_data:/var/lib/weaviate
20
+
21
+ t2v-transformers:
22
+ image: semitechnologies/transformers-inference:sentence-transformers-all-MiniLM-L6-v2
23
+ restart: on-failure:0
24
+ ports:
25
+ - "8081:8080"
26
+
27
+ volumes:
28
+ weaviate_data:
29
+
src/database/redisservice.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import redis
2
+ from threading import Lock
3
+ from src.utils.logging import get_logger
4
+
5
+ logger = get_logger("redis_service")
6
+
7
+ class RedisService:
8
+ _instance = None
9
+ _init_lock = Lock()
10
+
11
+ def __new__(cls, host, port, password, mode):
12
+ if cls._instance is None:
13
+ with cls._init_lock:
14
+ if cls._instance is None:
15
+ cls._instance = super().__new__(cls)
16
+ return cls._instance
17
+
18
+ def __init__(self, host, port, password, mode):
19
+ if hasattr(self, '_initialized') and self._initialized:
20
+ return
21
+
22
+ self._client = None
23
+ self._host = host
24
+ self._port = port
25
+ self._password = password
26
+ self.mode = mode
27
+
28
+ self._connect()
29
+
30
+ self._initialized = True
31
+
32
+ def _connect(self):
33
+ try:
34
+ logger.info(f"Connecting to Redis at {self._host}:{self._port}...")
35
+ self._client = redis.Redis(
36
+ host=self._host,
37
+ port=self._port,
38
+ password=self._password,
39
+ decode_responses=True,
40
+ socket_connect_timeout=2,
41
+ socket_timeout=2
42
+ )
43
+ self._client.ping()
44
+ logger.info(f"Successfully connected to Redis! {self.mode}")
45
+ except Exception as e:
46
+ logger.error(f"Redis connection failed: {e}")
47
+ self._client = None
48
+
49
+ def get_client(self):
50
+ return self._client
51
+
52
+ def is_connected(self) -> bool:
53
+ return self._client is not None
src/database/weavservice.py ADDED
@@ -0,0 +1,851 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import reduce
2
+ import weaviate as wvt
3
+ import datetime, os
4
+ from threading import Lock
5
+
6
+ from time import perf_counter, sleep
7
+ from weaviate.classes.config import Configure, Property, DataType
8
+ from weaviate.collections.classes.grpc import MetadataQuery
9
+ from weaviate.collections.collection import Collection
10
+ from weaviate.classes.init import AdditionalConfig, Timeout
11
+ from weaviate.classes.query import Filter
12
+ from weaviate.config import AdditionalConfig
13
+
14
+ from ..utils.logging import get_logger
15
+ from ..config import config
16
+
17
+ logger = get_logger("weaviate_service")
18
+
19
+ _get_collection_name = lambda lang: f'{config.weaviate.WEAVIATE_COLLECTION_BASENAME}_{lang}'
20
+ _collection_names = [_get_collection_name(lang) for lang in config.get('AVAILABLE_LANGUAGES')]
21
+
22
+
23
+ def _default_properties() -> list[Property]:
24
+ return [
25
+ Property(name='body', data_type=DataType.TEXT),
26
+ Property(name='chunk_id', data_type=DataType.TEXT),
27
+ Property(name='document_id', data_type=DataType.TEXT),
28
+ Property(name='programs', data_type=DataType.TEXT_ARRAY),
29
+ Property(name='source', data_type=DataType.TEXT),
30
+ Property(name='date', data_type=DataType.DATE),
31
+ ]
32
+
33
+
34
+ class WeaviateService:
35
+ """
36
+ Provides an interface for interacting with the Weaviate vector database.
37
+ Handles initialization, data import, and hybrid queries.
38
+ """
39
+
40
+ _instance = None
41
+ _init_lock = Lock()
42
+
43
+ def __new__(cls):
44
+ if cls._instance is None:
45
+ with cls._init_lock:
46
+ if cls._instance is None:
47
+ cls._instance = super().__new__(cls)
48
+ return cls._instance
49
+
50
+ def __init__(self) -> None:
51
+ """
52
+ Initialize the Weaviate service.
53
+ """
54
+ if hasattr(self, '_initialized'):
55
+ return
56
+
57
+ self._connection_type = 'local' if config.weaviate.LOCAL_DATABASE else 'cloud'
58
+ self._client = None
59
+ self._client_lock = Lock()
60
+
61
+ # Some parameters to ensure that the connection will not be closed
62
+ # during long pauses in conversations
63
+ self._last_query_time = perf_counter()
64
+ self._idle_timeout = 25 * 60
65
+ self._initialized = True
66
+
67
+ # Initialize the client for the first time
68
+ logger.info("Initializing Weaviate service...")
69
+ try:
70
+ self._init_client()
71
+ logger.info("Weaviate service initialized successfully")
72
+ except Exception as e:
73
+ logger.error(f"Failed to initialize Weaviate service: {e}")
74
+ raise e
75
+
76
+
77
+ def _init_client(self) -> wvt.WeaviateClient:
78
+ """
79
+ Initializes the weaviate client with additional configuration.
80
+ Performs a warm-up querying to speed-up the subsequent calls.
81
+
82
+ Returns:
83
+ configured Weaviate client instance on successfull connection.
84
+
85
+ Raises:
86
+ WeaviateConnectionError of the last failed connection if connection fails after 3 retires.
87
+ """
88
+ # Returns the client if it hasn't been idling for too long
89
+ if self._client is not None:
90
+ time_since_query = perf_counter() - self._last_query_time
91
+ if time_since_query < self._idle_timeout:
92
+ return self._client
93
+
94
+ # The connection might be closed, clients has to be reconnected
95
+ logger.warning(f"Client has been idling for too long. Reconnecting to prevent server-side closure...")
96
+ try:
97
+ self._client.close()
98
+ except Exception as _:
99
+ pass
100
+
101
+ self._client = None
102
+
103
+ # Client initialization
104
+ with self._client_lock:
105
+ if self._client:
106
+ return self._client
107
+
108
+ retries = 0
109
+ last_exception: Exception = None
110
+ while retries < 3:
111
+ try:
112
+ if config.weaviate.LOCAL_DATABASE:
113
+ self._client = wvt.connect_to_local()
114
+ break
115
+
116
+ self._client = wvt.connect_to_weaviate_cloud(
117
+ cluster_url=config.weaviate.CLUSTER_URL,
118
+ auth_credentials=config.weaviate.WEAVIATE_API_KEY,
119
+ additional_config=AdditionalConfig(
120
+ timeout=Timeout(
121
+ init=config.weaviate.INIT_TIMEOUT,
122
+ query=config.weaviate.QUERY_TIMEOUT,
123
+ insert=config.weaviate.INSERT_TIMEOUT,
124
+ ),
125
+ skip_init_checks=False,
126
+ ),
127
+ headers={
128
+ "X-HuggingFace-Api-Key": config.weaviate.HUGGING_FACE_API_KEY,
129
+ },
130
+ )
131
+
132
+ # Warm-up query
133
+ logger.info("Running warm-up query to initialize server...")
134
+ try:
135
+ collection = _get_collection_name(config.get('AVAILABLE_LANGUAGES')[0])
136
+ self._client.collections.exists(collection)
137
+ logger.info("Warm-up finished - server is ready!")
138
+ except Exception as warmup_err:
139
+ logger.warning(f"Warm-up query failed (non-critical): {warmup_err}")
140
+
141
+ break
142
+ except Exception as e:
143
+ last_exception = e
144
+ logger.warning(f"Failed to establish connection on try {retries}: {e}")
145
+ retries += 1
146
+ sleep(1)
147
+
148
+ if retries == 3:
149
+ logger.error(f"Failed to establish connection after 3 retries!")
150
+ raise last_exception
151
+
152
+ logger.info(f"Successully connected to the {self._connection_type} weaviate database")
153
+ self._last_query_time = perf_counter()
154
+ return self._client
155
+
156
+
157
+ def _select_collection(self, lang: str) -> tuple[Collection, str]:
158
+ """
159
+ Select a language-specific collection as the active working collection.
160
+
161
+ Args:
162
+ lang (str): Acceptable language code.
163
+
164
+ Raises:
165
+ weaviate.exceptions.WeaviateConnectionError: If the specified language collection does not exist.
166
+ """
167
+ if lang not in config.get('AVAILABLE_LANGUAGES'):
168
+ logger.error(f"No collection for language '{lang}' was found in the database")
169
+ return None, ''
170
+
171
+ collection_name = _get_collection_name(lang)
172
+ logger.debug(f"Using collection {collection_name}")
173
+
174
+ client = self._init_client()
175
+ return client.collections.use(collection_name), collection_name
176
+
177
+
178
+ def batch_import(self, data_rows: list, lang: str) -> list:
179
+ """
180
+ Perform a batch import of multiple objects into the current collection.
181
+
182
+ Args:
183
+ data_rows (list): List of dictionaries representing the data rows to import.
184
+ lang (str, optional): Language collection to use. If not provided, uses the current one.
185
+
186
+ Returns:
187
+ list[dict]: List of failed imports with error details, if any.
188
+
189
+ Raises:
190
+ If no active collection is available or a connection error was catched.
191
+ """
192
+ collection, collection_name = self._select_collection(lang)
193
+ if collection is None:
194
+ logger.error("No working collection selected!")
195
+ return []
196
+
197
+ import_errors = []
198
+ logger.info(f"Batch importing {len(data_rows)} rows into {collection_name}")
199
+
200
+ try:
201
+ with self._client_lock:
202
+ with collection.batch.fixed_size(batch_size=100, concurrent_requests=2) as batch:
203
+ for idx, data_row in enumerate(data_rows):
204
+ try:
205
+ batch.add_object(properties=data_row)
206
+ except Exception as e:
207
+ import_errors.append({'index': idx, 'chunk_id': data_row['chunk_id'], 'error': str(e)})
208
+
209
+ if idx % 20 == 0 and idx > 0:
210
+ if batch.number_errors > 0:
211
+ logger.info(f"Failed imports at index {idx}: {batch.number_errors}")
212
+
213
+ self._last_query_time = perf_counter()
214
+ logger.info(f"Batch import finished. Total errors: {len(import_errors)}")
215
+
216
+ except Exception as e:
217
+ if 'connection' in str(e).lower():
218
+ logger.error(f"Connection error during batch import: {e}")
219
+ self._client = None
220
+ raise e
221
+
222
+ return import_errors
223
+
224
+
225
+ @staticmethod
226
+ def _create_property_filter(prop, values) -> Filter:
227
+ match prop:
228
+ case 'programs':
229
+ return Filter.by_property('programs').contains_any(values)
230
+ case 'source':
231
+ return Filter.by_property('source').contains_any(values) \
232
+ if isinstance(values, list) else Filter.by_property('source').equal(values)
233
+ case _:
234
+ return None
235
+
236
+
237
+ def delete_chunks(self, lang: str, property_filters: dict[str, any] = None) -> int:
238
+ """
239
+ Delete all chunks from the specified collection that match given property filters.
240
+
241
+ Args:
242
+ lang (str): Language collection to use.
243
+ property_filters (dict[str, any]): Key-value pairs for filtering.
244
+
245
+ Returns:
246
+ int: Number of deleted objects (if available, else -1).
247
+ """
248
+ retry_count = 0
249
+ max_retries = 2
250
+
251
+ filters = [self._create_property_filter(prop, values)
252
+ for prop, values in property_filters.items()] if property_filters else None
253
+ if filters:
254
+ filters = [f for f in filters if f is not None]
255
+ filters = reduce(lambda f1, f2: f1 & f2, filters) if filters else None
256
+
257
+ while retry_count < max_retries:
258
+ try:
259
+ collection, collection_name = self._select_collection(lang)
260
+ if collection is None:
261
+ logger.error("No working collection selected!")
262
+ return 0
263
+
264
+ logger.info(f"Deleting chunks from {collection_name} with filters={property_filters}")
265
+
266
+ with self._client_lock:
267
+ result = collection.data.delete_many(
268
+ where=filters
269
+ )
270
+
271
+ self._last_query_time = perf_counter()
272
+
273
+ deleted = getattr(result, "objects_deleted", None)
274
+ if deleted is None:
275
+ logger.info("Deletion executed (count not returned by client)")
276
+ return -1
277
+
278
+ logger.info(f"Deleted {deleted} objects")
279
+ return deleted
280
+
281
+ except Exception as e:
282
+ if any(err_type in str(e).lower() for err_type in ['reset', 'closed', 'grpc', 'unavailable']):
283
+ retry_count += 1
284
+ logger.warning(f"Connection error during deletion: {e}. Retrying...")
285
+ if retry_count == max_retries:
286
+ raise e
287
+ else:
288
+ raise e
289
+
290
+
291
+ def ping(self, lang: str) -> dict:
292
+ try:
293
+ collection, _ = self._select_collection(lang)
294
+ with self._client_lock:
295
+ collection.query.hybrid("health check query")
296
+ return { 'status': 'OK' }
297
+ except Exception as e:
298
+ return { 'status': 'ERROR', 'error': e }
299
+
300
+
301
+ def query(self, query: str, lang: str, property_filters: dict[str] = None, limit: int = 5) -> dict:
302
+ """
303
+ Execute a hybrid semantic and keyword query against the active collection with automatic reconnection on idle timeout.
304
+
305
+ Args:
306
+ query (str): The query string.
307
+ lang (str, optional): Language collection to use. If not provided, uses the current one.
308
+ property_filters (dict[str, any]): Key-value pairs for metadata filtering. Keys correspond
309
+ to document properties (e.g., 'program', 'topic'), and values are the required matches.
310
+ Multiple filters are combined using logical AND.
311
+ limit (int, optional): Maximum number of results to return. Defaults to 5.
312
+
313
+
314
+ Returns:
315
+ tuple: A tuple containing the query response and elapsed time.
316
+
317
+ Raises:
318
+ weaviate.exceptions.WeaviateConnectionError: If no active collection is available.
319
+ """
320
+ retry_count = 0
321
+ max_retries = 2
322
+
323
+ filters = [self._create_property_filter(prop, values)
324
+ for prop, values in property_filters.items()] if property_filters else None
325
+ if filters:
326
+ filters = [f for f in filters if f is not None]
327
+ filters = reduce(lambda f1, f2: f1 & f2, filters) if filters else None
328
+
329
+ while retry_count < max_retries:
330
+ try:
331
+ collection, collection_name = self._select_collection(lang)
332
+ if collection is None:
333
+ logger.error("No working collection selected upon starting of the querying!")
334
+ return [], 0
335
+
336
+ logger.info(f"Querying collection {collection_name}")
337
+ query_start_time = perf_counter()
338
+
339
+ with self._client_lock:
340
+ resp = collection.query.hybrid(
341
+ query=query,
342
+ filters=filters,
343
+ limit=limit,
344
+ return_metadata=MetadataQuery.full()
345
+ )
346
+ elapsed = perf_counter() - query_start_time
347
+ self._last_query_time = perf_counter()
348
+ logger.info(f"Querying retrieved {len(resp.objects)} objects in {elapsed:3.2f} seconds")
349
+
350
+ return (resp, elapsed)
351
+ except Exception as e:
352
+ if any(err_type in str(e).lower() for err_type in ['reset', 'closed', 'grpc', 'unavailable']):
353
+ retry_count += 1
354
+ logger.warning(f"Connection error detected: {e}. Retrying...")
355
+
356
+ if retry_count == max_retries:
357
+ raise e
358
+ else: # Probably not a server issue
359
+ raise e
360
+
361
+
362
+ def _load_properties(self) -> list[Property]:
363
+ properties = {}
364
+ properties_file = os.path.join(config.weaviate.PROPERTIES_PATH, 'properties.yaml')
365
+ if not os.path.exists(properties_file):
366
+ logger.warning(
367
+ f"Optional file 'properties.yaml' is missing on path: {properties_file}. "
368
+ "Falling back to built-in default properties."
369
+ )
370
+ return _default_properties()
371
+
372
+ try:
373
+ import yaml
374
+
375
+ with open(properties_file, 'r') as stream:
376
+ properties = yaml.safe_load(stream)
377
+ except ModuleNotFoundError:
378
+ logger.warning(
379
+ "PyYAML is not installed. Falling back to built-in default properties "
380
+ "for Weaviate collection creation."
381
+ )
382
+ return _default_properties()
383
+ except Exception as e:
384
+ logger.error(f"Failed to load properties from path {properties_file}: {e}")
385
+ raise e
386
+
387
+ if not properties:
388
+ logger.warning("properties.yaml is empty. Falling back to built-in default properties.")
389
+ return _default_properties()
390
+
391
+ final_properties = []
392
+ for name, params in properties.items():
393
+ try:
394
+ data_type = params.get('data_type', '')
395
+ dtype = DataType(data_type)
396
+ except Exception as e:
397
+ logger.error(f"Nonexistent datatype {data_type}")
398
+ raise e
399
+
400
+ final_properties.append(Property(
401
+ name=name,
402
+ data_type=dtype,
403
+ index_filterable=params.get('filterable', True),
404
+ index_searchable=params.get('searchable', True),
405
+ skip_vectorization=params.get('skip_vectorization', False),
406
+ ))
407
+
408
+ return final_properties
409
+
410
+
411
+ def _create_collections(self):
412
+ """
413
+ Create and initialize language-specific collections.
414
+
415
+ Creates collections for all available languages with vector configuration.
416
+ """
417
+ properties = self._load_properties()
418
+ try:
419
+ client = self._init_client()
420
+ logger.info('Attempting collections creation...')
421
+
422
+ vector_config = (
423
+ Configure.Vectors.text2vec_transformers() if config.weaviate.LOCAL_DATABASE
424
+ else Configure.Vectors.text2vec_huggingface(
425
+ name='hsg_rag_embeddings',
426
+ source_properties=['body'],
427
+ model=config.processing.EMBEDDING_MODEL,
428
+ )
429
+ )
430
+
431
+ successful_creations = 0
432
+
433
+ with self._client_lock:
434
+ for collection_name in _collection_names:
435
+ try:
436
+ client.collections.create(
437
+ name=collection_name,
438
+ properties=properties,
439
+ vector_config=vector_config
440
+ )
441
+ logger.info(f"Created collection {collection_name}")
442
+ successful_creations += 1
443
+ except Exception as e:
444
+ logger.error(f"Failed to create collection '{collection_name}': {e}")
445
+
446
+ self._last_query_time = perf_counter()
447
+
448
+ if successful_creations == len(_collection_names):
449
+ logger.info('All collections successfully instantiated')
450
+ else:
451
+ logger.warning(f"Only {successful_creations}/{len(_collection_names)} collections created")
452
+
453
+ except Exception as e:
454
+ logger.error(f"Collections creation failed: {e}")
455
+ self._client = None
456
+ raise e
457
+
458
+
459
+ def _delete_collections(self):
460
+ """
461
+ Delete all existing collections from the database.
462
+
463
+ Also removes the hash file if it exists.
464
+ """
465
+ try:
466
+ client = self._init_client()
467
+ logger.info("Initiating deletion of stored collections...")
468
+
469
+ deleted_count = 0
470
+ with self._client_lock:
471
+ for collection_name in _collection_names:
472
+ try:
473
+ if client.collections.exists(collection_name):
474
+ client.collections.delete(collection_name)
475
+ logger.info(f"Deleted collection {collection_name}")
476
+ deleted_count += 1
477
+ else:
478
+ logger.warning(f"Collection {collection_name} does not exist")
479
+ except Exception as e:
480
+ logger.error(f"Failed to delete collection {collection_name}: {e}")
481
+
482
+ self._last_query_time = perf_counter()
483
+ logger.info(f"Deleted {deleted_count}/{len(_collection_names)} collections")
484
+
485
+ except Exception as e:
486
+ logger.error(f"Collections deletion failed: {e}")
487
+ self._client = None
488
+ raise e
489
+
490
+
491
+ def _reset_collections(self):
492
+ self._delete_collections()
493
+ self._create_collections()
494
+
495
+
496
+ def _collect_chunk_ids(self) -> dict:
497
+ client = self._init_client()
498
+ try:
499
+ ids = []
500
+ with self._client_lock:
501
+ for c in client.collections.list_all(simple=False):
502
+ coll = client.collections.get(c)
503
+ for obj in coll.iterator():
504
+ ids.append(obj.properties['chunk_id'])
505
+ return ids
506
+ except Exception as e:
507
+ logger.error(f"Failed to collect chunk ids: {e}")
508
+ raise e
509
+
510
+
511
+ def _extract_data(self) -> dict:
512
+ client = self._init_client()
513
+ try:
514
+ schema = []
515
+ objects = {}
516
+ with self._client_lock:
517
+ for c in client.collections.list_all(simple=False):
518
+ coll = client.collections.get(c)
519
+ cfg = coll.config.get().to_dict()
520
+ schema.append(cfg)
521
+
522
+ objects[c] = []
523
+ for obj in coll.iterator(include_vector=True):
524
+ objects[c].append({
525
+ "uuid": obj.uuid,
526
+ "properties": obj.properties,
527
+ "vector": obj.vector,
528
+ })
529
+
530
+ return {
531
+ 'schema': schema,
532
+ 'objects': objects,
533
+ }
534
+ except Exception as e:
535
+ logger.error(f"Failed to extract data from database: {e}")
536
+ raise e
537
+
538
+
539
+ def _create_backup(self) -> str:
540
+ """
541
+ Create a backup of the current database state and stores it under selected backup provider.
542
+
543
+ Returns: backup id of the created backup.
544
+ """
545
+ try:
546
+ if not config.weaviate.BACKUP_METHOD:
547
+ raise ValueError('Backup method is not selected!')
548
+ if config.weaviate.BACKUP_METHOD not in config.weaviate.BACKUP_METHODS:
549
+ raise ValueError(f"Selected backup method 'config.weaviate.BACKUP_METHODS' is not supported!")
550
+ if not config.weaviate.BACKUP_PATH:
551
+ raise ValueError("Backup directory is not set!")
552
+ os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
553
+
554
+ backup_id = f"backup_{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}"
555
+ logger.info(f"Initiating backup creation for {self._connection_type} database...")
556
+
557
+ match config.weaviate.BACKUP_METHOD:
558
+ case 'manual':
559
+ import json
560
+
561
+ backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
562
+ os.makedirs(backup_path)
563
+
564
+ db_data = self._extract_data()
565
+ data_backup = {
566
+ 'creation_date': datetime.datetime.now().isoformat(),
567
+ }
568
+
569
+ schema_backup_path = os.path.join(backup_path, 'schema.json')
570
+ with open(schema_backup_path, 'w', encoding='utf-8') as f:
571
+ json.dump(db_data['schema'], f, indent=2, default=str)
572
+
573
+ objects_backup_path = os.path.join(backup_path, 'objects.json')
574
+ with open(objects_backup_path, 'w', encoding='utf-8') as f:
575
+ json.dump(db_data['objects'], f, indent=2, default=str)
576
+
577
+ data_backup_path = os.path.join(backup_path, 'data.json')
578
+ with open(data_backup_path, 'w', encoding='utf-8') as f:
579
+ json.dump(data_backup, f, indent=2, default=str)
580
+
581
+ case 's3':
582
+ client = self._init_client()
583
+ with self._client_lock:
584
+ client.backup.create(
585
+ backup_id=backup_id,
586
+ backend="s3",
587
+ include_collections=_collection_names,
588
+ wait_for_completion=True,
589
+ )
590
+ case _:
591
+ raise NotImplementedError()
592
+
593
+
594
+ self._last_query_time = perf_counter()
595
+ logger.info(f"Backup '{backup_id}' created successfully")
596
+
597
+ return backup_id
598
+ except Exception as e:
599
+ logger.error(f"Backup creation failed: {e}")
600
+ raise e
601
+
602
+
603
+ def _restore_backup(self, backup_id: str):
604
+ """
605
+ Restore the database state from a backup.
606
+
607
+ Restores specified collections from backup.
608
+
609
+ Args:
610
+ backup_id: ID of the backup to restore from
611
+
612
+ Raises:
613
+ Exception if backup restoration fails
614
+ """
615
+ self._delete_collections()
616
+
617
+ try:
618
+ if not config.weaviate.BACKUP_METHOD:
619
+ raise ValueError('Backup method is not selected!')
620
+ if config.weaviate.BACKUP_METHOD not in config.weaviate.BACKUP_METHODS:
621
+ raise ValueError(f"Selected backup method 'config.weaviate.BACKUP_METHODS' is not supported!")
622
+ if not config.weaviate.BACKUP_PATH:
623
+ raise ValueError("Backup directory is not set!")
624
+ os.makedirs(config.weaviate.BACKUP_PATH, exist_ok=True)
625
+
626
+ backup_path = os.path.join(config.weaviate.BACKUP_PATH, backup_id)
627
+ if not os.path.exists(backup_path):
628
+ raise RuntimeError(f"Directory for backup 'backup_id' does not exist in the backup directory!")
629
+ schema_backup_path = os.path.join(backup_path, 'schema.json')
630
+ if not os.path.exists(schema_backup_path):
631
+ raise RuntimeError(f"Schema backup is missing in the backup directory!")
632
+ objects_backup_path = os.path.join(backup_path, 'objects.json')
633
+ if not os.path.exists(objects_backup_path):
634
+ raise RuntimeError(f"Objects backup is missing in the backup directory!")
635
+
636
+ client = self._init_client()
637
+ logger.info(f"Initiating restoration from backup '{backup_id}' for {self._connection_type} database...")
638
+
639
+ with self._client_lock:
640
+ match config.weaviate.BACKUP_METHOD:
641
+ case 'manual':
642
+ import json
643
+
644
+ with open(schema_backup_path) as f:
645
+ schemas = json.load(f)
646
+ for cfg in schemas:
647
+ client.collections.create_from_dict(cfg)
648
+
649
+ with open(objects_backup_path) as f:
650
+ data = json.load(f)
651
+ for name, objs in data.items():
652
+ logger.info(f"Restoring collection '{name}' with {len(objs)} objects...")
653
+ coll = client.collections.get(name)
654
+
655
+ with coll.batch.dynamic() as batch:
656
+ for o in objs:
657
+ o['properties']['date'] = o['properties']['date'] \
658
+ .replace(" ", "T").replace("+00:00", "Z")
659
+ batch.add_object(
660
+ uuid=o["uuid"],
661
+ properties=o["properties"],
662
+ vector=o["vector"]
663
+ )
664
+ logger.info(f"Collection '{name}' restored successfully")
665
+ case 's3':
666
+ client.backup.restore(
667
+ backup_id=backup_id,
668
+ backend="s3",
669
+ wait_for_completion=True,
670
+ roles_restore="all",
671
+ users_restore="all",
672
+ )
673
+ case _:
674
+ raise NotImplementedError()
675
+
676
+ self._last_query_time = perf_counter()
677
+ logger.info(f"Backup '{backup_id}' restored successfully")
678
+
679
+ except Exception as e:
680
+ error_msg = str(e).lower()
681
+ if 'connection' in error_msg:
682
+ logger.error(f"Connection error during backup restore: {e}. Will reconnect on next operation.")
683
+ self._client = None
684
+ logger.error(f"Backup restoration failed: {e}")
685
+ raise e
686
+
687
+
688
+ def _checkhealth(self) -> bool:
689
+ """
690
+ Check the connectivity and health status of the Weaviate database.
691
+
692
+ Verifies:
693
+ - Connection to the database
694
+ - Database metadata and version
695
+ - Existence of all expected collections
696
+ - Module availability
697
+
698
+ Returns:
699
+ True if all health checks pass, False otherwise
700
+ """
701
+ try:
702
+ client = self._init_client()
703
+
704
+ # Check basic connectivity
705
+ is_connected = False
706
+ with self._client_lock:
707
+ is_connected = client.is_connected()
708
+
709
+ connection_status = "✓ OK" if is_connected else "✗ ERROR"
710
+ logger.info(f"Connection to {self._connection_type} database: {connection_status}")
711
+
712
+ if not is_connected:
713
+ logger.error("Database connection check failed")
714
+ return False
715
+
716
+ # Get and log metadata
717
+ try:
718
+ with self._client_lock:
719
+ metainfo = client.get_meta()
720
+
721
+ # Format module information
722
+ modules = metainfo.get('modules', {})
723
+ modules_list = list(modules.keys()) if isinstance(modules, dict) else modules
724
+ modules_str = ', '.join(str(m) for m in modules_list) if modules_list else 'None'
725
+
726
+ # Truncate long module strings for logging
727
+ if len(modules_str) > 50:
728
+ modules_str = modules_str[:47] + '...'
729
+
730
+ # Log connection details
731
+ if config.weaviate.LOCAL_DATABASE:
732
+ logger.info(
733
+ f"Database metadata: "
734
+ f"HOSTNAME={metainfo.get('hostname', 'unknown')}, "
735
+ f"VERSION={metainfo.get('version', 'unknown')}, "
736
+ f"MODULES={modules_str}"
737
+ )
738
+ else:
739
+ logger.info(
740
+ f"Database metadata: "
741
+ f"VERSION={metainfo.get('version', 'unknown')}, "
742
+ f"MODULES={modules_str}"
743
+ )
744
+
745
+ except Exception as e:
746
+ logger.warning(f"Could not retrieve database metadata: {e}")
747
+
748
+ # Check collection existence
749
+ all_collections_exist = True
750
+
751
+ with self._client_lock:
752
+ for collection_name in _collection_names:
753
+ try:
754
+ exists = client.collections.exists(collection_name)
755
+ status = "✓ OK" if exists else "✗ MISSING"
756
+ logger.info(f"Collection '{collection_name}': {status}")
757
+
758
+ if not exists:
759
+ all_collections_exist = False
760
+
761
+ except Exception as e:
762
+ logger.error(f"Error checking collection '{collection_name}': {e}")
763
+ all_collections_exist = False
764
+
765
+ # Update last health check time
766
+ self._last_query_time = perf_counter()
767
+
768
+ # Log overall health status
769
+ if is_connected and all_collections_exist:
770
+ logger.info("✓ Database health check PASSED - All systems operational")
771
+ return True
772
+ else:
773
+ logger.warning("✗ Database health check FAILED - Some issues detected")
774
+ return False
775
+
776
+ except Exception as e:
777
+ error_msg = str(e).lower()
778
+ if 'connection' in error_msg:
779
+ logger.error(f"Connection error during health check: {e}. Will reconnect on next operation.")
780
+ self._client = None
781
+ logger.error(f"Health check failed: {e}")
782
+ return False
783
+
784
+
785
+ def parse_arguments():
786
+ """
787
+ Parse command-line arguments for managing Weaviate collections.
788
+
789
+ Returns:
790
+ argparse.Namespace: Parsed command-line arguments.
791
+ """
792
+ import argparse
793
+
794
+ parser = argparse.ArgumentParser(
795
+ description='Weaviate database management utility'
796
+ )
797
+ group = parser.add_mutually_exclusive_group()
798
+
799
+ group.add_argument(
800
+ '-dc', "--delete_collections",
801
+ action='store_true',
802
+ help='Delete all collections from the database'
803
+ )
804
+ group.add_argument(
805
+ '-cc', "--create_collections",
806
+ action='store_true',
807
+ help='Initialize collections for different language contents'
808
+ )
809
+ group.add_argument(
810
+ '-rc', "--redo_collections",
811
+ action='store_true',
812
+ help='Delete and recreate all collections'
813
+ )
814
+ group.add_argument(
815
+ '-ch', "--checkhealth",
816
+ action='store_true',
817
+ help='Check database connection and collection existence'
818
+ )
819
+ group.add_argument(
820
+ '-cb', "--create_backup",
821
+ action='store_true',
822
+ help='Create a backup of the current database state'
823
+ )
824
+ group.add_argument(
825
+ '-rb', "--restore_backup",
826
+ type=str,
827
+ metavar='BACKUP_ID',
828
+ help='Restore database from a backup (provide backup_id)'
829
+ )
830
+
831
+ return parser.parse_args()
832
+
833
+
834
+ if __name__ == "__main__":
835
+ args = parse_arguments()
836
+ service = WeaviateService()
837
+
838
+ if args.create_backup:
839
+ service._create_backup()
840
+
841
+ if args.restore_backup:
842
+ service._restore_backup(args.restore_backup)
843
+
844
+ if any([args.delete_collections, args.redo_collections]):
845
+ service._delete_collections()
846
+
847
+ if any([args.create_collections, args.redo_collections]):
848
+ service._create_collections()
849
+
850
+ if any([args.checkhealth, args.create_collections, args.redo_collections]):
851
+ service._checkhealth()
src/notification/__init__.py ADDED
File without changes
src/notification/notification_center.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+ import mimetypes
3
+ import os
4
+ import smtplib
5
+ from email.message import EmailMessage
6
+
7
+ import requests
8
+
9
+ from ..config import NotificationCenterConfig as NC
10
+
11
+
12
+ Channel = Literal["email", "slack"]
13
+
14
+
15
+ class EmailNotifier:
16
+ def __init__(self):
17
+ self.enabled = NC.ENABLE_EMAIL_ALERTS
18
+ self.smtp_host = NC.SMTP_HOST
19
+ self.smtp_port = NC.SMTP_PORT
20
+ self.smtp_user = NC.SMTP_USER
21
+ self.smtp_password = NC.SMTP_PASSWORD
22
+ self.smtp_use_tls = NC.SMTP_USE_TLS
23
+ self.from_email = NC.FROM_EMAIL
24
+ self.to_emails = self._parse_recipients(NC.TO_EMAIL)
25
+
26
+ if self.enabled:
27
+ self._validate()
28
+
29
+ @staticmethod
30
+ def _parse_recipients(value: str | None) -> list[str]:
31
+ if not value:
32
+ return []
33
+ return [email.strip() for email in value.split(",") if email.strip()]
34
+
35
+ def _validate(self) -> None:
36
+ missing = []
37
+
38
+ if not self.smtp_host:
39
+ missing.append("NOTIFY_SMTP_HOST")
40
+ if not self.smtp_user:
41
+ missing.append("NOTIFY_SMTP_USER")
42
+ if not self.smtp_password:
43
+ missing.append("NOTIFY_SMTP_PASSWORD")
44
+ if not self.from_email:
45
+ missing.append("NOTIFY_FROM_EMAIL")
46
+ if not self.to_emails:
47
+ missing.append("NOTIFY_TO_EMAIL")
48
+
49
+ if missing:
50
+ raise ValueError(f"Missing notification email config: {', '.join(missing)}")
51
+
52
+ def send(
53
+ self,
54
+ subject: str,
55
+ body: str,
56
+ attachments: str | list[str] | None = None,
57
+ ) -> None:
58
+ if not self.enabled:
59
+ return
60
+
61
+ if isinstance(attachments, str):
62
+ attachments = [attachments]
63
+
64
+ msg = EmailMessage()
65
+ msg["Subject"] = subject
66
+ msg["From"] = self.from_email
67
+ msg["To"] = ", ".join(self.to_emails)
68
+ msg.set_content(body)
69
+
70
+ if attachments:
71
+ for file_path in attachments:
72
+ if not file_path or not os.path.isfile(file_path):
73
+ continue
74
+
75
+ mime_type, _ = mimetypes.guess_type(file_path)
76
+ mime_type = mime_type or "application/octet-stream"
77
+ maintype, subtype = mime_type.split("/", 1)
78
+
79
+ with open(file_path, "rb") as f:
80
+ msg.add_attachment(
81
+ f.read(),
82
+ maintype=maintype,
83
+ subtype=subtype,
84
+ filename=os.path.basename(file_path),
85
+ )
86
+
87
+ with smtplib.SMTP(self.smtp_host, self.smtp_port, timeout=20) as server:
88
+ if self.smtp_use_tls:
89
+ server.starttls()
90
+ server.login(self.smtp_user, self.smtp_password)
91
+ server.send_message(msg)
92
+
93
+
94
+ class SlackNotifier:
95
+ def __init__(self):
96
+ self.enabled = NC.ENABLE_SLACK_ALERTS
97
+ self.webhook_url = NC.SLACK_WEBHOOK_URL
98
+
99
+ if self.enabled:
100
+ self._validate()
101
+
102
+ def _validate(self) -> None:
103
+ if not self.webhook_url:
104
+ raise ValueError("Missing notification slack config: NOTIFY_SLACK_WEBHOOK_URL")
105
+
106
+ def send(self, subject: str, body: str) -> None:
107
+ if not self.enabled:
108
+ return
109
+
110
+ text = f"*{subject}*\n{body}"
111
+
112
+ response = requests.post(
113
+ self.webhook_url,
114
+ json={"text": text},
115
+ timeout=10,
116
+ )
117
+
118
+ response.raise_for_status()
119
+
120
+ if response.status_code != 200:
121
+ raise RuntimeError(
122
+ f"Slack notification failed: {response.status_code} {response.text}"
123
+ )
124
+
125
+
126
+ class NotificationCenter:
127
+ def __init__(self):
128
+ self.email = EmailNotifier()
129
+ self.slack = SlackNotifier()
130
+
131
+ def send_notification(
132
+ self,
133
+ subject: str,
134
+ body: str,
135
+ channel: Channel = "email",
136
+ attachments: str | list[str] | None = None,
137
+ ) -> None:
138
+
139
+ match channel:
140
+ case "all":
141
+ self.email.send(subject, body, attachments)
142
+ self.slack.send(subject, body)
143
+ case "email":
144
+ self.email.send(subject, body, attachments)
145
+ case "slack":
146
+ self.slack.send(subject, body)
147
+ case _:
148
+ raise ValueError(f"Unknown notification channel: {channel}")
src/pipeline/__init__.py ADDED
File without changes
src/pipeline/pipeline.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import *
2
+ from .processors import *
3
+ from ..scraping.scraper import Scraper
4
+
5
+ from ..database.weavservice import WeaviateService
6
+ from ..utils.logging import get_logger
7
+ from ..config import config
8
+
9
+ pipelogger = get_logger("pipeline_module")
10
+ implogger = get_logger("import_pipeline")
11
+
12
+
13
+ class ImportPipeline:
14
+ """
15
+ Main pipeline class responsible for importing website and local documents
16
+ into the database with deduplication and language-based organization.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ logging_callback = None,
22
+ deduplication_callback = None,
23
+ ) -> None:
24
+ """
25
+ Initialize the import pipeline with optional callbacks for logging and deduplication.
26
+
27
+ This sets up the processors for websites and documents and recieves existing chunk IDs
28
+ from the database for deduplication purposes.
29
+
30
+ Args:
31
+ logging_callback (callable, optional): A callback function for logging progress.
32
+ Defaults to a placeholder if not provided.
33
+ deduplication_callback (callable, optional): A callback function for handling
34
+ deduplication decisions. Defaults to a placeholder if not provided.
35
+ """
36
+ self._logging_callback = logging_callback or logging_callback_placeholder
37
+ self._deduplication_callback = deduplication_callback or deduplication_callback_placeholder
38
+ self._docprocessor = DocumentProcessor()
39
+ self._service = WeaviateService()
40
+ self._ids = self._service._collect_chunk_ids()
41
+
42
+ implogger.info('Import pipeline initialization finished!')
43
+
44
+
45
+ def import_from_scraper(self, scraper_chunks: dict[str, dict]) -> None:
46
+ for lang, chunks in scraper_chunks.items():
47
+ if not chunks: continue
48
+
49
+ sources = list(set([chunk.get('source', '') for chunk in chunks]))
50
+ self._service.delete_chunks(lang, property_filters={'source': sources})
51
+ self._service.batch_import(data_rows=chunks, lang=lang)
52
+
53
+
54
+ def scrape_website(self, target_urls: list[str] | None = None, scrape_all: bool = False) -> None:
55
+ target_urls = [url for url in (target_urls or config.scraping.TARGET_URLS or []) if url]
56
+ if not target_urls:
57
+ implogger.warning("No target URLs configured for scraping.")
58
+ return
59
+
60
+ scraper = Scraper(scrape_all=scrape_all)
61
+ for target_url in target_urls:
62
+ self._logging_callback(f"Scraping target {target_url}...", 0)
63
+ scraped_chunks = scraper.scrape_target(target_url)
64
+ if not scraped_chunks:
65
+ self._logging_callback(f"No importable chunks scraped from {target_url}.", 100)
66
+ continue
67
+
68
+ self._logging_callback(f"Importing scraped chunks from {target_url}...", 90)
69
+ self.import_from_scraper(scraped_chunks)
70
+ self._logging_callback(f"Finished scraping import for {target_url}.", 100)
71
+
72
+
73
+ def import_many_documents(self, sources: list[str]) -> None:
74
+ self.import_all(paths=sources)
75
+
76
+
77
+ def _import_urls_via_scraper(self, urls: list[str], scrape_all: bool = True) -> None:
78
+ urls = [url for url in (urls or []) if url]
79
+ if not urls:
80
+ return
81
+
82
+ scraper = Scraper(scrape_all=scrape_all)
83
+ for url in urls:
84
+ self._logging_callback(f"Scraping URL {url}...", 0)
85
+ scraped_chunks = scraper.scrape_target(url)
86
+ if not scraped_chunks:
87
+ self._logging_callback(f"Failed to scrape URL {url}!", 100, failed=True)
88
+ continue
89
+
90
+ self._logging_callback(f"Importing scraped chunks from {url}...", 90)
91
+ self.import_from_scraper(scraped_chunks)
92
+ self._logging_callback(f"Stored scraped chunks for {url}.", 100)
93
+
94
+
95
+ def import_all(
96
+ self,
97
+ paths: list[str] = None,
98
+ urls: list[str] = None,
99
+ reset_collections: bool = False,
100
+ ) -> None:
101
+ """
102
+ Import documents from local paths and/or URLs into the database.
103
+
104
+ Processes the provided paths and URLs using the appropriate processors,
105
+ combines chunks by language, optionally resets database collections,
106
+ and performs batch imports.
107
+
108
+ Args:
109
+ paths (list[str], optional): List of local file paths to process. Defaults to None.
110
+ urls (list[str], optional): List of website URLs to process. Defaults to None.
111
+ reset_collections (bool, optional): If True, reset the database collections before importing.
112
+ Defaults to False.
113
+ """
114
+ chunks = self._pipeline(paths, self._docprocessor, reset_collections)
115
+
116
+ if reset_collections:
117
+ self._logging_callback('Resetting database collections...', 60)
118
+ self._service._reset_collections()
119
+
120
+ self._logging_callback('Importing document chunks to database...', 90)
121
+ for lang, ch in chunks.items():
122
+ self._service.batch_import(data_rows=ch, lang=lang)
123
+
124
+ self._import_urls_via_scraper(urls, scrape_all=True)
125
+
126
+ self._logging_callback(
127
+ f'Successfully imported {sum([len(ch) for ch in chunks.values()])} document chunks!',
128
+ 100
129
+ )
130
+
131
+
132
+ def _pipeline(
133
+ self,
134
+ sources: list[str],
135
+ processor: ProcessorBase,
136
+ reset_collections: bool,
137
+ ) -> dict:
138
+ """
139
+ Internal pipeline to process a list of sources using a given processor.
140
+
141
+ Handles processing, deduplication (if not resetting), and organizes unique chunks by language.
142
+ If no new unique data is found, logs a warning and returns empty chunks.
143
+
144
+ Args:
145
+ sources (list[str]): List of sources (paths or URLs) to process.
146
+ processor (ProcessorBase): The processor instance to use for handling sources.
147
+ reset_collections (bool): If True, skip deduplication.
148
+
149
+ Returns:
150
+ dict: A dictionary mapping languages to lists of unique chunk dictionaries.
151
+ """
152
+ unique_chunks = {lang: [] for lang in config.get('AVAILABLE_LANGUAGES')}
153
+
154
+ sources = [s for s in (sources or []) if s != ""]
155
+ if not sources:
156
+ return unique_chunks
157
+
158
+ for source in sources:
159
+ self._logging_callback(f'Starting pipeline for {source}...', 0)
160
+ result = processor.process(source)
161
+
162
+ if not result.chunks:
163
+ implogger.error(f"Failed to process {source}!")
164
+ self._logging_callback(f"Failed to process {source}!", 100, result, failed=True)
165
+ continue
166
+
167
+ if not reset_collections:
168
+ self._deduplicate(result)
169
+
170
+ self._logging_callback(f'Storing chunks for {source}...', 100, result)
171
+ unique_chunks[result.lang].extend(result.chunks)
172
+
173
+ if all([len(chunks) == 0 for chunks in unique_chunks.values()]):
174
+ self._logging_callback('No new data could be extracted from these sources!', 100)
175
+ implogger.warning(f"File(s) provided for the insertion do not contain any unique information.")
176
+
177
+ return unique_chunks
178
+
179
+
180
+ def _deduplicate(self, result: ProcessingResult) -> ProcessingResult:
181
+ """
182
+ Remove duplicate chunks based on chunks that are already stored in the database.
183
+
184
+ If all chunks are duplicates, invokes the deduplication callback to decide whether
185
+ to delete existing duplicates and reimport. Otherwise, returns only unique chunks.
186
+
187
+ Args:
188
+ result (ProcessingResult): The processing result containing document chunks.
189
+
190
+ Returns:
191
+ list[dict]: List of unique chunk dictionaries (or all if reimporting duplicates).
192
+ """
193
+ self._logging_callback('Performing deduplication...', 80)
194
+ unique_chunks = []
195
+ duplicate_ids = []
196
+ for chunk in result.chunks:
197
+ chunk_id = chunk['chunk_id']
198
+ if chunk_id in self._ids:
199
+ duplicate_ids.append(chunk_id)
200
+ else:
201
+ unique_chunks.append(chunk)
202
+
203
+ implogger.info(f"Found {len(duplicate_ids)} already existing IDs in {len(result.chunks)} collected chunks")
204
+ if duplicate_ids:
205
+ implogger.info(f"Duplicates found! Calling deduplication callback...")
206
+ if self._deduplication_callback(result.source, len(duplicate_ids)):
207
+ implogger.info('Duplicated chunks will be reimported as new...')
208
+ self._service._delete_by_id(duplicate_ids)
209
+ return result
210
+
211
+ result.chunks = unique_chunks
212
+ return result
src/pipeline/processors.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import os, re
3
+
4
+ from pathlib import Path
5
+ from transformers import AutoTokenizer
6
+
7
+ from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
8
+ from docling.datamodel.pipeline_options import PdfPipelineOptions, LayoutOptions
9
+ from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
10
+ from docling.document_converter import DocumentConverter, PdfFormatOption, InputFormat
11
+ from docling.chunking import HybridChunker
12
+ from docling_core.types.doc.document import DoclingDocument, TableItem
13
+
14
+ from .utils import *
15
+
16
+ from ..utils.lang import detect_language
17
+ from ..utils.logging import get_logger
18
+ from ..config import config
19
+
20
+ weblogger = get_logger("website_processor")
21
+ datalogger = get_logger("data_processor")
22
+
23
+ class ProcessorBase:
24
+ def __init__(self) -> None:
25
+ """
26
+ Initialize the base processor with document conversion and chunking tools.
27
+
28
+ Sets up the PDF pipeline options, document converter, tokenizer, and chunker.
29
+ Loads strategies for chunk preparation.
30
+
31
+ Args:
32
+ logging_callback (callable): A callback function for logging progress.
33
+ """
34
+ pipeline_options = PdfPipelineOptions(
35
+ do_ocr = False,
36
+ generate_page_images = False,
37
+
38
+ do_layout_analysis = True,
39
+ do_table_structure = True,
40
+ do_cell_matching = True,
41
+
42
+ layout_options=LayoutOptions(
43
+ create_orphan_clusters = True,
44
+ keep_empty_clusters = False,
45
+ skip_cell_assignment = False,
46
+ ),
47
+ )
48
+ self._converter: DocumentConverter = DocumentConverter(
49
+ format_options={
50
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
51
+ },
52
+ )
53
+ tokenizer = AutoTokenizer.from_pretrained(config.processing.EMBEDDING_MODEL)
54
+ self._chunker = HybridChunker(
55
+ tokenizer=HuggingFaceTokenizer(
56
+ tokenizer=tokenizer,
57
+ max_tokens=config.processing.MAX_TOKENS
58
+ ),
59
+ serializer_provider=EnhansedSerializerProvider(),
60
+ max_tokens=config.processing.MAX_TOKENS,
61
+ merge_peers=True
62
+ )
63
+ self.strategies_processor = StrategiesProcessor()
64
+ self._logging_callback = config.dbapp['logging_callback'] or logging_callback_placeholder
65
+
66
+
67
+ def process(self):
68
+ """
69
+ Abstract method to be implemented by subclasses for processing sources.
70
+
71
+ Raises:
72
+ NotImplementedError: If not overridden in a subclass.
73
+ """
74
+ raise NotImplementedError("This method is not implemented in ProcessorBase")
75
+
76
+
77
+ def convert_to_txt(self, document: DoclingDocument) -> str:
78
+ plain_text = []
79
+ for node, _ in document.iterate_items(root=document.body, with_groups=False):
80
+ if isinstance(node, TableItem):
81
+ df = node.export_to_dataframe(document)
82
+ table_str = df.to_string(index=False, na_rep='')
83
+ plain_text.append(table_str)
84
+ elif hasattr(node, 'text') and node.text:
85
+ plain_text.append(node.text.strip())
86
+ return '\n\n'.join(plain_text)
87
+
88
+
89
+ def _prepare_chunks(self, document_name: str, document_content: str, chunks: list[str]) -> list[dict]:
90
+ """
91
+ Prepare chunks by applying strategies to generate properties for each chunk.
92
+
93
+ Args:
94
+ document_name (str): The name or identifier of the document.
95
+ document_content (str): The full content of the document.
96
+ chunks (list[str]): List of text chunks to prepare.
97
+
98
+ Returns:
99
+ list[dict]: List of dictionaries, each containing properties for a chunk.
100
+ """
101
+ prepared_chunks = []
102
+ for chunk in chunks:
103
+ prepared_chunks.append({
104
+ prop: self.strategies_processor.apply_strategy(
105
+ strategy_name=prop,
106
+ arguments=StrategyArguments(document_name, document_content, chunk),
107
+ )
108
+ for prop in self.strategies_processor.list_strategies()
109
+ })
110
+
111
+ return prepared_chunks
112
+
113
+
114
+ def _clean_content(self, document_content: str) -> str:
115
+ """
116
+ Clean the document content by removing garbage symbols and normalizing whitespace.
117
+
118
+ Handles specific replacements for punctuation, symbols, and line breaks.
119
+
120
+ Args:
121
+ document_content (str): The raw document content to clean.
122
+
123
+ Returns:
124
+ str: The cleaned document content.
125
+ """
126
+ cleaned = re.sub(r'\s+/\s+', '/', document_content)
127
+ cleaned = re.sub(r'\s+\.\s+', '.', cleaned)
128
+ cleaned = re.sub(r',\s+', '.', cleaned)
129
+ cleaned = re.sub(r'\s+\|\s+', ' ', cleaned)
130
+ cleaned = re.sub(r'\/\s+', '/', cleaned)
131
+ cleaned = re.sub(r'\s+/','/', cleaned)
132
+ cleaned = re.sub(r'\s+\.', '.', cleaned)
133
+ cleaned = re.sub(r'(\d+)\s*,\s*(\d{4})', r'\1', cleaned)
134
+ cleaned = re.sub(r'(\d+)\s*/\s*(\d+)', r'\1', cleaned)
135
+ cleaned = re.sub(r'\.(\d{4})', r'.\1', cleaned)
136
+
137
+ cleaned = cleaned.replace('ä', 'ä').replace('ö', 'ö').replace('ü', 'ü')
138
+
139
+ cleaned = re.sub(r'\n\s*\n+', '\n\n', cleaned)
140
+ cleaned = re.sub(r' +', ' ', cleaned)
141
+
142
+ return cleaned
143
+
144
+
145
+ def _extract_document_content(self, document: DoclingDocument) -> str:
146
+ """
147
+ Extract and compile text content from the document into a single string.
148
+
149
+ Organizes text items by page, sorts them by position, and joins them
150
+ while handling line breaks and spacing.
151
+
152
+ Args:
153
+ document (DoclingDocument): The document object to extract content from.
154
+
155
+ Returns:
156
+ str: The cleaned, compiled text content.
157
+ """
158
+ page_texts = defaultdict(list)
159
+ for text_item in document.texts:
160
+ if not text_item.text.strip():
161
+ continue
162
+
163
+ prov = text_item.prov[0] if text_item.prov else None
164
+ if prov:
165
+ page_number = prov.page_no
166
+ bbox = prov.bbox
167
+ page_texts[page_number].append({
168
+ 'text': text_item.text.strip(),
169
+ 'top': bbox.t,
170
+ 'left': bbox.l,
171
+ 'bottom': bbox.b,
172
+ })
173
+
174
+ full_page_texts = []
175
+ for page_number in sorted(page_texts.keys()):
176
+ text_items = sorted(
177
+ page_texts[page_number],
178
+ key=lambda text: (-text['top'], text['left']),
179
+ )
180
+
181
+ content = []
182
+ last_bottom = None
183
+
184
+ line_treshold = 15
185
+
186
+ for item in text_items:
187
+ text = item['text']
188
+
189
+ if last_bottom is not None and (last_bottom - item['bottom'] > line_treshold):
190
+ if content:
191
+ full_page_texts.append(' '.join(content))
192
+ content = []
193
+
194
+ if last_bottom - item['bottom'] > 50:
195
+ full_page_texts.append("")
196
+
197
+ content.append(text)
198
+ last_bottom = item['bottom']
199
+
200
+ if content:
201
+ full_page_texts.append(' '.join(content))
202
+
203
+ full_text = '\n\n'.join(full_page_texts)
204
+ cleaned_text = self._clean_content(full_text)
205
+
206
+ return cleaned_text
207
+
208
+
209
+ def _collect_chunks(self, document: DoclingDocument) -> list[str]:
210
+ """
211
+ Collect contextualized chunks from the document using the chunker.
212
+
213
+ Args:
214
+ document (DoclingDocument): The document to chunk.
215
+
216
+ Returns:
217
+ list[str]: List of enriched text chunks.
218
+ """
219
+ chunks = []
220
+ for base_chunk in self._chunker.chunk(dl_doc=document):
221
+ enriched = self._chunker.contextualize(chunk=base_chunk)
222
+ chunks.append(enriched)
223
+ return chunks
224
+
225
+
226
+ def _collect_chunks_fallback(self, document_content: str) -> list[str]:
227
+ """
228
+ Fallback method to chunk the document content manually using tokenization.
229
+
230
+ Splits the content into overlapping chunks based on token limits.
231
+
232
+ Args:
233
+ document_content (str): The full content extracted from document.
234
+
235
+ Returns:
236
+ list[str]: List of text chunks.
237
+ """
238
+ tokenizer_wrapper = self._chunker.tokenizer
239
+ tokenizer = getattr(tokenizer_wrapper, 'tokenizer', tokenizer_wrapper)
240
+
241
+ tokens = tokenizer.encode(document_content)
242
+ chunk_size = self._chunker.max_tokens
243
+ overlap = 50
244
+
245
+ collected_chunks = []
246
+ for i in range(0, len(tokens), chunk_size-overlap):
247
+ chunk_tokens = tokens[i:i+chunk_size]
248
+ chunk = tokenizer.decode(
249
+ chunk_tokens,
250
+ skip_special_tokens=True,
251
+ clean_up_tokenization_spaces=True
252
+ )
253
+ collected_chunks.append(chunk)
254
+
255
+ return collected_chunks
256
+
257
+
258
+ class DocumentProcessor(ProcessorBase):
259
+ def process(self, source: Path | str) -> ProcessingResult:
260
+ """
261
+ Process a single local document, converting it to text, chunking, and preparing for import.
262
+
263
+ Handles document conversion, chunk collection (with fallback if needed),
264
+ chunk preparation, and language detection.
265
+
266
+ Args:
267
+ source (Path | str): Path to the document to process.
268
+
269
+ Returns:
270
+ ProcessingResult: The result containing chunks, source name, and detected language.
271
+ Returns None if the source does not exist or processing fails.
272
+ """
273
+ if not os.path.exists(source) or not os.path.isfile(source):
274
+ datalogger.error(f"Failed to initiate processing pipeline for source {source}: file does not exist")
275
+ return ProcessingResult(source=source, chunks=None, lang='')
276
+
277
+ document_name = os.path.basename(source)
278
+ datalogger.info(f"Initiating processing pipeline for source {document_name}")
279
+ self._logging_callback(f'Converting source {document_name}...', 20)
280
+ document = self._converter.convert(source).document
281
+
282
+ self._logging_callback(f'Collecting chunks from {document_name}...', 40)
283
+ collected_chunks = self._collect_chunks(document)
284
+ document_content = MarkdownDocSerializer(doc=document).serialize().text
285
+
286
+ if len(collected_chunks) <= 1: # Document content manual extraction
287
+ document_content = self._extract_document_content(document)
288
+ document = self._converter.convert_string(
289
+ content=document_content,
290
+ format=InputFormat.MD
291
+ ).document
292
+ collected_chunks = self._collect_chunks(document)
293
+
294
+ self._logging_callback(f'Preparing chunks for {document_name} for importing...', 60)
295
+ prepared_chunks = self._prepare_chunks(document_name, document_content, collected_chunks)
296
+
297
+ datalogger.info(f"Successfully collected {len(prepared_chunks)} chunks from {document_name}")
298
+
299
+ return ProcessingResult(
300
+ chunks=prepared_chunks,
301
+ source=document_name,
302
+ lang=detect_language(document_content),
303
+ )
src/pipeline/utilclasses.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+
3
+
src/pipeline/utils/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .strategies_processor import StrategyArguments, StrategiesProcessor
2
+ from .serializer import EnhansedSerializerProvider
3
+ from .utilclasses import *
src/pipeline/utils/serializer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider
2
+ from docling_core.transforms.serializer.base import BaseTableSerializer, SerializationResult
3
+ from docling_core.transforms.serializer.common import create_ser_result
4
+ from docling_core.types.doc.document import RichTableCell
5
+
6
+ class EnhancedTableSerializer(BaseTableSerializer):
7
+ def serialize(self, *, item, doc_serializer, doc, **kwargs) -> SerializationResult:
8
+ if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
9
+ return create_ser_result(text='')
10
+
11
+ grid = item.data.grid
12
+ if not grid:
13
+ return create_ser_result(text='')
14
+
15
+ row_cells = []
16
+ for row in grid:
17
+ clean_row = []
18
+ for cell in row:
19
+ if isinstance(cell, RichTableCell):
20
+ ser = doc_serializer.serialize(item=cell.ref.resolve(doc), **kwargs)
21
+ clean_row.append(ser.text.strip())
22
+ else:
23
+ clean_row.append((cell.text or "").strip())
24
+ if any(c for c in clean_row):
25
+ row_cells.append(clean_row)
26
+
27
+ headers = row_cells[0]
28
+ data_rows = row_cells[1:]
29
+
30
+ lines = []
31
+
32
+ for row in data_rows:
33
+ if len(row) < 2 or not row[0].strip():
34
+ continue
35
+
36
+ main_key = row[0].strip().replace('\n', ' ')
37
+ top_line = f'- {main_key}:'
38
+ lines.append(top_line)
39
+
40
+ for i in range(1, len(row)):
41
+ value = row[i].strip().replace('\n', ' ')
42
+ if not value: continue
43
+ sub_header = headers[i].strip().replace('\n', ' ') if i < len(headers) else f""
44
+ sub_line = f' - {sub_header}: {value}'
45
+ lines.append(sub_line)
46
+
47
+ lines.append("")
48
+
49
+ final_text = "\n".join(lines).rstrip()
50
+ return create_ser_result(text=final_text, span_source=item)
51
+
52
+
53
+ class EnhansedSerializerProvider(ChunkingSerializerProvider):
54
+ def get_serializer(self, doc):
55
+ return ChunkingDocSerializer(
56
+ doc=doc,
57
+ table_serializer=EnhancedTableSerializer(),
58
+ )
src/pipeline/utils/strategies_processor.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, importlib.util
2
+ from dataclasses import dataclass
3
+
4
+ from src.config import config
5
+ from src.utils.logging import get_logger
6
+
7
+ logger = get_logger('pipeline.strats')
8
+
9
+ @dataclass
10
+ class StrategyArguments:
11
+ name: str = None
12
+ content: str = None
13
+ chunk: str = None
14
+
15
+ class StrategiesProcessor:
16
+ def __init__(self) -> None:
17
+ os.makedirs(config.weaviate.STRATEGIES_PATH, exist_ok=True)
18
+
19
+ self._strategies: dict = self._load_strategies()
20
+
21
+ def list_strategies(self) -> list[str]:
22
+ return self._strategies.keys()
23
+
24
+ def apply_strategy(self, strategy_name: str, arguments: StrategyArguments | dict):
25
+ if strategy_name not in self._strategies.keys():
26
+ raise ValueError(f"Cannot apply strategy '{strategy_name}': strategy not found!")
27
+
28
+ try:
29
+ strategy = self._strategies[strategy_name]
30
+ run_result = None
31
+ if isinstance(arguments, StrategyArguments):
32
+ run_result = strategy.run(arguments.name, arguments.content, arguments.chunk)
33
+ else:
34
+ run_result = strategy.run(
35
+ arguments.get('document_name', ""),
36
+ arguments.get('document_content', ""),
37
+ arguments.get('chunk', None)
38
+ )
39
+ return run_result
40
+ except Exception as e:
41
+ raise RuntimeError(f"Cannot apply strategy '{strategy_name}': {e}")
42
+
43
+
44
+ def _load_strategies(self) -> dict:
45
+ loaded_strategies = dict()
46
+ for strat_file in os.listdir(config.weaviate.STRATEGIES_PATH):
47
+ strat_name = self._extract_strategy_name(strat_file)
48
+ if not strat_name: continue
49
+
50
+ strat_path = os.path.join(config.weaviate.STRATEGIES_PATH, strat_file)
51
+
52
+ spec = importlib.util.spec_from_file_location(
53
+ name=strat_name,
54
+ location=strat_path
55
+ )
56
+ strategy = importlib.util.module_from_spec(spec)
57
+ spec.loader.exec_module(strategy)
58
+
59
+ if not hasattr(strategy, 'run'):
60
+ logger.warning(f"Found strategy '{strat_name}' has no valid run() function!")
61
+ continue
62
+
63
+ loaded_strategies[strat_name] = strategy
64
+
65
+ logger.info(f"Loaded {len(loaded_strategies.keys())} strategies")
66
+ return loaded_strategies
67
+
68
+
69
+ def _extract_strategy_name(self, strat_file: str) -> str:
70
+ match = re.fullmatch(r'^strat_(.*)\.py$', strat_file)
71
+ return match.group(1) if match else None
72
+
73
+
74
+
src/pipeline/utils/utilclasses.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ def logging_callback_placeholder(*_):
4
+ pass
5
+
6
+ def deduplication_callback_placeholder(*_) -> bool:
7
+ return False
8
+
9
+ @dataclass
10
+ class ProcessingResult:
11
+ chunks: list[dict]
12
+ source: str
13
+ lang: str
src/rag/__init__.py ADDED
File without changes
src/rag/agent_chain.py ADDED
@@ -0,0 +1,1022 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.runnables import RunnableConfig
2
+ from langsmith import traceable
3
+ from langchain.tools import tool
4
+ from langchain.agents import create_agent
5
+ from langchain_core.messages import (
6
+ HumanMessage,
7
+ AIMessage,
8
+ SystemMessage,
9
+ )
10
+ from langchain.agents.middleware import ModelFallbackMiddleware
11
+ from langchain.agents.structured_output import ProviderStrategy
12
+
13
+ import uuid
14
+ import json
15
+ import os
16
+ import re
17
+ import random
18
+ import glob
19
+ from datetime import datetime
20
+
21
+ from src.database.weavservice import WeaviateService
22
+
23
+ from src.rag.utilclasses import *
24
+ from src.const.agent_response_constants import *
25
+ from src.rag.middleware import AgentChainMiddleware as chainmdw
26
+ from src.rag.prompts import PromptConfigurator as promptconf
27
+ from src.rag.models import ModelConfigurator as modelconf
28
+ from src.rag.input_handler import InputHandler
29
+ from src.rag.response_formatter import ResponseFormatter
30
+ from src.rag.scope_guardian import ScopeGuardian
31
+ # from src.rag.quality_score_handler import QualityEvaluationResult, QualityScoreHandler
32
+ from src.rag.language_detection import LanguageDetector
33
+
34
+ from src.utils.logging import get_logger
35
+ from src.utils.lang import get_language_name
36
+ from src.config import config
37
+
38
+ from ..cache.cache import Cache
39
+
40
+ chain_logger = get_logger('agent_chain')
41
+
42
+
43
+ class ExecutiveAgentChain:
44
+ def __init__(self, language: str = 'en', session_id: str | None = None) -> None:
45
+ self._initial_language = language
46
+ self._stored_language = language
47
+ self._dbservice = WeaviateService()
48
+ self._agents, self._config = self._init_agents()
49
+ self._conversation_history = []
50
+ self._cache = Cache.get_cache()
51
+
52
+ # Confidence scoring is intentionally disabled here because the extra
53
+ # model call adds latency and has not been reliable enough to justify it.
54
+ # if config.chain.EVALUATE_RESPONSE_QUALITY:
55
+ # self._quality_handler = QualityScoreHandler()
56
+ self._language_detector = LanguageDetector()
57
+
58
+ # Generate unique user ID for this session
59
+ self._user_id = session_id or str(uuid.uuid4())
60
+
61
+ # Initialize conversation state with user profile tracking
62
+ self._conversation_state: ConversationState = {
63
+ 'session_id': self._user_id,
64
+ 'user_id': self._user_id,
65
+ 'user_language': None,
66
+ 'user_name': None,
67
+ 'experience_years': None,
68
+ 'leadership_years': None,
69
+ 'field': None,
70
+ 'interest': None,
71
+ 'qualification_level': None,
72
+ 'program_interest': [],
73
+ 'suggested_program': None,
74
+ 'handover_requested': None,
75
+ 'topics_discussed': [],
76
+ 'preferences_known': False
77
+ }
78
+
79
+ # Track scope violations for escalation
80
+ self._scope_violation_counts: dict[str, int] = {}
81
+ self._aggressive_violation_count = 0
82
+
83
+ chain_logger.info(f"Initialized new Agent Chain for language '{language}' with user_id: {self._user_id}")
84
+
85
+ def _retrieve_context(self, query: str, program: str, language: str = None):
86
+ """
87
+ Send the query to the vector database to retrieve additional information about the program.
88
+
89
+ Args:
90
+ query: Keywords depicting information you want to retrieve in the primary language.
91
+ program: Name of the program (either 'emba', 'iemba' or 'emba x') for which the information is requested.
92
+ language: Optional parameter (either 'en' for English language or 'de' for German language). This parameter selects the language of the database to query from. The input query must be written in the same language as the selected language. Use this parameter only if there's not enough information in your main language.
93
+ """
94
+ lang = language if language in ['en', 'de'] else self._initial_language
95
+ try:
96
+ response, _ = self._dbservice.query(
97
+ query=query,
98
+ lang=lang,
99
+ limit=config.get('TOP_K_RETRIEVAL'),
100
+ property_filters={
101
+ 'programs': [program],
102
+ },
103
+ )
104
+ serialized = '\n\n'.join([doc.properties.get('body', '') for doc in response.objects])
105
+ return serialized
106
+ except Exception as e:
107
+ raise e
108
+
109
+ def _call_emba_agent(self, query: str) -> str:
110
+ """
111
+ Invokes the EMBA support agent to retrieve more detailed information about the EMBA program.
112
+
113
+ Args:
114
+ query: Query to the EMBA support agent. Provide collected user data in the query if possible.
115
+ """
116
+ try:
117
+ structured_response = self._query(
118
+ agent=self._agents['emba'],
119
+ messages=[HumanMessage(query)],
120
+ thread_id=f"emba_{hash(query)}",
121
+ )
122
+ return structured_response.response
123
+ except Exception as e:
124
+ chain_logger.error(f"EMBA Agent error: {e}")
125
+ raise RuntimeError("Unable to retrieve EMBA information at this time.")
126
+
127
+ def _call_iemba_agent(self, query: str) -> str:
128
+ """
129
+ Invokes the IEMBA support agent to retrieve more detailed information about the IEMBA program.
130
+
131
+ Args:
132
+ query: Query to the IEMBA support agent. Provide collected user data in the query if possible.
133
+ """
134
+ try:
135
+ structured_response = self._query(
136
+ agent=self._agents['iemba'],
137
+ messages=[HumanMessage(query)],
138
+ thread_id=f"emba_{hash(query)}",
139
+ )
140
+ return structured_response.response
141
+ except Exception as e:
142
+ chain_logger.error(f"IEMBA Agent error: {e}")
143
+ raise RuntimeError("Unable to retrieve IEMBA information at this time.")
144
+
145
+ def _call_embax_agent(self, query: str) -> str:
146
+ """
147
+ Invokes the emba X support agent to retrieve more detailed information about the emba X program.
148
+
149
+ Args:
150
+ query: Query to the emba X support agent. Provide collected user data in the query if possible.
151
+ """
152
+ try:
153
+ structured_response = self._query(
154
+ agent=self._agents['embax'],
155
+ messages=[HumanMessage(query)],
156
+ thread_id=f"emba_{hash(query)}",
157
+ )
158
+ return structured_response.response
159
+ except Exception as e:
160
+ chain_logger.error(f"emba X Agent error: {e}")
161
+ raise RuntimeError("Unable to retrieve emba X information at this time.")
162
+
163
+ def _init_agents(self):
164
+ config: RunnableConfig = {
165
+ 'configurable': {'thread_id': 0}
166
+ }
167
+ fallback_middleware = ModelFallbackMiddleware(
168
+ *modelconf.get_fallback_models()
169
+ )
170
+ tool_retrieve_context = tool(
171
+ name_or_callable='retrieve_context',
172
+ runnable=self._retrieve_context,
173
+ return_direct=False,
174
+ parse_docstring=True,
175
+ )
176
+ tools_agent_calling = [
177
+ tool(
178
+ name_or_callable='call_emba_agent',
179
+ runnable=self._call_emba_agent,
180
+ return_direct=False,
181
+ parse_docstring=True,
182
+ ),
183
+ tool(
184
+ name_or_callable='call_iemba_agent',
185
+ runnable=self._call_iemba_agent,
186
+ return_direct=False,
187
+ parse_docstring=True,
188
+ ),
189
+ tool(
190
+ name_or_callable='call_embax_agent',
191
+ runnable=self._call_embax_agent,
192
+ return_direct=False,
193
+ parse_docstring=True,
194
+ ),
195
+ ]
196
+ agents = {
197
+ 'lead': create_agent(
198
+ name="lead_agent",
199
+ model=modelconf.get_main_agent_model(),
200
+ tools=tools_agent_calling,
201
+ state_schema=LeadInformationState,
202
+ system_prompt=promptconf.get_configured_agent_prompt('lead', language=self._initial_language),
203
+ middleware=[
204
+ chainmdw.get_tool_wrapper(),
205
+ chainmdw.get_model_wrapper(),
206
+ fallback_middleware,
207
+ ],
208
+ context_schema=AgentContext,
209
+ response_format=ProviderStrategy(
210
+ StructuredAgentResponse
211
+ ),
212
+ ),
213
+ }
214
+ for agent in ['emba', 'iemba', 'embax']:
215
+ agents[agent] = create_agent(
216
+ name=f"{agent}_agent",
217
+ model=modelconf.get_subagent_model(),
218
+ tools=[tool_retrieve_context],
219
+ state_schema=LeadInformationState,
220
+ system_prompt=promptconf.get_configured_agent_prompt(agent, language=self._initial_language),
221
+ middleware=[
222
+ fallback_middleware,
223
+ chainmdw.get_tool_wrapper(),
224
+ chainmdw.get_model_wrapper(),
225
+ ],
226
+ context_schema=AgentContext,
227
+ )
228
+ return agents, config
229
+
230
+ def _extract_experience_years(self, conversation: str) -> int | None:
231
+ """Extract years of professional experience from conversation text."""
232
+ # Look for patterns like "10 years", "5 years experience", etc.
233
+ patterns = [
234
+ r'(\d+)\s*years?\s*(?:of\s*)?(?:experience|work)',
235
+ r'(\d+)\s*years?\s*in\s*(?:the\s*)?(?:field|industry)',
236
+ r'working\s*for\s*(\d+)\s*years?',
237
+ r'(\d+)\s*Jahre\s*(?:Erfahrung|Berufserfahrung)', # German
238
+ ]
239
+ for pattern in patterns:
240
+ match = re.search(pattern, conversation, re.IGNORECASE)
241
+ if match:
242
+ return int(match.group(1))
243
+ return None
244
+
245
+ def _extract_leadership_years(self, conversation: str) -> int | None:
246
+ """Extract years of leadership experience from conversation text."""
247
+ patterns = [
248
+ r'(\d+)\s*years?\s*(?:of\s*)?(?:leadership|management|managing)',
249
+ r'(?:lead|led|manage|managed)\s*(?:for\s*)?(\d+)\s*years?',
250
+ r'(\d+)\s*Jahre\s*(?:Führungserfahrung|Führung)', # German
251
+ ]
252
+ for pattern in patterns:
253
+ match = re.search(pattern, conversation, re.IGNORECASE)
254
+ if match:
255
+ return int(match.group(1))
256
+ return None
257
+
258
+ def _extract_field(self, conversation: str) -> str | None:
259
+ """Extract professional field/industry from conversation text."""
260
+ # Common fields mentioned in executive education
261
+ fields = [
262
+ 'finance', 'banking', 'technology', 'tech', 'IT', 'healthcare',
263
+ 'consulting', 'manufacturing', 'retail', 'marketing', 'sales',
264
+ 'engineering', 'pharma', 'telecommunications', 'energy',
265
+ 'Finanzwesen', 'Technologie', 'Gesundheitswesen', 'Beratung' # German
266
+ ]
267
+ conversation_lower = conversation.lower()
268
+ for field in fields:
269
+ if field.lower() in conversation_lower:
270
+ return field.capitalize()
271
+ return None
272
+
273
+ def _extract_interest(self, conversation: str) -> str | None:
274
+ """Extract content interests from conversation text."""
275
+ # Look for interest indicators
276
+ interests = [
277
+ 'strategy', 'innovation', 'leadership', 'digital transformation',
278
+ 'finance', 'operations', 'marketing', 'entrepreneurship',
279
+ 'social impact', 'technology', 'management',
280
+ 'Strategie', 'Innovation', 'Führung', 'Digitalisierung' # German
281
+ ]
282
+ conversation_lower = conversation.lower()
283
+ found_interests = [interest for interest in interests
284
+ if interest.lower() in conversation_lower]
285
+ return ', '.join(found_interests) if found_interests else None
286
+
287
+ def _extract_name(self, conversation: str) -> str | None:
288
+ """Extract user's name from conversation text."""
289
+ patterns = [
290
+ r"(?:my name is|i'm|i am|call me)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
291
+ r"(?:this is|it's)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)",
292
+ r"(?:ich heiße|mein Name ist|ich bin)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)", # German
293
+ ]
294
+ for pattern in patterns:
295
+ match = re.search(pattern, conversation, re.IGNORECASE)
296
+ if match:
297
+ name = match.group(1).strip()
298
+ # Filter out common words that might be误ly matched
299
+ excluded = ['interested', 'looking', 'working', 'searching', 'asking']
300
+ if name.lower() not in excluded:
301
+ return name
302
+ return None
303
+
304
+ def _detect_handover_request(self, conversation: str) -> bool:
305
+ """Detect if user requested appointment, callback, or contact."""
306
+ # Keywords indicating handover request
307
+ handover_keywords = [
308
+ 'appointment', 'call me', 'contact me', 'schedule', 'meeting',
309
+ 'callback', 'reach out', 'follow up', 'get in touch', 'speak with',
310
+ 'talk to', 'consultation', 'discuss with', 'meet with',
311
+ 'Termin', 'Rückruf', 'kontaktieren', 'Gespräch', 'anrufen', # German
312
+ 'zurückrufen', 'Beratung', 'treffen'
313
+ ]
314
+ conversation_lower = conversation.lower()
315
+ return any(keyword.lower() in conversation_lower for keyword in handover_keywords)
316
+
317
+ def _previous_response_offered_booking(self) -> bool:
318
+ """Return True if the latest assistant turn offered booking as a next step."""
319
+ booking_offer_terms = [
320
+ "appointment slots",
321
+ "book an appointment",
322
+ "book a consultation",
323
+ "appointment booking",
324
+ "show you available appointments",
325
+ "show appointment options",
326
+ "terminbuchung",
327
+ "termin buchen",
328
+ "termine anzeigen",
329
+ "verfügbare termine",
330
+ "beratungstermin",
331
+ ]
332
+
333
+ for message in reversed(self._conversation_history):
334
+ if not isinstance(message, AIMessage):
335
+ continue
336
+ content = getattr(message, "content", "") or getattr(message, "text", "")
337
+ if isinstance(content, list):
338
+ content = " ".join(str(part) for part in content)
339
+ content_lower = str(content).lower()
340
+ return any(term in content_lower for term in booking_offer_terms)
341
+
342
+ return False
343
+
344
+ def _get_latest_ai_message_content(self, skip_latest: bool = False) -> str:
345
+ """Return the latest assistant message content from conversation history."""
346
+ ai_messages_seen = 0
347
+
348
+ for message in reversed(self._conversation_history):
349
+ if not isinstance(message, AIMessage):
350
+ continue
351
+
352
+ ai_messages_seen += 1
353
+ if skip_latest and ai_messages_seen == 1:
354
+ continue
355
+
356
+ content = getattr(message, "content", "") or getattr(message, "text", "")
357
+ if isinstance(content, list):
358
+ return " ".join(str(part) for part in content)
359
+ return str(content)
360
+
361
+ return ""
362
+
363
+ def _is_booking_preference_follow_up(self, query: str) -> bool:
364
+ """Detect short follow-up answers that continue an active booking flow."""
365
+ query_lower = query.lower().strip()
366
+ if not query_lower:
367
+ return False
368
+
369
+ preference_terms = [
370
+ "online",
371
+ "on-site",
372
+ "onsite",
373
+ "in person",
374
+ "in-person",
375
+ "st.gallen",
376
+ "st. gallen",
377
+ "morning",
378
+ "mornings",
379
+ "afternoon",
380
+ "afternoons",
381
+ "evening",
382
+ "beginning of the week",
383
+ "start of the week",
384
+ "end of the week",
385
+ "monday",
386
+ "tuesday",
387
+ "wednesday",
388
+ "thursday",
389
+ "friday",
390
+ "morgens",
391
+ "vormittag",
392
+ "vormittags",
393
+ "nachmittag",
394
+ "nachmittags",
395
+ "abends",
396
+ "wochenanfang",
397
+ "anfang der woche",
398
+ "ende der woche",
399
+ "montag",
400
+ "dienstag",
401
+ "mittwoch",
402
+ "donnerstag",
403
+ "freitag",
404
+ "vor ort",
405
+ "vor-ort",
406
+ "persönlich",
407
+ "persoenlich",
408
+ "hybrid",
409
+ ]
410
+
411
+ if any(term in query_lower for term in preference_terms):
412
+ return True
413
+
414
+ return False
415
+
416
+ def _previous_response_requested_booking_preferences(self) -> bool:
417
+ """Return True when the previous assistant turn asked clarifying booking questions."""
418
+ content_lower = self._get_latest_ai_message_content().lower()
419
+ if not content_lower:
420
+ return False
421
+
422
+ booking_context_terms = [
423
+ "appointment options",
424
+ "available appointments",
425
+ "available slots",
426
+ "appointment slots",
427
+ "online-terminoptionen",
428
+ "terminoptionen",
429
+ "verfügbare slots",
430
+ "verfügbare termine",
431
+ "beratungsgespräch",
432
+ "beratung",
433
+ ]
434
+ clarification_terms = [
435
+ "do you prefer",
436
+ "would you prefer",
437
+ "which programme",
438
+ "which program",
439
+ "one short question",
440
+ "final question",
441
+ "when i know this",
442
+ "bitte noch kurz",
443
+ "eine kurze rückfrage",
444
+ "eine kurze letzte frage",
445
+ "bevorzugen sie",
446
+ "haben sie eine tagespräferenz",
447
+ "sobald ich das weiss",
448
+ "damit die slots besser passen",
449
+ ]
450
+
451
+ return (
452
+ any(term in content_lower for term in booking_context_terms)
453
+ and any(term in content_lower for term in clarification_terms)
454
+ )
455
+
456
+ def _response_commits_to_showing_booking_widget(self, response: str) -> bool:
457
+ """Detect when the assistant says booking options are being shown now."""
458
+ response_lower = response.lower()
459
+
460
+ positive_terms = [
461
+ "i can show you",
462
+ "contact details and available appointment slots are shown below",
463
+ "appointment options are shown below",
464
+ "available slots are shown below",
465
+ "i can now show you",
466
+ "ich kann ihnen nun",
467
+ "ich kann ihnen jetzt",
468
+ "unten werden ihnen",
469
+ "unten finden sie",
470
+ "unten sehen sie",
471
+ "terminoptionen anzeigen",
472
+ "verfügbaren slots",
473
+ "verfügbaren termine",
474
+ ]
475
+ defer_terms = [
476
+ "if you would like",
477
+ "if you later wish",
478
+ "you can ask me",
479
+ "if that would be helpful",
480
+ "sobald ich das weiss",
481
+ "wenn ich das weiss",
482
+ "damit die slots besser passen",
483
+ "bitte noch kurz",
484
+ "eine kurze rückfrage",
485
+ "eine kurze letzte frage",
486
+ "bevorzugen sie",
487
+ "have you got a preference",
488
+ "do you prefer",
489
+ "would you prefer",
490
+ "which programme",
491
+ "which program",
492
+ ]
493
+
494
+ return (
495
+ any(term in response_lower for term in positive_terms)
496
+ and not any(term in response_lower for term in defer_terms)
497
+ )
498
+ def _is_explicit_booking_intent(self, query: str) -> bool:
499
+ """Detect whether the user is actively asking to book or accepting a booking offer."""
500
+ query_lower = query.lower()
501
+ direct_booking_terms = [
502
+ "book",
503
+ "schedule",
504
+ "appointment",
505
+ "consultation",
506
+ "need a consultation",
507
+ "personal consultation",
508
+ "speak with",
509
+ "talk to an advisor",
510
+ "talk to admissions",
511
+ "connect me",
512
+ "show me available",
513
+ "show appointment",
514
+ "available slots",
515
+ "termin",
516
+ "termin buchen",
517
+ "termin vereinbaren",
518
+ "beratungstermin",
519
+ "beratungsgespräch",
520
+ "ich brauche eine beratung",
521
+ "ich möchte eine beratung",
522
+ "ich will eine beratung",
523
+ "beratung für",
524
+ "persönliche beratung",
525
+ "persoenliche beratung",
526
+ "mit jemandem sprechen",
527
+ "mit admissions sprechen",
528
+ "mit der zulassung sprechen",
529
+ "termine anzeigen",
530
+ "verfügbare termine",
531
+ ]
532
+ rejection_terms = [
533
+ "do not want",
534
+ "don't want",
535
+ "no appointment",
536
+ "not book",
537
+ "not schedule",
538
+ "no thanks",
539
+ "no thank you",
540
+ "kein termin",
541
+ "keinen termin",
542
+ "keine beratung",
543
+ "nicht buchen",
544
+ "nicht vereinbaren",
545
+ "nein danke",
546
+ ]
547
+ acceptance_terms = [
548
+ "yes",
549
+ "yes please",
550
+ "please do",
551
+ "that would be helpful",
552
+ "show me",
553
+ "ja",
554
+ "ja bitte",
555
+ "gerne",
556
+ "bitte",
557
+ "mach das",
558
+ "zeige",
559
+ ]
560
+
561
+ def contains_term(term: str) -> bool:
562
+ if term in {"yes", "ja", "bitte"}:
563
+ return re.search(rf"\b{re.escape(term)}\b", query_lower) is not None
564
+ return term in query_lower
565
+
566
+ if any(contains_term(term) for term in rejection_terms):
567
+ return False
568
+
569
+ if any(contains_term(term) for term in direct_booking_terms):
570
+ return True
571
+
572
+ return (
573
+ self._previous_response_offered_booking()
574
+ and any(contains_term(term) for term in acceptance_terms)
575
+ )
576
+
577
+ def _determine_suggested_program(self) -> str | None:
578
+ """Determine recommended program based on user profile."""
579
+ state = self._conversation_state
580
+
581
+ # If program interest was explicitly mentioned
582
+ if state['program_interest']:
583
+ return state['program_interest'][0]
584
+
585
+ # Make recommendation based on profile
586
+ experience = state.get('experience_years', 0) or 0
587
+ leadership = state.get('leadership_years', 0) or 0
588
+
589
+ # EMBA: 5+ years experience, 2+ years leadership
590
+ if experience >= 5 and leadership >= 2:
591
+ return 'EMBA'
592
+ # IEMBA: International focus, 3+ years experience
593
+ elif experience >= 3:
594
+ return 'IEMBA'
595
+ # EMBA X: Digital/Innovation focus
596
+ elif state.get('interest') and any(kw in state.get('interest', '').lower()
597
+ for kw in ['digital', 'innovation', 'technology']):
598
+ return 'emba X'
599
+
600
+ return None
601
+
602
+ def _update_conversation_state(self, user_query: str, agent_response: str) -> None:
603
+ """Update conversation state by extracting information from the conversation."""
604
+ if not config.convstate.TRACK_USER_PROFILE:
605
+ return
606
+
607
+ # Combine query and response for analysis
608
+ conversation_text = f"{user_query} {agent_response}"
609
+
610
+ # Extract profile information
611
+ if not self._conversation_state.get('experience_years'):
612
+ exp_years = self._extract_experience_years(conversation_text)
613
+ if exp_years:
614
+ self._conversation_state['experience_years'] = exp_years
615
+ chain_logger.info(f"Extracted experience years: {exp_years}")
616
+
617
+ if not self._conversation_state.get('leadership_years'):
618
+ lead_years = self._extract_leadership_years(conversation_text)
619
+ if lead_years:
620
+ self._conversation_state['leadership_years'] = lead_years
621
+ chain_logger.info(f"Extracted leadership years: {lead_years}")
622
+
623
+ if not self._conversation_state.get('field'):
624
+ field = self._extract_field(conversation_text)
625
+ if field:
626
+ self._conversation_state['field'] = field
627
+ chain_logger.info(f"Extracted field: {field}")
628
+
629
+ if not self._conversation_state.get('interest'):
630
+ interest = self._extract_interest(conversation_text)
631
+ if interest:
632
+ self._conversation_state['interest'] = interest
633
+ chain_logger.info(f"Extracted interest: {interest}")
634
+
635
+ # Extract name
636
+ if not self._conversation_state.get('user_name'):
637
+ name = self._extract_name(conversation_text)
638
+ if name:
639
+ self._conversation_state['user_name'] = name
640
+ chain_logger.info(f"Extracted name: {name}")
641
+
642
+ # Detect handover request from the user only; assistant soft offers should not count.
643
+ if self._detect_handover_request(user_query):
644
+ self._conversation_state['handover_requested'] = True
645
+ chain_logger.info("Handover request detected")
646
+
647
+ # Check for program mentions
648
+ programs = ['EMBA', 'IEMBA', 'EMBA X']
649
+ for program in programs:
650
+ if program.lower() in conversation_text.lower():
651
+ if program not in self._conversation_state['program_interest']:
652
+ self._conversation_state['program_interest'].append(program)
653
+
654
+ # Update suggested program
655
+ suggested = self._determine_suggested_program()
656
+ if suggested and not self._conversation_state.get('suggested_program'):
657
+ self._conversation_state['suggested_program'] = suggested
658
+ chain_logger.info(f"Suggested program: {suggested}")
659
+
660
+ def _log_user_profile(self) -> None:
661
+ """Log user profile to JSON file."""
662
+ if not config.convstate.TRACK_USER_PROFILE:
663
+ return
664
+
665
+ try:
666
+ # Create logs directory if it doesn't exist
667
+ log_dir = os.path.join('logs', 'user_profiles')
668
+ os.makedirs(log_dir, exist_ok=True)
669
+
670
+ # Create profile data
671
+ profile_data = {
672
+ 'session_id': self._conversation_state['session_id'],
673
+ 'user_id': self._conversation_state['user_id'],
674
+ 'name': self._conversation_state.get('user_name'),
675
+ 'timestamp': datetime.now().isoformat(),
676
+ 'experience_years': self._conversation_state.get('experience_years'),
677
+ 'leadership_years': self._conversation_state.get('leadership_years'),
678
+ 'field': self._conversation_state.get('field'),
679
+ 'interest': self._conversation_state.get('interest'),
680
+ 'suggested_program': self._conversation_state.get('suggested_program'),
681
+ 'handover': self._conversation_state.get('handover_requested'),
682
+ 'user_language': self._conversation_state.get('user_language'),
683
+ 'program_interest': self._conversation_state.get('program_interest', []),
684
+ }
685
+
686
+ # Log file path with timestamp
687
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
688
+ log_file = os.path.join(log_dir, f'profile_{self._user_id}_{timestamp}.json')
689
+
690
+ # Write to file
691
+ with open(log_file, 'w', encoding='utf-8') as f:
692
+ json.dump(profile_data, f, indent=2, ensure_ascii=False)
693
+
694
+ chain_logger.info(f"User profile logged to {log_file}")
695
+
696
+ except Exception as e:
697
+ chain_logger.error(f"Failed to log user profile: {e}")
698
+
699
+ def wipe_session_data(self) -> None:
700
+ """Delete in-memory session data and on-disk profile files (GDPR withdrawal)."""
701
+
702
+ # --- 1) In-memory wipe ---
703
+ self._conversation_history = []
704
+ self._conversation_state.update({
705
+ 'user_language': None,
706
+ 'user_name': None,
707
+ 'experience_years': None,
708
+ 'leadership_years': None,
709
+ 'field': None,
710
+ 'interest': None,
711
+ 'qualification_level': None,
712
+ 'program_interest': [],
713
+ 'suggested_program': None,
714
+ 'handover_requested': None,
715
+ 'topics_discussed': [],
716
+ 'preferences_known': False
717
+ })
718
+ self._scope_violation_counts = {}
719
+ self._aggressive_violation_count = 0
720
+
721
+ # --- 2) On-disk wipe (delete profile_<user_id>_*.json) ---
722
+ if not self._user_id:
723
+ chain_logger.warning("wipe_session_data called without user_id – skipping file deletion")
724
+ return
725
+
726
+ pattern = os.path.join(
727
+ "logs",
728
+ "user_profiles",
729
+ f"profile_{self._user_id}_*.json"
730
+ )
731
+
732
+ for path in glob.glob(pattern):
733
+ try:
734
+ os.remove(path)
735
+ chain_logger.info(f"Deleted profile file: {path}")
736
+ except OSError as e:
737
+ chain_logger.error(f"Failed to delete {path}: {e}")
738
+
739
+ def generate_greeting(self) -> str:
740
+ greeting_message = random.choice(GREETING_MESSAGES[self._stored_language])
741
+ return greeting_message
742
+
743
+ @traceable
744
+ def query(self, query: str) -> LeadAgentQueryResponse:
745
+ """
746
+ Phase 1: Validation, Scope-Check and language detection.
747
+ Does not call the agent directly.
748
+ """
749
+ # Remember fallback language
750
+ current_language = self._stored_language
751
+
752
+ if len(self._conversation_history) >= config.convstate.MAX_CONVERSATION_TURNS:
753
+ return LeadAgentQueryResponse(
754
+ response = CONVERSATION_END_MESSAGE[current_language],
755
+ language = current_language,
756
+ max_turns_reached = True,
757
+ relevant_programs=[],
758
+ processed_query = query
759
+ )
760
+
761
+ # 2. Input Processing
762
+ processed_query, is_valid = InputHandler.process_input(
763
+ query,
764
+ [msg for msg in self._conversation_history if isinstance(msg, (HumanMessage, AIMessage))]
765
+ )
766
+
767
+ if not is_valid or not processed_query:
768
+ chain_logger.warning(f"Invalid input received: '{query}'")
769
+ return LeadAgentQueryResponse(
770
+ response=NOT_VALID_QUERY_MESSAGE[self._stored_language],
771
+ language=current_language,
772
+ processed_query=query
773
+ )
774
+
775
+ # Log check
776
+ if processed_query != query:
777
+ chain_logger.info(f"Interpreted input '{query}' as '{processed_query}'")
778
+
779
+ # 3. Language Detection
780
+ # First: Check for explicit language switch request (overrides lock)
781
+ explicit_switch = self._language_detector.detect_explicit_switch_request(processed_query)
782
+ if explicit_switch:
783
+ self._stored_language = explicit_switch
784
+ current_language = explicit_switch
785
+ self._conversation_state['user_language'] = explicit_switch
786
+ elif self._language_detector.is_language_neutral_program_reference(processed_query):
787
+ chain_logger.info(
788
+ f"Skipping language re-detection for language-neutral programme reference: '{processed_query}'"
789
+ )
790
+ current_language = self._stored_language
791
+ else:
792
+ # Count user messages in conversation history
793
+ user_message_count = len([m for m in self._conversation_history if isinstance(m, HumanMessage)])
794
+
795
+ # Lock language after N user messages (allows language switch early in conversation)
796
+ lang_lock_n = config.convstate.LOCK_LANGUAGE_AFTER_N_MESSAGES
797
+ if lang_lock_n > 0 and user_message_count >= lang_lock_n:
798
+ chain_logger.info(f"Language locked to '{self._stored_language}' (after {user_message_count} messages)")
799
+ current_language = self._stored_language
800
+ else:
801
+ detected_language = self._language_detector.detect_language(processed_query)
802
+ self._conversation_state['user_language'] = detected_language
803
+
804
+ # Language validation
805
+ if detected_language in ['de', 'en']:
806
+ self._stored_language = detected_language
807
+ current_language = detected_language
808
+ else:
809
+ chain_logger.info("Invalid language detected.")
810
+ return LeadAgentQueryResponse(
811
+ response=LANGUAGE_FALLBACK_MESSAGE[current_language],
812
+ language=current_language,
813
+ processed_query=processed_query
814
+ )
815
+
816
+ # 4. Scope Check
817
+ scope_type = ScopeGuardian.check_scope(processed_query, current_language)
818
+
819
+ if scope_type != 'on_topic':
820
+ chain_logger.info(f"Out-of-scope query detected: {scope_type}")
821
+ if scope_type == 'aggressive':
822
+ self._aggressive_violation_count += 1
823
+ attempt_count = self._aggressive_violation_count
824
+ else:
825
+ self._scope_violation_counts[scope_type] = self._scope_violation_counts.get(scope_type, 0) + 1
826
+ attempt_count = self._scope_violation_counts[scope_type]
827
+
828
+ should_escalate, escalation_type = ScopeGuardian.should_escalate(
829
+ processed_query, scope_type, attempt_count
830
+ )
831
+
832
+ if should_escalate:
833
+ redirect_msg = ScopeGuardian.get_escalation_message(escalation_type, current_language)
834
+ else:
835
+ redirect_msg = ScopeGuardian.get_redirect_message(scope_type, current_language)
836
+
837
+ self._conversation_history.append(HumanMessage(processed_query))
838
+ self._conversation_history.append(AIMessage(redirect_msg))
839
+
840
+ return LeadAgentQueryResponse(
841
+ response=redirect_msg,
842
+ language=current_language,
843
+ processed_query=processed_query,
844
+ appointment_requested=False,
845
+ show_booking_widget=False,
846
+ )
847
+
848
+ # 5. Check if cached data already exists for this session
849
+ if config.cache.ENABLED:
850
+ cached_data = self._cache.get(query, current_language, self._user_id)
851
+ if cached_data and isinstance(cached_data, dict):
852
+ return LeadAgentQueryResponse(
853
+ response=cached_data["response"],
854
+ language=current_language,
855
+ appointment_requested=cached_data.get("appointment_requested", False),
856
+ show_booking_widget=cached_data.get("show_booking_widget", False),
857
+ relevant_programs=cached_data.get("relevant_programs", []),
858
+ )
859
+
860
+
861
+ # 6. Preprocessing is finished - the agent has to answer the query
862
+ response = self._query_lead(query)
863
+
864
+ if config.cache.ENABLED and response.should_cache:
865
+ self._cache.set(
866
+ key=query,
867
+ value={
868
+ "response": response.response,
869
+ "appointment_requested": response.appointment_requested,
870
+ "show_booking_widget": response.show_booking_widget,
871
+ "relevant_programs": response.relevant_programs,
872
+ },
873
+ language = current_language,
874
+ session_id = self._user_id,
875
+ )
876
+
877
+ return response
878
+
879
+
880
+ def _query_lead(self, preprocessed_query: str) -> LeadAgentQueryResponse:
881
+ """
882
+ Phase 2: Execute agent.
883
+ Takes the ALREADY validated query from the preprocessing phase.
884
+ """
885
+ # Reset scope-violation tracking
886
+ self._scope_violation_counts = {}
887
+
888
+ response_language = self._stored_language
889
+ explicit_booking_intent = self._is_explicit_booking_intent(preprocessed_query)
890
+ booking_preference_follow_up = (
891
+ self._conversation_state.get('handover_requested') is True
892
+ and self._previous_response_requested_booking_preferences()
893
+ and self._is_booking_preference_follow_up(preprocessed_query)
894
+ )
895
+
896
+ # 1. History Update
897
+ self._conversation_history.append(HumanMessage(preprocessed_query))
898
+
899
+ # 2. System instruction
900
+ language_instruction = SystemMessage(f"Respond in {get_language_name(response_language)} language.")
901
+
902
+ # 3. Agent Call
903
+ structured_response = self._query(
904
+ agent=self._agents['lead'],
905
+ messages=self._conversation_history + [language_instruction],
906
+ )
907
+ agent_response = structured_response.response
908
+ chain_logger.info(f"Is answer context dependent: {structured_response.is_context_dependent}")
909
+ chain_logger.info(f"Appointment Requested: {structured_response.appointment_requested}")
910
+ chain_logger.info(f"Show Booking Widget: {structured_response.show_booking_widget}")
911
+ chain_logger.info(f"Relevant Programs: {structured_response.relevant_programs}")
912
+
913
+ # 4. Formatting
914
+ if config.chain.ENABLE_RESPONSE_CHUNKING:
915
+ formatted_response = ResponseFormatter.format_response(
916
+ agent_response, agent_type='lead', enable_chunking=True, language=response_language
917
+ )
918
+ else:
919
+ formatted_response = ResponseFormatter.remove_tables(agent_response)
920
+
921
+ formatted_response = ResponseFormatter.clean_response(formatted_response)
922
+
923
+ confidence_fallback = False
924
+ # if config.chain.EVALUATE_RESPONSE_QUALITY:
925
+ # quality_evaluation: QualityEvaluationResult = self._quality_handler. \
926
+ # evaluate_response_quality(preprocessed_query, formatted_response)
927
+ #
928
+ # chain_logger.info(f"Quality Score: {quality_evaluation.overall_score:1.2f}")
929
+ #
930
+ # if quality_evaluation.overall_score < config.chain.CONFIDENCE_THRESHOLD:
931
+ # confidence_fallback = True
932
+ # formatted_response = CONFIDENCE_FALLBACK_MESSAGE[response_language]
933
+ # chain_logger.info("Fallback Mechanism activated!")
934
+
935
+ # Add to history
936
+ self._conversation_history.append(AIMessage(formatted_response))
937
+
938
+ # 6. Profiling
939
+ if config.convstate.TRACK_USER_PROFILE:
940
+ self._update_conversation_state(preprocessed_query, formatted_response)
941
+
942
+ message_count = len([m for m in self._conversation_history if isinstance(m, HumanMessage)])
943
+ if message_count % 5 == 0 or self._conversation_state.get('suggested_program'):
944
+ self._log_user_profile()
945
+
946
+ formatted_response = ResponseFormatter.format_name_of_university(formatted_response, language=response_language)
947
+
948
+ # Proactive booking offer.
949
+ # When the lead model signals booking readiness AND the assessment chain
950
+ # has identified a clear programme match, the booking widget is shown
951
+ # without waiting for an explicit "book"/"appointment" word from the user.
952
+ # The match comes from the existing profile-based assessment
953
+ # (suggested_program, set by _update_conversation_state above) or from
954
+ # relevant_programs returned by the lead model. Without this gate, the
955
+ # earlier user-led-only logic meant the widget effectively never fired.
956
+ clear_programme_match = (
957
+ self._conversation_state.get('suggested_program') is not None
958
+ or bool(structured_response.relevant_programs)
959
+ )
960
+ proactive_booking_offer = (
961
+ clear_programme_match
962
+ and structured_response.show_booking_widget
963
+ )
964
+
965
+ booking_flow_requested = (
966
+ explicit_booking_intent
967
+ or booking_preference_follow_up
968
+ or proactive_booking_offer
969
+ )
970
+ appointment_requested = bool(booking_flow_requested)
971
+ show_booking_widget = bool(
972
+ booking_flow_requested and (
973
+ structured_response.show_booking_widget
974
+ or self._response_commits_to_showing_booking_widget(formatted_response)
975
+ )
976
+ )
977
+
978
+ if proactive_booking_offer and not (explicit_booking_intent or booking_preference_follow_up):
979
+ chain_logger.info(
980
+ "Proactive booking offer triggered "
981
+ f"(suggested_program={self._conversation_state.get('suggested_program')}, "
982
+ f"relevant_programs={structured_response.relevant_programs})"
983
+ )
984
+ elif structured_response.appointment_requested and not booking_flow_requested:
985
+ chain_logger.info("Suppressed booking state because no programme match or booking intent was detected.")
986
+ elif booking_preference_follow_up and show_booking_widget:
987
+ chain_logger.info("Continuing active booking flow and showing booking widget for a preference follow-up.")
988
+
989
+ return LeadAgentQueryResponse(
990
+ response = formatted_response,
991
+ language = response_language,
992
+ confidence_fallback = confidence_fallback,
993
+ should_cache = False if (confidence_fallback or appointment_requested or structured_response.is_context_dependent) else True,
994
+ processed_query = preprocessed_query,
995
+ appointment_requested = appointment_requested,
996
+ show_booking_widget = show_booking_widget,
997
+ relevant_programs = structured_response.relevant_programs
998
+ )
999
+
1000
+ def _query(self, agent, messages: list, thread_id: str = None) -> StructuredAgentResponse:
1001
+ try:
1002
+ config = self._config.copy()
1003
+ config['configurable']['thread_id'] = thread_id or 0
1004
+
1005
+ result: AIMessage = agent.invoke(
1006
+ {"messages": messages},
1007
+ config=config,
1008
+ context=AgentContext(agent_name=agent.name),
1009
+ )
1010
+ response = result.get(
1011
+ 'structured_response',
1012
+ StructuredAgentResponse(
1013
+ response=result['messages'][-1].text,
1014
+ )
1015
+ )
1016
+ return response
1017
+ except Exception as e:
1018
+ error_msg = e.body['message'] if hasattr(e, 'body') else str(e)
1019
+ chain_logger.error(f"Failed to invoke the agent: {error_msg}")
1020
+ return StructuredAgentResponse(
1021
+ response=QUERY_EXCEPTION_MESSAGE[self._stored_language],
1022
+ )
src/rag/input_handler.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Input handler for processing and validating user messages.
3
+ Handles numeric inputs, validation, and interpretation.
4
+ """
5
+ import re
6
+ from src.rag.utilclasses import ConversationState
7
+ from src.utils.logging import get_logger
8
+
9
+ logger = get_logger("input_handler")
10
+
11
+
12
+ class InputHandler:
13
+ """Handles input validation and interpretation"""
14
+
15
+ @staticmethod
16
+ def validate_and_normalize(message: str) -> str:
17
+ """
18
+ Normalize and validate user input.
19
+
20
+ Args:
21
+ message: Raw user input
22
+
23
+ Returns:
24
+ Normalized message
25
+ """
26
+ if not message:
27
+ return ""
28
+
29
+ # Strip whitespace
30
+ normalized = message.strip()
31
+
32
+ # Handle empty or very short inputs
33
+ if len(normalized) < 1:
34
+ return ""
35
+
36
+ return normalized
37
+
38
+ @staticmethod
39
+ def is_numeric_input(message: str) -> bool:
40
+ """
41
+ Check if message is a standalone number.
42
+
43
+ Args:
44
+ message: User input
45
+
46
+ Returns:
47
+ True if message is just a number
48
+ """
49
+ normalized = message.strip()
50
+ # Check if it's just digits (possibly with decimal)
51
+ return bool(re.match(r'^\d+(\.\d+)?$', normalized))
52
+
53
+ @staticmethod
54
+ def interpret_numeric_input(
55
+ message: str,
56
+ conversation_history: list
57
+ ) -> str:
58
+ """
59
+ Interpret standalone numeric input based on conversation context.
60
+
61
+ Args:
62
+ message: Numeric input (e.g., "5")
63
+ conversation_history: Recent conversation messages (LangChain message objects)
64
+
65
+ Returns:
66
+ Interpreted message (e.g., "I have 5 years of experience")
67
+ """
68
+ number = message.strip()
69
+
70
+ # Look at recent messages for context
71
+ recent_context = ""
72
+ if len(conversation_history) > 0:
73
+ # Get last bot message
74
+ # Import here to avoid circular dependency
75
+ from langchain_core.messages import AIMessage
76
+
77
+ for msg in reversed(conversation_history):
78
+ # Handle LangChain message objects
79
+ if isinstance(msg, AIMessage):
80
+ recent_context = msg.content.lower() if hasattr(msg, 'content') else ""
81
+ break
82
+ # Handle dictionary format (for backward compatibility)
83
+ elif isinstance(msg, dict) and msg.get("role") == "assistant":
84
+ recent_context = msg.get("content", "").lower()
85
+ break
86
+
87
+ # Interpret based on context keywords
88
+ if any(keyword in recent_context for keyword in [
89
+ "experience", "years", "worked", "arbeits", "erfahrung", "jahre"
90
+ ]):
91
+ logger.info(f"Interpreting numeric input '{number}' as years of experience")
92
+ return f"I have {number} years of work experience"
93
+
94
+ elif any(keyword in recent_context for keyword in [
95
+ "age", "old", "alter", "jahre alt"
96
+ ]):
97
+ logger.info(f"Interpreting numeric input '{number}' as age")
98
+ return f"I am {number} years old"
99
+
100
+ elif any(keyword in recent_context for keyword in [
101
+ "qualification", "degree", "bachelor", "master", "qualifikation"
102
+ ]):
103
+ logger.info(f"Interpreting numeric input '{number}' as qualification level")
104
+ # Interpret as degree type
105
+ level_map = {
106
+ "1": "I have a Bachelor's degree",
107
+ "2": "I have a Master's degree",
108
+ "3": "I have an MBA",
109
+ "4": "I have a doctorate/PhD"
110
+ }
111
+ return level_map.get(number, f"My qualification level is {number}")
112
+
113
+ # Default: assume years of experience (most common)
114
+ logger.info(f"Interpreting numeric input '{number}' as years of experience (default)")
115
+ return f"I have {number} years of work experience"
116
+
117
+ @staticmethod
118
+ def process_input(
119
+ message: str,
120
+ conversation_history: list
121
+ ) -> tuple[str, bool]:
122
+ """
123
+ Process user input with validation and interpretation.
124
+
125
+ Args:
126
+ message: Raw user input
127
+ conversation_history: Recent messages for context
128
+
129
+ Returns:
130
+ Tuple of (processed_message, is_valid)
131
+ """
132
+ # Normalize
133
+ normalized = InputHandler.validate_and_normalize(message)
134
+
135
+ if not normalized:
136
+ return "", False
137
+
138
+ # Check if numeric
139
+ if InputHandler.is_numeric_input(normalized):
140
+ interpreted = InputHandler.interpret_numeric_input(
141
+ normalized,
142
+ conversation_history
143
+ )
144
+ return interpreted, True
145
+
146
+ return normalized, True
147
+