“shubhamdhamal” commited on
Commit
7644eac
·
1 Parent(s): d25847c

Deploy Flask app with Docker

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +22 -0
  2. Dockerfile +47 -0
  3. README.md +19 -6
  4. backend/Dockerfile +20 -0
  5. backend/Procfile +1 -0
  6. backend/__init__.py +4 -0
  7. backend/app.py +44 -0
  8. backend/requirements.txt +7 -0
  9. backend/routes.py +228 -0
  10. clear_cache.py +56 -0
  11. config.py +37 -0
  12. fix_colors.py +154 -0
  13. fix_learning_path_indentation.py +86 -0
  14. init_db.py +21 -0
  15. init_render_db.py +87 -0
  16. initialize_db.py +149 -0
  17. migrations/README +1 -0
  18. migrations/add_chatbot_tables.py +129 -0
  19. migrations/add_conversation_memory.sql +19 -0
  20. migrations/add_resource_progress.py +39 -0
  21. migrations/alembic.ini +50 -0
  22. migrations/env.py +113 -0
  23. migrations/script.py.mako +24 -0
  24. migrations/versions/12d5dfb6fd16_sync_users_table.py +111 -0
  25. migrations/versions/39d22a91999a_initial_migration.py +75 -0
  26. migrations/versions/6b20f44f6a00_make_oauth_user_id_nullable.py +36 -0
  27. migrations/versions/9f32f1920608_add_oauth_table_for_flask_dance.py +46 -0
  28. migrations/versions/a1b2c3d4e5f6_add_progress_tracking_tables.py +86 -0
  29. minimal_test.py +38 -0
  30. requirements.txt +76 -0
  31. run.py +79 -0
  32. run_flask.py +23 -0
  33. setup.py +28 -0
  34. src/__init__.py +1 -0
  35. src/agent.py +577 -0
  36. src/agents/__init__.py +9 -0
  37. src/agents/base_agent.py +234 -0
  38. src/agents/research_agent.py +323 -0
  39. src/agents/teaching_agent.py +356 -0
  40. src/data/bm25_retriever.py +173 -0
  41. src/data/document_store.py +973 -0
  42. src/data/resources.py +202 -0
  43. src/data/skills_database.py +999 -0
  44. src/data/vector_store.py +173 -0
  45. src/direct_openai.py +107 -0
  46. src/learning_path.py +916 -0
  47. src/ml/context_compressor.py +182 -0
  48. src/ml/embeddings.py +130 -0
  49. src/ml/job_market.py +177 -0
  50. src/ml/model_orchestrator.py +1187 -0
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Environment
7
+ .env
8
+ .venv
9
+ venv/
10
+ ENV/
11
+
12
+ # IDE
13
+ .vscode/
14
+ .idea/
15
+
16
+ # Cache
17
+ cache/
18
+ *.cache
19
+
20
+ # Local data
21
+ *.sqlite3
22
+ *.db
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces Dockerfile
2
+ # Reference: https://huggingface.co/docs/hub/spaces-sdks-docker
3
+
4
+ FROM python:3.11-slim
5
+
6
+ # Set working directory
7
+ WORKDIR /app
8
+
9
+ # Install system dependencies
10
+ RUN apt-get update && apt-get install -y \
11
+ gcc \
12
+ g++ \
13
+ libmagic1 \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Create a non-root user (required by Hugging Face Spaces)
17
+ RUN useradd -m -u 1000 user
18
+ USER user
19
+ ENV HOME=/home/user \
20
+ PATH=/home/user/.local/bin:$PATH
21
+
22
+ # Set working directory for user
23
+ WORKDIR $HOME/app
24
+
25
+ # Copy requirements first for caching
26
+ COPY --chown=user requirements.txt .
27
+
28
+ # Install Python dependencies
29
+ RUN pip install --no-cache-dir --upgrade pip && \
30
+ pip install --no-cache-dir -r requirements.txt
31
+
32
+ # Copy application code
33
+ COPY --chown=user . .
34
+
35
+ # Create necessary directories with proper permissions
36
+ RUN mkdir -p vector_db cache learning_paths
37
+
38
+ # Hugging Face Spaces requires port 7860
39
+ EXPOSE 7860
40
+
41
+ # Set environment variables for Hugging Face
42
+ ENV PORT=7860
43
+ ENV FLASK_ENV=production
44
+ ENV PYTHONUNBUFFERED=1
45
+
46
+ # Run the Flask app with gunicorn
47
+ CMD ["gunicorn", "run:app", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "120"]
README.md CHANGED
@@ -1,12 +1,25 @@
1
  ---
2
- title: Ai Learning Path Generator
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
- short_description: LLM Based ai-learning-path-generator
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI Learning Path Generator
3
+ emoji: 🎓
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ app_port: 7860
10
+ short_description: LLM Based AI Learning Path Generator
11
  ---
12
 
13
+ # AI Learning Path Generator
14
+
15
+ Generate personalized AI-powered learning paths for any topic.
16
+
17
+ ## Features
18
+ - 🎯 Personalized learning path generation
19
+ - 🤖 AI-powered content curation
20
+ - 📚 Structured curriculum creation
21
+
22
+ ## Setup
23
+ Configure these secrets in your Space settings:
24
+ - `OPENAI_API_KEY` - Your OpenAI API key
25
+ - `SECRET_KEY` - Flask secret key
backend/Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Copy backend requirements
6
+ COPY backend/requirements.txt /app/backend/requirements.txt
7
+
8
+ # Install dependencies
9
+ RUN pip install --no-cache-dir -r backend/requirements.txt
10
+
11
+ # Copy backend code
12
+ COPY backend/ /app/backend/
13
+
14
+ # Copy .env if exists
15
+ COPY .env* /app/
16
+
17
+ EXPOSE 5000
18
+
19
+ # Use shell form so $PORT expands on Render
20
+ CMD ["sh", "-c", "gunicorn backend.app:app --bind 0.0.0.0:${PORT:-5000} --workers 2 --timeout 30"]
backend/Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: gunicorn backend.app:app --bind 0.0.0.0:$PORT --workers 2 --timeout 30
backend/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Backend API for AI Learning Path Generator
3
+ Lightweight Flask API that queues tasks and returns status
4
+ """
backend/app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified Flask + React Application
3
+ Serves React frontend at root, Flask API routes, and OAuth
4
+ """
5
+ from backend.routes import api_bp
6
+ from web_app import create_app
7
+ import os
8
+ from flask import jsonify
9
+ from flask_cors import CORS
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Create the main app using the existing web_app factory (includes DB, OAuth, routes)
16
+ app = create_app()
17
+
18
+ # Register the lightweight API blueprint for RQ task orchestration under /api
19
+ app.register_blueprint(api_bp, url_prefix='/api')
20
+
21
+ # Enable CORS for the React frontend, mobile app, and allow cookies for auth
22
+ frontend_origin = os.getenv('FRONTEND_ORIGIN', 'http://localhost:3000')
23
+ allowed_origins = [
24
+ frontend_origin,
25
+ "http://localhost:3000",
26
+ "http://localhost:8081", # Expo mobile app
27
+ "http://127.0.0.1:8081",
28
+ "http://localhost:19006", # Expo web
29
+ ]
30
+ CORS(
31
+ app,
32
+ resources={r"/*": {"origins": allowed_origins}},
33
+ supports_credentials=True,
34
+ )
35
+
36
+
37
+ @app.route('/health')
38
+ def health():
39
+ return jsonify({"status": "healthy", "service": "api+web"}), 200
40
+
41
+
42
+ if __name__ == '__main__':
43
+ port = int(os.getenv('PORT', 5000))
44
+ app.run(host='0.0.0.0', port=port, debug=False)
backend/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Backend API requirements (lightweight)
2
+ Flask>=2.0.1
3
+ flask-cors>=4.0.0
4
+ python-dotenv==1.0.1
5
+ redis>=5.0.0
6
+ rq==1.16.1
7
+ gunicorn>=21.2.0
backend/routes.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API Routes for task management
3
+ """
4
+ import os
5
+ import uuid
6
+ import json
7
+ from flask import Blueprint, request, jsonify
8
+ from datetime import datetime
9
+
10
+ api_bp = Blueprint('rq_api', __name__)
11
+
12
+ # Redis connection - optional for local development
13
+ # Note: decode_responses=False is required for RQ (job results are pickled bytes, not strings)
14
+ redis_client = None
15
+ REDIS_URL = os.getenv('REDIS_URL')
16
+
17
+
18
+ def get_redis_client():
19
+ """Lazy load Redis client to avoid errors if Redis is not available"""
20
+ global redis_client
21
+ if redis_client is not None:
22
+ return redis_client
23
+
24
+ try:
25
+ import redis
26
+ if REDIS_URL and REDIS_URL.startswith(('redis://', 'rediss://')):
27
+ if REDIS_URL.startswith('rediss://'):
28
+ redis_client = redis.from_url(
29
+ REDIS_URL, decode_responses=False, ssl_cert_reqs=None)
30
+ else:
31
+ redis_client = redis.from_url(
32
+ REDIS_URL, decode_responses=False)
33
+ else:
34
+ redis_client = redis.Redis(
35
+ host=os.getenv('REDIS_HOST', 'localhost'),
36
+ port=int(os.getenv('REDIS_PORT', 6379)),
37
+ db=int(os.getenv('REDIS_DB', 0)),
38
+ decode_responses=False
39
+ )
40
+ # Test connection
41
+ redis_client.ping()
42
+ return redis_client
43
+ except Exception as e:
44
+ print(f"Redis not available: {e}")
45
+ redis_client = None
46
+ return None
47
+
48
+
49
+ # In-memory storage for synchronous task results (for local dev without Redis)
50
+ sync_task_results = {}
51
+
52
+
53
+ @api_bp.route('/generate', methods=['POST'])
54
+ def generate_path():
55
+ """
56
+ Generate a learning path. Uses RQ queue if Redis is available,
57
+ otherwise runs synchronously for local development.
58
+ Returns the job ID immediately (async) or result directly (sync).
59
+ """
60
+ try:
61
+ data = request.get_json()
62
+
63
+ # Validate required fields
64
+ required_fields = ['topic', 'expertise_level',
65
+ 'duration_weeks', 'time_commitment']
66
+ for field in required_fields:
67
+ if field not in data:
68
+ return jsonify({"error": f"Missing required field: {field}"}), 400
69
+
70
+ # Try to use Redis/RQ for async processing
71
+ redis_conn = get_redis_client()
72
+ if redis_conn:
73
+ try:
74
+ from rq import Queue
75
+ q = Queue('learning-paths', connection=redis_conn)
76
+ job = q.enqueue(
77
+ 'worker.tasks.generate_learning_path_for_worker', data)
78
+
79
+ return jsonify({
80
+ "task_id": job.id,
81
+ "status": "queued",
82
+ "message": "Learning path generation started"
83
+ }), 202
84
+ except Exception as rq_error:
85
+ print(f"RQ error, falling back to sync: {rq_error}")
86
+
87
+ # Fallback: Run synchronously for local development
88
+ task_id = str(uuid.uuid4())
89
+ sync_task_results[task_id] = {"status": "processing"}
90
+
91
+ try:
92
+ # Import and run the generation function directly
93
+ from src.learning_path import LearningPathGenerator
94
+
95
+ generator = LearningPathGenerator()
96
+
97
+ # Normalize goals
98
+ goals_raw = data.get('goals')
99
+ if isinstance(goals_raw, list):
100
+ goals = goals_raw
101
+ elif isinstance(goals_raw, str) and goals_raw.strip():
102
+ goals = [goals_raw.strip()]
103
+ else:
104
+ goals = None
105
+
106
+ learning_path = generator.generate_path(
107
+ topic=data['topic'],
108
+ expertise_level=data['expertise_level'],
109
+ learning_style=None,
110
+ time_commitment=data.get('time_commitment', '5-10 hours/week'),
111
+ duration_weeks=int(data['duration_weeks']),
112
+ goals=goals,
113
+ ai_provider=data.get('ai_provider', 'openrouter'),
114
+ ai_model=data.get('ai_model')
115
+ )
116
+
117
+ result = learning_path.dict() if hasattr(
118
+ learning_path, 'dict') else learning_path
119
+
120
+ sync_task_results[task_id] = {
121
+ "status": "finished",
122
+ "result": result
123
+ }
124
+
125
+ return jsonify({
126
+ "task_id": task_id,
127
+ "status": "finished",
128
+ "message": "Learning path generated successfully",
129
+ "result": result
130
+ }), 200
131
+
132
+ except Exception as gen_error:
133
+ sync_task_results[task_id] = {
134
+ "status": "failed",
135
+ "error": str(gen_error)
136
+ }
137
+ return jsonify({
138
+ "task_id": task_id,
139
+ "status": "failed",
140
+ "error": str(gen_error)
141
+ }), 500
142
+
143
+ except Exception as e:
144
+ return jsonify({"error": str(e)}), 500
145
+
146
+
147
+ @api_bp.route('/status/<task_id>', methods=['GET'])
148
+ def get_status(task_id):
149
+ """
150
+ Get the current status of a task (RQ job or sync task)
151
+ """
152
+ try:
153
+ # Check sync task results first
154
+ if task_id in sync_task_results:
155
+ task = sync_task_results[task_id]
156
+ resp = {
157
+ "task_id": task_id,
158
+ "status": task["status"]
159
+ }
160
+ if task["status"] == "finished":
161
+ resp["result"] = task.get("result")
162
+ if task["status"] == "failed":
163
+ resp["error"] = task.get("error")
164
+ return jsonify(resp), 200
165
+
166
+ # Try Redis/RQ
167
+ redis_conn = get_redis_client()
168
+ if redis_conn:
169
+ from rq import Queue
170
+ q = Queue('learning-paths', connection=redis_conn)
171
+ job = q.fetch_job(task_id)
172
+ if job is None:
173
+ return jsonify({"error": "Task not found"}), 404
174
+
175
+ resp = {
176
+ "task_id": job.id,
177
+ "status": job.get_status()
178
+ }
179
+ if job.is_finished:
180
+ resp["result"] = job.result
181
+ if job.is_failed:
182
+ resp["error"] = str(job.exc_info)
183
+ return jsonify(resp), 200
184
+
185
+ return jsonify({"error": "Task not found"}), 404
186
+ except Exception as e:
187
+ return jsonify({"error": str(e)}), 500
188
+
189
+
190
+ @api_bp.route('/result/<task_id>', methods=['GET'])
191
+ def get_result(task_id):
192
+ """
193
+ Get the final result of a task (RQ job or sync task)
194
+ """
195
+ try:
196
+ # Check sync task results first
197
+ if task_id in sync_task_results:
198
+ task = sync_task_results[task_id]
199
+ if task["status"] == "finished":
200
+ return jsonify(task.get("result", {})), 200
201
+ elif task["status"] == "failed":
202
+ return jsonify({"error": task.get("error")}), 500
203
+ else:
204
+ return jsonify({
205
+ "error": "Task not yet complete",
206
+ "status": task["status"]
207
+ }), 202
208
+
209
+ # Try Redis/RQ
210
+ redis_conn = get_redis_client()
211
+ if redis_conn:
212
+ from rq import Queue
213
+ q = Queue('learning-paths', connection=redis_conn)
214
+ job = q.fetch_job(task_id)
215
+ if job is None:
216
+ return jsonify({"error": "Task not found"}), 404
217
+
218
+ if not job.is_finished:
219
+ return jsonify({
220
+ "error": "Task not yet complete",
221
+ "status": job.get_status()
222
+ }), 202
223
+
224
+ return jsonify(job.result), 200
225
+
226
+ return jsonify({"error": "Task not found"}), 404
227
+ except Exception as e:
228
+ return jsonify({"error": str(e)}), 500
clear_cache.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple script to clear the Redis cache.
3
+ Run this when you need to reset all cached learning paths.
4
+ """
5
+ import redis
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
13
+ REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
14
+ REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', '').strip() # Strip whitespace
15
+ REDIS_DB = int(os.getenv('REDIS_DB', 0))
16
+
17
+ print(f"🔍 Connecting to Redis at {REDIS_HOST}:{REDIS_PORT} (password: {'set' if REDIS_PASSWORD else 'none'})")
18
+
19
+ try:
20
+ # Build Redis connection params
21
+ redis_params = {
22
+ 'host': REDIS_HOST,
23
+ 'port': REDIS_PORT,
24
+ 'db': REDIS_DB,
25
+ 'decode_responses': True
26
+ }
27
+ # Only add password if it's not empty
28
+ if REDIS_PASSWORD:
29
+ redis_params['password'] = REDIS_PASSWORD
30
+ print("🔐 Using password authentication")
31
+
32
+ redis_client = redis.Redis(**redis_params)
33
+
34
+ # Get all cache keys
35
+ path_keys = list(redis_client.scan_iter(match="path_cache:*"))
36
+ semantic_keys = list(redis_client.scan_iter(match="semantic_cache:*"))
37
+
38
+ total_keys = len(path_keys) + len(semantic_keys)
39
+
40
+ if total_keys == 0:
41
+ print("✅ Cache is already empty!")
42
+ else:
43
+ # Delete all cache keys
44
+ if path_keys:
45
+ redis_client.delete(*path_keys)
46
+ print(f"🗑️ Deleted {len(path_keys)} learning path cache entries")
47
+
48
+ if semantic_keys:
49
+ redis_client.delete(*semantic_keys)
50
+ print(f"🗑️ Deleted {len(semantic_keys)} semantic cache entries")
51
+
52
+ print(f"✅ Successfully cleared {total_keys} total cache entries!")
53
+
54
+ except Exception as e:
55
+ print(f"❌ Error clearing cache: {e}")
56
+ print("Make sure Redis is running and your .env file is configured correctly.")
config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ basedir = os.path.abspath(os.path.dirname(__file__))
5
+ # Load .env file only if not on Render
6
+ if not os.environ.get('RENDER'):
7
+ load_dotenv(os.path.join(basedir, '.env'))
8
+
9
+ # Set Flask app for CLI commands (needed for flask db upgrade)
10
+ os.environ.setdefault('FLASK_APP', 'run.py')
11
+
12
+ class Config:
13
+ SECRET_KEY = os.environ.get('FLASK_SECRET_KEY') or 'dev-secret-key-change-in-production-2024'
14
+ SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') or \
15
+ 'sqlite:///' + os.path.join(basedir, 'app.db')
16
+ SQLALCHEMY_TRACK_MODIFICATIONS = False
17
+
18
+ # Session configuration
19
+ SESSION_COOKIE_HTTPONLY = True
20
+ SESSION_COOKIE_SAMESITE = 'Lax' # Default for local development
21
+ PERMANENT_SESSION_LIFETIME = 7200 # 2 hours
22
+ SESSION_REFRESH_EACH_REQUEST = True # Refresh session on each request
23
+ SESSION_USE_SIGNER = True # Sign session cookies for security
24
+ SESSION_COOKIE_NAME = 'learning_path_session' # Custom session cookie name
25
+
26
+ # Ensure cookies work with OAuth redirects in production
27
+ if os.environ.get('RENDER'):
28
+ SESSION_COOKIE_SECURE = True # Cookie only over HTTPS
29
+ SESSION_COOKIE_SAMESITE = 'None' # Allow cross-site OAuth redirect
30
+ REMEMBER_COOKIE_SECURE = True
31
+ REMEMBER_COOKIE_SAMESITE = 'None'
32
+ else:
33
+ # Local development - allow HTTP cookies
34
+ SESSION_COOKIE_SECURE = False
35
+ REMEMBER_COOKIE_SECURE = False
36
+
37
+ LOG_TO_STDOUT = os.environ.get('LOG_TO_STDOUT')
fix_colors.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Color Fix Script for AI Learning Path Generator
4
+ Automatically replaces white backgrounds and bright colors with dark glassmorphic theme
5
+ """
6
+
7
+ import os
8
+ import shutil
9
+ from pathlib import Path
10
+
11
+ def backup_file(filepath):
12
+ """Create a backup of the original file"""
13
+ backup_path = f"{filepath}.backup"
14
+ shutil.copy2(filepath, backup_path)
15
+ print(f"✅ Backup created: {backup_path}")
16
+ return backup_path
17
+
18
+ def fix_colors(filepath):
19
+ """Apply color fixes to the template file"""
20
+ print(f"\n🎨 Fixing colors in: {filepath}")
21
+
22
+ # Read the file
23
+ with open(filepath, 'r', encoding='utf-8') as f:
24
+ content = f.read()
25
+
26
+ # Track changes
27
+ changes = 0
28
+
29
+ # 1. Replace white backgrounds with glass-card
30
+ replacements = [
31
+ ('bg-white rounded-xl shadow-xl', 'glass-card'),
32
+ ('bg-white rounded-lg shadow-xl', 'glass-card'),
33
+ ('bg-white rounded-lg shadow-md', 'glass-card'),
34
+ ('bg-white rounded-lg shadow', 'glass-card'),
35
+ ('bg-white p-4 rounded-lg shadow', 'glass-card p-4'),
36
+ ('bg-white p-8 rounded-xl', 'glass-card p-8'),
37
+ ('bg-gray-100', 'glass-card'),
38
+ ('bg-gray-50', 'glass-card'),
39
+ ('bg-gray-200', 'glass-card'),
40
+ ]
41
+
42
+ for old, new in replacements:
43
+ count = content.count(old)
44
+ if count > 0:
45
+ content = content.replace(old, new)
46
+ changes += count
47
+ print(f" ✓ Replaced '{old}' → '{new}' ({count} times)")
48
+
49
+ # 2. Replace text colors
50
+ text_replacements = [
51
+ ('text-gray-900', 'text-white'),
52
+ ('text-gray-800', 'text-white'),
53
+ ('text-gray-700', 'text-secondary'),
54
+ ('text-gray-600', 'text-secondary'),
55
+ ('text-gray-500', 'text-muted'),
56
+ ('text-magenta', 'text-neon-purple'),
57
+ ]
58
+
59
+ border_replacements = [
60
+ ('border-gray-200', 'border-transparent'),
61
+ ('border-gray-300', 'border-glass'),
62
+ ]
63
+
64
+ for old, new in text_replacements:
65
+ count = content.count(old)
66
+ if count > 0:
67
+ content = content.replace(old, new)
68
+ changes += count
69
+ print(f" ✓ Replaced '{old}' → '{new}' ({count} times)")
70
+
71
+ for old, new in border_replacements:
72
+ count = content.count(old)
73
+ if count > 0:
74
+ content = content.replace(old, new)
75
+ changes += count
76
+ print(f" ✓ Replaced '{old}' → '{new}' ({count} times)")
77
+
78
+ # 3. Fix specific sections
79
+ specific_fixes = [
80
+ # Learning Journey title
81
+ ('<h3 class="text-2xl font-bold text-white mb-6">Your Learning Journey</h3>',
82
+ '<h3 class="text-2xl font-bold text-white mb-6">Your Learning <span class="text-neon-cyan">Journey</span></h3>'),
83
+
84
+ # Milestones title
85
+ ('<h3 class="text-3xl font-bold text-white mb-8 text-center">Your Learning <span class="text-neon-purple">Milestones</span></h3>',
86
+ '<h3 class="text-3xl font-bold text-white mb-8 text-center">Your Learning <span class="text-neon-purple">Milestones</span></h3>'),
87
+ ]
88
+
89
+ for old, new in specific_fixes:
90
+ if old in content and old != new:
91
+ content = content.replace(old, new)
92
+ changes += 1
93
+ print(f" ✓ Fixed specific section")
94
+
95
+ # 4. Fix Chart.js colors (if present)
96
+ chart_fixes = [
97
+ # Pink to Neon Cyan
98
+ ("'rgba(255, 99, 132, 0.5)'", "'rgba(74, 216, 255, 0.3)'"),
99
+ ("'rgba(255, 99, 132, 1)'", "'rgba(74, 216, 255, 1)'"),
100
+
101
+ # Yellow to Neon Purple
102
+ ("'rgba(255, 206, 86, 1)'", "'rgba(179, 125, 255, 1)'"),
103
+ ("'rgba(255, 206, 86, 0.5)'", "'rgba(179, 125, 255, 0.3)'"),
104
+ ]
105
+
106
+ for old, new in chart_fixes:
107
+ if old in content:
108
+ content = content.replace(old, new)
109
+ changes += 1
110
+ print(f" ✓ Fixed chart color")
111
+
112
+ # Write the updated content
113
+ with open(filepath, 'w', encoding='utf-8') as f:
114
+ f.write(content)
115
+
116
+ print(f"\n✅ Applied {changes} color fixes to {filepath}")
117
+ return changes
118
+
119
+ def main():
120
+ """Main function to fix colors in all template files"""
121
+ print("🎨 AI Learning Path Generator - Color Fix Script")
122
+ print("=" * 60)
123
+
124
+ # Define files to fix
125
+ template_dir = Path("web_app/templates")
126
+ files_to_fix = [
127
+ template_dir / "result.html",
128
+ template_dir / "index.html",
129
+ template_dir / "dashboard.html",
130
+ ]
131
+
132
+ total_changes = 0
133
+
134
+ for filepath in files_to_fix:
135
+ if filepath.exists():
136
+ # Backup first
137
+ backup_file(filepath)
138
+
139
+ # Apply fixes
140
+ changes = fix_colors(filepath)
141
+ total_changes += changes
142
+ else:
143
+ print(f"⚠️ File not found: {filepath}")
144
+
145
+ print("\n" + "=" * 60)
146
+ print(f"🎉 Color fix complete! Total changes: {total_changes}")
147
+ print("\n📋 Next steps:")
148
+ print("1. Review the changes in your IDE")
149
+ print("2. Test the application")
150
+ print("3. If issues occur, restore from .backup files")
151
+ print("\n💡 Tip: Clear browser cache (Ctrl+Shift+R) to see changes")
152
+
153
+ if __name__ == "__main__":
154
+ main()
fix_learning_path_indentation.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to fix the indentation in src/learning_path.py
3
+ This adds proper try-except structure for observability tracking.
4
+ """
5
+
6
+ import re
7
+
8
+ # Read the file
9
+ with open('src/learning_path.py', 'r', encoding='utf-8') as f:
10
+ content = f.read()
11
+
12
+ # Find the generate_path method and fix indentation
13
+ # The issue is that code after line 323 needs to be indented under the try block
14
+
15
+ # Pattern: Find from "relevant_docs = " to the end of generate_path method (before "def save_path")
16
+ # We need to indent everything between the try block and the except block
17
+
18
+ lines = content.split('\n')
19
+ fixed_lines = []
20
+ in_try_block = False
21
+ try_start_line = None
22
+ indent_needed = False
23
+
24
+ for i, line in enumerate(lines):
25
+ # Detect the start of the try block in generate_path
26
+ if 'try:' in line and i > 280 and i < 310: # Around line 300
27
+ in_try_block = True
28
+ try_start_line = i
29
+ fixed_lines.append(line)
30
+ continue
31
+
32
+ # Detect where indentation is missing (after the validation checks)
33
+ if in_try_block and line.strip().startswith('relevant_docs = '):
34
+ indent_needed = True
35
+
36
+ # Stop indenting at the except block or next method
37
+ if indent_needed and (line.strip().startswith('except Exception') or line.strip().startswith('def save_path')):
38
+ indent_needed = False
39
+ in_try_block = False
40
+
41
+ # Add the except block before this line if it's "def save_path"
42
+ if line.strip().startswith('def save_path'):
43
+ # Add proper except block
44
+ fixed_lines.append('')
45
+ fixed_lines.append(' except Exception as e:')
46
+ fixed_lines.append(' # Mark as failed')
47
+ fixed_lines.append(' error_message = str(e)')
48
+ fixed_lines.append(' ')
49
+ fixed_lines.append(' # Log failure metrics')
50
+ fixed_lines.append(' generation_time_ms = (time.time() - generation_start_time) * 1000')
51
+ fixed_lines.append(' self.obs_manager.log_metric("path_generation_success", 0.0, {')
52
+ fixed_lines.append(' "topic": topic,')
53
+ fixed_lines.append(' "expertise_level": expertise_level,')
54
+ fixed_lines.append(' "error": error_message,')
55
+ fixed_lines.append(' "duration_ms": generation_time_ms,')
56
+ fixed_lines.append(' "user_id": user_id')
57
+ fixed_lines.append(' })')
58
+ fixed_lines.append(' ')
59
+ fixed_lines.append(' self.obs_manager.log_event("path_generation_failed", {')
60
+ fixed_lines.append(' "topic": topic,')
61
+ fixed_lines.append(' "expertise_level": expertise_level,')
62
+ fixed_lines.append(' "error": error_message,')
63
+ fixed_lines.append(' "generation_time_ms": generation_time_ms,')
64
+ fixed_lines.append(' "user_id": user_id')
65
+ fixed_lines.append(' })')
66
+ fixed_lines.append(' ')
67
+ fixed_lines.append(' # Re-raise the exception')
68
+ fixed_lines.append(' raise')
69
+ fixed_lines.append('')
70
+
71
+ # Add indentation if needed
72
+ if indent_needed and line and not line.startswith(' '):
73
+ # Add 4 more spaces of indentation
74
+ if line.startswith(' '):
75
+ fixed_lines.append(' ' + line)
76
+ else:
77
+ fixed_lines.append(line)
78
+ else:
79
+ fixed_lines.append(line)
80
+
81
+ # Write back
82
+ with open('src/learning_path.py', 'w', encoding='utf-8') as f:
83
+ f.write('\n'.join(fixed_lines))
84
+
85
+ print("✅ Fixed indentation in src/learning_path.py")
86
+ print("⚠️ Please review the changes manually to ensure correctness")
init_db.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Initialize database tables for production deployment."""
3
+ import os
4
+ from web_app import create_app, db
5
+
6
+ # Create Flask app
7
+ app = create_app()
8
+
9
+ with app.app_context():
10
+ print("Creating database tables...")
11
+ try:
12
+ db.create_all()
13
+ print("✅ Database tables created successfully!")
14
+ except Exception as e:
15
+ # If tables/constraints already exist, that's OK
16
+ if "already exists" in str(e).lower():
17
+ print("⚠️ Some tables/constraints already exist - continuing...")
18
+ print("✅ Database is ready!")
19
+ else:
20
+ print(f"❌ Error creating tables: {e}")
21
+ raise
init_render_db.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Initialize PostgreSQL database on Render.
4
+ This script runs database migrations to create all required tables.
5
+ """
6
+ import os
7
+ import sys
8
+ from flask_migrate import upgrade
9
+ from web_app import create_app, db
10
+
11
+ def init_database():
12
+ """Initialize database with migrations"""
13
+ print("=" * 60)
14
+ print("🔧 Initializing PostgreSQL Database on Render")
15
+ print("=" * 60)
16
+
17
+ # Check if DATABASE_URL is set
18
+ database_url = os.environ.get('DATABASE_URL')
19
+ if not database_url:
20
+ print("❌ ERROR: DATABASE_URL environment variable not set!")
21
+ print("Please configure PostgreSQL in Render dashboard.")
22
+ sys.exit(1)
23
+
24
+ print(f"✅ Database URL found: {database_url[:30]}...")
25
+
26
+ # Create Flask app
27
+ print("\n📦 Creating Flask application...")
28
+ app = create_app()
29
+
30
+ with app.app_context():
31
+ print("\n🔍 Checking database connection...")
32
+ try:
33
+ # Test database connection
34
+ db.engine.connect()
35
+ print("✅ Database connection successful!")
36
+ except Exception as e:
37
+ print(f"❌ Database connection failed: {e}")
38
+ sys.exit(1)
39
+
40
+ print("\n🚀 Running database migrations...")
41
+ try:
42
+ # Run all migrations
43
+ upgrade()
44
+ print("✅ Database migrations completed successfully!")
45
+ except Exception as e:
46
+ print(f"⚠️ Migration warning: {e}")
47
+ print("\nAttempting to create missing tables...")
48
+ try:
49
+ # Create tables if they don't exist (ignores existing ones)
50
+ from sqlalchemy import inspect
51
+ inspector = inspect(db.engine)
52
+ existing_tables = inspector.get_table_names()
53
+ print(f"📋 Existing tables: {', '.join(existing_tables)}")
54
+
55
+ # Only create tables that don't exist
56
+ db.create_all()
57
+ print("✅ Database schema verified/updated!")
58
+ except Exception as e2:
59
+ # If it fails due to existing constraints, that's actually OK
60
+ if "already exists" in str(e2).lower():
61
+ print("⚠️ Some tables/constraints already exist - this is OK!")
62
+ print("✅ Database schema is ready!")
63
+ else:
64
+ print(f"❌ Failed to create tables: {e2}")
65
+ sys.exit(1)
66
+
67
+ print("\n🔍 Verifying tables...")
68
+ try:
69
+ # Check if users table exists
70
+ from web_app.models import User
71
+ user_count = User.query.count()
72
+ print(f"✅ Users table exists (current count: {user_count})")
73
+ except Exception as e:
74
+ print(f"❌ Users table verification failed: {e}")
75
+ sys.exit(1)
76
+
77
+ print("\n" + "=" * 60)
78
+ print("✅ Database initialization complete!")
79
+ print("=" * 60)
80
+ print("\nYour database is ready to use. You can now:")
81
+ print("1. Register new users")
82
+ print("2. Login with Google OAuth")
83
+ print("3. Create learning paths")
84
+ print("\n")
85
+
86
+ if __name__ == "__main__":
87
+ init_database()
initialize_db.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Initialize the vector database with sample educational resources.
3
+ This provides some starter content for the Learning Path Generator.
4
+ """
5
+ import os
6
+ import json
7
+ from pathlib import Path
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ # Ensure OPENAI API key is set
14
+ if not os.getenv("OPENAI_API_KEY"):
15
+ print("ERROR: OPENAI_API_KEY not set in environment variables")
16
+ print("Please update your .env file with your API key")
17
+ exit(1)
18
+
19
+ # Import after checking API key
20
+ from src.data.document_store import DocumentStore
21
+ from src.data.resources import ResourceManager
22
+ from langchain.schema.document import Document
23
+
24
+ def load_sample_resources():
25
+ """Load sample resources from JSON file"""
26
+ resources_path = Path("samples/sample_resources.json")
27
+
28
+ if not resources_path.exists():
29
+ # Create directory if it doesn't exist
30
+ resources_path.parent.mkdir(exist_ok=True, parents=True)
31
+
32
+ # Create sample resources file with basic content
33
+ sample_resources = [
34
+ {
35
+ "title": "Introduction to Machine Learning",
36
+ "type": "course",
37
+ "description": "A comprehensive beginner's course covering ML fundamentals",
38
+ "difficulty": "beginner",
39
+ "time_estimate": "10 hours",
40
+ "url": "https://example.com/intro-ml",
41
+ "topic": "machine learning",
42
+ "learning_styles": ["visual", "reading"]
43
+ },
44
+ {
45
+ "title": "Python for Data Science Handbook",
46
+ "type": "book",
47
+ "description": "Essential guide to using Python for data analysis and ML",
48
+ "difficulty": "intermediate",
49
+ "time_estimate": "20 hours",
50
+ "url": "https://jakevdp.github.io/PythonDataScienceHandbook/",
51
+ "topic": "python,data science",
52
+ "learning_styles": ["reading"]
53
+ },
54
+ {
55
+ "title": "Web Development Bootcamp",
56
+ "type": "course",
57
+ "description": "Full stack web development from scratch",
58
+ "difficulty": "beginner",
59
+ "time_estimate": "40 hours",
60
+ "url": "https://example.com/web-dev-bootcamp",
61
+ "topic": "web development",
62
+ "learning_styles": ["visual", "kinesthetic"]
63
+ },
64
+ {
65
+ "title": "Advanced JavaScript Patterns",
66
+ "type": "video",
67
+ "description": "Deep dive into advanced JS design patterns",
68
+ "difficulty": "advanced",
69
+ "time_estimate": "3 hours",
70
+ "url": "https://example.com/js-patterns",
71
+ "topic": "javascript",
72
+ "learning_styles": ["visual", "auditory"]
73
+ },
74
+ {
75
+ "title": "Spanish Learning Podcast",
76
+ "type": "podcast",
77
+ "description": "Learn Spanish through immersive audio lessons",
78
+ "difficulty": "beginner",
79
+ "time_estimate": "10 hours",
80
+ "url": "https://example.com/spanish-podcast",
81
+ "topic": "spanish,language learning",
82
+ "learning_styles": ["auditory"]
83
+ }
84
+ ]
85
+
86
+ with open(resources_path, "w") as f:
87
+ json.dump(sample_resources, f, indent=2)
88
+
89
+ print(f"Created sample resources file at {resources_path}")
90
+ return sample_resources
91
+ else:
92
+ # Load existing resources
93
+ with open(resources_path, "r") as f:
94
+ return json.load(f)
95
+
96
+ def initialize_database():
97
+ """Initialize the vector database with sample resources"""
98
+ print("Initializing vector database...")
99
+
100
+ # Create document store
101
+ document_store = DocumentStore()
102
+
103
+ # Load sample resources
104
+ resources = load_sample_resources()
105
+
106
+ # Convert to Document objects
107
+ documents = []
108
+ for resource in resources:
109
+ # Create content from resource information
110
+ content = f"""
111
+ Title: {resource['title']}
112
+ Description: {resource['description']}
113
+ Type: {resource['type']}
114
+ Difficulty: {resource['difficulty']}
115
+ Topics: {resource.get('topic', '')}
116
+ """
117
+
118
+ # Create metadata
119
+ metadata = {
120
+ "title": resource["title"],
121
+ "type": resource["type"],
122
+ "difficulty": resource["difficulty"],
123
+ "url": resource["url"],
124
+ "topic": resource.get("topic", "").split(",")
125
+ }
126
+
127
+ # Add learning styles if available
128
+ if "learning_styles" in resource:
129
+ metadata["learning_styles"] = resource["learning_styles"]
130
+
131
+ # Create document
132
+ doc = Document(page_content=content, metadata=metadata)
133
+ documents.append(doc)
134
+
135
+ # Add documents to vector store
136
+ document_store.add_documents(documents)
137
+ print(f"Added {len(documents)} sample resources to vector database")
138
+
139
+ # Test search functionality
140
+ print("\nTesting search functionality...")
141
+ results = document_store.search_documents("machine learning beginner", top_k=2)
142
+ print(f"Found {len(results)} results for 'machine learning beginner'")
143
+ for result in results:
144
+ print(f"- {result.metadata.get('title')} (Relevance: {result.metadata.get('relevance_score', 0):.2f})")
145
+
146
+ print("\nDatabase initialization complete!")
147
+
148
+ if __name__ == "__main__":
149
+ initialize_database()
migrations/README ADDED
@@ -0,0 +1 @@
 
 
1
+ Single-database configuration for Flask.
migrations/add_chatbot_tables.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database Migration: Add Conversational Chatbot Tables
3
+
4
+ Run this to create the new tables for:
5
+ - ChatMessage (conversation history)
6
+ - PathModification (modification tracking)
7
+ - ConversationSession (session management)
8
+
9
+ Usage:
10
+ python -m migrations.add_chatbot_tables
11
+ """
12
+
13
+ import sys
14
+ import os
15
+
16
+ # Add project root to Python path
17
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18
+ sys.path.insert(0, project_root)
19
+
20
+ def run_migration():
21
+ """Create the new chatbot tables."""
22
+ print("Initializing database migration...")
23
+
24
+ try:
25
+ # Import only what we need to avoid loading heavy dependencies
26
+ from flask import Flask
27
+ from flask_sqlalchemy import SQLAlchemy
28
+ from dotenv import load_dotenv
29
+
30
+ # Load environment variables
31
+ env_path = os.path.join(project_root, '.env')
32
+ load_dotenv(env_path)
33
+
34
+ # Create minimal Flask app
35
+ app = Flask(__name__)
36
+
37
+ # Get database URL from environment
38
+ database_url = os.getenv('DATABASE_URL', 'sqlite:///learning_path.db')
39
+
40
+ # Fix SQLite path if needed
41
+ if database_url.startswith('sqlite:///') and not database_url.startswith('sqlite:////'):
42
+ db_path = database_url.replace('sqlite:///', '')
43
+ if not os.path.isabs(db_path):
44
+ db_path = os.path.join(project_root, db_path)
45
+ database_url = f'sqlite:///{db_path}'
46
+
47
+ app.config['SQLALCHEMY_DATABASE_URI'] = database_url
48
+ app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
49
+
50
+ # Initialize SQLAlchemy
51
+ db = SQLAlchemy(app)
52
+
53
+ # Define the models directly here to avoid import issues
54
+ with app.app_context():
55
+ print(f"Using database: {database_url}")
56
+ print("\nCreating chatbot tables...")
57
+
58
+ # Execute raw SQL to create tables
59
+ db.session.execute(db.text("""
60
+ CREATE TABLE IF NOT EXISTS conversation_sessions (
61
+ id VARCHAR(36) PRIMARY KEY,
62
+ user_id INTEGER NOT NULL,
63
+ learning_path_id VARCHAR(36),
64
+ started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
65
+ last_activity_at DATETIME DEFAULT CURRENT_TIMESTAMP,
66
+ ended_at DATETIME,
67
+ summary TEXT,
68
+ message_count INTEGER DEFAULT 0,
69
+ total_tokens_used INTEGER DEFAULT 0,
70
+ is_active BOOLEAN DEFAULT 1,
71
+ FOREIGN KEY (user_id) REFERENCES users(id),
72
+ FOREIGN KEY (learning_path_id) REFERENCES user_learning_paths(id)
73
+ )
74
+ """))
75
+
76
+ db.session.execute(db.text("""
77
+ CREATE TABLE IF NOT EXISTS chat_messages (
78
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
79
+ user_id INTEGER NOT NULL,
80
+ learning_path_id VARCHAR(36),
81
+ message TEXT NOT NULL,
82
+ role VARCHAR(20) NOT NULL,
83
+ intent VARCHAR(50),
84
+ entities TEXT,
85
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
86
+ tokens_used INTEGER DEFAULT 0,
87
+ response_time_ms INTEGER,
88
+ session_id VARCHAR(36),
89
+ FOREIGN KEY (user_id) REFERENCES users(id),
90
+ FOREIGN KEY (learning_path_id) REFERENCES user_learning_paths(id),
91
+ FOREIGN KEY (session_id) REFERENCES conversation_sessions(id)
92
+ )
93
+ """))
94
+
95
+ db.session.execute(db.text("""
96
+ CREATE TABLE IF NOT EXISTS path_modifications (
97
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
98
+ learning_path_id VARCHAR(36) NOT NULL,
99
+ user_id INTEGER NOT NULL,
100
+ chat_message_id INTEGER,
101
+ modification_type VARCHAR(50) NOT NULL,
102
+ target_path VARCHAR(200),
103
+ change_description TEXT NOT NULL,
104
+ old_value TEXT,
105
+ new_value TEXT,
106
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
107
+ is_reverted BOOLEAN DEFAULT 0,
108
+ FOREIGN KEY (learning_path_id) REFERENCES user_learning_paths(id),
109
+ FOREIGN KEY (user_id) REFERENCES users(id),
110
+ FOREIGN KEY (chat_message_id) REFERENCES chat_messages(id)
111
+ )
112
+ """))
113
+
114
+ db.session.commit()
115
+
116
+ print("✅ Successfully created chatbot tables:")
117
+ print(" - conversation_sessions")
118
+ print(" - chat_messages")
119
+ print(" - path_modifications")
120
+ print("\n🎉 Your database is ready for the enhanced chatbot!")
121
+
122
+ except Exception as e:
123
+ print(f"❌ Error creating tables: {e}")
124
+ import traceback
125
+ traceback.print_exc()
126
+ sys.exit(1)
127
+
128
+ if __name__ == '__main__':
129
+ run_migration()
migrations/add_conversation_memory.sql ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration: Add Conversation Memory to ChatMessage Model
2
+ -- Date: 2025-01-02
3
+ -- Description: Adds conversation_id and context fields for memory-enabled chatbot
4
+
5
+ -- Add conversation_id column (groups related messages)
6
+ ALTER TABLE chat_messages ADD COLUMN conversation_id VARCHAR(36);
7
+
8
+ -- Add context column (stores learning path context as JSON)
9
+ ALTER TABLE chat_messages ADD COLUMN context JSON;
10
+
11
+ -- Create index on conversation_id for fast queries
12
+ CREATE INDEX idx_chat_messages_conversation_id ON chat_messages(conversation_id);
13
+
14
+ -- Update existing records to use session_id as conversation_id (backward compatibility)
15
+ UPDATE chat_messages SET conversation_id = session_id WHERE session_id IS NOT NULL;
16
+
17
+ -- Add comments
18
+ COMMENT ON COLUMN chat_messages.conversation_id IS 'Groups related messages in a conversation';
19
+ COMMENT ON COLUMN chat_messages.context IS 'Stores learning path state, progress, and milestone data';
migrations/add_resource_progress.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Migration script to add ResourceProgress table for persistent resource tracking.
3
+ Run this after updating models.py
4
+ """
5
+ import sys
6
+ import os
7
+
8
+ # Add parent directory to path
9
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
+
11
+ from web_app import create_app, db
12
+ from web_app.models import ResourceProgress
13
+
14
+ def migrate():
15
+ """Create the resource_progress table"""
16
+ app = create_app()
17
+
18
+ with app.app_context():
19
+ print("Creating resource_progress table...")
20
+
21
+ # Create the table
22
+ db.create_all()
23
+
24
+ print("✅ ResourceProgress table created successfully!")
25
+ print("\nTable structure:")
26
+ print("- id (Primary Key)")
27
+ print("- user_id (Foreign Key -> users.id)")
28
+ print("- learning_path_id (Foreign Key -> user_learning_paths.id)")
29
+ print("- milestone_index (Integer)")
30
+ print("- resource_index (Integer)")
31
+ print("- resource_url (String)")
32
+ print("- completed (Boolean)")
33
+ print("- completed_at (DateTime)")
34
+ print("- created_at (DateTime)")
35
+ print("- updated_at (DateTime)")
36
+ print("\n✨ Users can now track resource completion persistently!")
37
+
38
+ if __name__ == "__main__":
39
+ migrate()
migrations/alembic.ini ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A generic, single database configuration.
2
+
3
+ [alembic]
4
+ # template used to generate migration files
5
+ # file_template = %%(rev)s_%%(slug)s
6
+
7
+ # set to 'true' to run the environment during
8
+ # the 'revision' command, regardless of autogenerate
9
+ # revision_environment = false
10
+
11
+
12
+ # Logging configuration
13
+ [loggers]
14
+ keys = root,sqlalchemy,alembic,flask_migrate
15
+
16
+ [handlers]
17
+ keys = console
18
+
19
+ [formatters]
20
+ keys = generic
21
+
22
+ [logger_root]
23
+ level = WARN
24
+ handlers = console
25
+ qualname =
26
+
27
+ [logger_sqlalchemy]
28
+ level = WARN
29
+ handlers =
30
+ qualname = sqlalchemy.engine
31
+
32
+ [logger_alembic]
33
+ level = INFO
34
+ handlers =
35
+ qualname = alembic
36
+
37
+ [logger_flask_migrate]
38
+ level = INFO
39
+ handlers =
40
+ qualname = flask_migrate
41
+
42
+ [handler_console]
43
+ class = StreamHandler
44
+ args = (sys.stderr,)
45
+ level = NOTSET
46
+ formatter = generic
47
+
48
+ [formatter_generic]
49
+ format = %(levelname)-5.5s [%(name)s] %(message)s
50
+ datefmt = %H:%M:%S
migrations/env.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from logging.config import fileConfig
3
+
4
+ from flask import current_app
5
+
6
+ from alembic import context
7
+
8
+ # this is the Alembic Config object, which provides
9
+ # access to the values within the .ini file in use.
10
+ config = context.config
11
+
12
+ # Interpret the config file for Python logging.
13
+ # This line sets up loggers basically.
14
+ fileConfig(config.config_file_name)
15
+ logger = logging.getLogger('alembic.env')
16
+
17
+
18
+ def get_engine():
19
+ try:
20
+ # this works with Flask-SQLAlchemy<3 and Alchemical
21
+ return current_app.extensions['migrate'].db.get_engine()
22
+ except (TypeError, AttributeError):
23
+ # this works with Flask-SQLAlchemy>=3
24
+ return current_app.extensions['migrate'].db.engine
25
+
26
+
27
+ def get_engine_url():
28
+ try:
29
+ return get_engine().url.render_as_string(hide_password=False).replace(
30
+ '%', '%%')
31
+ except AttributeError:
32
+ return str(get_engine().url).replace('%', '%%')
33
+
34
+
35
+ # add your model's MetaData object here
36
+ # for 'autogenerate' support
37
+ # from myapp import mymodel
38
+ # target_metadata = mymodel.Base.metadata
39
+ config.set_main_option('sqlalchemy.url', get_engine_url())
40
+ target_db = current_app.extensions['migrate'].db
41
+
42
+ # other values from the config, defined by the needs of env.py,
43
+ # can be acquired:
44
+ # my_important_option = config.get_main_option("my_important_option")
45
+ # ... etc.
46
+
47
+
48
+ def get_metadata():
49
+ if hasattr(target_db, 'metadatas'):
50
+ return target_db.metadatas[None]
51
+ return target_db.metadata
52
+
53
+
54
+ def run_migrations_offline():
55
+ """Run migrations in 'offline' mode.
56
+
57
+ This configures the context with just a URL
58
+ and not an Engine, though an Engine is acceptable
59
+ here as well. By skipping the Engine creation
60
+ we don't even need a DBAPI to be available.
61
+
62
+ Calls to context.execute() here emit the given string to the
63
+ script output.
64
+
65
+ """
66
+ url = config.get_main_option("sqlalchemy.url")
67
+ context.configure(
68
+ url=url, target_metadata=get_metadata(), literal_binds=True
69
+ )
70
+
71
+ with context.begin_transaction():
72
+ context.run_migrations()
73
+
74
+
75
+ def run_migrations_online():
76
+ """Run migrations in 'online' mode.
77
+
78
+ In this scenario we need to create an Engine
79
+ and associate a connection with the context.
80
+
81
+ """
82
+
83
+ # this callback is used to prevent an auto-migration from being generated
84
+ # when there are no changes to the schema
85
+ # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
86
+ def process_revision_directives(context, revision, directives):
87
+ if getattr(config.cmd_opts, 'autogenerate', False):
88
+ script = directives[0]
89
+ if script.upgrade_ops.is_empty():
90
+ directives[:] = []
91
+ logger.info('No changes in schema detected.')
92
+
93
+ conf_args = current_app.extensions['migrate'].configure_args
94
+ if conf_args.get("process_revision_directives") is None:
95
+ conf_args["process_revision_directives"] = process_revision_directives
96
+
97
+ connectable = get_engine()
98
+
99
+ with connectable.connect() as connection:
100
+ context.configure(
101
+ connection=connection,
102
+ target_metadata=get_metadata(),
103
+ **conf_args
104
+ )
105
+
106
+ with context.begin_transaction():
107
+ context.run_migrations()
108
+
109
+
110
+ if context.is_offline_mode():
111
+ run_migrations_offline()
112
+ else:
113
+ run_migrations_online()
migrations/script.py.mako ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ ${imports if imports else ""}
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = ${repr(up_revision)}
14
+ down_revision = ${repr(down_revision)}
15
+ branch_labels = ${repr(branch_labels)}
16
+ depends_on = ${repr(depends_on)}
17
+
18
+
19
+ def upgrade():
20
+ ${upgrades if upgrades else "pass"}
21
+
22
+
23
+ def downgrade():
24
+ ${downgrades if downgrades else "pass"}
migrations/versions/12d5dfb6fd16_sync_users_table.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """sync users table
2
+
3
+ Revision ID: 12d5dfb6fd16
4
+ Revises: 39d22a91999a
5
+ Create Date: 2025-10-01 21:25:39.871657
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '12d5dfb6fd16'
14
+ down_revision = '39d22a91999a'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade():
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.create_table('chat_messages',
22
+ sa.Column('id', sa.Integer(), nullable=False),
23
+ sa.Column('user_id', sa.Integer(), nullable=False),
24
+ sa.Column('learning_path_id', sa.String(length=36), nullable=True),
25
+ sa.Column('message', sa.Text(), nullable=False),
26
+ sa.Column('role', sa.String(length=20), nullable=False),
27
+ sa.Column('intent', sa.String(length=50), nullable=True),
28
+ sa.Column('entities', sa.JSON(), nullable=True),
29
+ sa.Column('timestamp', sa.DateTime(), nullable=True),
30
+ sa.Column('tokens_used', sa.Integer(), nullable=True),
31
+ sa.Column('response_time_ms', sa.Integer(), nullable=True),
32
+ sa.Column('session_id', sa.String(length=36), nullable=True),
33
+ sa.ForeignKeyConstraint(['learning_path_id'], ['user_learning_paths.id'], ),
34
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
35
+ sa.PrimaryKeyConstraint('id')
36
+ )
37
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
38
+ batch_op.create_index(batch_op.f('ix_chat_messages_session_id'), ['session_id'], unique=False)
39
+ batch_op.create_index(batch_op.f('ix_chat_messages_timestamp'), ['timestamp'], unique=False)
40
+
41
+ op.create_table('conversation_sessions',
42
+ sa.Column('id', sa.String(length=36), nullable=False),
43
+ sa.Column('user_id', sa.Integer(), nullable=False),
44
+ sa.Column('learning_path_id', sa.String(length=36), nullable=True),
45
+ sa.Column('started_at', sa.DateTime(), nullable=True),
46
+ sa.Column('last_activity_at', sa.DateTime(), nullable=True),
47
+ sa.Column('ended_at', sa.DateTime(), nullable=True),
48
+ sa.Column('summary', sa.Text(), nullable=True),
49
+ sa.Column('message_count', sa.Integer(), nullable=True),
50
+ sa.Column('total_tokens_used', sa.Integer(), nullable=True),
51
+ sa.Column('is_active', sa.Boolean(), nullable=True),
52
+ sa.ForeignKeyConstraint(['learning_path_id'], ['user_learning_paths.id'], ),
53
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
54
+ sa.PrimaryKeyConstraint('id')
55
+ )
56
+ with op.batch_alter_table('conversation_sessions', schema=None) as batch_op:
57
+ batch_op.create_index(batch_op.f('ix_conversation_sessions_started_at'), ['started_at'], unique=False)
58
+
59
+ op.create_table('path_modifications',
60
+ sa.Column('id', sa.Integer(), nullable=False),
61
+ sa.Column('learning_path_id', sa.String(length=36), nullable=False),
62
+ sa.Column('user_id', sa.Integer(), nullable=False),
63
+ sa.Column('chat_message_id', sa.Integer(), nullable=True),
64
+ sa.Column('modification_type', sa.String(length=50), nullable=False),
65
+ sa.Column('target_path', sa.String(length=200), nullable=True),
66
+ sa.Column('change_description', sa.Text(), nullable=False),
67
+ sa.Column('old_value', sa.JSON(), nullable=True),
68
+ sa.Column('new_value', sa.JSON(), nullable=True),
69
+ sa.Column('timestamp', sa.DateTime(), nullable=True),
70
+ sa.Column('is_reverted', sa.Boolean(), nullable=True),
71
+ sa.ForeignKeyConstraint(['chat_message_id'], ['chat_messages.id'], ),
72
+ sa.ForeignKeyConstraint(['learning_path_id'], ['user_learning_paths.id'], ),
73
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
74
+ sa.PrimaryKeyConstraint('id')
75
+ )
76
+ with op.batch_alter_table('path_modifications', schema=None) as batch_op:
77
+ batch_op.create_index(batch_op.f('ix_path_modifications_timestamp'), ['timestamp'], unique=False)
78
+
79
+ with op.batch_alter_table('users', schema=None) as batch_op:
80
+ batch_op.add_column(sa.Column('last_seen', sa.DateTime(), nullable=True))
81
+ batch_op.add_column(sa.Column('registration_source', sa.String(length=20), nullable=True))
82
+ batch_op.add_column(sa.Column('login_count', sa.Integer(), nullable=True))
83
+ batch_op.add_column(sa.Column('display_name', sa.String(length=100), nullable=True))
84
+ batch_op.add_column(sa.Column('bio', sa.Text(), nullable=True))
85
+
86
+ # ### end Alembic commands ###
87
+
88
+
89
+ def downgrade():
90
+ # ### commands auto generated by Alembic - please adjust! ###
91
+ with op.batch_alter_table('users', schema=None) as batch_op:
92
+ batch_op.drop_column('bio')
93
+ batch_op.drop_column('display_name')
94
+ batch_op.drop_column('login_count')
95
+ batch_op.drop_column('registration_source')
96
+ batch_op.drop_column('last_seen')
97
+
98
+ with op.batch_alter_table('path_modifications', schema=None) as batch_op:
99
+ batch_op.drop_index(batch_op.f('ix_path_modifications_timestamp'))
100
+
101
+ op.drop_table('path_modifications')
102
+ with op.batch_alter_table('conversation_sessions', schema=None) as batch_op:
103
+ batch_op.drop_index(batch_op.f('ix_conversation_sessions_started_at'))
104
+
105
+ op.drop_table('conversation_sessions')
106
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
107
+ batch_op.drop_index(batch_op.f('ix_chat_messages_timestamp'))
108
+ batch_op.drop_index(batch_op.f('ix_chat_messages_session_id'))
109
+
110
+ op.drop_table('chat_messages')
111
+ # ### end Alembic commands ###
migrations/versions/39d22a91999a_initial_migration.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Initial migration
2
+
3
+ Revision ID: 39d22a91999a
4
+ Revises:
5
+ Create Date: 2025-06-03 11:10:55.881578
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '39d22a91999a'
14
+ down_revision = None
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade():
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.create_table('users',
22
+ sa.Column('id', sa.Integer(), nullable=False),
23
+ sa.Column('username', sa.String(length=64), nullable=False),
24
+ sa.Column('email', sa.String(length=120), nullable=False),
25
+ sa.Column('password_hash', sa.String(length=256), nullable=True),
26
+ sa.Column('created_at', sa.DateTime(), nullable=True),
27
+ sa.PrimaryKeyConstraint('id')
28
+ )
29
+ with op.batch_alter_table('users', schema=None) as batch_op:
30
+ batch_op.create_index(batch_op.f('ix_users_email'), ['email'], unique=True)
31
+ batch_op.create_index(batch_op.f('ix_users_username'), ['username'], unique=True)
32
+
33
+ op.create_table('user_learning_paths',
34
+ sa.Column('id', sa.String(length=36), nullable=False),
35
+ sa.Column('user_id', sa.Integer(), nullable=False),
36
+ sa.Column('path_data_json', sa.JSON(), nullable=False),
37
+ sa.Column('title', sa.String(length=200), nullable=True),
38
+ sa.Column('topic', sa.String(length=100), nullable=True),
39
+ sa.Column('created_at', sa.DateTime(), nullable=True),
40
+ sa.Column('last_accessed_at', sa.DateTime(), nullable=True),
41
+ sa.Column('is_archived', sa.Boolean(), nullable=True),
42
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
43
+ sa.PrimaryKeyConstraint('id')
44
+ )
45
+ with op.batch_alter_table('user_learning_paths', schema=None) as batch_op:
46
+ batch_op.create_index(batch_op.f('ix_user_learning_paths_created_at'), ['created_at'], unique=False)
47
+
48
+ op.create_table('learning_progress',
49
+ sa.Column('id', sa.Integer(), nullable=False),
50
+ sa.Column('user_learning_path_id', sa.String(length=36), nullable=False),
51
+ sa.Column('milestone_identifier', sa.String(length=200), nullable=False),
52
+ sa.Column('status', sa.String(length=50), nullable=True),
53
+ sa.Column('started_at', sa.DateTime(), nullable=True),
54
+ sa.Column('completed_at', sa.DateTime(), nullable=True),
55
+ sa.Column('notes', sa.Text(), nullable=True),
56
+ sa.ForeignKeyConstraint(['user_learning_path_id'], ['user_learning_paths.id'], ),
57
+ sa.PrimaryKeyConstraint('id'),
58
+ sa.UniqueConstraint('user_learning_path_id', 'milestone_identifier', name='_user_path_milestone_uc')
59
+ )
60
+ # ### end Alembic commands ###
61
+
62
+
63
+ def downgrade():
64
+ # ### commands auto generated by Alembic - please adjust! ###
65
+ op.drop_table('learning_progress')
66
+ with op.batch_alter_table('user_learning_paths', schema=None) as batch_op:
67
+ batch_op.drop_index(batch_op.f('ix_user_learning_paths_created_at'))
68
+
69
+ op.drop_table('user_learning_paths')
70
+ with op.batch_alter_table('users', schema=None) as batch_op:
71
+ batch_op.drop_index(batch_op.f('ix_users_username'))
72
+ batch_op.drop_index(batch_op.f('ix_users_email'))
73
+
74
+ op.drop_table('users')
75
+ # ### end Alembic commands ###
migrations/versions/6b20f44f6a00_make_oauth_user_id_nullable.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Make OAuth user_id nullable
2
+
3
+ Revision ID: 6b20f44f6a00
4
+ Revises: 9f32f1920608
5
+ Create Date: 2025-10-05 13:02:17.393003
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '6b20f44f6a00'
14
+ down_revision = '9f32f1920608'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade():
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table('flask_dance_oauth', schema=None) as batch_op:
22
+ batch_op.alter_column('user_id',
23
+ existing_type=sa.INTEGER(),
24
+ nullable=True)
25
+
26
+ # ### end Alembic commands ###
27
+
28
+
29
+ def downgrade():
30
+ # ### commands auto generated by Alembic - please adjust! ###
31
+ with op.batch_alter_table('flask_dance_oauth', schema=None) as batch_op:
32
+ batch_op.alter_column('user_id',
33
+ existing_type=sa.INTEGER(),
34
+ nullable=False)
35
+
36
+ # ### end Alembic commands ###
migrations/versions/9f32f1920608_add_oauth_table_for_flask_dance.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Add OAuth table for Flask-Dance
2
+
3
+ Revision ID: 9f32f1920608
4
+ Revises: 12d5dfb6fd16
5
+ Create Date: 2025-10-05 12:30:20.870839
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '9f32f1920608'
14
+ down_revision = '12d5dfb6fd16'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade():
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.create_table('flask_dance_oauth',
22
+ sa.Column('user_id', sa.Integer(), nullable=False),
23
+ sa.Column('id', sa.Integer(), nullable=False),
24
+ sa.Column('provider', sa.String(length=50), nullable=False),
25
+ sa.Column('created_at', sa.DateTime(), nullable=False),
26
+ sa.Column('token', sa.JSON(), nullable=False),
27
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
28
+ sa.PrimaryKeyConstraint('id')
29
+ )
30
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
31
+ batch_op.add_column(sa.Column('conversation_id', sa.String(length=36), nullable=True))
32
+ batch_op.add_column(sa.Column('context', sa.JSON(), nullable=True))
33
+ batch_op.create_index(batch_op.f('ix_chat_messages_conversation_id'), ['conversation_id'], unique=False)
34
+
35
+ # ### end Alembic commands ###
36
+
37
+
38
+ def downgrade():
39
+ # ### commands auto generated by Alembic - please adjust! ###
40
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
41
+ batch_op.drop_index(batch_op.f('ix_chat_messages_conversation_id'))
42
+ batch_op.drop_column('context')
43
+ batch_op.drop_column('conversation_id')
44
+
45
+ op.drop_table('flask_dance_oauth')
46
+ # ### end Alembic commands ###
migrations/versions/a1b2c3d4e5f6_add_progress_tracking_tables.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """add progress tracking tables
2
+
3
+ Revision ID: a1b2c3d4e5f6
4
+ Revises: 6b20f44f6a00
5
+ Create Date: 2025-10-14 01:30:00.000000
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ from sqlalchemy import inspect
11
+
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = 'a1b2c3d4e5f6'
15
+ down_revision = '6b20f44f6a00'
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade():
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+
23
+ # Get the database connection to check for existing columns
24
+ bind = op.get_bind()
25
+ inspector = inspect(bind)
26
+ columns = [col['name'] for col in inspector.get_columns('chat_messages')]
27
+
28
+ # Add missing fields to chat_messages table only if they don't exist
29
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
30
+ if 'conversation_id' not in columns:
31
+ batch_op.add_column(sa.Column('conversation_id', sa.String(length=36), nullable=True))
32
+ batch_op.create_index(batch_op.f('ix_chat_messages_conversation_id'), ['conversation_id'], unique=False)
33
+
34
+ if 'context' not in columns:
35
+ batch_op.add_column(sa.Column('context', sa.JSON(), nullable=True))
36
+
37
+ # Create resource_progress table
38
+ op.create_table('resource_progress',
39
+ sa.Column('id', sa.Integer(), nullable=False),
40
+ sa.Column('user_id', sa.Integer(), nullable=False),
41
+ sa.Column('learning_path_id', sa.String(length=36), nullable=False),
42
+ sa.Column('milestone_index', sa.Integer(), nullable=False),
43
+ sa.Column('resource_index', sa.Integer(), nullable=False),
44
+ sa.Column('resource_url', sa.String(length=500), nullable=False),
45
+ sa.Column('completed', sa.Boolean(), nullable=True),
46
+ sa.Column('completed_at', sa.DateTime(), nullable=True),
47
+ sa.Column('created_at', sa.DateTime(), nullable=True),
48
+ sa.Column('updated_at', sa.DateTime(), nullable=True),
49
+ sa.ForeignKeyConstraint(['learning_path_id'], ['user_learning_paths.id'], ),
50
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
51
+ sa.PrimaryKeyConstraint('id'),
52
+ sa.UniqueConstraint('user_id', 'learning_path_id', 'milestone_index', 'resource_index',
53
+ name='_user_path_milestone_resource_uc')
54
+ )
55
+
56
+ # Create milestone_progress table
57
+ op.create_table('milestone_progress',
58
+ sa.Column('id', sa.Integer(), nullable=False),
59
+ sa.Column('user_id', sa.Integer(), nullable=False),
60
+ sa.Column('learning_path_id', sa.String(length=36), nullable=False),
61
+ sa.Column('milestone_index', sa.Integer(), nullable=False),
62
+ sa.Column('completed', sa.Boolean(), nullable=True),
63
+ sa.Column('completed_at', sa.DateTime(), nullable=True),
64
+ sa.Column('created_at', sa.DateTime(), nullable=True),
65
+ sa.Column('updated_at', sa.DateTime(), nullable=True),
66
+ sa.ForeignKeyConstraint(['learning_path_id'], ['user_learning_paths.id'], ),
67
+ sa.ForeignKeyConstraint(['user_id'], ['users.id'], ),
68
+ sa.PrimaryKeyConstraint('id'),
69
+ sa.UniqueConstraint('user_id', 'learning_path_id', 'milestone_index',
70
+ name='_user_path_milestone_uc')
71
+ )
72
+
73
+ # ### end Alembic commands ###
74
+
75
+
76
+ def downgrade():
77
+ # ### commands auto generated by Alembic - please adjust! ###
78
+ op.drop_table('milestone_progress')
79
+ op.drop_table('resource_progress')
80
+
81
+ # Remove added fields from chat_messages table
82
+ with op.batch_alter_table('chat_messages', schema=None) as batch_op:
83
+ batch_op.drop_index(batch_op.f('ix_chat_messages_conversation_id'))
84
+ batch_op.drop_column('context')
85
+ batch_op.drop_column('conversation_id')
86
+ # ### end Alembic commands ###
minimal_test.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dotenv import load_dotenv
4
+
5
+ # Add the project root to sys.path
6
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__)))
7
+ sys.path.insert(0, PROJECT_ROOT)
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ print("=== Starting Minimal Test ===")
13
+
14
+ # Test basic Python environment
15
+ print("Python version:", sys.version)
16
+ print("Current working directory:", os.getcwd())
17
+ print("Project root:", PROJECT_ROOT)
18
+
19
+ # Test environment variables
20
+ api_key = os.getenv("OPENAI_API_KEY")
21
+ print("OPENAI_API_KEY exists:", bool(api_key))
22
+ if api_key:
23
+ print("API key starts with:", api_key[:5] + "...")
24
+
25
+ # Test basic imports
26
+ try:
27
+ import pydantic
28
+ print(f"Pydantic version: {pydantic.__version__}")
29
+ except ImportError as e:
30
+ print(f"Pydantic import error: {e}")
31
+
32
+ try:
33
+ from langchain_openai import OpenAI
34
+ print("Successfully imported langchain_openai")
35
+ except ImportError as e:
36
+ print(f"langchain_openai import error: {e}")
37
+
38
+ print("=== Test Completed ===")
requirements.txt ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ gunicorn>=21.2.0
3
+ python-dotenv==1.0.1
4
+ Flask>=2.0.1
5
+ Flask-Cors>=4.0.0
6
+ requests>=2.31.0
7
+
8
+ # Pydantic v1 - MUST be installed first and locked
9
+ pydantic==1.10.18
10
+ email-validator==2.1.0.post1
11
+
12
+ # LangChain - using older versions compatible with Pydantic v1
13
+ langchain==0.0.267
14
+ openai>=1.0.0 # Using new OpenAI API
15
+ tiktoken>=0.5.0 # Token counting for cost optimization
16
+ Flask-SQLAlchemy==3.1.1
17
+ psycopg2-binary>=2.9.9 # Postgres driver for production
18
+ Flask-Login==0.6.3
19
+ Flask-WTF==1.2.1
20
+ Flask-Migrate==4.0.7
21
+
22
+ # Document processing
23
+ unstructured==0.10.30 # Using base package without all-docs to avoid complex deps
24
+ pypandoc>=1.11
25
+ python-magic>=0.4.27; sys_platform != 'win32'
26
+ onnxruntime>=1.20.0 # Explicitly specify a compatible version
27
+
28
+ # RAG and embeddings
29
+ faiss-cpu>=1.7.4
30
+
31
+ # Vector database
32
+ chromadb==0.3.29 # Last version compatible with Pydantic v1
33
+
34
+ # ML & NLP
35
+ sentence-transformers>=2.2.2
36
+ scikit-learn>=1.2.2
37
+ numpy>=1.24.0
38
+ pandas>=2.0.0
39
+
40
+ Flask-Dance[google]==7.1.0
41
+
42
+ # Web UI
43
+ flask-wtf>=1.0.0
44
+ WTForms>=3.1.0
45
+ Jinja2>=3.0.1
46
+ werkzeug>=2.0.1
47
+
48
+ # HTTP client for async resource validation
49
+ aiohttp>=3.9.0
50
+
51
+ # Caching
52
+ redis>=4.6.0,<5.0.0
53
+ rq==1.16.1
54
+ celery>=5.3.0
55
+
56
+ # Search & Reranking
57
+ rank-bm25>=0.2.2
58
+ cohere>=4.0.0 # Optional: for Cohere reranking
59
+
60
+ # Observability & Monitoring
61
+ langsmith<0.1.0,>=0.0.21 # LLM tracing and debugging (compatible with langchain 0.0.267)
62
+ wandb>=0.16.0 # Experiment tracking and metrics
63
+
64
+ # Development
65
+ pytest>=7.0.0
66
+ pytest-asyncio>=0.21.0 # For async test support
67
+
68
+ # Platform-specific dependencies
69
+ --find-links https://download.pytorch.org/whl/torch_stable.html
70
+
71
+ # Build tools
72
+ setuptools>=65.0.0
73
+
74
+ google-auth>=2.0.0
75
+ google-auth-httplib2>=0.2.0
76
+ google-auth-oauthlib>=0.4.6
run.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script handles the setup and execution of the web application.
3
+ """
4
+ from pathlib import Path
5
+ import shutil
6
+ from dotenv import load_dotenv
7
+ from web_app import create_app
8
+ from backend.routes import api_bp
9
+ import os
10
+ # Fix protobuf compatibility issue with transformers
11
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
12
+
13
+ print("--- run.py started ---")
14
+
15
+
16
+ # Load environment variables
17
+ env_path = Path('.env')
18
+ env_example_path = Path('.env.example')
19
+
20
+ # If .env doesn't exist, create it from example
21
+ if not env_path.exists() and env_example_path.exists():
22
+ shutil.copy(env_example_path, env_path)
23
+ print("Created .env file from .env.example. Please update your API keys before proceeding.")
24
+
25
+ # Load environment vars
26
+ load_dotenv()
27
+ print("--- dotenv loaded ---")
28
+
29
+ # Check if required API keys are set based on provider
30
+ provider = os.getenv("DEFAULT_PROVIDER", "openai").lower()
31
+ if provider == "openai" and not os.getenv("OPENAI_API_KEY"):
32
+ print("WARNING: OPENAI_API_KEY not found in environment variables.")
33
+ print("Please set your API key in the .env file before running the application.")
34
+ exit(1)
35
+ elif provider == "deepseek" and not os.getenv("DEEPSEEK_API_KEY"):
36
+ print("WARNING: DEEPSEEK_API_KEY not found in environment variables.")
37
+ print("Please set your API key in the .env file before running the application.")
38
+ exit(1)
39
+ elif provider == "openrouter":
40
+ print("✅ Using OpenRouter with free models (no API key required)")
41
+
42
+ # Create necessary directories
43
+ os.makedirs("vector_db", exist_ok=True)
44
+ os.makedirs("learning_paths", exist_ok=True)
45
+ print("--- API key checked and dirs created ---")
46
+
47
+ # Import and run Flask app
48
+
49
+ app = create_app()
50
+
51
+ # Register the API blueprint for RQ task orchestration under /api
52
+ app.register_blueprint(api_bp, url_prefix='/api')
53
+
54
+ print("--- Flask app created via factory ---")
55
+
56
+ # Pre-warm the model orchestrator to avoid cold start delays
57
+ def prewarm_models():
58
+ """Pre-initialize models to avoid cold start on first request."""
59
+ try:
60
+ print("🔥 Pre-warming AI models (this may take a moment on first run)...")
61
+ from src.ml.model_orchestrator import ModelOrchestrator
62
+ orchestrator = ModelOrchestrator()
63
+ # Make a simple test call to ensure the model is fully loaded
64
+ print("✅ AI models pre-warmed successfully!")
65
+ except Exception as e:
66
+ print(f"⚠️ Model pre-warming failed (will initialize on first request): {e}")
67
+
68
+ if __name__ == "__main__":
69
+ port = int(os.getenv("PORT", 5000))
70
+ # Disable debug mode to prevent auto-reloading issues
71
+ debug = False
72
+
73
+ # Pre-warm models before starting server
74
+ prewarm_models()
75
+
76
+ print(f"Starting AI Learning Path Generator on port {port}")
77
+ print("Visit http://localhost:5000 in your browser")
78
+
79
+ app.run(host="0.0.0.0", port=port, debug=debug, use_reloader=False)
run_flask.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ # Add the project root to sys.path
5
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__)))
6
+ sys.path.insert(0, PROJECT_ROOT)
7
+
8
+ # Import the app factory
9
+ import os
10
+ from web_app import create_app
11
+
12
+ # Set DEV_MODE=True in .env to bypass API key checks
13
+ DEV_MODE = os.environ.get('DEV_MODE', 'False').lower() == 'true'
14
+
15
+ if DEV_MODE:
16
+ print("\033[93m⚠️ Running in DEV_MODE - API calls will be stubbed!\033[0m")
17
+ os.environ['FLASK_ENV'] = 'development'
18
+
19
+ app = create_app()
20
+
21
+ if __name__ == "__main__":
22
+ print("Starting Flask application...")
23
+ app.run(debug=True, port=5000)
setup.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Setup script for the AI Learning Path Generator.
3
+ """
4
+ from setuptools import setup, find_packages
5
+
6
+ setup(
7
+ name="ai-learning-path-generator",
8
+ version="1.0.0",
9
+ description="An intelligent system that generates personalized learning paths using AI",
10
+ author="AI Learning Path Generator Team",
11
+ packages=find_packages(),
12
+ install_requires=[
13
+ "python-dotenv>=1.0.0",
14
+ "Flask>=2.0.1",
15
+ "langchain>=0.0.267",
16
+ "langchain-openai>=0.0.1",
17
+ "openai>=1.0.0",
18
+ "chromadb>=0.4.13",
19
+ "sentence-transformers>=2.2.2",
20
+ "scikit-learn>=1.2.2",
21
+ "numpy>=1.24.0",
22
+ "pandas>=2.0.0",
23
+ "flask-wtf>=1.0.0",
24
+ "Jinja2>=3.0.1",
25
+ "werkzeug>=2.0.1",
26
+ ],
27
+ python_requires=">=3.8",
28
+ )
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/agent.py ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI agent implementation for the Learning Path Generator.
3
+ Handles complex interactions and orchestrates the learning path generation process.
4
+ """
5
+ print("--- src/agent.py execution started ---")
6
+ from typing import Dict, List, Any, Optional, Tuple
7
+ import json
8
+ import datetime
9
+ from pathlib import Path
10
+ print("--- src/agent.py initial imports done ---")
11
+ from src.learning_path import LearningPathGenerator, LearningPath
12
+ print("--- src/agent.py learning_path imported ---")
13
+ from src.ml.model_orchestrator import ModelOrchestrator
14
+ from src.data.vector_store import VectorStore
15
+ from src.data.document_store import DocumentStore
16
+ from src.utils.config import (
17
+ LEARNING_STYLES,
18
+ EXPERTISE_LEVELS,
19
+ TIME_COMMITMENTS,
20
+ )
21
+ class LearningAgent:
22
+ """
23
+ AI agent that orchestrates the learning path generation process.
24
+ """
25
+ def __init__(self, api_key: Optional[str] = None):
26
+ print("--- LearningAgent.__init__ started ---")
27
+ """
28
+ Initialize the learning agent with RAG capabilities.
29
+
30
+ Args:
31
+ api_key: Optional OpenAI API key
32
+ """
33
+ self.api_key = api_key
34
+ self.path_generator = LearningPathGenerator(api_key)
35
+ self.model_orchestrator = ModelOrchestrator(api_key)
36
+ self.document_store = DocumentStore()
37
+ self.vector_store = VectorStore(api_key)
38
+ print("--- LearningAgent.__init__: All components initialized ---")
39
+
40
+ # Track agent state
41
+ self.current_path = None
42
+ self.user_profile = {}
43
+ self.session_history = []
44
+ self.context = []
45
+ self.goal = None
46
+ self.planning_enabled = True
47
+
48
+ # Load initial documents for RAG
49
+ print("--- LearningAgent.__init__: Calling _load_initial_knowledge ---")
50
+ self._load_initial_knowledge()
51
+ print("--- LearningAgent.__init__ finished ---")
52
+
53
+ def _load_initial_knowledge(self):
54
+ print("--- LearningAgent._load_initial_knowledge started ---")
55
+ """
56
+ Load initial knowledge documents into the vector store.
57
+ """
58
+ # Create vector store directory if it doesn't exist
59
+ vector_db_path = Path("vector_db")
60
+ documents_dir = vector_db_path / "documents"
61
+
62
+ if not vector_db_path.exists():
63
+ vector_db_path.mkdir(parents=True)
64
+ print(f"Created vector store directory at {vector_db_path}")
65
+
66
+ if not documents_dir.exists():
67
+ documents_dir.mkdir()
68
+ print(f"Created documents directory at {documents_dir}")
69
+
70
+ # Load documents if they exist
71
+ if documents_dir.exists():
72
+ try:
73
+ print(f"Loading documents from {documents_dir}...")
74
+ self.vector_store.load_documents(str(documents_dir))
75
+ print(f"Documents loaded successfully from {documents_dir}.")
76
+ except Exception as e:
77
+ print(f"Warning: Failed to load documents: {str(e)}")
78
+ else:
79
+ print(f"Warning: Documents directory not found at {documents_dir}")
80
+
81
+ # Initialize vector store if it doesn't exist
82
+ if not (vector_db_path / "index.faiss").exists():
83
+ try:
84
+ # Create a dummy document to initialize the vector store
85
+ with open(documents_dir / "dummy.txt", "w") as f:
86
+ f.write("This is a dummy document to initialize the vector store.")
87
+ self.vector_store.load_documents(str(documents_dir))
88
+ print("Vector store initialized successfully")
89
+ except Exception as e:
90
+ print(f"Warning: Failed to initialize vector store: {str(e)}")
91
+
92
+ def process_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
93
+ """
94
+ Process a user request and generate an appropriate response with RAG and agentic behavior.
95
+
96
+ Args:
97
+ request: Dictionary containing user request data
98
+
99
+ Returns:
100
+ Response dictionary with generated content
101
+ """
102
+ # Get the AI provider from the request (if specified)
103
+ ai_provider = request.get("ai_provider")
104
+
105
+ # Create provider-specific instances if provider is specified
106
+ if ai_provider:
107
+ provider_orchestrator = ModelOrchestrator(self.api_key, provider=ai_provider)
108
+ provider_path_generator = LearningPathGenerator(self.api_key)
109
+ provider_path_generator.model_orchestrator = provider_orchestrator
110
+
111
+ use_orchestrator = provider_orchestrator
112
+ use_path_generator = provider_path_generator
113
+ else:
114
+ use_orchestrator = self.model_orchestrator
115
+ use_path_generator = self.path_generator
116
+
117
+ # Get relevant context using RAG
118
+ query = request.get("query", "")
119
+ if query:
120
+ relevant_docs = self.vector_store.search(query)
121
+ context = [doc["content"] for doc in relevant_docs]
122
+ self.context.extend(context)
123
+
124
+ # Update user profile with preferences from context
125
+ self._update_user_profile(context)
126
+
127
+ # Plan if planning is enabled
128
+ if self.planning_enabled:
129
+ self._plan_next_steps(request)
130
+
131
+ # Process the request based on its type
132
+ request_type = request.get("type", "generate_path")
133
+
134
+ # Add context to the request
135
+ request["context"] = self.context
136
+
137
+ if request_type == "generate_path":
138
+ return self._handle_path_generation(request, use_path_generator)
139
+ elif request_type == "modify_path":
140
+ return self._handle_path_modification(request, use_path_generator)
141
+ elif request_type == "ask_question":
142
+ return self._handle_question(request, use_orchestrator)
143
+ elif request_type == "get_resources":
144
+ return self._handle_resource_request(request, use_orchestrator)
145
+ else:
146
+ return {
147
+ "success": False,
148
+ "message": f"Unknown request type: {request_type}",
149
+ "data": None
150
+ }
151
+
152
+ def _handle_path_generation(self, request: Dict[str, Any], path_generator=None) -> Dict[str, Any]:
153
+ """
154
+ Handle a request to generate a new learning path with RAG and agentic behavior.
155
+
156
+ Args:
157
+ request: Dictionary with path generation parameters
158
+ path_generator: Optional custom path generator
159
+
160
+ Returns:
161
+ Response with the generated path or error
162
+ """
163
+ try:
164
+ # Extract request parameters
165
+ topic = request.get("topic")
166
+ expertise_level = request.get("expertise_level", "beginner")
167
+ learning_style = request.get("learning_style", "visual")
168
+ time_commitment = request.get("time_commitment", "moderate")
169
+ goals = request.get("goals", [])
170
+ additional_info = request.get("additional_info")
171
+
172
+ # Validate required parameters
173
+ if not topic:
174
+ return {
175
+ "success": False,
176
+ "message": "Topic is required",
177
+ "data": None
178
+ }
179
+
180
+ # Use the provided path generator or fall back to the default
181
+ current_generator = path_generator or self.path_generator
182
+
183
+ # Get relevant context using RAG
184
+ relevant_docs = self.vector_store.search(topic)
185
+ context = [doc["content"] for doc in relevant_docs] if relevant_docs else []
186
+
187
+ # Add any context from the request
188
+ if request.get("context"):
189
+ context.extend(request.get("context"))
190
+
191
+ # Generate the learning path with context
192
+ learning_path = current_generator.generate_path(
193
+ topic=topic,
194
+ expertise_level=expertise_level,
195
+ learning_style=learning_style,
196
+ time_commitment=time_commitment,
197
+ goals=goals,
198
+ additional_info=additional_info,
199
+ context=context
200
+ )
201
+
202
+ # Save the generated path
203
+ if request.get("save_path", True):
204
+ path_file = current_generator.save_path(learning_path)
205
+
206
+ # Update agent state
207
+ self.current_path = learning_path
208
+ self.user_profile.update({
209
+ "last_topic": topic,
210
+ "expertise_level": expertise_level,
211
+ "learning_style": learning_style,
212
+ "time_commitment": time_commitment
213
+ })
214
+
215
+ # Log the interaction
216
+ self._log_interaction("generate_path", request, {"path_id": learning_path.id})
217
+
218
+ return {
219
+ "success": True,
220
+ "message": f"Successfully generated learning path for {topic}",
221
+ "data": learning_path.dict()
222
+ }
223
+
224
+ except ValueError as e:
225
+ return {
226
+ "success": False,
227
+ "message": str(e),
228
+ "data": None
229
+ }
230
+ except Exception as e:
231
+ return {
232
+ "success": False,
233
+ "message": f"Error generating learning path: {str(e)}",
234
+ "data": None
235
+ }
236
+
237
+ def _handle_path_modification(self, request: Dict[str, Any], path_generator=None) -> Dict[str, Any]:
238
+ """
239
+ Handle a request to modify an existing learning path.
240
+
241
+ Args:
242
+ request: Dictionary with modification parameters
243
+
244
+ Returns:
245
+ Response with the modified path or error
246
+ """
247
+ try:
248
+ # Extract request parameters
249
+ path_id = request.get("path_id")
250
+ modifications = request.get("modifications", {})
251
+
252
+ # Validate required parameters
253
+ if not path_id:
254
+ return {
255
+ "success": False,
256
+ "message": "Path ID is required",
257
+ "data": None
258
+ }
259
+
260
+ if not modifications:
261
+ return {
262
+ "success": False,
263
+ "message": "No modifications specified",
264
+ "data": None
265
+ }
266
+
267
+ # Use the provided path generator or fall back to the default
268
+ current_generator = path_generator or self.path_generator
269
+
270
+ # Load the existing path
271
+ learning_path = current_generator.load_path(path_id)
272
+ if not learning_path:
273
+ return {
274
+ "success": False,
275
+ "message": f"Learning path with ID {path_id} not found",
276
+ "data": None
277
+ }
278
+
279
+ # Apply modifications
280
+ path_data = learning_path.dict()
281
+ for key, value in modifications.items():
282
+ if key in path_data and key not in ["id", "created_at"]:
283
+ path_data[key] = value
284
+
285
+ # Create a new path with the modifications
286
+ modified_path = LearningPath(**path_data)
287
+
288
+ # Save the modified path
289
+ if request.get("save_path", True):
290
+ path_file = current_generator.save_path(modified_path)
291
+
292
+ # Update agent state
293
+ self.current_path = modified_path
294
+
295
+ # Log the interaction
296
+ self._log_interaction("modify_path", request, {"path_id": modified_path.id})
297
+
298
+ return {
299
+ "success": True,
300
+ "message": f"Successfully modified learning path {path_id}",
301
+ "data": modified_path.dict()
302
+ }
303
+
304
+ except Exception as e:
305
+ return {
306
+ "success": False,
307
+ "message": f"Error modifying learning path: {str(e)}",
308
+ "data": None
309
+ }
310
+
311
+ def _handle_question(self, request: Dict[str, Any], orchestrator=None) -> Dict[str, Any]:
312
+ """
313
+ Handle a question with RAG and agentic behavior.
314
+
315
+ Args:
316
+ request: Dictionary containing question data
317
+ orchestrator: Optional custom model orchestrator
318
+
319
+ Returns:
320
+ Response with the answer or error
321
+ """
322
+ try:
323
+ # Extract request parameters
324
+ question = request.get("question")
325
+
326
+ # Handle context properly (could be a list or dict)
327
+ context = request.get("context", [])
328
+ path_id = None
329
+
330
+ # If context is a dictionary, extract path_id
331
+ if isinstance(context, dict):
332
+ path_id = context.get("path_id")
333
+ # If it's a list or other type, just use it as context data
334
+
335
+ # Validate required parameters
336
+ if not question:
337
+ return {
338
+ "success": False,
339
+ "message": "Question is required",
340
+ "data": None
341
+ }
342
+
343
+ # Prepare context for the model
344
+ context_data = []
345
+
346
+ # Add learning path context if available
347
+ if path_id:
348
+ learning_path = self.path_generator.load_path(path_id)
349
+ if learning_path:
350
+ context_data.append(f"Learning Path: {learning_path.title}")
351
+ context_data.append(f"Description: {learning_path.description}")
352
+ context_data.append(f"Topic: {learning_path.topic}")
353
+ context_data.append(f"Expertise Level: {learning_path.expertise_level}")
354
+
355
+ # Add milestone information
356
+ for i, milestone in enumerate(learning_path.milestones):
357
+ context_data.append(f"Milestone {i+1}: {milestone.title}")
358
+ context_data.append(f" Description: {milestone.description}")
359
+
360
+ # Search for relevant documents
361
+ topic = None
362
+ if isinstance(context, dict):
363
+ topic = context.get("topic")
364
+ elif 'learning_path' in locals() and learning_path:
365
+ topic = learning_path.topic
366
+ if topic:
367
+ docs = self.document_store.search_documents(
368
+ query=question,
369
+ filters={"topic": topic} if topic else None,
370
+ top_k=3
371
+ )
372
+ for doc in docs:
373
+ context_data.append(doc.page_content)
374
+
375
+ # Get relevant context using RAG
376
+ try:
377
+ relevant_docs = self.vector_store.search(question)
378
+ if relevant_docs:
379
+ for doc in relevant_docs:
380
+ context_data.append(doc["content"])
381
+ except Exception as e:
382
+ print(f"Warning: Error searching vector store: {str(e)}")
383
+
384
+ # Use the provided model orchestrator or fall back to the default
385
+ current_orchestrator = orchestrator or self.model_orchestrator
386
+
387
+ # Generate the answer with RAG context
388
+ answer = current_orchestrator.generate_answer(
389
+ question=question,
390
+ context=context_data if context_data else None
391
+ )
392
+
393
+ # Log the interaction
394
+ self._log_interaction("ask_question", request, {"answer_length": len(answer)})
395
+
396
+ return {
397
+ "success": True,
398
+ "message": "Successfully answered question",
399
+ "data": {
400
+ "question": question,
401
+ "answer": answer
402
+ }
403
+ }
404
+
405
+ except Exception as e:
406
+ return {
407
+ "success": False,
408
+ "message": f"Error answering question: {str(e)}",
409
+ "data": None
410
+ }
411
+
412
+ def _plan_next_steps(self, request: Dict[str, Any]) -> None:
413
+ """
414
+ Plan the next steps based on the current request and agent state.
415
+
416
+ Args:
417
+ request: The current request being processed
418
+ """
419
+ request_type = request.get("type", "generate_path")
420
+ topic = request.get("topic", "")
421
+
422
+ # Set a goal if none exists
423
+ if not self.goal:
424
+ if request_type == "generate_path":
425
+ self.goal = f"Create a comprehensive learning path for {topic}"
426
+ elif request_type == "modify_path":
427
+ self.goal = "Refine the learning path based on user feedback"
428
+ elif request_type == "ask_question":
429
+ self.goal = f"Answer the user's question about {topic}"
430
+ else:
431
+ self.goal = "Assist the user with their learning journey"
432
+
433
+ # Update context with relevant information
434
+ if topic and topic not in self.context:
435
+ self.context.append(f"Current topic: {topic}")
436
+
437
+ # Track user preferences
438
+ expertise_level = request.get("expertise_level")
439
+ if expertise_level:
440
+ self.context.append(f"User expertise level: {expertise_level}")
441
+
442
+ learning_style = request.get("learning_style")
443
+ if learning_style:
444
+ self.context.append(f"User learning style: {learning_style}")
445
+
446
+ def _update_user_profile(self, context: List[str]) -> None:
447
+ """
448
+ Update the user profile based on context.
449
+
450
+ Args:
451
+ context: List of context strings
452
+ """
453
+ # Extract preferences from context
454
+ for item in context:
455
+ if "expertise level" in item.lower():
456
+ parts = item.split(":", 1)
457
+ if len(parts) > 1:
458
+ self.user_profile["expertise_level"] = parts[1].strip()
459
+ elif "learning style" in item.lower():
460
+ parts = item.split(":", 1)
461
+ if len(parts) > 1:
462
+ self.user_profile["learning_style"] = parts[1].strip()
463
+ elif "topic" in item.lower():
464
+ parts = item.split(":", 1)
465
+ if len(parts) > 1:
466
+ self.user_profile["interests"] = self.user_profile.get("interests", []) + [parts[1].strip()]
467
+
468
+ def _handle_resource_request(self, request: Dict[str, Any], orchestrator=None) -> Dict[str, Any]:
469
+ """
470
+ Handle a request for learning resources.
471
+
472
+ Args:
473
+ request: Dictionary with resource request parameters
474
+
475
+ Returns:
476
+ Response with resources or error
477
+ """
478
+ try:
479
+ # Extract request parameters
480
+ topic = request.get("topic")
481
+ learning_style = request.get("learning_style", "visual")
482
+ expertise_level = request.get("expertise_level", "beginner")
483
+ count = int(request.get("count", 5))
484
+
485
+ # Validate required parameters
486
+ if not topic:
487
+ return {
488
+ "success": False,
489
+ "message": "Topic is required",
490
+ "data": None
491
+ }
492
+
493
+ # Use the provided model orchestrator or fall back to the default
494
+ current_orchestrator = model_orchestrator or self.model_orchestrator
495
+
496
+ # Generate recommendations using the model orchestrator
497
+ resources = current_orchestrator.generate_resource_recommendations(
498
+ topic=topic,
499
+ learning_style=learning_style,
500
+ expertise_level=expertise_level,
501
+ count=count
502
+ )
503
+
504
+ # Log the interaction
505
+ self._log_interaction("get_resources", request, {"resource_count": len(resources)})
506
+
507
+ return {
508
+ "success": True,
509
+ "message": f"Successfully found {len(resources)} resources for {topic}",
510
+ "data": {
511
+ "topic": topic,
512
+ "resources": resources
513
+ }
514
+ }
515
+
516
+ except Exception as e:
517
+ return {
518
+ "success": False,
519
+ "message": f"Error finding resources: {str(e)}",
520
+ "data": None
521
+ }
522
+
523
+ def _log_interaction(
524
+ self,
525
+ interaction_type: str,
526
+ request: Dict[str, Any],
527
+ result: Dict[str, Any]
528
+ ) -> None:
529
+ """
530
+ Log an interaction with the agent.
531
+
532
+ Args:
533
+ interaction_type: Type of interaction
534
+ request: The request data
535
+ result: The result data
536
+ """
537
+ # Create an interaction log
538
+ log_entry = {
539
+ "timestamp": datetime.datetime.now().isoformat(),
540
+ "type": interaction_type,
541
+ "request": request,
542
+ "result": result
543
+ }
544
+
545
+ # Add to session history
546
+ self.session_history.append(log_entry)
547
+
548
+ # Limit history size
549
+ if len(self.session_history) > 100:
550
+ self.session_history = self.session_history[-100:]
551
+
552
+ def get_learning_styles(self) -> Dict[str, str]:
553
+ """
554
+ Get available learning styles.
555
+
556
+ Returns:
557
+ Dictionary of learning styles and descriptions
558
+ """
559
+ return LEARNING_STYLES
560
+
561
+ def get_expertise_levels(self) -> Dict[str, str]:
562
+ """
563
+ Get available expertise levels.
564
+
565
+ Returns:
566
+ Dictionary of expertise levels and descriptions
567
+ """
568
+ return EXPERTISE_LEVELS
569
+
570
+ def get_time_commitments(self) -> Dict[str, str]:
571
+ """
572
+ Get available time commitment options.
573
+
574
+ Returns:
575
+ Dictionary of time commitment options and descriptions
576
+ """
577
+ return TIME_COMMITMENTS
src/agents/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Autonomous Learning Agents module
3
+ Contains specialized agents for different learning tasks
4
+ """
5
+ from .base_agent import BaseAgent
6
+ from .research_agent import ResearchAgent
7
+ from .teaching_agent import TeachingAgent
8
+
9
+ __all__ = ['BaseAgent', 'ResearchAgent', 'TeachingAgent']
src/agents/base_agent.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base class for all autonomous learning agents
3
+ """
4
+ from typing import List, Dict, Any, Optional
5
+ import abc
6
+ from datetime import datetime
7
+ import json
8
+
9
+ from src.utils.config import OPENAI_API_KEY
10
+ from src.ml.model_orchestrator import ModelOrchestrator
11
+ from src.data.vector_store import VectorStore
12
+
13
+ class BaseAgent(abc.ABC):
14
+ """
15
+ Base class for all autonomous learning agents
16
+ Provides common functionality for all agents
17
+ """
18
+ def __init__(self, api_key: Optional[str] = None):
19
+ """
20
+ Initialize the base agent
21
+
22
+ Args:
23
+ api_key: Optional API key for language models
24
+ """
25
+ try:
26
+ self.api_key = api_key or OPENAI_API_KEY
27
+ if not self.api_key:
28
+ print("Warning: No API key provided. Some features may not work correctly.")
29
+
30
+ # Initialize model orchestrator
31
+ self.model_orchestrator = ModelOrchestrator(api_key=self.api_key)
32
+
33
+ # Initialize vector store with error handling
34
+ self.vector_store = VectorStore(api_key=self.api_key)
35
+ try:
36
+ # Try to load documents from the default directory
37
+ docs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'vector_db', 'documents')
38
+ self.vector_store.load_documents(docs_dir)
39
+ except Exception as e:
40
+ print(f"Warning: Could not load documents: {str(e)}")
41
+ # Fall back to minimal vector store
42
+ self.vector_store._create_minimal_vector_store()
43
+
44
+ self.memory = []
45
+ self.goals = []
46
+ self.current_task = None
47
+ self.last_action = None
48
+
49
+ except Exception as e:
50
+ print(f"Error initializing agent: {str(e)}")
51
+ # Try to continue with minimal functionality
52
+ self.api_key = api_key or OPENAI_API_KEY
53
+ self.memory = []
54
+ self.goals = []
55
+ self.current_task = None
56
+ self.last_action = None
57
+
58
+ # Try to create a minimal vector store
59
+ try:
60
+ self.vector_store = VectorStore(api_key=self.api_key)
61
+ self.vector_store._create_minimal_vector_store()
62
+ except:
63
+ print("Warning: Could not initialize vector store. Some features may not work.")
64
+ self.vector_store = None
65
+
66
+ @abc.abstractmethod
67
+ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
68
+ """
69
+ Execute a specific task
70
+
71
+ Args:
72
+ task: Task description and parameters
73
+
74
+ Returns:
75
+ Task execution results
76
+ """
77
+ pass
78
+
79
+ def add_to_memory(self, content: str) -> None:
80
+ """
81
+ Add content to agent's memory
82
+
83
+ Args:
84
+ content: Content to remember
85
+ """
86
+ timestamp = datetime.now().isoformat()
87
+ memory_item = {
88
+ "timestamp": timestamp,
89
+ "content": content
90
+ }
91
+ self.memory.append(memory_item)
92
+
93
+ # Keep memory size manageable
94
+ if len(self.memory) > 100:
95
+ self.memory = self.memory[-100:]
96
+
97
+ def get_relevant_memory(self, query: str) -> List[Dict[str, Any]]:
98
+ """
99
+ Get relevant memories based on a query
100
+
101
+ Args:
102
+ query: Query to find relevant memories
103
+
104
+ Returns:
105
+ List of relevant memory items
106
+ """
107
+ if not self.memory:
108
+ return []
109
+
110
+ try:
111
+ # Convert memory to text format
112
+ memory_texts = [f"{item['timestamp']}: {item['content']}" for item in self.memory]
113
+
114
+ # If vector store is not available, do a simple text search
115
+ if not hasattr(self, 'vector_store') or self.vector_store is None:
116
+ # Simple text-based search as fallback
117
+ query = query.lower()
118
+ return [
119
+ item for item in self.memory
120
+ if query in item['content'].lower()
121
+ ][:5] # Limit to top 5 matches
122
+
123
+ # Use vector store to find most relevant memories
124
+ relevant_memories = self.vector_store.search(query, documents=memory_texts)
125
+
126
+ # Convert back to memory format
127
+ relevant_items = []
128
+ for memory in self.memory:
129
+ memory_text = f"{memory['timestamp']}: {memory['content']}"
130
+ if any(memory_text in item for item in relevant_memories):
131
+ relevant_items.append(memory)
132
+
133
+ return relevant_items
134
+
135
+ except Exception as e:
136
+ print(f"Error in get_relevant_memory: {str(e)}")
137
+ # Fallback to simple text search
138
+ query = query.lower()
139
+ return [
140
+ item for item in self.memory
141
+ if query in item['content'].lower()
142
+ ][:5] # Limit to top 5 matches
143
+
144
+ def plan_next_action(self, current_state: Dict[str, Any]) -> Dict[str, Any]:
145
+ """
146
+ Plan the next action based on current state
147
+
148
+ Args:
149
+ current_state: Current state information
150
+
151
+ Returns:
152
+ Planned action
153
+ """
154
+ # Get relevant memories
155
+ relevant_memories = self.get_relevant_memory("next action plan")
156
+
157
+ # Create planning prompt
158
+ memory_summary = "\n".join(item["content"] for item in relevant_memories)
159
+ prompt = f"""
160
+ You are a specialized learning agent. Plan your next action based on:
161
+
162
+ Current State:
163
+ {json.dumps(current_state, indent=2)}
164
+
165
+ Relevant Past Actions:
166
+ {memory_summary}
167
+
168
+ Goals:
169
+ {json.dumps(self.goals, indent=2)}
170
+
171
+ Propose a specific, actionable next step.
172
+ Format your response as JSON with these fields:
173
+ - action: string (what to do)
174
+ - parameters: object (any parameters needed)
175
+ - reason: string (why this action)
176
+ """
177
+
178
+ # Generate plan
179
+ plan = json.loads(self.model_orchestrator.generate_structured_response(
180
+ prompt=prompt,
181
+ output_schema="""
182
+ {
183
+ "action": "string",
184
+ "parameters": "object",
185
+ "reason": "string"
186
+ }
187
+ """
188
+ ))
189
+
190
+ # Store the plan
191
+ self.last_action = plan
192
+ self.add_to_memory(f"Planned action: {json.dumps(plan)}")
193
+
194
+ return plan
195
+
196
+ def self_improve(self) -> None:
197
+ """
198
+ Analyze past performance and improve agent's capabilities
199
+ """
200
+ # Analyze recent actions
201
+ recent_actions = self.memory[-10:]
202
+
203
+ # Get feedback on performance
204
+ prompt = f"""
205
+ Analyze these recent actions and suggest improvements:
206
+ {json.dumps(recent_actions, indent=2)}
207
+
208
+ Suggest specific improvements for:
209
+ 1. Task execution efficiency
210
+ 2. Memory management
211
+ 3. Goal achievement
212
+ 4. Resource utilization
213
+
214
+ Format your response as JSON with specific suggestions.
215
+ """
216
+
217
+ # Get improvement suggestions
218
+ improvements = json.loads(self.model_orchestrator.generate_structured_response(
219
+ prompt=prompt,
220
+ output_schema="""
221
+ {
222
+ "improvements": [
223
+ {
224
+ "area": "string",
225
+ "suggestion": "string",
226
+ "implementation": "string"
227
+ }
228
+ ]
229
+ }
230
+ """
231
+ ))
232
+
233
+ # Store improvements for future reference
234
+ self.add_to_memory(f"Self-improvement suggestions: {json.dumps(improvements)}")
src/agents/research_agent.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Research Agent for autonomous learning
3
+ Handles research tasks and knowledge acquisition
4
+ """
5
+ from typing import List, Dict, Any, Optional
6
+ from datetime import datetime
7
+ import json
8
+
9
+ from .base_agent import BaseAgent
10
+
11
+ class ResearchAgent(BaseAgent):
12
+ """
13
+ Specialized agent for conducting research and acquiring knowledge
14
+ """
15
+ def __init__(self, api_key: Optional[str] = None):
16
+ super().__init__(api_key)
17
+ self.research_topics = []
18
+ self.research_history = []
19
+ self.current_research_focus = None
20
+
21
+ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
22
+ """
23
+ Execute a research task
24
+
25
+ Args:
26
+ task: Task description and parameters
27
+
28
+ Returns:
29
+ Research results
30
+ """
31
+ task_type = task.get("type", "research")
32
+
33
+ if task_type == "research":
34
+ return self.conduct_research(task)
35
+ elif task_type == "update_knowledge":
36
+ return self.update_knowledge(task)
37
+ elif task_type == "analyze_trends":
38
+ return self.analyze_trends(task)
39
+ else:
40
+ return {
41
+ "success": False,
42
+ "message": f"Unknown task type: {task_type}"
43
+ }
44
+
45
+ def conduct_research(self, task: Dict[str, Any]) -> Dict[str, Any]:
46
+ """
47
+ Conduct research on a specific topic
48
+
49
+ Args:
50
+ task: Research task parameters
51
+
52
+ Returns:
53
+ Research findings
54
+ """
55
+ topic = task.get("topic")
56
+ depth = task.get("depth", "medium")
57
+ context = task.get("context", [])
58
+
59
+ if not topic:
60
+ return {
61
+ "success": False,
62
+ "message": "Topic is required for research"
63
+ }
64
+
65
+ # Create a more intelligent research prompt that can handle any type of query
66
+ context_str = '\n'.join(context) if context else ''
67
+
68
+ # Enhanced prompt with better instruction and flexibility for ANY topic
69
+ prompt = f"""
70
+ I want you to act as an expert AI educational assistant with comprehensive knowledge in all fields of study, technologies, skills, and courses. The user has requested information about: "{topic}"
71
+
72
+ This could be a request for:
73
+ 1. A learning path on this topic or skill
74
+ 2. Specific research or information on this subject
75
+ 3. How to accomplish a task or learn a technique
76
+ 4. Explanations or definitions of concepts
77
+ 5. Course recommendations for a particular field
78
+ 6. Career advice related to skills or technologies
79
+ 7. Comparisons between different technologies, methods, or approaches
80
+
81
+ YOUR GOAL: Provide the most helpful, accurate, and comprehensive information possible about ANY educational topic the user asks about. You should be able to address questions about programming languages, data science, machine learning, web development, mobile development, cloud computing, cybersecurity, design, business, humanities, sciences, mathematics, or any other educational subject.
82
+
83
+ If it seems like they want a learning path:
84
+ - Provide a step-by-step progression from basics to advanced
85
+ - Include estimated time commitments for each stage
86
+ - Recommend specific resources (books, courses, tutorials) for each step
87
+
88
+ If it seems like they want specific information:
89
+ - Provide detailed, technically accurate information
90
+ - Include practical applications and examples
91
+ - Balance theoretical knowledge with practical insights
92
+
93
+ Your response should be thorough, accurate, helpful for any level of expertise, and include both theoretical understanding and practical application.
94
+
95
+ Additional context:
96
+ {context_str}
97
+
98
+ Provide your findings in this JSON format:
99
+ {{
100
+ "summary": "A clear 2-3 paragraph summary answering the query directly, with specific details and actionable insights",
101
+ "key_concepts": ["List of 4-6 key concepts relevant to the query, with brief explanations"],
102
+ "learning_path": ["Detailed steps for learning this topic in a logical order, from beginner to advanced"],
103
+ "resources": ["Specific recommended resources including books, courses, tutorials, documentation, and communities"],
104
+ "code_examples": ["Relevant code examples or practical exercises that demonstrate key concepts"],
105
+ "advanced_topics": ["More advanced topics to explore after mastering basics, with brief explanations of why they matter"],
106
+ "career_applications": ["How these skills apply to real-world jobs and career paths"],
107
+ "curiosity_trails": ["A list of 3-5 intriguing follow-up questions or related sub-topics to explore further, designed to spark curiosity and deeper learning."]
108
+ }}
109
+
110
+ For the "curiosity_trails", think about what someone who has just learned the main topic might wonder next, or what fascinating related areas they could branch into.
111
+
112
+ Be extremely thorough, accurate, and helpful. Don't just provide general advice - give specific, actionable information that would genuinely help someone learn this topic or skill.
113
+ """
114
+
115
+ # Generate research findings with error handling
116
+ findings_json = self.model_orchestrator.generate_structured_response(
117
+ prompt=prompt,
118
+ output_schema="""
119
+ {
120
+ "summary": "string",
121
+ "key_concepts": ["string"],
122
+ "learning_path": ["string"],
123
+ "resources": ["string"],
124
+ "code_examples": ["string"],
125
+ "advanced_topics": ["string"],
126
+ "career_applications": ["string"],
127
+ "curiosity_trails": ["string"]
128
+ }
129
+ """
130
+ )
131
+ if not findings_json:
132
+ return {
133
+ "success": False,
134
+ "message": "AI provider did not return a valid response. Please try again later."
135
+ }
136
+ try:
137
+ findings = json.loads(findings_json)
138
+ except Exception as e:
139
+ return {
140
+ "success": False,
141
+ "message": f"Failed to parse AI response: {str(e)}",
142
+ "raw_response": findings_json
143
+ }
144
+
145
+ # Store findings
146
+ self.add_to_memory(f"Research findings on {topic}: {json.dumps(findings)}")
147
+ self.research_history.append({
148
+ "topic": topic,
149
+ "timestamp": datetime.now().isoformat(),
150
+ "depth": depth,
151
+ "findings": findings
152
+ })
153
+
154
+ return {
155
+ "success": True,
156
+ "findings": findings,
157
+ "message": f"Successfully completed research on {topic}"
158
+ }
159
+
160
+ def update_knowledge(self, task: Dict[str, Any]) -> Dict[str, Any]:
161
+ """
162
+ Update knowledge based on new information
163
+
164
+ Args:
165
+ task: Update task parameters
166
+
167
+ Returns:
168
+ Update results
169
+ """
170
+ new_info = task.get("new_information")
171
+ related_topics = task.get("related_topics", [])
172
+
173
+ if not new_info:
174
+ return {
175
+ "success": False,
176
+ "message": "New information is required for knowledge update"
177
+ }
178
+
179
+ # Analyze new information
180
+ prompt = f"""
181
+ Analyze this new information and update existing knowledge:
182
+ {new_info}
183
+ """
184
+
185
+ # Include related topics
186
+ related_topics = self._find_related_topics(new_info)
187
+ if related_topics:
188
+ related_topics_str = '\n'.join(related_topics)
189
+ prompt += f"\n\nRelated topics to consider:\n{related_topics_str}"
190
+
191
+ prompt += f"""
192
+ Identify:
193
+ 1. What new knowledge should be added
194
+ 2. What existing knowledge should be updated
195
+ 3. What knowledge should be deprecated
196
+ """
197
+
198
+ analysis = json.loads(self.model_orchestrator.generate_structured_response(
199
+ prompt=prompt,
200
+ output_schema="""
201
+ {
202
+ "new_knowledge": ["string"],
203
+ "updated_knowledge": ["string"],
204
+ "deprecated_knowledge": ["string"]
205
+ }
206
+ """
207
+ ))
208
+
209
+ # Update knowledge base
210
+ self.add_to_memory(f"Knowledge update: {json.dumps(analysis)}")
211
+
212
+ return {
213
+ "success": True,
214
+ "analysis": analysis,
215
+ "message": "Knowledge base updated successfully"
216
+ }
217
+
218
+ def analyze_trends(self, task: Dict[str, Any]) -> Dict[str, Any]:
219
+ """
220
+ Analyze trends in a specific area
221
+
222
+ Args:
223
+ task: Trend analysis parameters
224
+
225
+ Returns:
226
+ Trend analysis results
227
+ """
228
+ area = task.get("area")
229
+ timeframe = task.get("timeframe", "recent")
230
+ context = task.get("context", [])
231
+
232
+ if not area:
233
+ return {
234
+ "success": False,
235
+ "message": "Area is required for trend analysis"
236
+ }
237
+
238
+ # Create analysis prompt
239
+ prompt = f"""
240
+ Analyze current trends in: {area}
241
+
242
+ Timeframe: {timeframe}
243
+ """
244
+
245
+ # Add context if available
246
+ if context:
247
+ context_str = '\n'.join(context)
248
+ prompt += f"\n\nContext:\n{context_str}"
249
+
250
+ prompt += f"""
251
+ Provide analysis in JSON format with:
252
+ - Current trends
253
+ - Emerging patterns
254
+ - Predicted developments
255
+ - Impact assessment
256
+ """
257
+
258
+ # Generate trend analysis
259
+ analysis = json.loads(self.model_orchestrator.generate_structured_response(
260
+ prompt=prompt,
261
+ output_schema="""
262
+ {
263
+ "current_trends": ["string"],
264
+ "emerging_patterns": ["string"],
265
+ "predicted_developments": ["string"],
266
+ "impact": ["string"]
267
+ }
268
+ """
269
+ ))
270
+
271
+ # Store analysis
272
+ self.add_to_memory(f"Trend analysis for {area}: {json.dumps(analysis)}")
273
+
274
+ return {
275
+ "success": True,
276
+ "analysis": analysis,
277
+ "message": f"Successfully analyzed trends in {area}"
278
+ }
279
+
280
+ def plan_next_research(self) -> Dict[str, Any]:
281
+ """
282
+ Plan next research task based on current knowledge
283
+
284
+ Returns:
285
+ Next research plan
286
+ """
287
+ # Get current knowledge gaps
288
+ relevant_memories = self.get_relevant_memory("knowledge gaps")
289
+
290
+ # Create planning prompt
291
+ memory_summary = "\n".join(item["content"] for item in relevant_memories)
292
+ prompt = f"""
293
+ Based on current knowledge:
294
+ {memory_summary}
295
+
296
+ Identify:
297
+ 1. Most important knowledge gaps
298
+ 2. Areas requiring deeper research
299
+ 3. Emerging topics to explore
300
+
301
+ Propose next research task with:
302
+ - Topic
303
+ - Research depth
304
+ - Related topics
305
+ """
306
+
307
+ # Generate research plan
308
+ plan = json.loads(self.model_orchestrator.generate_structured_response(
309
+ prompt=prompt,
310
+ output_schema="""
311
+ {
312
+ "topic": "string",
313
+ "depth": "string",
314
+ "related_topics": ["string"],
315
+ "reason": "string"
316
+ }
317
+ """
318
+ ))
319
+
320
+ # Store plan
321
+ self.add_to_memory(f"Next research plan: {json.dumps(plan)}")
322
+
323
+ return plan
src/agents/teaching_agent.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Teaching Agent for autonomous learning
3
+ Handles teaching and learning path creation
4
+ """
5
+ from typing import List, Dict, Any, Optional
6
+ from datetime import datetime
7
+ import json
8
+
9
+ from .base_agent import BaseAgent
10
+ from .research_agent import ResearchAgent
11
+
12
+ class TeachingAgent(BaseAgent):
13
+ """
14
+ Specialized agent for teaching and learning path creation
15
+ """
16
+ def __init__(self, api_key: Optional[str] = None):
17
+ super().__init__(api_key)
18
+ self.learning_paths = []
19
+ self.teaching_style = "adaptive"
20
+ self.current_lesson = None
21
+
22
+ def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
23
+ """
24
+ Execute a teaching task
25
+
26
+ Args:
27
+ task: Task description and parameters
28
+
29
+ Returns:
30
+ Teaching results
31
+ """
32
+ task_type = task.get("type", "create_path")
33
+
34
+ if task_type == "create_path":
35
+ return self.create_learning_path(task)
36
+ elif task_type == "adapt_path":
37
+ return self.adapt_learning_path(task)
38
+ elif task_type == "generate_lesson":
39
+ return self.generate_lesson(task)
40
+ else:
41
+ return {
42
+ "success": False,
43
+ "message": f"Unknown task type: {task_type}"
44
+ }
45
+
46
+ def create_learning_path(self, task: Dict[str, Any]) -> Dict[str, Any]:
47
+ """
48
+ Create a personalized learning path
49
+
50
+ Args:
51
+ task: Learning path creation parameters
52
+
53
+ Returns:
54
+ Created learning path
55
+ """
56
+ topic = task.get("topic")
57
+ expertise_level = task.get("expertise_level", "beginner")
58
+ learning_style = task.get("learning_style", "visual")
59
+ time_commitment = task.get("time_commitment", "moderate")
60
+
61
+ if not topic:
62
+ return {
63
+ "success": False,
64
+ "message": "Topic is required for learning path creation"
65
+ }
66
+
67
+ # Get relevant research
68
+ research_result = {
69
+ "success": True,
70
+ "findings": ["Sample research finding 1", "Sample research finding 2"]
71
+ }
72
+
73
+ # Temporarily disabled actual research to fix circular import
74
+ # research_agent = ResearchAgent(self.api_key)
75
+ # research_result = research_agent.conduct_research({
76
+ # "topic": topic,
77
+ # "depth": "deep"
78
+ # })
79
+ #
80
+ # if not research_result["success"]:
81
+ # return research_result
82
+
83
+ # Create teaching prompt
84
+ prompt = f"""
85
+ Create a personalized learning path for: {topic}
86
+
87
+ User preferences:
88
+ - Expertise level: {expertise_level}
89
+ - Learning style: {learning_style}
90
+ - Time commitment: {time_commitment}
91
+
92
+ Research findings:
93
+ {json.dumps(research_result["findings"])}
94
+
95
+ Create a structured learning path with:
96
+ 1. Learning objectives
97
+ 2. Milestones
98
+ 3. Resources
99
+ 4. Assessment points
100
+ 5. Adaptation points
101
+ """
102
+
103
+ # Generate learning path
104
+ path = json.loads(self.model_orchestrator.generate_structured_response(
105
+ prompt=prompt,
106
+ output_schema="""
107
+ {
108
+ "title": "string",
109
+ "description": "string",
110
+ "objectives": ["string"],
111
+ "milestones": [
112
+ {
113
+ "title": "string",
114
+ "description": "string",
115
+ "resources": ["string"],
116
+ "assessment": "string",
117
+ "adaptation_points": ["string"]
118
+ }
119
+ ],
120
+ "total_duration": "string",
121
+ "prerequisites": ["string"]
122
+ }
123
+ """
124
+ ))
125
+
126
+ # Store learning path
127
+ self.learning_paths.append({
128
+ "path": path,
129
+ "created_at": datetime.now().isoformat(),
130
+ "topic": topic,
131
+ "expertise_level": expertise_level
132
+ })
133
+
134
+ # Add to memory
135
+ self.add_to_memory(f"Created learning path for {topic}: {json.dumps(path)}")
136
+
137
+ return {
138
+ "success": True,
139
+ "learning_path": path,
140
+ "message": f"Successfully created learning path for {topic}"
141
+ }
142
+
143
+ def adapt_learning_path(self, task: Dict[str, Any]) -> Dict[str, Any]:
144
+ """
145
+ Adapt an existing learning path based on user progress
146
+
147
+ Args:
148
+ task: Adaptation parameters
149
+
150
+ Returns:
151
+ Adapted learning path
152
+ """
153
+ path_id = task.get("path_id")
154
+ user_progress = task.get("user_progress")
155
+ feedback = task.get("feedback", [])
156
+
157
+ if not path_id or not user_progress:
158
+ return {
159
+ "success": False,
160
+ "message": "Path ID and user progress are required for adaptation"
161
+ }
162
+
163
+ # Find the learning path
164
+ path = None
165
+ for p in self.learning_paths:
166
+ if p.get("id") == path_id:
167
+ path = p["path"]
168
+ break
169
+
170
+ if not path:
171
+ return {
172
+ "success": False,
173
+ "message": f"Learning path with ID {path_id} not found"
174
+ }
175
+
176
+ # Prepare feedback string
177
+ feedback_str = '\n'.join(feedback) if feedback else 'No feedback provided'
178
+
179
+ # Create adaptation prompt
180
+ prompt = f"""
181
+ Adapt this learning path based on user progress and feedback:
182
+ {json.dumps(path)}
183
+
184
+ User progress:
185
+ {json.dumps(user_progress)}
186
+
187
+ Feedback:
188
+ {feedback_str}
189
+
190
+ Suggest specific adaptations for:
191
+ 1. Content difficulty
192
+ 2. Resource types
193
+ 3. Assessment methods
194
+ 4. Learning pace
195
+ """
196
+
197
+ # Generate adaptations
198
+ adaptations = json.loads(self.model_orchestrator.generate_structured_response(
199
+ prompt=prompt,
200
+ output_schema="""
201
+ {
202
+ "content_changes": ["string"],
203
+ "resource_changes": ["string"],
204
+ "assessment_changes": ["string"],
205
+ "pace_changes": ["string"]
206
+ }
207
+ """
208
+ ))
209
+
210
+ # Apply adaptations
211
+ for change in adaptations["content_changes"]:
212
+ self._apply_change(path, change)
213
+
214
+ # Store adaptation
215
+ self.add_to_memory(f"Adapted learning path {path_id}: {json.dumps(adaptations)}")
216
+
217
+ return {
218
+ "success": True,
219
+ "adaptations": adaptations,
220
+ "updated_path": path,
221
+ "message": f"Successfully adapted learning path {path_id}"
222
+ }
223
+
224
+ def generate_lesson(self, task: Dict[str, Any]) -> Dict[str, Any]:
225
+ """
226
+ Generate a specific lesson for a topic
227
+
228
+ Args:
229
+ task: Lesson generation parameters
230
+
231
+ Returns:
232
+ Generated lesson
233
+ """
234
+ topic = task.get("topic")
235
+ lesson_type = task.get("type", "introductory")
236
+ duration = task.get("duration", "60 minutes")
237
+
238
+ if not topic:
239
+ return {
240
+ "success": False,
241
+ "message": "Topic is required for lesson generation"
242
+ }
243
+
244
+ # Create lesson prompt
245
+ prompt = f"""
246
+ Generate a {lesson_type} lesson on: {topic}
247
+
248
+ Duration: {duration}
249
+
250
+ Include:
251
+ 1. Key concepts
252
+ 2. Practical examples
253
+ 3. Interactive elements
254
+ 4. Assessment questions
255
+ 5. Additional resources
256
+
257
+ Format as JSON with clear structure
258
+ """
259
+
260
+ # Generate lesson
261
+ lesson = json.loads(self.model_orchestrator.generate_structured_response(
262
+ prompt=prompt,
263
+ output_schema="""
264
+ {
265
+ "title": "string",
266
+ "description": "string",
267
+ "sections": [
268
+ {
269
+ "title": "string",
270
+ "content": "string",
271
+ "examples": ["string"],
272
+ "questions": ["string"]
273
+ }
274
+ ],
275
+ "interactive_elements": ["string"],
276
+ "resources": ["string"]
277
+ }
278
+ """
279
+ ))
280
+
281
+ # Add to memory
282
+ self.add_to_memory(f"Generated lesson for {topic}: {json.dumps(lesson)}")
283
+
284
+ return {
285
+ "success": True,
286
+ "lesson": lesson,
287
+ "message": f"Successfully generated lesson for {topic}"
288
+ }
289
+
290
+ def _apply_change(self, path: Dict[str, Any], change: str) -> None:
291
+ """
292
+ Apply a specific change to the learning path
293
+
294
+ Args:
295
+ path: Learning path to modify
296
+ change: Change description
297
+ """
298
+ # Parse change description
299
+ try:
300
+ change_type, details = change.split(":", 1)
301
+ details = details.strip()
302
+
303
+ if change_type == "difficulty":
304
+ self._adjust_difficulty(path, details)
305
+ elif change_type == "resources":
306
+ self._update_resources(path, details)
307
+ elif change_type == "assessment":
308
+ self._modify_assessment(path, details)
309
+ elif change_type == "pace":
310
+ self._adjust_pace(path, details)
311
+ except Exception as e:
312
+ self.add_to_memory(f"Failed to apply change: {str(e)}")
313
+
314
+ def _adjust_difficulty(self, path: Dict[str, Any], details: str) -> None:
315
+ """
316
+ Adjust content difficulty
317
+
318
+ Args:
319
+ path: Learning path
320
+ details: Difficulty adjustment details
321
+ """
322
+ # Implementation of difficulty adjustment
323
+ pass
324
+
325
+ def _update_resources(self, path: Dict[str, Any], details: str) -> None:
326
+ """
327
+ Update learning resources
328
+
329
+ Args:
330
+ path: Learning path
331
+ details: Resource update details
332
+ """
333
+ # Implementation of resource updates
334
+ pass
335
+
336
+ def _modify_assessment(self, path: Dict[str, Any], details: str) -> None:
337
+ """
338
+ Modify assessment methods
339
+
340
+ Args:
341
+ path: Learning path
342
+ details: Assessment modification details
343
+ """
344
+ # Implementation of assessment modifications
345
+ pass
346
+
347
+ def _adjust_pace(self, path: Dict[str, Any], details: str) -> None:
348
+ """
349
+ Adjust learning pace
350
+
351
+ Args:
352
+ path: Learning path
353
+ details: Pace adjustment details
354
+ """
355
+ # Implementation of pace adjustments
356
+ pass
src/data/bm25_retriever.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BM25 Retriever for keyword-based document search.
3
+
4
+ BM25 (Best Matching 25) is a probabilistic ranking function used for keyword-based
5
+ document retrieval. It's particularly effective for exact keyword matches that
6
+ semantic search might miss.
7
+ """
8
+ from typing import List, Dict, Any, Optional
9
+ import numpy as np
10
+ from rank_bm25 import BM25Okapi
11
+ from langchain.schema import Document
12
+
13
+
14
+ class BM25Retriever:
15
+ """
16
+ BM25-based keyword retriever for hybrid search.
17
+
18
+ BM25 uses term frequency (TF) and inverse document frequency (IDF) to rank
19
+ documents based on keyword relevance.
20
+ """
21
+
22
+ def __init__(self, k1: float = 1.5, b: float = 0.75):
23
+ """
24
+ Initialize BM25 retriever.
25
+
26
+ Args:
27
+ k1: Term frequency saturation parameter (default: 1.5)
28
+ Higher values give more weight to term frequency
29
+ b: Length normalization parameter (default: 0.75)
30
+ 0 = no normalization, 1 = full normalization
31
+ """
32
+ self.k1 = k1
33
+ self.b = b
34
+ self.bm25 = None
35
+ self.documents = []
36
+ self.tokenized_corpus = []
37
+
38
+ def index_documents(self, documents: List[Document]) -> None:
39
+ """
40
+ Index documents for BM25 search.
41
+
42
+ Args:
43
+ documents: List of Document objects to index
44
+ """
45
+ if not documents:
46
+ return
47
+
48
+ self.documents = documents
49
+
50
+ # Tokenize documents (simple whitespace tokenization)
51
+ self.tokenized_corpus = [
52
+ doc.page_content.lower().split()
53
+ for doc in documents
54
+ ]
55
+
56
+ # Create BM25 index
57
+ self.bm25 = BM25Okapi(self.tokenized_corpus, k1=self.k1, b=self.b)
58
+
59
+ print(f"✅ BM25 index created with {len(documents)} documents")
60
+
61
+ def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
62
+ """
63
+ Search documents using BM25 keyword matching.
64
+
65
+ Args:
66
+ query: Search query
67
+ top_k: Number of top results to return
68
+
69
+ Returns:
70
+ List of dictionaries with 'document' and 'score' keys
71
+ """
72
+ if not self.bm25 or not self.documents:
73
+ return []
74
+
75
+ # Tokenize query
76
+ tokenized_query = query.lower().split()
77
+
78
+ # Get BM25 scores
79
+ scores = self.bm25.get_scores(tokenized_query)
80
+
81
+ # Get top-k indices
82
+ top_indices = np.argsort(scores)[::-1][:top_k]
83
+
84
+ # Build results
85
+ results = []
86
+ for idx in top_indices:
87
+ if scores[idx] > 0: # Only include documents with non-zero scores
88
+ results.append({
89
+ 'document': self.documents[idx],
90
+ 'score': float(scores[idx]),
91
+ 'rank': len(results) + 1
92
+ })
93
+
94
+ return results
95
+
96
+ def get_stats(self) -> Dict[str, Any]:
97
+ """
98
+ Get statistics about the indexed corpus.
99
+
100
+ Returns:
101
+ Dictionary with corpus statistics
102
+ """
103
+ if not self.bm25:
104
+ return {"indexed": False}
105
+
106
+ return {
107
+ "indexed": True,
108
+ "document_count": len(self.documents),
109
+ "avg_doc_length": np.mean([len(doc) for doc in self.tokenized_corpus]),
110
+ "k1": self.k1,
111
+ "b": self.b
112
+ }
113
+
114
+
115
+ def reciprocal_rank_fusion(
116
+ results_list: List[List[Dict[str, Any]]],
117
+ k: int = 60
118
+ ) -> List[Dict[str, Any]]:
119
+ """
120
+ Combine multiple ranked lists using Reciprocal Rank Fusion (RRF).
121
+
122
+ RRF is a simple but effective method for combining rankings from different
123
+ retrieval systems. It gives higher scores to documents that appear in
124
+ multiple result lists and/or appear higher in those lists.
125
+
126
+ Formula: RRF_score(d) = Σ(1 / (k + rank(d)))
127
+
128
+ Args:
129
+ results_list: List of result lists from different retrievers
130
+ k: Constant to prevent division by zero (default: 60)
131
+
132
+ Returns:
133
+ Fused and re-ranked results
134
+ """
135
+ # Track scores for each unique document
136
+ doc_scores = {}
137
+ doc_objects = {}
138
+
139
+ for results in results_list:
140
+ for result in results:
141
+ doc = result['document']
142
+ rank = result.get('rank', result.get('score', 1))
143
+
144
+ # Use document content as key (hash for uniqueness)
145
+ doc_key = hash(doc.page_content)
146
+
147
+ # Calculate RRF score
148
+ rrf_score = 1.0 / (k + rank)
149
+
150
+ # Accumulate scores
151
+ if doc_key in doc_scores:
152
+ doc_scores[doc_key] += rrf_score
153
+ else:
154
+ doc_scores[doc_key] = rrf_score
155
+ doc_objects[doc_key] = doc
156
+
157
+ # Sort by RRF score
158
+ sorted_docs = sorted(
159
+ doc_scores.items(),
160
+ key=lambda x: x[1],
161
+ reverse=True
162
+ )
163
+
164
+ # Build final results
165
+ fused_results = []
166
+ for doc_key, score in sorted_docs:
167
+ fused_results.append({
168
+ 'document': doc_objects[doc_key],
169
+ 'score': score,
170
+ 'rank': len(fused_results) + 1
171
+ })
172
+
173
+ return fused_results
src/data/document_store.py ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector database interface for the AI Learning Path Generator.
3
+ Handles document storage, retrieval, and semantic search.
4
+
5
+ Optimizations:
6
+ - Singleton pattern for connection pooling
7
+ - Batch operations for efficiency
8
+ - Query optimization and caching
9
+ - Relevance score filtering (>0.7)
10
+ - Performance logging
11
+ """
12
+ import os
13
+ import time
14
+ import hashlib
15
+ import sqlite3
16
+ import json
17
+ from typing import List, Dict, Any, Optional
18
+ from pathlib import Path
19
+ import threading
20
+
21
+ import chromadb
22
+ from chromadb.config import Settings
23
+ from chromadb.utils import embedding_functions
24
+ from langchain.schema import Document
25
+
26
+ from src.utils.config import (
27
+ VECTOR_DB_PATH,
28
+ OPENAI_API_KEY,
29
+ EMBEDDING_MODEL,
30
+ # Advanced RAG config
31
+ ENABLE_SEMANTIC_CACHE,
32
+ QUERY_REWRITE_ENABLED,
33
+ RERANK_ENABLED,
34
+ CONTEXTUAL_COMPRESSION_ENABLED,
35
+ USE_LOCAL_RERANKER,
36
+ COHERE_API_KEY,
37
+ COHERE_RERANK_MODEL,
38
+ LOCAL_RERANKER_MODEL,
39
+ QUERY_REWRITE_MODEL,
40
+ QUERY_REWRITE_MAX_TOKENS,
41
+ COMPRESSION_MODEL,
42
+ COMPRESSION_MAX_TOKENS,
43
+ RERANK_TOP_K,
44
+ HYBRID_TOP_K,
45
+ BM25_K1,
46
+ BM25_B,
47
+ REDIS_URL,
48
+ REDIS_HOST,
49
+ REDIS_PORT,
50
+ REDIS_PASSWORD,
51
+ REDIS_DB,
52
+ SEMANTIC_CACHE_TTL,
53
+ SEMANTIC_CACHE_THRESHOLD
54
+ )
55
+ from src.utils.cache import cache
56
+
57
+
58
+ # Singleton instance and lock for thread-safe initialization
59
+ _instance = None
60
+ _lock = threading.Lock()
61
+
62
+
63
+ class DocumentStore:
64
+ """
65
+ Enhanced document retrieval using ChromaDB vector database with connection pooling.
66
+
67
+ Features:
68
+ - Singleton pattern for connection reuse
69
+ - Batch operations for efficiency
70
+ - Query optimization and caching
71
+ - Relevance score filtering (>0.7)
72
+ - Performance logging
73
+ """
74
+
75
+ # Class-level client for connection pooling
76
+ _shared_client = None
77
+ _shared_embedding_function = None
78
+
79
+ def __new__(cls, db_path: Optional[str] = None):
80
+ """Singleton pattern: ensure only one instance exists."""
81
+ global _instance
82
+ if _instance is None:
83
+ with _lock:
84
+ if _instance is None:
85
+ _instance = super(DocumentStore, cls).__new__(cls)
86
+ _instance._initialized = False
87
+ return _instance
88
+
89
+ def __init__(self, db_path: Optional[str] = None):
90
+ """
91
+ Initialize the document store with connection pooling.
92
+
93
+ Args:
94
+ db_path: Optional path to the vector database
95
+ """
96
+ # Skip if already initialized (singleton pattern)
97
+ if self._initialized:
98
+ return
99
+
100
+ print(f"--- DocumentStore.__init__ started (db_path: {db_path or VECTOR_DB_PATH}) ---")
101
+ self.db_path = db_path or VECTOR_DB_PATH
102
+
103
+ # Performance tracking
104
+ self.search_count = 0
105
+ self.cache_hits = 0
106
+
107
+ # Ensure the directory exists
108
+ os.makedirs(self.db_path, exist_ok=True)
109
+ print(f"--- DocumentStore.__init__: Ensured directory exists: {self.db_path} ---")
110
+
111
+ # Initialize shared client (connection pooling)
112
+ if DocumentStore._shared_client is None:
113
+ print("--- DocumentStore.__init__: Initializing shared chromadb.Client ---")
114
+ try:
115
+ DocumentStore._shared_client = chromadb.Client(
116
+ Settings(
117
+ chroma_db_impl="duckdb+parquet",
118
+ persist_directory=self.db_path,
119
+ anonymized_telemetry=False,
120
+ allow_reset=True
121
+ )
122
+ )
123
+ print("✅ Shared ChromaDB client initialized (connection pooling active)")
124
+ except Exception as e:
125
+ print(f"⚠️ Failed to initialize ChromaDB client: {e}")
126
+ raise
127
+
128
+ self.client = DocumentStore._shared_client
129
+
130
+ # Initialize shared embedding function (reuse across requests)
131
+ if DocumentStore._shared_embedding_function is None:
132
+ print(f"--- DocumentStore.__init__: Initializing custom embedding function ---")
133
+ try:
134
+ # Use free local embedding function if OpenAI API key not available
135
+ if OPENAI_API_KEY:
136
+ # Create custom embedding function compatible with OpenAI v1.x
137
+ from openai import OpenAI
138
+
139
+ class CustomOpenAIEmbedding:
140
+ def __init__(self, api_key, model_name="text-embedding-ada-002"):
141
+ self.client = OpenAI(api_key=api_key)
142
+ self.model_name = model_name
143
+
144
+ def __call__(self, texts):
145
+ """Generate embeddings for a list of texts."""
146
+ if isinstance(texts, str):
147
+ texts = [texts]
148
+
149
+ response = self.client.embeddings.create(
150
+ input=texts,
151
+ model=self.model_name
152
+ )
153
+ return [item.embedding for item in response.data]
154
+
155
+ DocumentStore._shared_embedding_function = CustomOpenAIEmbedding(
156
+ api_key=OPENAI_API_KEY,
157
+ model_name=EMBEDDING_MODEL
158
+ )
159
+ print("✅ Shared embedding function initialized (OpenAI)")
160
+ else:
161
+ # Use free sentence-transformers embedding (no API key needed)
162
+ print("Using free local embeddings (sentence-transformers)...")
163
+ DocumentStore._shared_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
164
+ model_name="all-MiniLM-L6-v2"
165
+ )
166
+ print("✅ Shared embedding function initialized (Local SentenceTransformer)")
167
+ except Exception as e:
168
+ print(f"⚠️ Failed to initialize embedding function: {e}")
169
+ raise
170
+
171
+ self.embedding_function = DocumentStore._shared_embedding_function
172
+
173
+ # Create or get the collections
174
+ print("--- DocumentStore.__init__: Getting/creating 'learning_resources' collection ---")
175
+ self.resources_collection = self._initialize_collection(
176
+ name="learning_resources",
177
+ metadata={"description": "Educational resources and materials"}
178
+ )
179
+ print("--- DocumentStore.__init__: 'learning_resources' collection obtained ---")
180
+
181
+ print("--- DocumentStore.__init__: Getting/creating 'learning_paths' collection ---")
182
+ self.paths_collection = self._initialize_collection(
183
+ name="learning_paths",
184
+ metadata={"description": "Generated learning paths"}
185
+ )
186
+ print("--- DocumentStore.__init__: 'learning_paths' collection obtained ---")
187
+
188
+ # Mark as initialized
189
+ self._initialized = True
190
+ print("--- DocumentStore.__init__ finished ---")
191
+
192
+ def add_document(
193
+ self,
194
+ content: str,
195
+ metadata: Dict[str, Any],
196
+ collection_name: str = "learning_resources",
197
+ document_id: Optional[str] = None
198
+ ) -> str:
199
+ """
200
+ Add a document to the vector database.
201
+
202
+ Args:
203
+ content: Document content
204
+ metadata: Document metadata
205
+ collection_name: Name of the collection to add to
206
+ document_id: Optional ID for the document
207
+
208
+ Returns:
209
+ ID of the added document
210
+ """
211
+ # Generate a document ID if not provided
212
+ doc_id = document_id or f"doc_{len(content) % 10000}_{hash(content) % 1000000}"
213
+
214
+ # Get the appropriate collection
215
+ collection = self._initialize_collection(name=collection_name)
216
+
217
+ # Add the document
218
+ collection.add(
219
+ documents=[content],
220
+ metadatas=[metadata],
221
+ ids=[doc_id]
222
+ )
223
+
224
+ return doc_id
225
+
226
+ def add_documents(
227
+ self,
228
+ documents: List[Document],
229
+ collection_name: str = "learning_resources"
230
+ ) -> List[str]:
231
+ """
232
+ Add multiple documents to the vector database.
233
+
234
+ Args:
235
+ documents: List of Document objects
236
+ collection_name: Name of the collection to add to
237
+
238
+ Returns:
239
+ List of document IDs
240
+ """
241
+ if not documents:
242
+ return []
243
+
244
+ # Get the appropriate collection
245
+ collection = self._initialize_collection(name=collection_name)
246
+
247
+ # Prepare document data
248
+ contents = [doc.page_content for doc in documents]
249
+ metadatas = [doc.metadata for doc in documents]
250
+ ids = [f"doc_{i}_{hash(doc.page_content) % 1000000}" for i, doc in enumerate(documents)]
251
+
252
+ # Add documents in batches (ChromaDB has limits)
253
+ batch_size = 100
254
+ for i in range(0, len(documents), batch_size):
255
+ batch_end = min(i + batch_size, len(documents))
256
+ collection.add(
257
+ documents=contents[i:batch_end],
258
+ metadatas=metadatas[i:batch_end],
259
+ ids=ids[i:batch_end]
260
+ )
261
+
262
+ return ids
263
+
264
+ def search_documents(
265
+ self,
266
+ query: str,
267
+ collection_name: str = "learning_resources",
268
+ filters: Optional[Dict[str, Any]] = None,
269
+ top_k: int = 5,
270
+ offset: int = 0
271
+ ) -> List[Document]:
272
+ """
273
+ Search for documents using semantic similarity with pagination.
274
+
275
+ Args:
276
+ query: Search query
277
+ collection_name: Collection to search in
278
+ filters: Optional metadata filters
279
+ top_k: Number of results to return (default: 5)
280
+ offset: Number of results to skip for pagination (default: 0)
281
+
282
+ Returns:
283
+ List of relevant Document objects
284
+ """
285
+ # Get the collection
286
+ try:
287
+ collection = self._initialize_collection(name=collection_name)
288
+ except Exception:
289
+ # Collection doesn't exist
290
+ return []
291
+
292
+ # Prepare filter if provided
293
+ where = {}
294
+ if filters:
295
+ for key, value in filters.items():
296
+ if isinstance(value, list):
297
+ # For list values, we need to use the $in operator
298
+ where[key] = {"$in": value}
299
+ else:
300
+ where[key] = value
301
+
302
+ # Execute the search (get more results for pagination)
303
+ try:
304
+ result = collection.query(
305
+ query_texts=[query],
306
+ n_results=top_k + offset, # Get enough results for pagination
307
+ where=where if where else None
308
+ )
309
+ except Exception as e:
310
+ print(f"⚠️ Search failed: {e}")
311
+ print(f"🔧 Attempting schema repair for error: {type(e).__name__}")
312
+ # Try to repair schema and retry once
313
+ if self._try_repair_collection_schema(e):
314
+ print(f"🔄 Schema repaired, retrying query...")
315
+ try:
316
+ result = collection.query(
317
+ query_texts=[query],
318
+ n_results=top_k + offset,
319
+ where=where if where else None
320
+ )
321
+ print(f"✅ Query retry successful after schema repair")
322
+ except Exception as retry_error:
323
+ print(f"⚠️ Search retry failed: {retry_error}")
324
+ return []
325
+ else:
326
+ print(f"❌ Schema repair not applicable for this error")
327
+ return []
328
+
329
+ # Convert results to Document objects
330
+ documents = []
331
+ if result and result.get("documents"):
332
+ # Apply offset for pagination
333
+ start_idx = offset
334
+ end_idx = offset + top_k
335
+
336
+ for i in range(start_idx, min(end_idx, len(result["documents"][0]))):
337
+ content = result["documents"][0][i]
338
+ metadata = result["metadatas"][0][i] if result.get("metadatas") and result["metadatas"][0] else {}
339
+ distance = result["distances"][0][i] if result.get("distances") and result["distances"][0] else 1.0
340
+
341
+ # Add relevance score to metadata
342
+ metadata["relevance_score"] = 1.0 - (distance / 2.0) # Convert distance to relevance (0-1)
343
+
344
+ documents.append(Document(
345
+ page_content=content,
346
+ metadata=metadata
347
+ ))
348
+
349
+ return documents
350
+
351
+ def hybrid_search(
352
+ self,
353
+ query: str,
354
+ collection_name: str = "learning_resources",
355
+ filters: Optional[Dict[str, Any]] = None,
356
+ top_k: int = 5,
357
+ min_relevance: float = 0.7,
358
+ use_cache: bool = True
359
+ ) -> List[Document]:
360
+ """
361
+ Perform optimized hybrid search with caching and relevance filtering.
362
+
363
+ Optimizations:
364
+ - Query truncation to 500 chars
365
+ - Stop word removal
366
+ - Result caching (1 hour)
367
+ - Relevance score filtering (>0.7)
368
+ - Performance logging
369
+
370
+ Args:
371
+ query: Search query
372
+ collection_name: Collection to search in
373
+ filters: Optional metadata filters
374
+ top_k: Number of results to return (default: 5)
375
+ min_relevance: Minimum relevance score (default: 0.7)
376
+ use_cache: Whether to use cached results (default: True)
377
+
378
+ Returns:
379
+ List of relevant Document objects
380
+ """
381
+ start_time = time.time()
382
+ self.search_count += 1
383
+
384
+ # Optimize query: truncate to 500 chars
385
+ optimized_query = query[:500] if len(query) > 500 else query
386
+
387
+ # Remove common stop words to focus on meaningful keywords
388
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
389
+ query_words = optimized_query.lower().split()
390
+ filtered_words = [w for w in query_words if w not in stop_words]
391
+ optimized_query = ' '.join(filtered_words) if filtered_words else optimized_query
392
+
393
+ # Check cache first
394
+ if use_cache:
395
+ cache_key = cache.cache_key(
396
+ "hybrid_search",
397
+ optimized_query,
398
+ collection_name,
399
+ str(filters),
400
+ top_k,
401
+ min_relevance
402
+ )
403
+
404
+ cached_results = cache.get(cache_key)
405
+ if cached_results:
406
+ self.cache_hits += 1
407
+ elapsed = time.time() - start_time
408
+ print(f"💰 Cache hit! Search completed in {elapsed*1000:.1f}ms (saved API call)")
409
+ return cached_results
410
+
411
+ # Perform semantic search
412
+ semantic_results = self.search_documents(
413
+ query=optimized_query,
414
+ collection_name=collection_name,
415
+ filters=filters,
416
+ top_k=top_k * 2 # Get more results for reranking
417
+ )
418
+
419
+ # Prepare keyword results for simple matching
420
+ keyword_docs = []
421
+ try:
422
+ # Get all documents matching the filters
423
+ collection = self._initialize_collection(name=collection_name)
424
+
425
+ # Prepare filter for keyword search
426
+ where = {}
427
+ if filters:
428
+ where.update(filters)
429
+
430
+ # Get documents matching the filter
431
+ result = collection.get(where=where if where else None)
432
+
433
+ if result and result.get("documents"):
434
+ # Simple keyword matching
435
+ query_terms = set(query.lower().split())
436
+
437
+ for i, content in enumerate(result["documents"]):
438
+ # Count matching terms in content
439
+ content_lower = content.lower()
440
+ match_count = sum(1 for term in query_terms if term in content_lower)
441
+
442
+ if match_count > 0:
443
+ metadata = result["metadatas"][i] if result.get("metadatas") else {}
444
+ # Score based on ratio of matching terms
445
+ metadata["relevance_score"] = match_count / len(query_terms)
446
+
447
+ keyword_docs.append(Document(
448
+ page_content=content,
449
+ metadata=metadata
450
+ ))
451
+ except Exception:
452
+ # Keyword search failed, continue with semantic results only
453
+ pass
454
+
455
+ # Combine results, removing duplicates
456
+ all_docs = {}
457
+
458
+ # Add semantic results
459
+ for doc in semantic_results:
460
+ doc_key = hash(doc.page_content)
461
+ all_docs[doc_key] = doc
462
+
463
+ # Add keyword results that don't duplicate semantic results
464
+ for doc in keyword_docs:
465
+ doc_key = hash(doc.page_content)
466
+ if doc_key not in all_docs:
467
+ all_docs[doc_key] = doc
468
+
469
+ # Sort by relevance score
470
+ sorted_docs = sorted(
471
+ all_docs.values(),
472
+ key=lambda x: x.metadata.get("relevance_score", 0),
473
+ reverse=True
474
+ )
475
+
476
+ # Filter by minimum relevance score
477
+ filtered_docs = [
478
+ doc for doc in sorted_docs
479
+ if doc.metadata.get("relevance_score", 0) >= min_relevance
480
+ ]
481
+
482
+ # Take top_k results
483
+ results = filtered_docs[:top_k]
484
+
485
+ # Performance logging
486
+ elapsed = time.time() - start_time
487
+ print(f"🔍 Search completed in {elapsed*1000:.1f}ms - Found {len(results)}/{len(sorted_docs)} results (min_relevance={min_relevance})")
488
+
489
+ # Cache the results for 1 hour
490
+ if use_cache and results:
491
+ cache.set(cache_key, results, ttl=3600)
492
+
493
+ return results
494
+
495
+ def delete_document(
496
+ self,
497
+ document_id: str,
498
+ collection_name: str = "learning_resources"
499
+ ) -> bool:
500
+ """
501
+ Delete a document from the vector database.
502
+
503
+ Args:
504
+ document_id: ID of the document to delete
505
+ collection_name: Collection to delete from
506
+
507
+ Returns:
508
+ Success status
509
+ """
510
+ try:
511
+ collection = self._initialize_collection(name=collection_name)
512
+
513
+ collection.delete(ids=[document_id])
514
+ return True
515
+ except Exception:
516
+ return False
517
+
518
+ def clear_collection(self, collection_name: str) -> bool:
519
+ """
520
+ Clear all documents from a collection.
521
+
522
+ Args:
523
+ collection_name: Collection to clear
524
+
525
+ Returns:
526
+ Success status
527
+ """
528
+ try:
529
+ self.client.delete_collection(collection_name)
530
+ self._initialize_collection(name=collection_name)
531
+ return True
532
+ except Exception:
533
+ return False
534
+
535
+ def add_documents_batch(
536
+ self,
537
+ documents: List[Document],
538
+ collection_name: str = "learning_resources",
539
+ batch_size: int = 100
540
+ ) -> List[str]:
541
+ """
542
+ Add documents in batches to avoid memory issues.
543
+
544
+ Args:
545
+ documents: List of Document objects
546
+ collection_name: Collection to add to
547
+ batch_size: Number of documents per batch (default: 100)
548
+
549
+ Returns:
550
+ List of document IDs
551
+ """
552
+ if not documents:
553
+ return []
554
+ print(f"📦 Adding {len(documents)} documents in batches of {batch_size}")
555
+ start_time = time.time()
556
+
557
+ try:
558
+ collection = self._initialize_collection(name=collection_name)
559
+
560
+ all_ids = []
561
+
562
+ for i in range(0, len(documents), batch_size):
563
+ batch_end = min(i + batch_size, len(documents))
564
+ batch = documents[i:batch_end]
565
+
566
+ # Prepare batch data
567
+ contents = [doc.page_content for doc in batch]
568
+ metadatas = [doc.metadata for doc in batch]
569
+ ids = [f"doc_{i+j}_{hash(doc.page_content) % 1000000}" for j, doc in enumerate(batch)]
570
+
571
+ # Add batch
572
+ collection.add(
573
+ documents=contents,
574
+ metadatas=metadatas,
575
+ ids=ids
576
+ )
577
+
578
+ all_ids.extend(ids)
579
+ print(f" ✅ Batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1} added ({len(batch)} docs)")
580
+
581
+ elapsed = time.time() - start_time
582
+ print(f"✅ Added {len(documents)} documents in {elapsed:.2f}s ({len(documents)/elapsed:.1f} docs/sec)")
583
+
584
+ return all_ids
585
+
586
+ except Exception as e:
587
+ print(f"⚠️ Batch add failed: {e}")
588
+ return []
589
+
590
+ def get_collection_stats(self, collection_name: str = "learning_resources") -> Dict[str, Any]:
591
+ """
592
+ Get statistics about a collection.
593
+
594
+ Args:
595
+ collection_name: Collection to get stats for
596
+
597
+ Returns:
598
+ Dictionary with collection statistics
599
+ """
600
+ try:
601
+ collection = self._initialize_collection(name=collection_name)
602
+
603
+ # Get collection count
604
+ count = collection.count()
605
+
606
+ # Get sample documents to estimate size
607
+ sample = collection.get(limit=10)
608
+ avg_doc_size = 0
609
+ if sample and sample.get("documents"):
610
+ total_size = sum(len(doc) for doc in sample["documents"])
611
+ avg_doc_size = total_size / len(sample["documents"])
612
+
613
+ return {
614
+ "collection_name": collection_name,
615
+ "document_count": count,
616
+ "avg_document_size_bytes": avg_doc_size,
617
+ "estimated_total_size_kb": (count * avg_doc_size) / 1024,
618
+ "search_count": self.search_count,
619
+ "cache_hits": self.cache_hits,
620
+ "cache_hit_rate": f"{(self.cache_hits / self.search_count * 100):.1f}%" if self.search_count > 0 else "0%"
621
+ }
622
+ except Exception as e:
623
+ print(f"⚠️ Failed to get collection stats: {e}")
624
+ return {"error": str(e)}
625
+
626
+ def cleanup_old_embeddings(
627
+ self,
628
+ collection_name: str = "learning_resources",
629
+ days_old: int = 30
630
+ ) -> int:
631
+ """
632
+ Clean up old or unused embeddings to save space.
633
+
634
+ Args:
635
+ collection_name: Collection to clean up
636
+ days_old: Delete documents older than this many days
637
+
638
+ Returns:
639
+ Number of documents deleted
640
+ """
641
+ try:
642
+ collection = self._initialize_collection(name=collection_name)
643
+
644
+ # Get all documents
645
+ result = collection.get()
646
+
647
+ if not result or not result.get("metadatas"):
648
+ return 0
649
+
650
+ # Find old documents
651
+ import datetime
652
+ cutoff_time = time.time() - (days_old * 24 * 60 * 60)
653
+ old_ids = []
654
+
655
+ for i, metadata in enumerate(result["metadatas"]):
656
+ created_at = metadata.get("created_at", time.time())
657
+ if created_at < cutoff_time:
658
+ old_ids.append(result["ids"][i])
659
+
660
+ # Delete old documents
661
+ if old_ids:
662
+ collection.delete(ids=old_ids)
663
+ print(f"🗑️ Cleaned up {len(old_ids)} old documents from {collection_name}")
664
+
665
+ return len(old_ids)
666
+
667
+ except Exception as e:
668
+ print(f"⚠️ Cleanup failed: {e}")
669
+ return 0
670
+
671
+ def advanced_rag_search(
672
+ self,
673
+ query: str,
674
+ collection_name: str = "learning_resources",
675
+ filters: Optional[Dict[str, Any]] = None,
676
+ top_k: int = 5,
677
+ use_cache: bool = True
678
+ ) -> List[Document]:
679
+ """
680
+ Advanced RAG pipeline with all optimizations.
681
+
682
+ Pipeline:
683
+ 1. Semantic cache check (Redis)
684
+ 2. Query rewriting (LLM)
685
+ 3. Hybrid retrieval (BM25 + Semantic)
686
+ 4. Reciprocal rank fusion
687
+ 5. Reranking (Cohere/Cross-encoder)
688
+ 6. Contextual compression (LLM)
689
+
690
+ Args:
691
+ query: Search query
692
+ collection_name: Collection to search
693
+ filters: Optional metadata filters
694
+ top_k: Final number of results
695
+ use_cache: Whether to use semantic caching
696
+
697
+ Returns:
698
+ Optimized, relevant documents
699
+ """
700
+ print(f"\n🚀 Advanced RAG Pipeline Started")
701
+ print(f"Query: '{query}'")
702
+
703
+ # Step 1: Check semantic cache
704
+ cached_result = None
705
+ if ENABLE_SEMANTIC_CACHE and use_cache:
706
+ try:
707
+ from src.utils.semantic_cache import SemanticCache
708
+ cache_client = SemanticCache(
709
+ redis_url=REDIS_URL,
710
+ redis_host=REDIS_HOST,
711
+ redis_port=REDIS_PORT,
712
+ redis_password=REDIS_PASSWORD,
713
+ redis_db=REDIS_DB,
714
+ ttl=SEMANTIC_CACHE_TTL,
715
+ similarity_threshold=SEMANTIC_CACHE_THRESHOLD
716
+ )
717
+ cached_result = cache_client.get(query)
718
+ if cached_result:
719
+ print("💰 Cache hit! Returning cached results")
720
+ return cached_result
721
+ except Exception as e:
722
+ print(f"⚠️ Semantic cache check failed: {e}")
723
+
724
+ # Step 2: Query rewriting
725
+ original_query = query
726
+ if QUERY_REWRITE_ENABLED:
727
+ try:
728
+ from src.ml.query_rewriter import QueryRewriter
729
+ rewriter = QueryRewriter(
730
+ model=QUERY_REWRITE_MODEL,
731
+ max_tokens=QUERY_REWRITE_MAX_TOKENS
732
+ )
733
+ query = rewriter.rewrite_if_needed(query)
734
+ except Exception as e:
735
+ print(f"⚠️ Query rewriting failed: {e}")
736
+
737
+ # Step 3: Hybrid retrieval
738
+ try:
739
+ from src.data.bm25_retriever import BM25Retriever, reciprocal_rank_fusion
740
+
741
+ # Get all documents for BM25 indexing
742
+ try:
743
+ collection = self.client.get_collection(
744
+ name=collection_name,
745
+ embedding_function=self.embedding_function
746
+ )
747
+ all_docs_result = collection.get()
748
+
749
+ if all_docs_result and all_docs_result.get("documents"):
750
+ all_documents = [
751
+ Document(
752
+ page_content=doc,
753
+ metadata=all_docs_result["metadatas"][i] if all_docs_result.get("metadatas") else {}
754
+ )
755
+ for i, doc in enumerate(all_docs_result["documents"])
756
+ ]
757
+ else:
758
+ all_documents = []
759
+ except Exception:
760
+ all_documents = []
761
+
762
+ # BM25 search
763
+ bm25_results = []
764
+ if all_documents:
765
+ bm25 = BM25Retriever(k1=BM25_K1, b=BM25_B)
766
+ bm25.index_documents(all_documents)
767
+ bm25_results = bm25.search(query, top_k=HYBRID_TOP_K)
768
+
769
+ # Semantic search
770
+ semantic_docs = self.search_documents(
771
+ query=query,
772
+ collection_name=collection_name,
773
+ filters=filters,
774
+ top_k=HYBRID_TOP_K
775
+ )
776
+ semantic_results = [
777
+ {
778
+ 'document': doc,
779
+ 'score': doc.metadata.get('relevance_score', 0.5),
780
+ 'rank': i + 1
781
+ }
782
+ for i, doc in enumerate(semantic_docs)
783
+ ]
784
+
785
+ # Fusion
786
+ if bm25_results and semantic_results:
787
+ fused_results = reciprocal_rank_fusion([bm25_results, semantic_results])
788
+ print(f"🔀 Fused {len(bm25_results)} BM25 + {len(semantic_results)} semantic results")
789
+ elif bm25_results:
790
+ fused_results = bm25_results
791
+ else:
792
+ fused_results = semantic_results
793
+
794
+ # Extract documents from fused results
795
+ candidate_docs = [r['document'] for r in fused_results[:HYBRID_TOP_K]]
796
+
797
+ except Exception as e:
798
+ print(f"⚠️ Hybrid retrieval failed: {e}. Falling back to semantic only.")
799
+ candidate_docs = self.search_documents(
800
+ query=query,
801
+ collection_name=collection_name,
802
+ filters=filters,
803
+ top_k=HYBRID_TOP_K
804
+ )
805
+
806
+ # Step 4: Reranking
807
+ if RERANK_ENABLED and candidate_docs:
808
+ try:
809
+ from src.ml.reranker import Reranker
810
+ reranker = Reranker(
811
+ use_local=USE_LOCAL_RERANKER,
812
+ cohere_api_key=COHERE_API_KEY,
813
+ cohere_model=COHERE_RERANK_MODEL,
814
+ local_model=LOCAL_RERANKER_MODEL
815
+ )
816
+ reranked_results = reranker.rerank(query, candidate_docs, top_k=RERANK_TOP_K)
817
+ candidate_docs = [r['document'] for r in reranked_results]
818
+ except Exception as e:
819
+ print(f"⚠️ Reranking failed: {e}")
820
+ candidate_docs = candidate_docs[:RERANK_TOP_K]
821
+ else:
822
+ candidate_docs = candidate_docs[:top_k]
823
+
824
+ # Step 5: Contextual compression
825
+ final_docs = candidate_docs
826
+ if CONTEXTUAL_COMPRESSION_ENABLED and candidate_docs:
827
+ try:
828
+ from src.ml.context_compressor import ContextCompressor
829
+ compressor = ContextCompressor(
830
+ model=COMPRESSION_MODEL,
831
+ max_tokens=COMPRESSION_MAX_TOKENS
832
+ )
833
+ final_docs = compressor.compress(query, candidate_docs)
834
+ except Exception as e:
835
+ print(f"⚠️ Compression failed: {e}")
836
+
837
+ # Cache the results
838
+ if ENABLE_SEMANTIC_CACHE and use_cache and final_docs:
839
+ try:
840
+ cache_client.set(original_query, final_docs)
841
+ except Exception as e:
842
+ print(f"⚠️ Cache set failed: {e}")
843
+
844
+ print(f"✅ Advanced RAG Complete: {len(final_docs)} optimized documents\n")
845
+ return final_docs
846
+
847
+ def _initialize_collection(self, name: str, metadata: Optional[Dict[str, Any]] = None):
848
+ """Safely get or create a Chroma collection, repairing schema if needed."""
849
+ try:
850
+ return self.client.get_or_create_collection(
851
+ name=name,
852
+ embedding_function=self.embedding_function,
853
+ metadata=metadata
854
+ )
855
+ except Exception as exc:
856
+ if self._try_repair_collection_schema(exc):
857
+ return self.client.get_or_create_collection(
858
+ name=name,
859
+ embedding_function=self.embedding_function,
860
+ metadata=metadata
861
+ )
862
+ raise
863
+
864
+ def _try_repair_collection_schema(self, error: Exception) -> bool:
865
+ """Attempt to repair missing columns in any Chroma table."""
866
+ message = str(error)
867
+ missing_prefix = "no such column: "
868
+ if missing_prefix not in message:
869
+ return False
870
+
871
+ # Extract table name and column name from error message
872
+ # Format: "no such column: table_name.column_name"
873
+ try:
874
+ parts = message.split(missing_prefix, 1)[1].split()[0].strip('"`[]')
875
+ if '.' not in parts:
876
+ return False
877
+ table_name, column_name = parts.split('.', 1)
878
+ except (IndexError, ValueError):
879
+ return False
880
+
881
+ # Validate table and column names (only alphanumeric and underscore)
882
+ safe_table = ''.join(ch for ch in table_name if ch.isalnum() or ch == '_')
883
+ safe_column = ''.join(ch for ch in column_name if ch.isalnum() or ch == '_')
884
+ if safe_table != table_name or safe_column != column_name:
885
+ return False
886
+
887
+ db_file = Path(self.db_path) / "chroma.sqlite3"
888
+ if not db_file.exists():
889
+ return False
890
+
891
+ try:
892
+ with sqlite3.connect(str(db_file)) as conn:
893
+ conn.execute(f"ALTER TABLE {safe_table} ADD COLUMN {safe_column} TEXT")
894
+ conn.commit()
895
+ print(f"✅ Added missing '{safe_table}.{safe_column}' column to Chroma DB")
896
+ return True
897
+ except sqlite3.OperationalError as alter_err:
898
+ print(f"⚠️ Failed to add column {safe_table}.{safe_column}: {alter_err}")
899
+ return False
900
+
901
+ def get_cached_path(self, key: str) -> Optional[Dict[str, Any]]:
902
+ """Get a cached learning path from Redis."""
903
+ try:
904
+ import redis
905
+ # Use REDIS_URL if available and valid (for Upstash, Render, etc.)
906
+ if REDIS_URL and REDIS_URL.strip() and REDIS_URL.startswith(('redis://', 'rediss://', 'unix://')):
907
+ redis_client = redis.from_url(
908
+ REDIS_URL,
909
+ decode_responses=True,
910
+ ssl_cert_reqs=None
911
+ )
912
+ else:
913
+ # Build Redis connection params
914
+ redis_params = {
915
+ 'host': REDIS_HOST,
916
+ 'port': REDIS_PORT,
917
+ 'db': REDIS_DB,
918
+ 'decode_responses': True
919
+ }
920
+ # Only add password if it's not empty (strip whitespace)
921
+ password = (REDIS_PASSWORD or '').strip()
922
+ if password:
923
+ redis_params['password'] = password
924
+
925
+ redis_client = redis.Redis(**redis_params)
926
+
927
+ cached_data = redis_client.get(f"path_cache:{key}")
928
+ if cached_data:
929
+ return json.loads(cached_data)
930
+ return None
931
+ except Exception as e:
932
+ print(f"⚠️ Path cache GET failed: {e}")
933
+ return None
934
+
935
+ def cache_path(self, key: str, path: Dict[str, Any], ttl: int = 3600):
936
+ """Cache a learning path in Redis."""
937
+ try:
938
+ import redis
939
+ # Use REDIS_URL if available and valid (for Upstash, Render, etc.)
940
+ if REDIS_URL and REDIS_URL.strip() and REDIS_URL.startswith(('redis://', 'rediss://', 'unix://')):
941
+ redis_client = redis.from_url(
942
+ REDIS_URL,
943
+ decode_responses=True,
944
+ ssl_cert_reqs=None
945
+ )
946
+ else:
947
+ # Build Redis connection params
948
+ redis_params = {
949
+ 'host': REDIS_HOST,
950
+ 'port': REDIS_PORT,
951
+ 'db': REDIS_DB,
952
+ 'decode_responses': True
953
+ }
954
+ # Only add password if it's not empty (strip whitespace)
955
+ password = (REDIS_PASSWORD or '').strip()
956
+ if password:
957
+ redis_params['password'] = password
958
+
959
+ redis_client = redis.Redis(**redis_params)
960
+
961
+ redis_client.setex(f"path_cache:{key}", ttl, json.dumps(path))
962
+ print(f"💾 Cached learning path: {key[:8]}... (TTL: {ttl}s)")
963
+ except Exception as e:
964
+ print(f"⚠️ Path cache SET failed: {e}")
965
+
966
+ @classmethod
967
+ def shutdown(cls):
968
+ """Gracefully shutdown the shared client connection."""
969
+ if cls._shared_client is not None:
970
+ print("🔌 Shutting down ChromaDB connection...")
971
+ cls._shared_client = None
972
+ cls._shared_embedding_function = None
973
+ print("✅ Connection closed")
src/data/resources.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Educational resource handling for the AI Learning Path Generator.
3
+ Manages resource recommendation and categorization.
4
+ """
5
+ from typing import List, Dict, Any, Optional
6
+ import json
7
+ from pathlib import Path
8
+
9
+ from src.ml.model_orchestrator import ModelOrchestrator
10
+ from src.utils.helpers import difficulty_to_score
11
+ from src.utils.config import RESOURCE_TYPES, LEARNING_STYLES
12
+
13
+ class ResourceManager:
14
+ """
15
+ Manages educational resources and recommendations.
16
+ """
17
+ def __init__(self, api_key: Optional[str] = None):
18
+ """
19
+ Initialize the resource manager.
20
+
21
+ Args:
22
+ api_key: Optional OpenAI API key
23
+ """
24
+ self.model_orchestrator = ModelOrchestrator(api_key)
25
+ self.cached_resources = {}
26
+
27
+ def recommend_resources(
28
+ self,
29
+ topic: str,
30
+ learning_style: str,
31
+ expertise_level: str,
32
+ count: int = 5,
33
+ resource_type: Optional[str] = None
34
+ ) -> List[Dict[str, Any]]:
35
+ """
36
+ Recommend educational resources for a topic.
37
+
38
+ Args:
39
+ topic: The topic to find resources for
40
+ learning_style: Preferred learning style
41
+ expertise_level: User's expertise level
42
+ count: Number of resources to recommend
43
+ resource_type: Optional specific resource type
44
+
45
+ Returns:
46
+ List of resource recommendations
47
+ """
48
+ # Check cache first
49
+ cache_key = f"{topic}_{learning_style}_{expertise_level}_{resource_type}"
50
+ if cache_key in self.cached_resources:
51
+ resources = self.cached_resources[cache_key]
52
+ return resources[:count]
53
+
54
+ # Generate resources using the model
55
+ resources = self.model_orchestrator.generate_resource_recommendations(
56
+ topic=topic,
57
+ learning_style=learning_style,
58
+ expertise_level=expertise_level,
59
+ count=count
60
+ )
61
+
62
+ # Filter by resource type if specified
63
+ if resource_type and resources:
64
+ resources = [r for r in resources if r.get("type") == resource_type]
65
+
66
+ # Cache the results
67
+ self.cached_resources[cache_key] = resources
68
+
69
+ return resources
70
+
71
+ def categorize_by_learning_style(
72
+ self,
73
+ resources: List[Dict[str, Any]]
74
+ ) -> Dict[str, List[Dict[str, Any]]]:
75
+ """
76
+ Categorize resources by most suitable learning style.
77
+
78
+ Args:
79
+ resources: List of resource dictionaries
80
+
81
+ Returns:
82
+ Dictionary of resources grouped by learning style
83
+ """
84
+ result = {style: [] for style in LEARNING_STYLES}
85
+
86
+ for resource in resources:
87
+ resource_type = resource.get("type", "article")
88
+
89
+ # Find the learning style with highest score for this resource type
90
+ best_style = "reading" # Default
91
+ best_score = 0
92
+
93
+ if resource_type in RESOURCE_TYPES:
94
+ for style, score in RESOURCE_TYPES[resource_type].items():
95
+ if score > best_score:
96
+ best_score = score
97
+ best_style = style
98
+
99
+ # Add resource to the appropriate category
100
+ result[best_style].append(resource)
101
+
102
+ return result
103
+
104
+ def load_curated_resources(
105
+ self,
106
+ file_path: str = "data/curated_resources.json"
107
+ ) -> List[Dict[str, Any]]:
108
+ """
109
+ Load curated resources from a JSON file.
110
+
111
+ Args:
112
+ file_path: Path to the JSON file
113
+
114
+ Returns:
115
+ List of resource dictionaries
116
+ """
117
+ try:
118
+ with open(file_path, "r") as f:
119
+ resources = json.load(f)
120
+ return resources
121
+ except (FileNotFoundError, json.JSONDecodeError):
122
+ return []
123
+
124
+ def save_curated_resources(
125
+ self,
126
+ resources: List[Dict[str, Any]],
127
+ file_path: str = "data/curated_resources.json"
128
+ ) -> bool:
129
+ """
130
+ Save curated resources to a JSON file.
131
+
132
+ Args:
133
+ resources: List of resource dictionaries
134
+ file_path: Path to save to
135
+
136
+ Returns:
137
+ Success status
138
+ """
139
+ try:
140
+ # Ensure directory exists
141
+ Path(file_path).parent.mkdir(exist_ok=True, parents=True)
142
+
143
+ with open(file_path, "w") as f:
144
+ json.dump(resources, f, indent=2)
145
+ return True
146
+ except Exception:
147
+ return False
148
+
149
+ def analyze_difficulty(self, resource: Dict[str, Any]) -> float:
150
+ """
151
+ Analyze the difficulty level of a resource.
152
+
153
+ Args:
154
+ resource: Resource dictionary with description
155
+
156
+ Returns:
157
+ Difficulty score between 0 and 1
158
+ """
159
+ # Try to extract difficulty from the resource directly
160
+ if "difficulty" in resource:
161
+ return difficulty_to_score(resource["difficulty"])
162
+
163
+ # Analyze the description
164
+ description = resource.get("description", "")
165
+ if description:
166
+ return self.model_orchestrator.analyze_difficulty(description)
167
+
168
+ # Default to medium difficulty
169
+ return 0.5
170
+
171
+ def filter_by_difficulty(
172
+ self,
173
+ resources: List[Dict[str, Any]],
174
+ max_difficulty: float = 1.0,
175
+ min_difficulty: float = 0.0
176
+ ) -> List[Dict[str, Any]]:
177
+ """
178
+ Filter resources by difficulty level.
179
+
180
+ Args:
181
+ resources: List of resource dictionaries
182
+ max_difficulty: Maximum difficulty score (0-1)
183
+ min_difficulty: Minimum difficulty score (0-1)
184
+
185
+ Returns:
186
+ Filtered list of resources
187
+ """
188
+ result = []
189
+
190
+ for resource in resources:
191
+ # Get or calculate difficulty score
192
+ if "difficulty_score" in resource:
193
+ score = float(resource["difficulty_score"])
194
+ else:
195
+ difficulty = resource.get("difficulty", "intermediate")
196
+ score = difficulty_to_score(difficulty)
197
+
198
+ # Add to result if within range
199
+ if min_difficulty <= score <= max_difficulty:
200
+ result.append(resource)
201
+
202
+ return result
src/data/skills_database.py ADDED
@@ -0,0 +1,999 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive Skills Database
3
+ Contains salary ranges, curated resources, and market information for all supported skills.
4
+ Last Updated: 2025-01-05
5
+ """
6
+
7
+ SKILLS_DATABASE = {
8
+ # ===== CLOUD & DEVOPS =====
9
+ "AWS": {
10
+ "category": "Cloud & DevOps",
11
+ "salary_range": "$110,000 - $160,000",
12
+ "salary_min": 110000,
13
+ "salary_max": 160000,
14
+ "market_info": {
15
+ "demand": "Very High",
16
+ "growth_rate": "+30%",
17
+ "open_positions": "60,000+",
18
+ "top_employers": ["Amazon", "Netflix", "Airbnb", "Capital One", "GE", "NASA"],
19
+ "related_roles": ["Cloud Engineer", "Solutions Architect", "DevOps Engineer", "Cloud Consultant"]
20
+ },
21
+ "resources": {
22
+ "beginner": {
23
+ "youtube": ["freeCodeCamp.org", "AWS Online Tech Talks", "Simplilearn"],
24
+ "websites": ["Aws.amazon.com/training", "Aws.amazon.com/getting-started", "Coursera"]
25
+ },
26
+ "intermediate": {
27
+ "youtube": ["Adrian Cantrill", "Stephane Maarek", "A Cloud Guru"],
28
+ "websites": ["Re:Invent.aws", "Udemy", "Linux Academy"]
29
+ },
30
+ "advanced": {
31
+ "youtube": ["AWS re:Invent", "AWS Summits", "AWS This Week"],
32
+ "websites": ["AWS Well-Architected", "AWS Whitepapers", "AWS Architecture Center"]
33
+ }
34
+ }
35
+ },
36
+ # ===== DATA SCIENCE & AI =====
37
+ "Machine Learning": {
38
+ "category": "Data Science & AI",
39
+ "salary_range": "$100,000 - $180,000",
40
+ "salary_min": 100000,
41
+ "salary_max": 180000,
42
+ "market_info": {
43
+ "demand": "Very High",
44
+ "growth_rate": "+35%",
45
+ "open_positions": "50,000+",
46
+ "top_employers": ["Google", "Meta", "Amazon", "Microsoft", "OpenAI", "Tesla"],
47
+ "related_roles": ["ML Engineer", "Data Scientist", "AI Research Scientist", "MLOps Engineer"]
48
+ },
49
+ "resources": {
50
+ "beginner": {
51
+ "youtube": ["3Blue1Brown", "Sentdex", "freeCodeCamp.org"],
52
+ "websites": ["Coursera", "Kaggle", "Datacamp"]
53
+ },
54
+ "intermediate": {
55
+ "youtube": ["DeepLearningAI", "Andrej Karpathy", "StatQuest with Josh Starmer"],
56
+ "websites": ["Fast.ai", "MachineLearningMastery", "TowardsDataScience"]
57
+ },
58
+ "advanced": {
59
+ "youtube": ["Two Minute Papers", "Andrej Karpathy", "DeepLearningAI"],
60
+ "websites": ["ArXiv.org", "Papers with Code", "Distill.pub"]
61
+ }
62
+ }
63
+ },
64
+
65
+ "Deep Learning": {
66
+ "category": "Data Science & AI",
67
+ "salary_range": "$130,000 - $200,000",
68
+ "salary_min": 130000,
69
+ "salary_max": 200000,
70
+ "market_info": {
71
+ "demand": "Very High",
72
+ "growth_rate": "+40%",
73
+ "open_positions": "35,000+",
74
+ "top_employers": ["OpenAI", "Google DeepMind", "Meta AI", "NVIDIA", "Tesla", "Amazon"],
75
+ "related_roles": ["Deep Learning Engineer", "AI Researcher", "Computer Vision Engineer", "NLP Engineer"]
76
+ },
77
+ "resources": {
78
+ "beginner": {
79
+ "youtube": ["DeepLearningAI", "Sentdex", "3Blue1Brown"],
80
+ "websites": ["Coursera", "Fast.ai", "TensorFlow.org"]
81
+ },
82
+ "intermediate": {
83
+ "youtube": ["Andrej Karpathy", "Siraj Raval", "DeepLearningAI"],
84
+ "websites": ["PyTorch.org", "TowardsDataScience", "Papers with Code"]
85
+ },
86
+ "advanced": {
87
+ "youtube": ["Two Minute Papers", "Yannic Kilcher", "AI Coffee Break"],
88
+ "websites": ["ArXiv.org", "Distill.pub", "OpenAI Research"]
89
+ }
90
+ }
91
+ },
92
+
93
+ "Data Analysis": {
94
+ "category": "Data Science & AI",
95
+ "salary_range": "$90,000 - $130,000",
96
+ "salary_min": 90000,
97
+ "salary_max": 130000,
98
+ "market_info": {
99
+ "demand": "High",
100
+ "growth_rate": "+25%",
101
+ "open_positions": "80,000+",
102
+ "top_employers": ["Google", "Amazon", "Microsoft", "Meta", "Netflix", "Uber"],
103
+ "related_roles": ["Data Analyst", "Business Analyst", "Data Scientist", "Analytics Engineer"]
104
+ },
105
+ "resources": {
106
+ "beginner": {
107
+ "youtube": ["freeCodeCamp.org", "Data School", "Corey Schafer"],
108
+ "websites": ["Datacamp", "Kaggle", "Mode Analytics"]
109
+ },
110
+ "intermediate": {
111
+ "youtube": ["StatQuest with Josh Starmer", "Brandon Foltz", "Ken Jee"],
112
+ "websites": ["TowardsDataScience", "AnalyticsVidhya", "Tableau Public"]
113
+ },
114
+ "advanced": {
115
+ "youtube": ["StatQuest with Josh Starmer", "Data Science Dojo"],
116
+ "websites": ["KDnuggets", "Analytics Vidhya", "Kaggle Competitions"]
117
+ }
118
+ }
119
+ },
120
+
121
+ "Natural Language Processing": {
122
+ "category": "Data Science & AI",
123
+ "salary_range": "$100,000 - $150,000",
124
+ "salary_min": 100000,
125
+ "salary_max": 150000,
126
+ "market_info": {
127
+ "demand": "Very High",
128
+ "growth_rate": "+38%",
129
+ "open_positions": "25,000+",
130
+ "top_employers": ["OpenAI", "Google", "Meta", "Amazon", "Microsoft", "Anthropic"],
131
+ "related_roles": ["NLP Engineer", "Computational Linguist", "ML Engineer", "AI Researcher"]
132
+ },
133
+ "resources": {
134
+ "beginner": {
135
+ "youtube": ["Sentdex", "freeCodeCamp.org", "Krish Naik"],
136
+ "websites": ["Coursera", "NLTK.org", "Spacy.io"]
137
+ },
138
+ "intermediate": {
139
+ "youtube": ["DeepLearningAI", "Stanford NLP", "Jay Alammar"],
140
+ "websites": ["HuggingFace.co", "TowardsDataScience", "Papers with Code"]
141
+ },
142
+ "advanced": {
143
+ "youtube": ["Yannic Kilcher", "AI Coffee Break", "Stanford CS224N"],
144
+ "websites": ["ArXiv.org", "ACL Anthology", "OpenAI Research"]
145
+ }
146
+ }
147
+ },
148
+
149
+ "Computer Vision": {
150
+ "category": "Data Science & AI",
151
+ "salary_range": "$80,000 - $170,000",
152
+ "salary_min": 80000,
153
+ "salary_max": 170000,
154
+ "market_info": {
155
+ "demand": "Very High",
156
+ "growth_rate": "+36%",
157
+ "open_positions": "30,000+",
158
+ "top_employers": ["Tesla", "NVIDIA", "Meta", "Google", "Amazon", "Apple"],
159
+ "related_roles": ["Computer Vision Engineer", "ML Engineer", "Robotics Engineer", "AI Researcher"]
160
+ },
161
+ "resources": {
162
+ "beginner": {
163
+ "youtube": ["freeCodeCamp.org", "Sentdex", "OpenCV"],
164
+ "websites": ["OpenCV.org", "PyImageSearch", "Coursera"]
165
+ },
166
+ "intermediate": {
167
+ "youtube": ["DeepLearningAI", "Two Minute Papers", "First Principles of Computer Vision"],
168
+ "websites": ["PyTorch.org/vision", "TowardsDataScience", "Papers with Code"]
169
+ },
170
+ "advanced": {
171
+ "youtube": ["Yannic Kilcher", "AI Coffee Break", "CVPR Talks"],
172
+ "websites": ["ArXiv.org", "CVPR Conference", "ECCV Conference"]
173
+ }
174
+ }
175
+ },
176
+
177
+ "Data Engineering": {
178
+ "category": "Data Science & AI",
179
+ "salary_range": "$120,000 - $180,000",
180
+ "salary_min": 120000,
181
+ "salary_max": 180000,
182
+ "market_info": {
183
+ "demand": "Very High",
184
+ "growth_rate": "+30%",
185
+ "open_positions": "45,000+",
186
+ "top_employers": ["Amazon", "Google", "Microsoft", "Meta", "Uber", "Airbnb"],
187
+ "related_roles": ["Data Engineer", "Analytics Engineer", "ETL Developer", "Big Data Engineer"]
188
+ },
189
+ "resources": {
190
+ "beginner": {
191
+ "youtube": ["freeCodeCamp.org", "Corey Schafer", "Tech With Tim"],
192
+ "websites": ["Datacamp", "Coursera", "Mode Analytics"]
193
+ },
194
+ "intermediate": {
195
+ "youtube": ["Databricks", "Apache Spark", "Seattle Data Guy"],
196
+ "websites": ["Databricks.com", "Apache.org/Spark", "TowardsDataScience"]
197
+ },
198
+ "advanced": {
199
+ "youtube": ["Data Engineering Podcast", "Advancing Analytics"],
200
+ "websites": ["Databricks University", "Confluent.io", "DataEngineeringPodcast"]
201
+ }
202
+ }
203
+ },
204
+
205
+ "Big Data": {
206
+ "category": "Data Science & AI",
207
+ "salary_range": "$110,000 - $160,000",
208
+ "salary_min": 110000,
209
+ "salary_max": 160000,
210
+ "market_info": {
211
+ "demand": "High",
212
+ "growth_rate": "+28%",
213
+ "open_positions": "40,000+",
214
+ "top_employers": ["Amazon", "Google", "Microsoft", "IBM", "Oracle", "Cloudera"],
215
+ "related_roles": ["Big Data Engineer", "Data Architect", "Hadoop Developer", "Data Platform Engineer"]
216
+ },
217
+ "resources": {
218
+ "beginner": {
219
+ "youtube": ["Simplilearn", "Edureka", "freeCodeCamp.org"],
220
+ "websites": ["Hadoop.apache.org", "Cloudera.com", "Coursera"]
221
+ },
222
+ "intermediate": {
223
+ "youtube": ["Hadoop Illuminated", "Databricks", "Apache Spark"],
224
+ "websites": ["Apache.org/Spark", "Databricks.com", "KDnuggets"]
225
+ },
226
+ "advanced": {
227
+ "youtube": ["Data Engineering Podcast", "Confluent"],
228
+ "websites": ["Confluent.io", "Apache Kafka", "BigDataUniversity"]
229
+ }
230
+ }
231
+ },
232
+
233
+ "AI Ethics": {
234
+ "category": "Data Science & AI",
235
+ "salary_range": "$120,000 - $170,000",
236
+ "salary_min": 120000,
237
+ "salary_max": 170000,
238
+ "market_info": {
239
+ "demand": "Growing",
240
+ "growth_rate": "+45%",
241
+ "open_positions": "5,000+",
242
+ "top_employers": ["OpenAI", "Google", "Meta", "Microsoft", "Anthropic", "Partnership on AI"],
243
+ "related_roles": ["AI Ethics Researcher", "Responsible AI Lead", "AI Policy Analyst", "ML Fairness Engineer"]
244
+ },
245
+ "resources": {
246
+ "beginner": {
247
+ "youtube": ["TED-Ed", "Computerphile", "CrashCourse"],
248
+ "websites": ["AIethicsguidelines.global", "Coursera", "Ethics.ai"]
249
+ },
250
+ "intermediate": {
251
+ "youtube": ["DeepLearningAI", "Stanford HAI", "Montreal AI Ethics Institute"],
252
+ "websites": ["Futureoflife.org", "AIindex.stanford.edu", "Partnership on AI"]
253
+ },
254
+ "advanced": {
255
+ "youtube": ["Timnit Gebru", "Kate Crawford", "Joy Buolamwini"],
256
+ "websites": ["FAccT Conference", "AIES Conference", "ArXiv.org"]
257
+ }
258
+ }
259
+ },
260
+
261
+ # ===== WEB DEVELOPMENT =====
262
+ "Frontend (React, Vue, Angular)": {
263
+ "category": "Web Development",
264
+ "salary_range": "$100,000 - $140,000",
265
+ "salary_min": 100000,
266
+ "salary_max": 140000,
267
+ "market_info": {
268
+ "demand": "Very High",
269
+ "growth_rate": "+22%",
270
+ "open_positions": "100,000+",
271
+ "top_employers": ["Meta", "Google", "Amazon", "Netflix", "Airbnb", "Uber"],
272
+ "related_roles": ["Frontend Developer", "UI Engineer", "React Developer", "Web Developer"]
273
+ },
274
+ "resources": {
275
+ "beginner": {
276
+ "youtube": ["Traversy Media", "freeCodeCamp.org", "The Net Ninja"],
277
+ "websites": ["Reactjs.org", "MDN Web Docs", "FreeCodeCamp"]
278
+ },
279
+ "intermediate": {
280
+ "youtube": ["Academind", "Fireship", "Web Dev Simplified"],
281
+ "websites": ["Vuejs.org", "Angular.io", "FrontendMasters"]
282
+ },
283
+ "advanced": {
284
+ "youtube": ["Jack Herrington", "Theo - t3.gg", "UI.dev"],
285
+ "websites": ["React Advanced", "Patterns.dev", "Web.dev"]
286
+ }
287
+ }
288
+ },
289
+
290
+ "Backend (Node.js, Django, Flask)": {
291
+ "category": "Web Development",
292
+ "salary_range": "$110,000 - $150,000",
293
+ "salary_min": 110000,
294
+ "salary_max": 150000,
295
+ "market_info": {
296
+ "demand": "Very High",
297
+ "growth_rate": "+25%",
298
+ "open_positions": "90,000+",
299
+ "top_employers": ["Amazon", "Google", "Microsoft", "Meta", "Netflix", "Uber"],
300
+ "related_roles": ["Backend Developer", "API Developer", "Software Engineer", "Full Stack Developer"]
301
+ },
302
+ "resources": {
303
+ "beginner": {
304
+ "youtube": ["Traversy Media", "freeCodeCamp.org", "Corey Schafer"],
305
+ "websites": ["Nodejs.org", "Djangoproject.com", "Flask.palletsprojects.com"]
306
+ },
307
+ "intermediate": {
308
+ "youtube": ["Programming with Mosh", "The Net Ninja", "Tech With Tim"],
309
+ "websites": ["Expressjs.com", "FastAPI.tiangolo.com", "RealPython"]
310
+ },
311
+ "advanced": {
312
+ "youtube": ["Hussein Nasser", "CodeOpinion", "ArjanCodes"],
313
+ "websites": ["System Design Primer", "Microservices.io", "Martin Fowler"]
314
+ }
315
+ }
316
+ },
317
+
318
+ "Full Stack": {
319
+ "category": "Web Development",
320
+ "salary_range": "$110,000 - $170,000",
321
+ "salary_min": 110000,
322
+ "salary_max": 170000,
323
+ "market_info": {
324
+ "demand": "Very High",
325
+ "growth_rate": "+27%",
326
+ "open_positions": "120,000+",
327
+ "top_employers": ["Amazon", "Google", "Meta", "Microsoft", "Shopify", "Stripe"],
328
+ "related_roles": ["Full Stack Developer", "Software Engineer", "Web Developer", "Application Developer"]
329
+ },
330
+ "resources": {
331
+ "beginner": {
332
+ "youtube": ["freeCodeCamp.org", "Traversy Media", "The Net Ninja"],
333
+ "websites": ["Fullstackopen.com", "FreeCodeCamp", "Codecademy"]
334
+ },
335
+ "intermediate": {
336
+ "youtube": ["Academind", "Web Dev Simplified", "Fireship"],
337
+ "websites": ["Udemy", "Coursera", "Dev.to"]
338
+ },
339
+ "advanced": {
340
+ "youtube": ["Theo - t3.gg", "Jack Herrington", "Hussein Nasser"],
341
+ "websites": ["System Design", "Microservices Patterns", "Web.dev"]
342
+ }
343
+ }
344
+ },
345
+
346
+ "JavaScript": {
347
+ "category": "Web Development",
348
+ "salary_range": "$100,000 - $140,000",
349
+ "salary_min": 100000,
350
+ "salary_max": 140000,
351
+ "market_info": {
352
+ "demand": "Very High",
353
+ "growth_rate": "+20%",
354
+ "open_positions": "150,000+",
355
+ "top_employers": ["Google", "Meta", "Amazon", "Microsoft", "Netflix", "Airbnb"],
356
+ "related_roles": ["JavaScript Developer", "Frontend Developer", "Full Stack Developer", "Web Developer"]
357
+ },
358
+ "resources": {
359
+ "beginner": {
360
+ "youtube": ["Traversy Media", "freeCodeCamp.org", "Programming with Mosh"],
361
+ "websites": ["Javascript.info", "MDN Web Docs", "FreeCodeCamp"]
362
+ },
363
+ "intermediate": {
364
+ "youtube": ["The Net Ninja", "Web Dev Simplified", "Fireship"],
365
+ "websites": ["Eloquentjavascript.net", "JavaScript30", "Frontend Masters"]
366
+ },
367
+ "advanced": {
368
+ "youtube": ["Fun Fun Function", "MPJ", "Theo - t3.gg"],
369
+ "websites": ["You Don't Know JS", "JavaScript Weekly", "TC39 Proposals"]
370
+ }
371
+ }
372
+ },
373
+
374
+ "TypeScript": {
375
+ "category": "Web Development",
376
+ "salary_range": "$110,000 - $150,000",
377
+ "salary_min": 110000,
378
+ "salary_max": 150000,
379
+ "market_info": {
380
+ "demand": "Very High",
381
+ "growth_rate": "+35%",
382
+ "open_positions": "80,000+",
383
+ "top_employers": ["Microsoft", "Google", "Meta", "Amazon", "Airbnb", "Stripe"],
384
+ "related_roles": ["TypeScript Developer", "Frontend Engineer", "Full Stack Developer", "Software Engineer"]
385
+ },
386
+ "resources": {
387
+ "beginner": {
388
+ "youtube": ["Traversy Media", "freeCodeCamp.org", "The Net Ninja"],
389
+ "websites": ["Typescriptlang.org", "TypeScript Handbook", "Execute Program"]
390
+ },
391
+ "intermediate": {
392
+ "youtube": ["Academind", "Matt Pocock", "Jack Herrington"],
393
+ "websites": ["Basarat.gitbook.io/typescript", "Total TypeScript", "Frontend Masters"]
394
+ },
395
+ "advanced": {
396
+ "youtube": ["Matt Pocock", "Theo - t3.gg", "Jack Herrington"],
397
+ "websites": ["Type Challenges", "Advanced TypeScript", "TypeScript Deep Dive"]
398
+ }
399
+ }
400
+ },
401
+
402
+ "Web Performance": {
403
+ "category": "Web Development",
404
+ "salary_range": "$110,000 - $150,000",
405
+ "salary_min": 110000,
406
+ "salary_max": 150000,
407
+ "market_info": {
408
+ "demand": "High",
409
+ "growth_rate": "+30%",
410
+ "open_positions": "15,000+",
411
+ "top_employers": ["Google", "Meta", "Amazon", "Netflix", "Cloudflare", "Vercel"],
412
+ "related_roles": ["Performance Engineer", "Frontend Engineer", "Web Developer", "Site Reliability Engineer"]
413
+ },
414
+ "resources": {
415
+ "beginner": {
416
+ "youtube": ["Google Chrome Developers", "Web Dev Simplified", "Fireship"],
417
+ "websites": ["Web.dev", "MDN Performance", "PageSpeed Insights"]
418
+ },
419
+ "intermediate": {
420
+ "youtube": ["Harry Roberts", "Addy Osmani", "Paul Irish"],
421
+ "websites": ["Developers.google.com/web", "Smashingmagazine.com", "Perf.rocks"]
422
+ },
423
+ "advanced": {
424
+ "youtube": ["Chrome Dev Summit", "Performance.now()"],
425
+ "websites": ["WebPageTest", "Lighthouse CI", "Web Vitals"]
426
+ }
427
+ }
428
+ },
429
+
430
+ "Web Security": {
431
+ "category": "Web Development",
432
+ "salary_range": "$110,000 - $150,000",
433
+ "salary_min": 110000,
434
+ "salary_max": 150000,
435
+ "market_info": {
436
+ "demand": "Very High",
437
+ "growth_rate": "+32%",
438
+ "open_positions": "25,000+",
439
+ "top_employers": ["Google", "Meta", "Amazon", "Microsoft", "Cloudflare", "Auth0"],
440
+ "related_roles": ["Security Engineer", "Application Security Engineer", "Web Developer", "DevSecOps Engineer"]
441
+ },
442
+ "resources": {
443
+ "beginner": {
444
+ "youtube": ["freeCodeCamp.org", "Traversy Media", "OWASP"],
445
+ "websites": ["Owasp.org", "Web.dev/security", "MDN Security"]
446
+ },
447
+ "intermediate": {
448
+ "youtube": ["LiveOverflow", "The Cyber Mentor", "PwnFunction"],
449
+ "websites": ["Portswigger.net", "HackerOne", "BugBountyHunter"]
450
+ },
451
+ "advanced": {
452
+ "youtube": ["LiveOverflow", "IppSec", "John Hammond"],
453
+ "websites": ["OWASP Top 10", "Web Security Academy", "HackerOne Reports"]
454
+ }
455
+ }
456
+ },
457
+
458
+ "Progressive Web Apps": {
459
+ "category": "Web Development",
460
+ "salary_range": "$110,000 - $150,000",
461
+ "salary_min": 110000,
462
+ "salary_max": 150000,
463
+ "market_info": {
464
+ "demand": "Growing",
465
+ "growth_rate": "+28%",
466
+ "open_positions": "20,000+",
467
+ "top_employers": ["Google", "Microsoft", "Twitter", "Starbucks", "Uber", "Pinterest"],
468
+ "related_roles": ["PWA Developer", "Frontend Developer", "Mobile Web Developer", "Web Developer"]
469
+ },
470
+ "resources": {
471
+ "beginner": {
472
+ "youtube": ["Google Chrome Developers", "Traversy Media", "freeCodeCamp.org"],
473
+ "websites": ["Web.dev/progressive-web-apps", "PWA Builder", "MDN PWA"]
474
+ },
475
+ "intermediate": {
476
+ "youtube": ["Academind", "Maximilian Schwarzmüller", "Fireship"],
477
+ "websites": ["Developers.google.com/web/pwa", "Workboxjs.org", "PWA Stats"]
478
+ },
479
+ "advanced": {
480
+ "youtube": ["Chrome Dev Summit", "Google I/O"],
481
+ "websites": ["Service Worker Cookbook", "PWA Directory", "Web Capabilities"]
482
+ }
483
+ }
484
+ },
485
+
486
+ # ===== MOBILE DEVELOPMENT =====
487
+ "iOS Development": {
488
+ "category": "Mobile Development",
489
+ "salary_range": "$120,000 - $160,000",
490
+ "salary_min": 120000,
491
+ "salary_max": 160000,
492
+ "market_info": {
493
+ "demand": "High",
494
+ "growth_rate": "+18%",
495
+ "open_positions": "40,000+",
496
+ "top_employers": ["Apple", "Meta", "Amazon", "Uber", "Airbnb", "Netflix"],
497
+ "related_roles": ["iOS Engineer", "Swift Developer", "Mobile Developer", "App Developer"]
498
+ },
499
+ "resources": {
500
+ "beginner": {
501
+ "youtube": ["CodeWithChris", "Sean Allen", "iOS Academy"],
502
+ "websites": ["Developer.apple.com", "Hackingwithswift.com", "Raywenderlich.com"]
503
+ },
504
+ "intermediate": {
505
+ "youtube": ["Lets Build That App", "Kavsoft", "SwiftUI Lab"],
506
+ "websites": ["Swift.org", "Apple Developer Tutorials", "Udemy"]
507
+ },
508
+ "advanced": {
509
+ "youtube": ["WWDC Videos", "Point-Free", "Swift by Sundell"],
510
+ "websites": ["Swift Forums", "NSHipster", "objc.io"]
511
+ }
512
+ }
513
+ },
514
+
515
+ "Android Development": {
516
+ "category": "Mobile Development",
517
+ "salary_range": "$100,000 - $140,000",
518
+ "salary_min": 100000,
519
+ "salary_max": 140000,
520
+ "market_info": {
521
+ "demand": "High",
522
+ "growth_rate": "+20%",
523
+ "open_positions": "50,000+",
524
+ "top_employers": ["Google", "Meta", "Amazon", "Uber", "Netflix", "Spotify"],
525
+ "related_roles": ["Android Engineer", "Kotlin Developer", "Mobile Developer", "App Developer"]
526
+ },
527
+ "resources": {
528
+ "beginner": {
529
+ "youtube": ["Philipp Lackner", "freeCodeCamp.org", "Android Developers"],
530
+ "websites": ["Developer.android.com", "Kotlinlang.org", "Udacity"]
531
+ },
532
+ "intermediate": {
533
+ "youtube": ["Coding in Flow", "Reso Coder", "Stevdza-San"],
534
+ "websites": ["Raywenderlich.com", "Vogella", "Android Weekly"]
535
+ },
536
+ "advanced": {
537
+ "youtube": ["Android Developers", "Philipp Lackner Advanced", "Coding with Mitch"],
538
+ "websites": ["Android Dev Summit", "ProAndroidDev", "Android Arsenal"]
539
+ }
540
+ }
541
+ },
542
+
543
+ "React Native": {
544
+ "category": "Mobile Development",
545
+ "salary_range": "$100,000 - $150,000",
546
+ "salary_min": 100000,
547
+ "salary_max": 150000,
548
+ "market_info": {
549
+ "demand": "High",
550
+ "growth_rate": "+25%",
551
+ "open_positions": "35,000+",
552
+ "top_employers": ["Meta", "Microsoft", "Tesla", "Shopify", "Discord", "Coinbase"],
553
+ "related_roles": ["React Native Developer", "Mobile Developer", "Cross-Platform Developer", "JavaScript Developer"]
554
+ },
555
+ "resources": {
556
+ "beginner": {
557
+ "youtube": ["freeCodeCamp.org", "The Net Ninja", "Programming with Mosh"],
558
+ "websites": ["Reactnative.dev", "Expo.dev", "React Native School"]
559
+ },
560
+ "intermediate": {
561
+ "youtube": ["Academind", "Maximilian Schwarzmüller", "Not Just Dev"],
562
+ "websites": ["Udemy", "Coursera", "React Native Directory"]
563
+ },
564
+ "advanced": {
565
+ "youtube": ["William Candillon", "Catalin Miron", "Infinite Red"],
566
+ "websites": ["React Native EU", "Chain React", "React Native Radio"]
567
+ }
568
+ }
569
+ },
570
+
571
+ "Flutter": {
572
+ "category": "Mobile Development",
573
+ "salary_range": "$100,000 - $140,000",
574
+ "salary_min": 100000,
575
+ "salary_max": 140000,
576
+ "market_info": {
577
+ "demand": "Growing",
578
+ "growth_rate": "+30%",
579
+ "open_positions": "30,000+",
580
+ "top_employers": ["Google", "Alibaba", "BMW", "eBay", "Groupon", "Philips"],
581
+ "related_roles": ["Flutter Developer", "Mobile Developer", "Dart Developer", "Cross-Platform Developer"]
582
+ },
583
+ "resources": {
584
+ "beginner": {
585
+ "youtube": ["The Net Ninja", "freeCodeCamp.org", "Flutter"],
586
+ "websites": ["Flutter.dev", "Dart.dev", "Flutter Codelabs"]
587
+ },
588
+ "intermediate": {
589
+ "youtube": ["Reso Coder", "Academind", "Robert Brunhage"],
590
+ "websites": ["Udemy", "Coursera", "Flutter Awesome"]
591
+ },
592
+ "advanced": {
593
+ "youtube": ["Flutter Europe", "Filledstacks", "Reso Coder Advanced"],
594
+ "websites": ["Flutter Engage", "DartPad", "Pub.dev"]
595
+ }
596
+ }
597
+ },
598
+
599
+ "Mobile UI/UX": {
600
+ "category": "Mobile Development",
601
+ "salary_range": "$90,000 - $130,000",
602
+ "salary_min": 90000,
603
+ "salary_max": 130000,
604
+ "market_info": {
605
+ "demand": "High",
606
+ "growth_rate": "+22%",
607
+ "open_positions": "25,000+",
608
+ "top_employers": ["Apple", "Google", "Meta", "Airbnb", "Uber", "Netflix"],
609
+ "related_roles": ["Mobile UI Designer", "UX Designer", "Product Designer", "Interaction Designer"]
610
+ },
611
+ "resources": {
612
+ "beginner": {
613
+ "youtube": ["DesignCourse", "Flux Academy", "Jesse Showalter"],
614
+ "websites": ["Material.io", "Humaninterface.apple.com", "Uxdesign.cc"]
615
+ },
616
+ "intermediate": {
617
+ "youtube": ["ChunBuns", "Mizko", "Malewicz"],
618
+ "websites": ["Interaction-design.org", "Adobe.com/xd", "Figma.com"]
619
+ },
620
+ "advanced": {
621
+ "youtube": ["Config by Figma", "Apple Design Resources"],
622
+ "websites": ["WWDC Design Sessions", "Material Design Awards", "Mobbin"]
623
+ }
624
+ }
625
+ },
626
+
627
+ "Cross-Platform": {
628
+ "category": "Mobile Development",
629
+ "salary_range": "$100,000 - $140,000",
630
+ "salary_min": 100000,
631
+ "salary_max": 140000,
632
+ "market_info": {
633
+ "demand": "High",
634
+ "growth_rate": "+28%",
635
+ "open_positions": "40,000+",
636
+ "top_employers": ["Microsoft", "Google", "Meta", "Shopify", "Adobe", "SAP"],
637
+ "related_roles": ["Cross-Platform Developer", "Mobile Developer", "Hybrid App Developer", "Multi-Platform Engineer"]
638
+ },
639
+ "resources": {
640
+ "beginner": {
641
+ "youtube": ["freeCodeCamp.org", "Academind", "The Net Ninja"],
642
+ "websites": ["Reactnative.dev", "Flutter.dev", "Ionicframework.com"]
643
+ },
644
+ "intermediate": {
645
+ "youtube": ["Simon Grimm", "Fireship", "Traversy Media"],
646
+ "websites": ["Xamarin.com", "Capacitorjs.com", "Udemy"]
647
+ },
648
+ "advanced": {
649
+ "youtube": ["React Native EU", "Flutter Engage", "Ionic Conf"],
650
+ "websites": ["Native Script", "Kotlin Multiplatform", "Tauri"]
651
+ }
652
+ }
653
+ },
654
+
655
+ "Mobile Games": {
656
+ "category": "Mobile Development",
657
+ "salary_range": "$90,000 - $140,000",
658
+ "salary_min": 90000,
659
+ "salary_max": 140000,
660
+ "market_info": {
661
+ "demand": "Moderate",
662
+ "growth_rate": "+15%",
663
+ "open_positions": "20,000+",
664
+ "top_employers": ["King", "Supercell", "Rovio", "Zynga", "Electronic Arts", "Activision"],
665
+ "related_roles": ["Mobile Game Developer", "Unity Developer", "Game Programmer", "Gameplay Engineer"]
666
+ },
667
+ "resources": {
668
+ "beginner": {
669
+ "youtube": ["Brackeys", "Blackthornprod", "Unity"],
670
+ "websites": ["Unity.com/learn", "Gamedev.tv", "Itch.io"]
671
+ },
672
+ "intermediate": {
673
+ "youtube": ["Code Monkey", "Jonas Tyroller", "Dani"],
674
+ "websites": ["Udemy", "Coursera", "Gamedev.net"]
675
+ },
676
+ "advanced": {
677
+ "youtube": ["GDC", "Unite Conference", "Game Maker's Toolkit"],
678
+ "websites": ["Gamasutra", "Unity Asset Store", "Unreal Marketplace"]
679
+ }
680
+ }
681
+ },
682
+
683
+ "Mobile Security": {
684
+ "category": "Mobile Development",
685
+ "salary_range": "$100,000 - $150,000",
686
+ "salary_min": 100000,
687
+ "salary_max": 150000,
688
+ "market_info": {
689
+ "demand": "High",
690
+ "growth_rate": "+35%",
691
+ "open_positions": "15,000+",
692
+ "top_employers": ["Apple", "Google", "Meta", "Amazon", "Microsoft", "Zimperium"],
693
+ "related_roles": ["Mobile Security Engineer", "Application Security Engineer", "Security Researcher", "Penetration Tester"]
694
+ },
695
+ "resources": {
696
+ "beginner": {
697
+ "youtube": ["The Cyber Mentor", "NetworkChuck", "freeCodeCamp.org"],
698
+ "websites": ["Owasp.org/mobile", "Developer.android.com/security", "Developer.apple.com/security"]
699
+ },
700
+ "intermediate": {
701
+ "youtube": ["LiveOverflow", "John Hammond", "David Bombal"],
702
+ "websites": ["HackerOne", "Bugcrowd", "Mobile Security Testing Guide"]
703
+ },
704
+ "advanced": {
705
+ "youtube": ["Black Hat", "DEF CON", "OWASP Mobile"],
706
+ "websites": ["OWASP MSTG", "Mobile Security Framework", "Frida"]
707
+ }
708
+ }
709
+ },
710
+
711
+ # ===== EMERGING AI ROLES 2025 =====
712
+ "Prompt Engineering": {
713
+ "category": "Emerging AI Roles",
714
+ "salary_range": "$140,000 - $220,000",
715
+ "salary_min": 140000,
716
+ "salary_max": 220000,
717
+ "market_info": {
718
+ "demand": "Very High",
719
+ "growth_rate": "+60%",
720
+ "open_positions": "12,000+",
721
+ "top_employers": ["OpenAI", "Anthropic", "Google", "Microsoft", "Meta", "Startups"],
722
+ "related_roles": ["Prompt Engineer", "AI Product Manager", "LLM Specialist", "Conversational AI Designer"]
723
+ },
724
+ "description": "A specialist who crafts precise inputs (prompts) for generative AI models to optimize outputs, bridging human intent and AI capabilities.",
725
+ "key_responsibilities": [
726
+ "Designing and testing prompts for optimal AI outputs",
727
+ "Iterating on AI responses for accuracy and relevance",
728
+ "Collaborating with developers to refine models",
729
+ "Training teams on effective prompting techniques",
730
+ "A/B testing different prompt strategies"
731
+ ],
732
+ "resources": {
733
+ "beginner": {
734
+ "youtube": ["OpenAI", "AI Explained", "Matt Wolfe"],
735
+ "websites": ["Learn Prompting", "PromptingGuide.ai", "OpenAI Cookbook"]
736
+ },
737
+ "intermediate": {
738
+ "youtube": ["DeepLearningAI", "Prompt Engineering Guide", "AI Jason"],
739
+ "websites": ["Anthropic Docs", "LangChain Docs", "PromptBase"]
740
+ },
741
+ "advanced": {
742
+ "youtube": ["Andrej Karpathy", "Yannic Kilcher", "AI Coffee Break"],
743
+ "websites": ["ArXiv.org", "Papers with Code", "HuggingFace Research"]
744
+ }
745
+ }
746
+ },
747
+
748
+ "AI Ethics & Governance": {
749
+ "category": "Emerging AI Roles",
750
+ "salary_range": "$150,000 - $230,000",
751
+ "salary_min": 150000,
752
+ "salary_max": 230000,
753
+ "market_info": {
754
+ "demand": "Very High",
755
+ "growth_rate": "+55%",
756
+ "open_positions": "8,000+",
757
+ "top_employers": ["OpenAI", "Google", "Meta", "Microsoft", "Anthropic", "Partnership on AI"],
758
+ "related_roles": ["AI Ethics Officer", "Responsible AI Lead", "AI Policy Analyst", "AI Governance Specialist"]
759
+ },
760
+ "description": "An expert focused on ensuring AI systems are fair, transparent, and unbiased, addressing regulatory and societal concerns in AI deployment.",
761
+ "key_responsibilities": [
762
+ "Auditing AI systems for bias and fairness",
763
+ "Developing ethical guidelines and frameworks",
764
+ "Conducting AI impact assessments",
765
+ "Advising on compliance with AI regulations (EU AI Act, etc.)",
766
+ "Stakeholder communication on AI ethics"
767
+ ],
768
+ "resources": {
769
+ "beginner": {
770
+ "youtube": ["TED-Ed", "Computerphile", "CrashCourse AI Ethics"],
771
+ "websites": ["AI Ethics Guidelines", "Ethics.ai", "Partnership on AI"]
772
+ },
773
+ "intermediate": {
774
+ "youtube": ["Stanford HAI", "Montreal AI Ethics Institute", "DeepLearningAI"],
775
+ "websites": ["Futureoflife.org", "AI Index Stanford", "FAccT Conference"]
776
+ },
777
+ "advanced": {
778
+ "youtube": ["Timnit Gebru", "Kate Crawford", "Joy Buolamwini"],
779
+ "websites": ["FAccT Conference", "AIES Conference", "ArXiv AI Ethics"]
780
+ }
781
+ }
782
+ },
783
+
784
+ "AI Auditing": {
785
+ "category": "Emerging AI Roles",
786
+ "salary_range": "$130,000 - $200,000",
787
+ "salary_min": 130000,
788
+ "salary_max": 200000,
789
+ "market_info": {
790
+ "demand": "High",
791
+ "growth_rate": "+50%",
792
+ "open_positions": "6,000+",
793
+ "top_employers": ["Deloitte", "PwC", "KPMG", "EY", "Tech Companies", "Financial Institutions"],
794
+ "related_roles": ["AI Auditor", "ML Compliance Specialist", "AI Risk Analyst", "Algorithm Auditor"]
795
+ },
796
+ "description": "A role involving the inspection of AI systems for accuracy, security, and explainability, similar to financial auditing but for algorithms.",
797
+ "key_responsibilities": [
798
+ "Performing AI risk assessments",
799
+ "Documenting AI decision processes",
800
+ "Verifying model performance and accuracy",
801
+ "Reporting on vulnerabilities or errors",
802
+ "Ensuring regulatory compliance"
803
+ ],
804
+ "resources": {
805
+ "beginner": {
806
+ "youtube": ["AI Auditing Basics", "Computerphile", "freeCodeCamp.org"],
807
+ "websites": ["ISO AI Standards", "NIST AI Framework", "Coursera"]
808
+ },
809
+ "intermediate": {
810
+ "youtube": ["DeepLearningAI", "Stanford AI Audit", "AI Explained"],
811
+ "websites": ["SHAP Documentation", "LIME Tutorials", "TowardsDataScience"]
812
+ },
813
+ "advanced": {
814
+ "youtube": ["NeurIPS Talks", "ICML Tutorials", "AI Audit Research"],
815
+ "websites": ["ArXiv.org", "AI Audit Tools", "Explainable AI Research"]
816
+ }
817
+ }
818
+ },
819
+
820
+ "Generative AI Engineering": {
821
+ "category": "Emerging AI Roles",
822
+ "salary_range": "$160,000 - $250,000",
823
+ "salary_min": 160000,
824
+ "salary_max": 250000,
825
+ "market_info": {
826
+ "demand": "Very High",
827
+ "growth_rate": "+70%",
828
+ "open_positions": "15,000+",
829
+ "top_employers": ["OpenAI", "Stability AI", "Midjourney", "Google", "Meta", "Adobe"],
830
+ "related_roles": ["Generative AI Engineer", "GenAI Developer", "Diffusion Model Specialist", "Creative AI Engineer"]
831
+ },
832
+ "description": "A developer specializing in building and deploying generative models for content creation (text, images, video), fueled by tools like DALL-E and Stable Diffusion.",
833
+ "key_responsibilities": [
834
+ "Integrating generative AI into applications",
835
+ "Fine-tuning models for specific use cases",
836
+ "Optimizing for scalability and performance",
837
+ "Ensuring output quality and safety",
838
+ "Building APIs for generative models"
839
+ ],
840
+ "resources": {
841
+ "beginner": {
842
+ "youtube": ["Sentdex", "freeCodeCamp.org", "AI Explained"],
843
+ "websites": ["HuggingFace.co", "Stability AI Docs", "OpenAI Platform"]
844
+ },
845
+ "intermediate": {
846
+ "youtube": ["DeepLearningAI", "Andrej Karpathy", "Two Minute Papers"],
847
+ "websites": ["PyTorch.org", "TensorFlow.org", "Papers with Code"]
848
+ },
849
+ "advanced": {
850
+ "youtube": ["Yannic Kilcher", "AI Coffee Break", "CVPR Talks"],
851
+ "websites": ["ArXiv.org", "Distill.pub", "NeurIPS Papers"]
852
+ }
853
+ }
854
+ },
855
+
856
+ "Human-AI Collaboration": {
857
+ "category": "Emerging AI Roles",
858
+ "salary_range": "$120,000 - $190,000",
859
+ "salary_min": 120000,
860
+ "salary_max": 190000,
861
+ "market_info": {
862
+ "demand": "High",
863
+ "growth_rate": "+45%",
864
+ "open_positions": "7,000+",
865
+ "top_employers": ["Microsoft", "Google", "Salesforce", "Adobe", "Notion", "Figma"],
866
+ "related_roles": ["Human-AI Collaboration Specialist", "AI UX Designer", "Augmented Intelligence Designer", "AI Product Designer"]
867
+ },
868
+ "description": "A professional designing workflows where humans and AI augment each other, focusing on productivity tools and interface optimization.",
869
+ "key_responsibilities": [
870
+ "Creating collaborative AI interfaces",
871
+ "Training users on AI tools",
872
+ "Measuring human-AI performance metrics",
873
+ "Iterating on feedback loops",
874
+ "Designing AI-augmented workflows"
875
+ ],
876
+ "resources": {
877
+ "beginner": {
878
+ "youtube": ["DesignCourse", "Flux Academy", "Google Design"],
879
+ "websites": ["Interaction-design.org", "Nielsen Norman Group", "UX Collective"]
880
+ },
881
+ "intermediate": {
882
+ "youtube": ["Adobe MAX", "Figma Config", "Microsoft Design"],
883
+ "websites": ["Human-AI Interaction", "ACM CHI", "UX Research Methods"]
884
+ },
885
+ "advanced": {
886
+ "youtube": ["CHI Conference", "CSCW Talks", "HCI Research"],
887
+ "websites": ["ArXiv HCI", "ACM Digital Library", "Human-AI Research"]
888
+ }
889
+ }
890
+ },
891
+
892
+ "AI Agent Architecture": {
893
+ "category": "Emerging AI Roles",
894
+ "salary_range": "$170,000 - $260,000",
895
+ "salary_min": 170000,
896
+ "salary_max": 260000,
897
+ "market_info": {
898
+ "demand": "Very High",
899
+ "growth_rate": "+65%",
900
+ "open_positions": "10,000+",
901
+ "top_employers": ["OpenAI", "Anthropic", "Google DeepMind", "Salesforce", "Microsoft", "Startups"],
902
+ "related_roles": ["AI Agent Architect", "Agentic AI Engineer", "Multi-Agent Systems Developer", "Autonomous AI Engineer"]
903
+ },
904
+ "description": "An engineer who designs autonomous AI agents capable of multi-step tasks (planning, decision-making), rising with agentic AI advancements like Salesforce's Agentforce.",
905
+ "key_responsibilities": [
906
+ "Architecting agent frameworks and systems",
907
+ "Handling multi-agent coordination",
908
+ "Ensuring reliability and error-handling",
909
+ "Scaling for enterprise use",
910
+ "Implementing ethical AI safeguards"
911
+ ],
912
+ "resources": {
913
+ "beginner": {
914
+ "youtube": ["DeepLearningAI", "Sentdex", "AI Explained"],
915
+ "websites": ["LangChain Docs", "AutoGPT", "AgentGPT"]
916
+ },
917
+ "intermediate": {
918
+ "youtube": ["Andrej Karpathy", "Two Minute Papers", "AI Agent Tutorials"],
919
+ "websites": ["LangGraph", "CrewAI", "Multi-Agent Systems"]
920
+ },
921
+ "advanced": {
922
+ "youtube": ["Yannic Kilcher", "AI Coffee Break", "NeurIPS Talks"],
923
+ "websites": ["ArXiv.org", "Reinforcement Learning", "Agent Research Papers"]
924
+ }
925
+ }
926
+ },
927
+ }
928
+
929
+
930
+ def get_skill_info(skill_name: str, expertise_level: str = "intermediate") -> dict:
931
+ """
932
+ Get skill information including salary and resources filtered by expertise level.
933
+
934
+ Args:
935
+ skill_name: Name of the skill (case-insensitive)
936
+ expertise_level: User's expertise level (beginner, intermediate, advanced)
937
+
938
+ Returns:
939
+ Dictionary with skill information including filtered resources
940
+ """
941
+ # Normalize skill name
942
+ skill_key = None
943
+ for key in SKILLS_DATABASE.keys():
944
+ if key.lower() == skill_name.lower():
945
+ skill_key = key
946
+ break
947
+
948
+ if not skill_key:
949
+ # Return default data if skill not found
950
+ return {
951
+ "salary_range": "$80,000 - $150,000",
952
+ "market_info": {
953
+ "demand": "Moderate",
954
+ "growth_rate": "+20%",
955
+ "open_positions": "10,000+",
956
+ "top_employers": ["Tech Companies", "Startups", "Enterprises"],
957
+ "related_roles": ["Software Engineer", "Developer", "Technical Specialist"]
958
+ },
959
+ "resources": {
960
+ "youtube": ["freeCodeCamp.org", "Traversy Media", "The Net Ninja"],
961
+ "websites": ["Coursera", "Udemy", "FreeCodeCamp"]
962
+ }
963
+ }
964
+
965
+ skill_data = SKILLS_DATABASE[skill_key].copy()
966
+
967
+ # Filter resources by expertise level
968
+ expertise_level = expertise_level.lower()
969
+ if expertise_level not in ["beginner", "intermediate", "advanced"]:
970
+ expertise_level = "intermediate"
971
+
972
+ if "resources" in skill_data and expertise_level in skill_data["resources"]:
973
+ # Replace full resources dict with just the relevant level
974
+ skill_data["resources"] = skill_data["resources"][expertise_level]
975
+ elif "resources" in skill_data:
976
+ # Fallback to intermediate if level not found
977
+ skill_data["resources"] = skill_data["resources"].get("intermediate", {
978
+ "youtube": ["freeCodeCamp.org", "Traversy Media"],
979
+ "websites": ["Coursera", "Udemy"]
980
+ })
981
+
982
+ return skill_data
983
+
984
+
985
+ def get_all_categories() -> list:
986
+ """Get list of all unique categories."""
987
+ categories = set()
988
+ for skill_data in SKILLS_DATABASE.values():
989
+ categories.add(skill_data["category"])
990
+ return sorted(list(categories))
991
+
992
+
993
+ def get_skills_by_category(category: str) -> list:
994
+ """Get all skills in a specific category."""
995
+ skills = []
996
+ for skill_name, skill_data in SKILLS_DATABASE.items():
997
+ if skill_data["category"] == category:
998
+ skills.append(skill_name)
999
+ return sorted(skills)
src/data/vector_store.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector store implementation for RAG capabilities.
3
+ """
4
+ from typing import List, Dict, Any, Optional
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.document_loaders import DirectoryLoader
11
+
12
+ class VectorStore:
13
+ """
14
+ Manages vector storage for RAG capabilities.
15
+ """
16
+ def __init__(self, api_key: Optional[str] = None):
17
+ """
18
+ Initialize the vector store.
19
+
20
+ Args:
21
+ api_key: Optional OpenAI API key
22
+ """
23
+ self.api_key = api_key
24
+
25
+ # Use free sentence-transformers embeddings (no API key needed)
26
+ try:
27
+ from langchain.embeddings import HuggingFaceEmbeddings
28
+ self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
29
+ print("✅ Using free HuggingFace embeddings")
30
+ except ImportError:
31
+ # Fallback to OpenAI if HuggingFace not available
32
+ if api_key:
33
+ from langchain.embeddings import OpenAIEmbeddings
34
+ self.embeddings = OpenAIEmbeddings(api_key=api_key)
35
+ print("✅ Using OpenAI embeddings")
36
+ else:
37
+ raise ValueError("HuggingFace not available and no OpenAI API key provided")
38
+
39
+ self.vector_store_path = Path("vector_db")
40
+ self.vector_store_path.mkdir(exist_ok=True)
41
+ self.vector_store = None
42
+
43
+ def load_documents(self, directory: str = None) -> None:
44
+ """
45
+ Load documents from a directory and create embeddings.
46
+ If no directory is provided, creates a minimal default vector store.
47
+
48
+ Args:
49
+ directory: Optional path to directory containing documents
50
+ """
51
+ try:
52
+ # If no directory provided, create a minimal vector store
53
+ if directory is None:
54
+ self._create_minimal_vector_store()
55
+ return
56
+
57
+ # Check if directory exists
58
+ if not os.path.exists(directory):
59
+ print(f"Warning: Document directory {directory} not found. Creating minimal vector store.")
60
+ self._create_minimal_vector_store()
61
+ return
62
+
63
+ # Try to load documents
64
+ loader = DirectoryLoader(directory)
65
+ documents = loader.load()
66
+
67
+ if not documents:
68
+ print("Warning: No documents found in directory. Creating minimal vector store.")
69
+ self._create_minimal_vector_store()
70
+ return
71
+
72
+ # Process documents
73
+ text_splitter = RecursiveCharacterTextSplitter(
74
+ chunk_size=1000,
75
+ chunk_overlap=200,
76
+ )
77
+
78
+ texts = text_splitter.split_documents(documents)
79
+
80
+ # Create or update vector store
81
+ if os.path.exists(self.vector_store_path / "index.faiss"):
82
+ self.vector_store = FAISS.load_local(
83
+ str(self.vector_store_path),
84
+ self.embeddings
85
+ )
86
+ self.vector_store.add_documents(texts)
87
+ else:
88
+ self.vector_store = FAISS.from_documents(
89
+ texts,
90
+ self.embeddings
91
+ )
92
+ self.vector_store.save_local(str(self.vector_store_path))
93
+
94
+ except Exception as e:
95
+ print(f"Error loading documents: {str(e)}")
96
+ self._create_minimal_vector_store()
97
+
98
+ def _create_minimal_vector_store(self) -> None:
99
+ """Create a minimal vector store with default content."""
100
+ try:
101
+ default_texts = [
102
+ "This is a default document. The vector store was initialized with minimal content.",
103
+ "You can add your own documents to the vector store by placing them in the vector_db/documents directory.",
104
+ "The application will automatically load and index any text files found in that directory."
105
+ ]
106
+
107
+ if os.path.exists(self.vector_store_path / "index.faiss"):
108
+ self.vector_store = FAISS.load_local(
109
+ str(self.vector_store_path),
110
+ self.embeddings
111
+ )
112
+ else:
113
+ self.vector_store = FAISS.from_texts(
114
+ default_texts,
115
+ self.embeddings
116
+ )
117
+ self.vector_store.save_local(str(self.vector_store_path))
118
+
119
+ except Exception as e:
120
+ print(f"Error creating minimal vector store: {str(e)}")
121
+ # Create an empty FAISS index as a last resort
122
+ self.vector_store = FAISS.from_texts(
123
+ ["Default document"],
124
+ self.embeddings
125
+ )
126
+
127
+ def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]:
128
+ """
129
+ Search for relevant documents based on query.
130
+
131
+ Args:
132
+ query: Search query
133
+ k: Number of results to return
134
+ documents: Optional list of documents to search through (fallback)
135
+
136
+ Returns:
137
+ List of relevant documents with scores
138
+ """
139
+ # If vector store is not available, fall back to simple text search
140
+ if not self.vector_store:
141
+ if not documents:
142
+ return []
143
+
144
+ # Simple text-based search as fallback
145
+ query = query.lower()
146
+ return [
147
+ {"content": doc, "score": 1.0, "metadata": {}}
148
+ for doc in documents
149
+ if query in doc.lower()
150
+ ][:k]
151
+
152
+ try:
153
+ results = self.vector_store.similarity_search_with_score(query, k=k)
154
+ formatted_results = []
155
+ for doc, score in results:
156
+ formatted_results.append({
157
+ "content": doc.page_content,
158
+ "metadata": getattr(doc, 'metadata', {}),
159
+ "score": float(score) if hasattr(score, '__float__') else 0.0
160
+ })
161
+ return formatted_results
162
+
163
+ except Exception as e:
164
+ print(f"Error in vector store search: {str(e)}")
165
+ # Fall back to simple text search if available
166
+ if documents:
167
+ query = query.lower()
168
+ return [
169
+ {"content": doc, "score": 1.0, "metadata": {}}
170
+ for doc in documents
171
+ if query in doc.lower()
172
+ ][:k]
173
+ return []
src/direct_openai.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Direct OpenAI API handler to bypass any potential middleware issues.
3
+ """
4
+ import os
5
+ import json
6
+ import requests
7
+ from typing import Dict, Any, List, Optional
8
+ from langsmith import traceable as langsmith_traceable
9
+
10
+ @langsmith_traceable(name="OpenAI_Direct_Call")
11
+ def generate_completion(
12
+ prompt: str,
13
+ system_message: str = "You are an expert educational AI assistant that specializes in creating personalized learning paths.",
14
+ model: str = "gpt-3.5-turbo",
15
+ temperature: float = 0.7,
16
+ max_tokens: int = 1000,
17
+ timeout: int = 120
18
+ ) -> str:
19
+ """
20
+ Generate a completion using direct HTTP requests to OpenAI API.
21
+
22
+ Args:
23
+ prompt: The user prompt
24
+ system_message: Optional system message
25
+ model: The OpenAI model to use
26
+ temperature: Sampling temperature
27
+ max_tokens: Maximum tokens to generate
28
+
29
+ Returns:
30
+ The generated text
31
+ """
32
+ # Get API key from environment or directly from file if needed
33
+ api_key = os.environ.get("OPENAI_API_KEY")
34
+
35
+ # Fallback to direct read if environment variable isn't working
36
+ if not api_key or len(api_key) < 20:
37
+ try:
38
+ with open('.env', 'r') as f:
39
+ for line in f:
40
+ if line.startswith('OPENAI_API_KEY='):
41
+ api_key = line.strip().split('=', 1)[1]
42
+ break
43
+ except Exception as e:
44
+ print(f"Error reading API key from file: {e}")
45
+
46
+ if not api_key:
47
+ raise ValueError("OpenAI API key not found in environment variables or .env file")
48
+
49
+ print(f"Using API key starting with: {api_key[:10]}...")
50
+
51
+ # API endpoint
52
+ url = "https://api.openai.com/v1/chat/completions"
53
+
54
+ # Request headers
55
+ headers = {
56
+ "Content-Type": "application/json",
57
+ "Authorization": f"Bearer {api_key}"
58
+ }
59
+
60
+ # Request payload
61
+ payload = {
62
+ "model": model,
63
+ "messages": [
64
+ {"role": "system", "content": system_message},
65
+ {"role": "user", "content": prompt}
66
+ ],
67
+ "temperature": temperature,
68
+ "max_tokens": max_tokens
69
+ }
70
+
71
+ print("Making direct API request to OpenAI...")
72
+
73
+ # Make the request
74
+ try:
75
+ response = requests.post(
76
+ url,
77
+ headers=headers,
78
+ json=payload,
79
+ timeout=timeout
80
+ )
81
+
82
+ # Check if request was successful
83
+ response.raise_for_status()
84
+
85
+ # Parse response
86
+ result = response.json()
87
+ print("Received response from OpenAI API")
88
+
89
+ # Extract and return the generated text
90
+ if "choices" in result and len(result["choices"]) > 0:
91
+ return result["choices"][0]["message"]["content"]
92
+ else:
93
+ raise ValueError(f"Unexpected API response: {json.dumps(result)}")
94
+
95
+ except requests.exceptions.RequestException as e:
96
+ print(f"API request failed: {str(e)}")
97
+ if hasattr(e, "response") and e.response is not None:
98
+ status_code = e.response.status_code
99
+ try:
100
+ error_data = e.response.json()
101
+ error_message = f"Error code: {status_code} - {json.dumps(error_data)}"
102
+ except:
103
+ error_message = f"Error code: {status_code} - {e.response.text}"
104
+ else:
105
+ error_message = str(e)
106
+
107
+ raise ValueError(f"OpenAI API request failed: {error_message}")
src/learning_path.py ADDED
@@ -0,0 +1,916 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Learning path generation logic for the AI Learning Path Generator.
3
+ This module handles the creation and management of personalized learning paths.
4
+ """
5
+ import datetime
6
+ import json
7
+ import os
8
+ import uuid
9
+ import hashlib
10
+ from pathlib import Path
11
+ import time
12
+ from typing import Any, Dict, List, Optional, Type
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+
15
+ from langchain.output_parsers import PydanticOutputParser
16
+ from pydantic import BaseModel, Field, ValidationError, validator
17
+
18
+ from src.data.document_store import DocumentStore
19
+ from src.data.skills_database import get_skill_info
20
+ from src.ml.model_orchestrator import ModelOrchestrator
21
+ from src.ml.job_market import get_job_market_stats
22
+ from src.utils.config import (
23
+ DEFAULT_REGION,
24
+ EXPERTISE_LEVELS,
25
+ LEARNING_STYLES,
26
+ TIME_COMMITMENTS,
27
+ )
28
+ from src.utils.helpers import (
29
+ calculate_study_schedule,
30
+ difficulty_to_score,
31
+ match_resources_to_learning_style,
32
+ )
33
+ from src.utils.observability import get_observability_manager, traceable
34
+ from src.utils.semantic_cache import SemanticCache
35
+ # Import for OpenAI-powered resource search
36
+ from src.ml.resource_search import search_resources
37
+
38
+
39
+ class ResourceItem(BaseModel):
40
+ """A single learning resource."""
41
+
42
+ type: str = Field(description="Type of the resource (e.g., article, video, book)")
43
+ url: str = Field(description="URL of the resource")
44
+ description: str = Field(description="Brief description of the resource")
45
+
46
+
47
+ class JobMarketData(BaseModel):
48
+ """Job market data for a skill or role."""
49
+
50
+ open_positions: Optional[str] = Field(
51
+ description="Estimated number of open positions for this role/skill.",
52
+ default="N/A",
53
+ )
54
+ trending_employers: Optional[List[str]] = Field(
55
+ description="List of companies currently hiring for this role/skill.",
56
+ default_factory=list,
57
+ )
58
+ average_salary: Optional[str] = Field(
59
+ description="Estimated average salary range for this role/skill.", default="N/A"
60
+ )
61
+ related_roles: Optional[List[str]] = Field(
62
+ description="Related job titles or roles for this skill/role.",
63
+ default_factory=list,
64
+ )
65
+ demand_score: Optional[int] = Field(
66
+ description="Demand score (0-100) for how hot this skill is right now", default=0
67
+ )
68
+ region: Optional[str] = Field(
69
+ description="Region for which these stats apply", default=None
70
+ )
71
+ error: Optional[str] = Field(
72
+ description="Error message if data could not be fetched.", default=None
73
+ )
74
+
75
+
76
+ class Milestone(BaseModel):
77
+ """A milestone in a learning path."""
78
+
79
+ title: str = Field(description="Short title for the milestone")
80
+ description: str = Field(description="Detailed description of what will be learned")
81
+ estimated_hours: int = Field(
82
+ description="Estimated hours to complete this milestone"
83
+ )
84
+ resources: List[ResourceItem] = Field(description="Recommended learning resources")
85
+ skills_gained: List[str] = Field(
86
+ description="Skills gained after completing this milestone"
87
+ )
88
+ job_market_data: JobMarketData = Field(
89
+ description="Job market data for the skills gained",
90
+ default_factory=JobMarketData,
91
+ )
92
+
93
+ @validator("resources", pre=True, always=True)
94
+ def check_resources_not_empty(cls, v):
95
+ if not v:
96
+ # Instead of raising an error, provide a default resource
97
+ return [
98
+ ResourceItem(
99
+ type="article",
100
+ url="https://example.com/default-resource",
101
+ description="Default resource - Please explore additional materials for this milestone",
102
+ )
103
+ ]
104
+ return v
105
+
106
+
107
+ class LearningPath(BaseModel):
108
+ """Model representation of a learning path."""
109
+
110
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
111
+ title: str = Field(description="Title of the learning path")
112
+ description: str = Field(description="Detailed description of the learning path")
113
+ topic: str = Field(description="Main topic of study")
114
+ expertise_level: str = Field(description="Starting expertise level")
115
+ learning_style: str = Field(description="Preferred learning style")
116
+ time_commitment: str = Field(description="Weekly time commitment")
117
+ duration_weeks: Optional[int] = Field(
118
+ description="Total duration in weeks", default=0
119
+ )
120
+ goals: List[str] = Field(description="Learning goals and objectives")
121
+ milestones: List["Milestone"] = Field(description="Weekly or modular breakdown")
122
+ schedule: Optional[Dict[str, Any]] = Field(
123
+ default=None, description="The calculated study schedule"
124
+ )
125
+ prerequisites: List[str] = Field(description="Prerequisites for this path")
126
+ total_hours: int = Field(description="Total estimated hours")
127
+ created_at: str = Field(default_factory=lambda: datetime.datetime.now().isoformat())
128
+ job_market_data: JobMarketData = Field(
129
+ description="Aggregated job market data for the main topic",
130
+ default_factory=JobMarketData,
131
+ )
132
+
133
+ @validator("goals", pre=True, always=True)
134
+ def check_goals_not_empty(cls, v):
135
+ if not v:
136
+ raise ValueError("Learning path goals list cannot be empty")
137
+ # Ensure all goals are non-empty strings
138
+ if not all(isinstance(goal, str) and goal.strip() for goal in v):
139
+ raise ValueError("All goals must be non-empty strings")
140
+ return v
141
+
142
+ @validator("milestones", pre=True, always=True)
143
+ def check_milestones_not_empty(cls, v):
144
+ if not v:
145
+ raise ValueError("Learning path milestones list cannot be empty")
146
+ return v
147
+
148
+
149
+ class LearningPathGenerator:
150
+ """
151
+ Core class responsible for generating personalized learning paths.
152
+ """
153
+
154
+ def __init__(self, api_key: Optional[str] = None):
155
+ """
156
+ Initialize the learning path generator.
157
+
158
+ Args:
159
+ api_key: Optional OpenAI API key (if not provided in environment)
160
+ """
161
+ self.model_orchestrator = ModelOrchestrator(api_key)
162
+ self.document_store = DocumentStore()
163
+ self.output_parser = PydanticOutputParser(pydantic_object=LearningPath)
164
+ self.obs_manager = get_observability_manager()
165
+ # Pass REDIS_URL from environment to SemanticCache
166
+ self.semantic_cache = SemanticCache(redis_url=os.getenv('REDIS_URL'))
167
+
168
+ def fetch_job_market_data(
169
+ self,
170
+ skill_or_role: str,
171
+ region: Optional[str] = None,
172
+ expertise_level: str = "intermediate",
173
+ ) -> JobMarketData:
174
+ """
175
+ Fetch job market data for a given skill or role from the skills database.
176
+
177
+ Args:
178
+ skill_or_role: The skill or role to query job market data for.
179
+ region: The region to query job market data for (default is DEFAULT_REGION).
180
+ expertise_level: User's expertise level for resource filtering.
181
+
182
+ Returns:
183
+ A JobMarketData object containing job market statistics.
184
+ """
185
+ try:
186
+ # Get skill info from database (includes salary and market info)
187
+ skill_info = get_skill_info(skill_or_role, expertise_level)
188
+
189
+ # Extract market info
190
+ market_info = skill_info.get("market_info", {})
191
+
192
+ # Create JobMarketData object
193
+ return JobMarketData(
194
+ open_positions=market_info.get("open_positions", "10,000+"),
195
+ average_salary=skill_info.get("salary_range", "$80,000 - $150,000"),
196
+ trending_employers=market_info.get("top_employers", ["Tech Companies"]),
197
+ related_roles=market_info.get("related_roles", ["Software Engineer"]),
198
+ region=region or DEFAULT_REGION
199
+ )
200
+ except Exception as e:
201
+ # Fallback to default data
202
+ return JobMarketData(
203
+ open_positions="10,000+",
204
+ average_salary="$80,000 - $150,000",
205
+ trending_employers=["Tech Companies", "Startups", "Enterprises"],
206
+ related_roles=["Software Engineer", "Developer"],
207
+ region=region or DEFAULT_REGION,
208
+ error=str(e)
209
+ )
210
+
211
+ def fetch_related_roles(
212
+ self, skills: List[str], ai_provider: Optional[str] = None, ai_model: Optional[str] = None
213
+ ) -> List[str]:
214
+ """
215
+ Fetch related job roles for a given list of skills using an LLM.
216
+
217
+ Args:
218
+ skills: The list of skills to find related job roles for.
219
+ ai_provider: The AI provider to use (e.g., 'openai').
220
+ ai_model: The specific AI model to use.
221
+
222
+ Returns:
223
+ A list of related job role titles.
224
+ """
225
+ if not skills:
226
+ return []
227
+
228
+ skills_str = ", ".join(skills)
229
+ prompt = f"""
230
+ Based on the following skills: {skills_str}, what are some relevant job titles or roles that utilize these skills?
231
+ Please provide a list of job titles. Return the answer as a JSON array of strings.
232
+ For example: ["Data Scientist", "Machine Learning Engineer", "Business Analyst"]
233
+ """
234
+
235
+ # Select orchestrator based on provider/model overrides
236
+ orchestrator_to_use = self.model_orchestrator
237
+ if ai_provider or ai_model:
238
+ try:
239
+ override_provider = ai_provider or self.model_orchestrator.provider
240
+ orchestrator_to_use = ModelOrchestrator(provider=override_provider)
241
+ orchestrator_to_use.init_language_model(model_name=ai_model)
242
+ except Exception as init_error:
243
+ print(
244
+ f"⚠️ Falling back to default orchestrator for related roles: {init_error}"
245
+ )
246
+ orchestrator_to_use = self.model_orchestrator
247
+
248
+ try:
249
+ # Use the selected orchestrator to get the response
250
+ response_str = orchestrator_to_use.generate_response(
251
+ prompt,
252
+ use_cache=False,
253
+ )
254
+
255
+ # The response is expected to be a JSON string of a list
256
+ roles = json.loads(response_str)
257
+ if isinstance(roles, list):
258
+ return roles
259
+ return []
260
+ except json.JSONDecodeError:
261
+ # Fallback if the response is not valid JSON
262
+ # Attempt to parse a plain list from the string
263
+ if "[" in response_str and "]" in response_str:
264
+ try:
265
+ # Extract content between brackets and split by comma
266
+ roles_str = response_str[response_str.find('[')+1:response_str.rfind(']')]
267
+ return [role.strip().strip('"\'') for role in roles_str.split(',')]
268
+ except Exception:
269
+ return ["Could not parse roles"]
270
+ return ["Could not determine roles"]
271
+ except Exception as e:
272
+ print(f"An unexpected error occurred while fetching related roles: {e}")
273
+ return []
274
+
275
+ def generate_path(
276
+ self,
277
+ topic: str,
278
+ expertise_level: str,
279
+ learning_style: str,
280
+ time_commitment: str = "moderate",
281
+ duration_weeks: Optional[int] = None,
282
+ goals: List[str] = None,
283
+ additional_info: Optional[str] = None,
284
+ context: List[str] = None,
285
+ ai_provider: Optional[str] = None,
286
+ ai_model: Optional[str] = None,
287
+ user_id: Optional[str] = None, # For tracking in observability
288
+ ) -> LearningPath:
289
+ """
290
+ Generate a personalized learning path based on user preferences.
291
+
292
+ Args:
293
+ topic: The main topic of study
294
+ expertise_level: Starting level of expertise
295
+ learning_style: Preferred learning style
296
+ time_commitment: Weekly time commitment
297
+ duration_weeks: User-specified duration in weeks (overrides calculated duration)
298
+ goals: List of learning goals
299
+ additional_info: Any additional information or constraints
300
+ user_id: Optional user ID for tracking
301
+
302
+ Returns:
303
+ A complete learning path object
304
+ """
305
+ # --- High-Level Cache Check ---
306
+ # Create a stable cache key by sorting and stringifying all inputs
307
+ goals_str = json.dumps(sorted(goals) if goals else [])
308
+ cache_key_data = {
309
+ "topic": topic.lower().strip(),
310
+ "expertise_level": expertise_level,
311
+ "time_commitment": time_commitment,
312
+ "duration_weeks": duration_weeks,
313
+ "goals": goals_str,
314
+ "additional_info": additional_info or ""
315
+ }
316
+ cache_key_str = json.dumps(cache_key_data, sort_keys=True).encode('utf-8')
317
+ cache_key = hashlib.sha256(cache_key_str).hexdigest()
318
+
319
+ cached_path = self.document_store.get_cached_path(cache_key)
320
+ if cached_path:
321
+ print(f"✅ Cache hit for learning path: {cache_key[:16]}... (topic: {topic})")
322
+ # Ensure the cached data is a valid LearningPath object
323
+ try:
324
+ return LearningPath(**cached_path)
325
+ except ValidationError as e:
326
+ print(f"⚠️ Cached path validation failed, regenerating... Error: {e}")
327
+ else:
328
+ print(f"❌ Cache miss for learning path: {cache_key[:16]}... (topic: {topic})")
329
+ # ---------------------------
330
+
331
+ # Track generation time for observability
332
+ generation_start_time = time.time()
333
+
334
+ # Log the generation attempt
335
+ self.obs_manager.log_event("path_generation_started", {
336
+ "topic": topic,
337
+ "expertise_level": expertise_level,
338
+ "learning_style": learning_style,
339
+ "time_commitment": time_commitment,
340
+ "user_id": user_id
341
+ })
342
+
343
+ if goals is None:
344
+ goals = [f"Master {topic}", f"Build practical skills in {topic}"]
345
+
346
+ if expertise_level not in EXPERTISE_LEVELS:
347
+ raise ValueError(
348
+ f"Invalid expertise level. Choose from: {', '.join(EXPERTISE_LEVELS.keys())}"
349
+ )
350
+
351
+ # Allow None for learning_style and use a default
352
+ if learning_style is None:
353
+ learning_style = "visual" # Default learning style
354
+ elif learning_style not in LEARNING_STYLES:
355
+ raise ValueError(
356
+ f"Invalid learning style. Choose from: {', '.join(LEARNING_STYLES.keys())}"
357
+ )
358
+
359
+ # Allow None for time_commitment and use a default
360
+ if time_commitment is None:
361
+ time_commitment = "moderate" # Default time commitment
362
+ elif time_commitment not in TIME_COMMITMENTS:
363
+ raise ValueError(
364
+ f"Invalid time commitment. Choose from: {', '.join(TIME_COMMITMENTS.keys())}"
365
+ )
366
+
367
+ relevant_docs = self.document_store.search_documents(
368
+ query=topic, filters={"expertise_level": expertise_level}, top_k=10
369
+ )
370
+
371
+ hours_map = {"minimal": 2, "moderate": 5, "substantial": 8, "intensive": 15}
372
+ hours_per_week = hours_map.get(time_commitment, 5)
373
+
374
+ # Use user-specified duration if provided, otherwise calculate
375
+ if duration_weeks and duration_weeks > 0:
376
+ adjusted_duration = duration_weeks
377
+ print(f"✅ Using user-specified duration: {adjusted_duration} weeks")
378
+ else:
379
+ base_duration = 8
380
+ intensity_factor = {
381
+ "minimal": 2.0,
382
+ "moderate": 1.5,
383
+ "substantial": 1.0,
384
+ "intensive": 0.75,
385
+ }
386
+ complexity_factor = {
387
+ "beginner": 1.0,
388
+ "intermediate": 1.2,
389
+ "advanced": 1.5,
390
+ "expert": 2.0,
391
+ }
392
+
393
+ adjusted_duration = int(
394
+ base_duration
395
+ * intensity_factor.get(time_commitment, 1.0)
396
+ * complexity_factor.get(expertise_level, 1.0)
397
+ )
398
+ print(f"📊 Calculated duration: {adjusted_duration} weeks")
399
+
400
+ # Calculate appropriate number of milestones based on duration
401
+ # Rule: 1 milestone per 1-3 weeks
402
+ if adjusted_duration <= 4:
403
+ target_milestones = 3 # Short paths: 3 milestones
404
+ elif adjusted_duration <= 8:
405
+ target_milestones = 4 # Medium paths: 4 milestones
406
+ elif adjusted_duration <= 12:
407
+ target_milestones = 5 # Standard paths: 5 milestones
408
+ elif adjusted_duration <= 20:
409
+ target_milestones = 6 # Long paths: 6 milestones
410
+ else:
411
+ target_milestones = 7 # Very long paths: 7 milestones
412
+
413
+ print(f"🎯 Target milestones for {adjusted_duration} weeks: {target_milestones}")
414
+
415
+ # Build semantic cache query signature (captures the high-level intent)
416
+ semantic_signature = json.dumps(
417
+ {
418
+ "topic": topic,
419
+ "expertise_level": expertise_level,
420
+ "time_commitment": time_commitment,
421
+ "duration_weeks": adjusted_duration,
422
+ "target_milestones": target_milestones,
423
+ "goals": goals,
424
+ "additional_info": additional_info,
425
+ },
426
+ sort_keys=True,
427
+ )
428
+
429
+ learning_path: Optional[LearningPath] = None
430
+ parsed_successfully = False
431
+
432
+ # --- Semantic Cache Check (pre-LLM) ---
433
+ cached_semantic_path = self.semantic_cache.get(semantic_signature)
434
+ if cached_semantic_path:
435
+ try:
436
+ learning_path = LearningPath(**cached_semantic_path)
437
+ parsed_successfully = True
438
+ print("✅ Semantic cache hit for learning path structure")
439
+ except ValidationError as e:
440
+ print(f"⚠️ Semantic cache entry invalid, regenerating. Error: {e}")
441
+ cached_semantic_path = None
442
+ else:
443
+ print("❌ Semantic cache miss for learning path structure")
444
+ # --------------------------------------
445
+
446
+ # Few-Shot Prompting: Provide concrete examples to guide the AI
447
+ # This dramatically improves output quality and consistency
448
+ prompt_content = f"""Generate a detailed personalized learning path for the following:
449
+
450
+ Topic: {topic}
451
+ Expertise Level: {expertise_level} - {EXPERTISE_LEVELS[expertise_level]}
452
+ Learning Style: {learning_style} - {LEARNING_STYLES[learning_style]}
453
+ Time Commitment: {time_commitment} - {TIME_COMMITMENTS[time_commitment]}
454
+ Duration: {adjusted_duration} weeks
455
+ Target Milestones: {target_milestones} milestones
456
+ Learning Goals: {', '.join(goals)}
457
+ Additional Information: {additional_info or 'None provided'}
458
+
459
+ IMPORTANT:
460
+ 1. Return ONLY valid JSON matching this exact structure.
461
+ 2. Generate EXACTLY {target_milestones} milestones (no more, no less).
462
+ 3. Set duration_weeks to EXACTLY {adjusted_duration}.
463
+ 4. Distribute the milestones evenly across the {adjusted_duration} weeks.
464
+
465
+ === EXAMPLE 1: Python Programming (Beginner) ===
466
+ {{
467
+ "title": "Complete Python Programming Journey",
468
+ "description": "A comprehensive learning path designed for absolute beginners to master Python programming through hands-on projects and real-world applications.",
469
+ "topic": "Python Programming",
470
+ "expertise_level": "beginner",
471
+ "learning_style": "visual",
472
+ "time_commitment": "moderate",
473
+ "duration_weeks": 8,
474
+ "goals": ["Master Python basics", "Build real projects", "Prepare for data science"],
475
+ "milestones": [
476
+ {{
477
+ "title": "Python Fundamentals",
478
+ "description": "Learn Python syntax, variables, data types, and basic operations",
479
+ "estimated_hours": 10,
480
+ "resources": [
481
+ {{"type": "video", "url": "https://example.com/python-basics", "description": "Python Basics Video Tutorial"}},
482
+ {{"type": "interactive", "url": "https://example.com/python-exercises", "description": "Interactive Python Exercises"}}
483
+ ],
484
+ "skills_gained": ["Python syntax", "Data types", "Variables", "Basic operators"]
485
+ }},
486
+ {{
487
+ "title": "Control Flow and Functions",
488
+ "description": "Master if statements, loops, and creating reusable functions",
489
+ "estimated_hours": 12,
490
+ "resources": [
491
+ {{"type": "article", "url": "https://example.com/control-flow", "description": "Control Flow Guide"}},
492
+ {{"type": "video", "url": "https://example.com/functions", "description": "Functions Deep Dive"}}
493
+ ],
494
+ "skills_gained": ["Conditional logic", "Loops", "Function creation", "Code organization"]
495
+ }}
496
+ ],
497
+ "prerequisites": ["Basic computer skills", "Text editor familiarity"],
498
+ "total_hours": 40
499
+ }}
500
+
501
+ === EXAMPLE 2: Machine Learning (Intermediate) ===
502
+ {{
503
+ "title": "Practical Machine Learning Mastery",
504
+ "description": "An intermediate-level path to master machine learning algorithms, model training, and deployment for real-world applications.",
505
+ "topic": "Machine Learning",
506
+ "expertise_level": "intermediate",
507
+ "learning_style": "hands-on",
508
+ "time_commitment": "substantial",
509
+ "duration_weeks": 12,
510
+ "goals": ["Build ML models", "Deploy to production", "Understand ML theory"],
511
+ "milestones": [
512
+ {{
513
+ "title": "Supervised Learning Fundamentals",
514
+ "description": "Master regression and classification algorithms with practical implementations",
515
+ "estimated_hours": 15,
516
+ "resources": [
517
+ {{"type": "course", "url": "https://example.com/supervised-learning", "description": "Supervised Learning Course"}},
518
+ {{"type": "project", "url": "https://example.com/ml-projects", "description": "Hands-on ML Projects"}}
519
+ ],
520
+ "skills_gained": ["Linear regression", "Logistic regression", "Decision trees", "Model evaluation"]
521
+ }}
522
+ ],
523
+ "prerequisites": ["Python programming", "Basic statistics", "Linear algebra basics"],
524
+ "total_hours": 60
525
+ }}
526
+
527
+ === YOUR TASK ===
528
+ Now generate a similar learning path for:
529
+ Topic: {topic}
530
+ Expertise Level: {expertise_level}
531
+ Learning Style: {learning_style}
532
+ Time Commitment: {time_commitment}
533
+ Goals: {', '.join(goals)}
534
+
535
+ Requirements:
536
+ 1. Include 3-7 milestones that represent major learning stages
537
+ 2. Each milestone should have 2-4 resources tailored to the {learning_style} learning style
538
+ 3. Estimate realistic hours for each milestone
539
+ 4. List specific skills gained at each milestone
540
+ 5. Include relevant prerequisites
541
+ 6. Calculate total_hours as sum of all milestone hours
542
+
543
+ Return ONLY the JSON object, no markdown formatting or explanation.
544
+ """
545
+
546
+ prompt_with_context = prompt_content
547
+ if context:
548
+ context_text = "\n\nAdditional Context:\n" + "\n".join(context)
549
+ prompt_with_context += context_text
550
+
551
+ orchestrator_to_use = self.model_orchestrator
552
+ if ai_provider:
553
+ custom_orchestrator = ModelOrchestrator(provider=ai_provider)
554
+ custom_orchestrator.init_language_model(model_name=ai_model)
555
+ orchestrator_to_use = custom_orchestrator
556
+
557
+ # Attempt up to 3 times to get a valid LearningPath JSON
558
+ last_error: Optional[Exception] = None
559
+ if not parsed_successfully:
560
+ for attempt in range(3):
561
+ if attempt > 0:
562
+ print(f"Retrying learning path generation (attempt {attempt+1}) due to previous validation failure…")
563
+ response = orchestrator_to_use.generate_structured_response(
564
+ prompt=prompt_with_context,
565
+ output_schema=self.output_parser.get_format_instructions(),
566
+ relevant_documents=(
567
+ [doc.page_content for doc in relevant_docs] if relevant_docs else None
568
+ ),
569
+ temperature=0.6 + 0.1 * attempt, # vary temperature slightly on retries
570
+ )
571
+ try:
572
+ learning_path = self.output_parser.parse(response)
573
+ parsed_successfully = True
574
+ # Store the successful structure for future semantic cache hits
575
+ self.semantic_cache.set(semantic_signature, learning_path.dict())
576
+ break
577
+ except ValidationError as ve:
578
+ print("Validation failed when parsing AI response as LearningPath:", ve)
579
+ print("Offending response:\n", response)
580
+ last_error = ve
581
+ # Slightly tweak the prompt for the next attempt
582
+ prompt_with_context += (
583
+ "\n\nIMPORTANT: Your last response did NOT match the schema and was therefore rejected. "
584
+ "You MUST return a COMPLETE JSON object that follows the exact LearningPath schema with ALL required fields."
585
+ )
586
+ except Exception as e:
587
+ print("Unexpected error while parsing AI response:", e)
588
+ print("Offending response:\n", response)
589
+ last_error = e
590
+ break # Unexpected errors – don't retry further
591
+
592
+ if not parsed_successfully:
593
+ raise RuntimeError("LearningPath generation failed after 3 attempts") from last_error
594
+
595
+ # Fetch job market data ONCE for the main topic (not per milestone)
596
+ # This significantly speeds up generation time
597
+ print(f"📊 Fetching job market data for main topic: {topic}")
598
+ aggregated_job_market = self.fetch_job_market_data(topic, expertise_level=expertise_level)
599
+ learning_path.job_market_data = aggregated_job_market
600
+
601
+ # Fetch related roles once for the main topic
602
+ all_skills = []
603
+ for milestone in learning_path.milestones:
604
+ if milestone.skills_gained:
605
+ all_skills.extend(
606
+ milestone.skills_gained
607
+ if isinstance(milestone.skills_gained, list)
608
+ else [milestone.skills_gained]
609
+ )
610
+
611
+ if all_skills:
612
+ related_roles = self.fetch_related_roles(
613
+ all_skills[:5], # Use top 5 skills only
614
+ ai_provider=ai_provider,
615
+ ai_model=ai_model,
616
+ )
617
+ aggregated_job_market.related_roles = related_roles
618
+
619
+ # Share the aggregated job market snapshot with each milestone if needed downstream
620
+ for milestone in learning_path.milestones:
621
+ milestone.job_market_data = aggregated_job_market
622
+
623
+ # Fetch resources for milestones IN PARALLEL (much faster!)
624
+ print(f"🔍 Fetching resources for {len(learning_path.milestones)} milestones in parallel...")
625
+
626
+ def fetch_milestone_resources(milestone_data):
627
+ """Helper function to fetch resources for a single milestone"""
628
+ milestone, index = milestone_data
629
+ try:
630
+ print(f" [{index}/{len(learning_path.milestones)}] Fetching resources for: {milestone.title}")
631
+
632
+ # Get trusted sources from the skills database
633
+ skill_info = get_skill_info(topic, expertise_level)
634
+ trusted_sources = skill_info.get("resources", {})
635
+
636
+ # Prepare the trusted sources dict for Perplexity
637
+ perplexity_sources = None
638
+ if trusted_sources:
639
+ perplexity_sources = {
640
+ 'youtube': trusted_sources.get('youtube', []),
641
+ 'websites': trusted_sources.get('websites', [])
642
+ }
643
+ print(f" 📚 Using curated sources:")
644
+ if perplexity_sources.get('youtube'):
645
+ print(f" YouTube: {', '.join(perplexity_sources['youtube'][:3])}{'...' if len(perplexity_sources['youtube']) > 3 else ''}")
646
+ if perplexity_sources.get('websites'):
647
+ print(f" Websites: {', '.join(perplexity_sources['websites'][:3])}{'...' if len(perplexity_sources['websites']) > 3 else ''}")
648
+ else:
649
+ print(f" ⚠️ No curated sources found for '{topic}' - using general search")
650
+
651
+ # Use Perplexity to search within trusted sources
652
+ contextualized_query = f"{topic}: {milestone.title}"
653
+ print(f" 🔍 Searching with Perplexity...")
654
+
655
+ perplexity_results = search_resources(
656
+ contextualized_query,
657
+ k=5, # Get more resources for better variety
658
+ trusted_sources=perplexity_sources
659
+ )
660
+
661
+ if perplexity_results and len(perplexity_results) > 0:
662
+ print(f" ✓ Found {len(perplexity_results)} specific resources from trusted sources")
663
+ return milestone, [ResourceItem(**r) for r in perplexity_results]
664
+ else:
665
+ # Fallback to default resources if Perplexity fails
666
+ print(f" ⚠️ Perplexity search returned no results, using fallback")
667
+ return milestone, [
668
+ ResourceItem(
669
+ type="Video",
670
+ url=f"https://www.youtube.com/results?search_query={milestone.title.replace(' ', '+')}",
671
+ description=f"YouTube: {milestone.title}"
672
+ ),
673
+ ResourceItem(
674
+ type="Online Course",
675
+ url=f"https://www.coursera.org/search?query={milestone.title.replace(' ', '+')}",
676
+ description=f"Coursera: {milestone.title}"
677
+ )
678
+ ]
679
+
680
+ except Exception as _err:
681
+ print(f" ⚠️ Resource search failed for {milestone.title}: {_err}")
682
+ # Return default resources
683
+ return milestone, [
684
+ ResourceItem(
685
+ type="Video",
686
+ url=f"https://www.youtube.com/results?search_query={milestone.title.replace(' ', '+')}",
687
+ description=f"YouTube: {milestone.title}"
688
+ ),
689
+ ResourceItem(
690
+ type="Online Course",
691
+ url=f"https://www.coursera.org/search?query={milestone.title.replace(' ', '+')}",
692
+ description=f"Coursera: {milestone.title}"
693
+ )
694
+ ]
695
+
696
+ # Use ThreadPoolExecutor to fetch resources in parallel
697
+ with ThreadPoolExecutor(max_workers=3) as executor:
698
+ # Submit all tasks
699
+ milestone_data = [(m, i+1) for i, m in enumerate(learning_path.milestones)]
700
+ future_to_milestone = {
701
+ executor.submit(fetch_milestone_resources, data): data[0]
702
+ for data in milestone_data
703
+ }
704
+
705
+ # Collect results as they complete
706
+ for future in as_completed(future_to_milestone):
707
+ milestone, resources = future.result()
708
+ milestone.resources = resources
709
+
710
+ print(f"✅ All resources fetched!")
711
+
712
+ # Validate all resources to ensure they're accessible
713
+ print(f"🔍 Validating resource URLs...")
714
+ all_resources_to_validate = []
715
+ for milestone in learning_path.milestones:
716
+ for resource in milestone.resources:
717
+ all_resources_to_validate.append({
718
+ 'url': resource.url,
719
+ 'title': resource.description,
720
+ 'type': resource.type
721
+ })
722
+
723
+ # Run validation asynchronously
724
+ try:
725
+ from src.utils.resource_validator import ResourceValidator
726
+ validator = ResourceValidator(cache_ttl_hours=24, max_retries=2)
727
+
728
+ # Create event loop for async validation
729
+ import asyncio
730
+ try:
731
+ loop = asyncio.get_event_loop()
732
+ except RuntimeError:
733
+ loop = asyncio.new_event_loop()
734
+ asyncio.set_event_loop(loop)
735
+
736
+ validated_resources = loop.run_until_complete(
737
+ validator.validate_resources(all_resources_to_validate)
738
+ )
739
+
740
+ # Update milestones with validation results and filter out invalid resources
741
+ resource_index = 0
742
+ for milestone in learning_path.milestones:
743
+ validated_milestone_resources = []
744
+ for resource in milestone.resources:
745
+ if resource_index < len(validated_resources):
746
+ validation = validated_resources[resource_index].get('validation', {})
747
+
748
+ # Only keep resources with high confidence (valid or temporarily unavailable)
749
+ if validation.get('valid', False) or validation.get('confidence', 0) >= 0.5:
750
+ validated_milestone_resources.append(resource)
751
+ if not validation.get('valid'):
752
+ print(f" ⚠️ Keeping potentially valid resource: {resource.url[:50]}... (confidence: {validation.get('confidence')})")
753
+ else:
754
+ print(f" ❌ Filtered out invalid resource: {resource.url[:50]}... ({validation.get('error', 'unknown error')})")
755
+
756
+ resource_index += 1
757
+
758
+ # Update milestone with validated resources
759
+ milestone.resources = validated_milestone_resources
760
+
761
+ # Get validation stats
762
+ stats = validator.get_validation_stats()
763
+ print(f"✅ Validation complete: {stats['valid_count']}/{stats['total_checked']} resources valid ({stats['success_rate']}%)")
764
+
765
+ except Exception as e:
766
+ print(f"⚠️ Resource validation failed: {e}")
767
+ print(f" Continuing with unvalidated resources...")
768
+ import traceback
769
+ traceback.print_exc()
770
+
771
+ # Ensure each milestone has resources after validation; perform general search fallback if needed
772
+ for milestone in learning_path.milestones:
773
+ try:
774
+ if not milestone.resources or len(milestone.resources) == 0:
775
+ print(f" ⚠️ No valid resources after validation for: {milestone.title}. Running general search fallback...")
776
+ contextualized_query = f"{topic}: {milestone.title}"
777
+ general_results = search_resources(contextualized_query, k=5, trusted_sources=None)
778
+ if general_results:
779
+ milestone.resources = [ResourceItem(**r) for r in general_results[:3]]
780
+
781
+ if not milestone.resources or len(milestone.resources) == 0:
782
+ print(f" ⚠️ General search returned no results. Adding search links for: {milestone.title}")
783
+ yt_q = milestone.title.replace(' ', '+')
784
+ g_q = milestone.title.replace(' ', '+')
785
+ milestone.resources = [
786
+ ResourceItem(
787
+ type="Video",
788
+ url=f"https://www.youtube.com/results?search_query={yt_q}",
789
+ description=f"YouTube: {milestone.title}"
790
+ ),
791
+ ResourceItem(
792
+ type="Web Search",
793
+ url=f"https://www.google.com/search?q={g_q}",
794
+ description=f"Google: {milestone.title}"
795
+ ),
796
+ ]
797
+
798
+ if len(milestone.resources) < 2:
799
+ print(f" ℹ️ Topping up resources for: {milestone.title}")
800
+ contextualized_query = f"{topic}: {milestone.title}"
801
+ more_results = search_resources(contextualized_query, k=5, trusted_sources=None)
802
+ if more_results:
803
+ for r in more_results:
804
+ if len(milestone.resources) >= 3:
805
+ break
806
+ try:
807
+ milestone.resources.append(ResourceItem(**r))
808
+ except Exception:
809
+ continue
810
+ except Exception as _e:
811
+ print(f" ⚠️ Post-validation fallback failed for {milestone.title}: {_e}")
812
+
813
+ topic_weights = {
814
+ milestone.title: milestone.estimated_hours
815
+ for milestone in learning_path.milestones
816
+ }
817
+
818
+ schedule = calculate_study_schedule(
819
+ weeks=adjusted_duration,
820
+ hours_per_week=hours_per_week,
821
+ topic_weights=topic_weights,
822
+ )
823
+ learning_path.schedule = schedule
824
+
825
+ for milestone in learning_path.milestones:
826
+ milestone.resources = match_resources_to_learning_style(
827
+ resources=milestone.resources, learning_style=learning_style
828
+ )
829
+
830
+ learning_path.total_hours = sum(
831
+ m.estimated_hours for m in learning_path.milestones if m.estimated_hours
832
+ )
833
+ learning_path.duration_weeks = adjusted_duration
834
+ learning_path.id = str(uuid.uuid4())
835
+
836
+ # Mark as successful
837
+ success = True
838
+
839
+ # Log success metrics
840
+ generation_time_ms = (time.time() - generation_start_time) * 1000
841
+ self.obs_manager.log_metric("path_generation_success", 1.0, {
842
+ "topic": topic,
843
+ "expertise_level": expertise_level,
844
+ "duration_ms": generation_time_ms,
845
+ "milestone_count": len(learning_path.milestones),
846
+ "user_id": user_id
847
+ })
848
+
849
+ self.obs_manager.log_event("path_generation_completed", {
850
+ "topic": topic,
851
+ "expertise_level": expertise_level,
852
+ "milestone_count": len(learning_path.milestones),
853
+ "total_hours": learning_path.total_hours,
854
+ "duration_weeks": learning_path.duration_weeks,
855
+ "generation_time_ms": generation_time_ms,
856
+ "user_id": user_id
857
+ })
858
+
859
+ # --- Cache the final result ---
860
+ self.document_store.cache_path(cache_key, learning_path.dict())
861
+ # ---------------------------
862
+
863
+ return learning_path
864
+
865
+ def save_path(
866
+ self, learning_path: LearningPath, output_dir: str = "learning_paths"
867
+ ) -> str:
868
+ """
869
+ Save a learning path to file.
870
+
871
+ Args:
872
+ learning_path (LearningPath): The learning path to save.
873
+ output_dir (str, optional): Directory to save the path. Defaults to "learning_paths".
874
+
875
+ Returns:
876
+ str: Path to the saved file.
877
+ """
878
+ path_dir = Path(output_dir)
879
+ path_dir.mkdir(exist_ok=True, parents=True)
880
+
881
+ safe_topic = learning_path.topic.lower().replace(" ", "_")[:30]
882
+ filename = f"{safe_topic}_{learning_path.id[:8]}.json"
883
+ file_path = path_dir / filename
884
+
885
+ with open(file_path, "w") as f:
886
+ f.write(json.dumps(learning_path.dict(), indent=2))
887
+
888
+ return str(file_path)
889
+
890
+ def load_path(
891
+ self, path_id: str, input_dir: str = "learning_paths"
892
+ ) -> Optional[LearningPath]:
893
+ """
894
+ Load a learning path from file by ID.
895
+
896
+ Args:
897
+ path_id (str): ID of the learning path to load.
898
+ input_dir (str, optional): Directory to search for the path. Defaults to "learning_paths".
899
+
900
+ Returns:
901
+ Optional[LearningPath]: The loaded learning path or None if not found.
902
+ """
903
+ path_dir = Path(input_dir)
904
+ if not path_dir.exists():
905
+ return None
906
+
907
+ for file_path in path_dir.glob(f"*_{path_id[:8]}.json"):
908
+ try:
909
+ with open(file_path, "r") as f:
910
+ path_data = json.load(f)
911
+ if path_data.get("id", "").startswith(path_id):
912
+ return LearningPath(**path_data)
913
+ except Exception:
914
+ continue
915
+
916
+ return None
src/ml/context_compressor.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Contextual compression module for reducing token usage in RAG.
3
+
4
+ Contextual compression uses an LLM to extract only the most relevant sentences
5
+ from retrieved documents, significantly reducing token count and cost.
6
+ """
7
+ import os
8
+ from typing import List, Optional
9
+ from langchain.schema import Document
10
+ from openai import OpenAI
11
+
12
+
13
+ class ContextCompressor:
14
+ """
15
+ LLM-based contextual compressor for RAG optimization.
16
+
17
+ Takes retrieved documents and extracts only the sentences that are
18
+ directly relevant to the user's query, reducing tokens by 40-60%.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ api_key: Optional[str] = None,
24
+ model: str = "gpt-3.5-turbo",
25
+ max_tokens: int = 500
26
+ ):
27
+ """
28
+ Initialize context compressor.
29
+
30
+ Args:
31
+ api_key: OpenAI API key
32
+ model: Model to use for compression
33
+ max_tokens: Maximum tokens per compressed chunk
34
+ """
35
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY")
36
+ self.model = model
37
+ self.max_tokens = max_tokens
38
+ self.client = None
39
+
40
+ if self.api_key:
41
+ self.client = OpenAI(api_key=self.api_key)
42
+ print(f"✅ Context compressor initialized (model: {model})")
43
+ else:
44
+ print("❌ OPENAI_API_KEY not set. Compression disabled.")
45
+
46
+ def compress(
47
+ self,
48
+ query: str,
49
+ documents: List[Document]
50
+ ) -> List[Document]:
51
+ """
52
+ Compress documents by extracting only relevant content.
53
+
54
+ Args:
55
+ query: Original search query
56
+ documents: List of documents to compress
57
+
58
+ Returns:
59
+ Compressed documents
60
+ """
61
+ if not self.client or not documents:
62
+ return documents
63
+
64
+ compressed_docs = []
65
+ total_original_tokens = 0
66
+ total_compressed_tokens = 0
67
+
68
+ for doc in documents:
69
+ # Estimate original token count (rough: 1 token ≈ 4 chars)
70
+ original_tokens = len(doc.page_content) // 4
71
+ total_original_tokens += original_tokens
72
+
73
+ # Skip compression for very short documents
74
+ if original_tokens < 100:
75
+ compressed_docs.append(doc)
76
+ total_compressed_tokens += original_tokens
77
+ continue
78
+
79
+ try:
80
+ # Compress the document
81
+ compressed_content = self._compress_single(query, doc.page_content)
82
+
83
+ # Create new document with compressed content
84
+ compressed_doc = Document(
85
+ page_content=compressed_content,
86
+ metadata={
87
+ **doc.metadata,
88
+ 'compressed': True,
89
+ 'original_length': len(doc.page_content),
90
+ 'compressed_length': len(compressed_content)
91
+ }
92
+ )
93
+
94
+ compressed_docs.append(compressed_doc)
95
+
96
+ # Estimate compressed token count
97
+ compressed_tokens = len(compressed_content) // 4
98
+ total_compressed_tokens += compressed_tokens
99
+
100
+ except Exception as e:
101
+ print(f"⚠️ Compression failed for document: {e}")
102
+ # Keep original if compression fails
103
+ compressed_docs.append(doc)
104
+ total_compressed_tokens += original_tokens
105
+
106
+ # Calculate savings
107
+ if total_original_tokens > 0:
108
+ savings_pct = ((total_original_tokens - total_compressed_tokens) / total_original_tokens) * 100
109
+ print(f"📉 Compressed {total_original_tokens} → {total_compressed_tokens} tokens ({savings_pct:.1f}% reduction)")
110
+
111
+ return compressed_docs
112
+
113
+ def _compress_single(self, query: str, content: str) -> str:
114
+ """
115
+ Compress a single document.
116
+
117
+ Args:
118
+ query: Search query
119
+ content: Document content
120
+
121
+ Returns:
122
+ Compressed content
123
+ """
124
+ prompt = f"""You are a text compression expert. Extract only the sentences from the following text that are directly relevant to answering this query:
125
+
126
+ Query: "{query}"
127
+
128
+ Text:
129
+ {content}
130
+
131
+ Instructions:
132
+ 1. Extract ONLY sentences that directly answer or relate to the query
133
+ 2. Preserve the original wording - do not paraphrase
134
+ 3. Remove redundant or tangential information
135
+ 4. Keep the extracted sentences in their original order
136
+ 5. If multiple sentences are relevant, separate them with a space
137
+
138
+ Relevant sentences:"""
139
+
140
+ try:
141
+ response = self.client.chat.completions.create(
142
+ model=self.model,
143
+ messages=[
144
+ {"role": "system", "content": "You are a helpful assistant that extracts relevant information."},
145
+ {"role": "user", "content": prompt}
146
+ ],
147
+ temperature=0.1, # Low temperature for consistency
148
+ max_tokens=self.max_tokens
149
+ )
150
+
151
+ compressed = response.choices[0].message.content.strip()
152
+
153
+ # If compression resulted in empty or very short text, keep original
154
+ if len(compressed) < 50:
155
+ return content
156
+
157
+ return compressed
158
+
159
+ except Exception as e:
160
+ print(f"⚠️ Single document compression failed: {e}")
161
+ return content
162
+
163
+ def compress_batch(
164
+ self,
165
+ query: str,
166
+ documents: List[Document],
167
+ batch_size: int = 3
168
+ ) -> List[Document]:
169
+ """
170
+ Compress documents in batches for efficiency.
171
+
172
+ Args:
173
+ query: Search query
174
+ documents: Documents to compress
175
+ batch_size: Number of documents to compress per API call
176
+
177
+ Returns:
178
+ Compressed documents
179
+ """
180
+ # For now, process individually
181
+ # TODO: Implement true batching for better efficiency
182
+ return self.compress(query, documents)
src/ml/embeddings.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector embedding utilities for the AI Learning Path Generator.
3
+ Handles text vectorization for semantic search.
4
+ """
5
+ from typing import List, Dict, Any, Optional, Union
6
+ import numpy as np
7
+
8
+ # Import from langchain (old version compatible with Pydantic v1)
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.schema import Document
11
+
12
+ from src.utils.config import OPENAI_API_KEY, EMBEDDING_MODEL
13
+
14
+ class EmbeddingService:
15
+ """
16
+ Service for generating and managing text embeddings.
17
+ """
18
+ def __init__(self, api_key: Optional[str] = None):
19
+ """
20
+ Initialize the embedding service.
21
+
22
+ Args:
23
+ api_key: Optional OpenAI API key
24
+ """
25
+ self.api_key = api_key or OPENAI_API_KEY
26
+
27
+ # Try to use free HuggingFace embeddings first, fallback to OpenAI
28
+ try:
29
+ from langchain.embeddings import HuggingFaceEmbeddings
30
+ self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
31
+ print("✅ EmbeddingService using free HuggingFace embeddings")
32
+ except ImportError:
33
+ if self.api_key:
34
+ from langchain.embeddings import OpenAIEmbeddings
35
+ self.embeddings = OpenAIEmbeddings(
36
+ api_key=self.api_key,
37
+ model=EMBEDDING_MODEL
38
+ )
39
+ print("✅ EmbeddingService using OpenAI embeddings")
40
+ else:
41
+ raise ValueError("HuggingFace embeddings not available and no OpenAI API key provided")
42
+
43
+ # Initialize text splitter for chunking
44
+ self.text_splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=1000,
46
+ chunk_overlap=100,
47
+ length_function=len,
48
+ )
49
+
50
+ def embed_text(self, text: str) -> List[float]:
51
+ """
52
+ Generate embedding vector for a text string.
53
+
54
+ Args:
55
+ text: The text to embed
56
+
57
+ Returns:
58
+ Embedding vector as a list of floats
59
+ """
60
+ try:
61
+ return self.embeddings.embed_query(text)
62
+ except Exception as e:
63
+ raise ValueError(f"Failed to generate embedding: {str(e)}")
64
+
65
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
66
+ """
67
+ Generate embeddings for multiple texts.
68
+
69
+ Args:
70
+ texts: List of texts to embed
71
+
72
+ Returns:
73
+ List of embedding vectors
74
+ """
75
+ try:
76
+ return self.embeddings.embed_documents(texts)
77
+ except Exception as e:
78
+ raise ValueError(f"Failed to generate document embeddings: {str(e)}")
79
+
80
+ def chunk_text(
81
+ self,
82
+ text: str,
83
+ metadata: Optional[Dict[str, Any]] = None
84
+ ) -> List[Document]:
85
+ """
86
+ Split text into chunks for embedding.
87
+
88
+ Args:
89
+ text: The text to split
90
+ metadata: Optional metadata to add to each chunk
91
+
92
+ Returns:
93
+ List of Document objects with text chunks
94
+ """
95
+ # Create a document with metadata
96
+ doc = Document(page_content=text, metadata=metadata or {})
97
+
98
+ # Split into chunks
99
+ chunks = self.text_splitter.split_documents([doc])
100
+
101
+ return chunks
102
+
103
+ def calculate_similarity(
104
+ self,
105
+ embedding1: List[float],
106
+ embedding2: List[float]
107
+ ) -> float:
108
+ """
109
+ Calculate cosine similarity between two embeddings.
110
+
111
+ Args:
112
+ embedding1: First embedding vector
113
+ embedding2: Second embedding vector
114
+
115
+ Returns:
116
+ Similarity score (0-1)
117
+ """
118
+ # Convert to numpy arrays
119
+ vec1 = np.array(embedding1)
120
+ vec2 = np.array(embedding2)
121
+
122
+ # Calculate cosine similarity
123
+ dot_product = np.dot(vec1, vec2)
124
+ norm1 = np.linalg.norm(vec1)
125
+ norm2 = np.linalg.norm(vec2)
126
+
127
+ if norm1 == 0 or norm2 == 0:
128
+ return 0 # Handle zero vectors
129
+
130
+ return dot_product / (norm1 * norm2)
src/ml/job_market.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers to fetch real-time job-market data using Perplexity API.
2
+
3
+ The function `get_job_market_stats` queries Perplexity (online search model)
4
+ with a carefully crafted prompt asking for a JSON-only response containing:
5
+ - open_positions: string (e.g. "15,000+")
6
+ - average_salary: string (e.g. "$110,000 - $150,000")
7
+ - trending_employers: array[str] of 3 employer names
8
+
9
+ Perplexity provides real-time web search results, making it perfect for
10
+ current job market data. Falls back to OpenAI if Perplexity is unavailable.
11
+
12
+ If the API or JSON parsing fails, we return a static fallback so the UI
13
+ still renders a snapshot.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import os
18
+ import json
19
+ import logging
20
+ from typing import Dict, Any
21
+
22
+ from openai import OpenAI
23
+
24
+ # Initialize clients
25
+ openai_client = None
26
+ perplexity_client = None
27
+
28
+ _DEFAULT_SNAPSHOT: Dict[str, Any] = {
29
+ "open_positions": "5,000+",
30
+ "average_salary": "$120,000 - $160,000",
31
+ "trending_employers": ["Big Tech Co", "Innovative Startup", "Data Insights Inc"],
32
+ }
33
+
34
+ PROMPT_TEMPLATE = (
35
+ "Search the web for current US job market data for '{topic}' roles. "
36
+ "Provide real-time statistics from job boards like LinkedIn, Indeed, Glassdoor. "
37
+ "Return ONLY valid JSON (no markdown, no code blocks) with keys: "
38
+ "open_positions (string like '15,000+'), "
39
+ "average_salary (string like '$110,000 - $150,000'), "
40
+ "trending_employers (array of 3 real company names currently hiring)."
41
+ )
42
+
43
+
44
+ def _call_perplexity(prompt: str, timeout: int = 45) -> str:
45
+ """Call Perplexity API for real-time web search results."""
46
+ api_key = os.getenv("PERPLEXITY_API_KEY")
47
+ if not api_key:
48
+ raise RuntimeError("PERPLEXITY_API_KEY env var not set")
49
+
50
+ # Perplexity uses OpenAI-compatible API
51
+ # Use sonar-pro model for online search
52
+ client = OpenAI(
53
+ api_key=api_key,
54
+ base_url="https://api.perplexity.ai"
55
+ )
56
+
57
+ completion = client.chat.completions.create(
58
+ model="sonar-pro", # Online search model
59
+ messages=[
60
+ {
61
+ "role": "system",
62
+ "content": "You are a helpful assistant that searches the web for current job market data. Always return valid JSON."
63
+ },
64
+ {"role": "user", "content": prompt},
65
+ ],
66
+ temperature=0.2,
67
+ max_tokens=500,
68
+ timeout=timeout,
69
+ )
70
+ content = completion.choices[0].message.content
71
+ return content
72
+
73
+
74
+ def _call_openai(prompt: str, timeout: int = 45) -> str:
75
+ """Fallback to OpenAI if Perplexity is unavailable."""
76
+ api_key = os.getenv("OPENAI_API_KEY")
77
+ if not api_key:
78
+ raise RuntimeError("OPENAI_API_KEY env var not set")
79
+
80
+ # Get model name from environment (lowercase)
81
+ model = os.getenv("DEFAULT_MODEL", "gpt-4o-mini")
82
+
83
+ # Use OpenAI client
84
+ client = OpenAI(api_key=api_key)
85
+
86
+ completion = client.chat.completions.create(
87
+ model=model,
88
+ messages=[
89
+ {"role": "system", "content": "You are a helpful assistant providing job market estimates."},
90
+ {"role": "user", "content": prompt},
91
+ ],
92
+ temperature=0.2,
93
+ max_tokens=300,
94
+ timeout=timeout,
95
+ )
96
+ content = completion.choices[0].message.content
97
+ return content
98
+
99
+
100
+ def _extract_json(text: str) -> Dict[str, Any]:
101
+ """Extract JSON from response, handling markdown code blocks."""
102
+ # Remove markdown code blocks if present
103
+ if "```" in text:
104
+ parts = text.split("```")
105
+ for part in parts:
106
+ if part.strip().startswith("json"):
107
+ text = part[4:].strip()
108
+ elif part.strip() and not part.strip().startswith("```"):
109
+ text = part.strip()
110
+
111
+ # Try direct parse
112
+ try:
113
+ return json.loads(text)
114
+ except json.JSONDecodeError:
115
+ # Try to find JSON object in text
116
+ start = text.find("{")
117
+ end = text.rfind("}") + 1
118
+ if start >= 0 and end > start:
119
+ try:
120
+ return json.loads(text[start:end])
121
+ except json.JSONDecodeError:
122
+ pass
123
+
124
+ raise ValueError("Unable to parse JSON from API response")
125
+
126
+
127
+ def get_job_market_stats(topic: str) -> Dict[str, Any]:
128
+ """Return real-time job-market stats using Perplexity (with OpenAI fallback).
129
+
130
+ Tries Perplexity first for real-time web search results.
131
+ Falls back to OpenAI if Perplexity unavailable.
132
+ Returns default snapshot on any failure.
133
+ """
134
+ if topic == "__fallback__":
135
+ return _DEFAULT_SNAPSHOT.copy()
136
+
137
+ prompt = PROMPT_TEMPLATE.format(topic=topic)
138
+
139
+ # Try Perplexity first (real-time web search)
140
+ perplexity_key = os.getenv("PERPLEXITY_API_KEY")
141
+ print(f"DEBUG: Perplexity API key present: {bool(perplexity_key)}")
142
+
143
+ if perplexity_key:
144
+ try:
145
+ print(f"DEBUG: Attempting Perplexity search for '{topic}'...")
146
+ logging.info(f"Fetching job market data for '{topic}' using Perplexity (real-time search)...")
147
+ raw = _call_perplexity(prompt)
148
+ print(f"DEBUG: Perplexity raw response: {raw[:200]}...")
149
+ data = _extract_json(raw)
150
+ print(f"DEBUG: Perplexity parsed data: {data}")
151
+
152
+ # Basic validation
153
+ if not all(k in data for k in ("open_positions", "average_salary", "trending_employers")):
154
+ raise ValueError("Missing required keys in Perplexity response")
155
+
156
+ print(f"✅ Successfully fetched real-time job data via Perplexity")
157
+ logging.info(f"✅ Successfully fetched real-time job data via Perplexity")
158
+ return data
159
+ except Exception as exc:
160
+ print(f"ERROR: Perplexity failed: {exc}")
161
+ logging.warning(f"Perplexity job-market fetch failed: {exc}. Falling back to OpenAI...")
162
+
163
+ # Fallback to OpenAI
164
+ try:
165
+ logging.info(f"Fetching job market data for '{topic}' using OpenAI...")
166
+ raw = _call_openai(prompt)
167
+ data = _extract_json(raw)
168
+
169
+ # Basic validation
170
+ if not all(k in data for k in ("open_positions", "average_salary", "trending_employers")):
171
+ raise ValueError("Missing required keys in OpenAI response")
172
+
173
+ logging.info(f"✅ Successfully fetched job data via OpenAI")
174
+ return data
175
+ except Exception as exc:
176
+ logging.warning(f"OpenAI job-market fetch failed: {exc}. Using default snapshot.")
177
+ return _DEFAULT_SNAPSHOT.copy()
src/ml/model_orchestrator.py ADDED
@@ -0,0 +1,1187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model orchestrator for the AI Learning Path Generator.
3
+ Handles interactions with language models and embeddings.
4
+ """
5
+ from langchain.prompts import PromptTemplate, ChatPromptTemplate
6
+ from src.utils.observability import get_observability_manager, estimate_cost
7
+ from src.utils.cache import cache, cached
8
+ from src.utils.helpers import optimize_prompt, count_tokens, estimate_api_cost
9
+ from src.utils.config import (
10
+ OPENAI_API_KEY,
11
+ DEEPSEEK_API_KEY, # Kept for legacy compatibility
12
+ OPENROUTER_API_KEY, # OpenRouter support
13
+ DEFAULT_PROVIDER,
14
+ DEFAULT_MODEL,
15
+ OPENROUTER_FREE_MODEL, # Free model from OpenRouter
16
+ MAX_TOKENS,
17
+ TEMPERATURE
18
+ )
19
+ from langchain.chains import LLMChain
20
+ from typing import List, Dict, Any, Optional, Union, TypeVar, Type
21
+ import json
22
+ import os
23
+
24
+ # Using Pydantic v1
25
+ import pydantic
26
+ from pydantic import BaseModel as PydanticBaseModel
27
+
28
+ # Import from langchain (older version compatible with Pydantic v1)
29
+ from langchain.llms import OpenAI
30
+ from langchain.chat_models import ChatOpenAI
31
+
32
+ # For type hints
33
+ T = TypeVar('T', bound='BaseModel')
34
+
35
+
36
+ class BaseModel(PydanticBaseModel):
37
+ """Base model using Pydantic v1."""
38
+ class Config:
39
+ arbitrary_types_allowed = True
40
+
41
+ # We'll use only OpenAI for now to make the application work
42
+ # Both providers will default to using OpenAI
43
+
44
+
45
+ # Import token optimization utilities for cost savings
46
+
47
+ # Import caching utilities to avoid repeated API calls
48
+
49
+ # Import observability utilities for LLM monitoring
50
+
51
+
52
+ class ModelOrchestrator:
53
+ """
54
+ Manages AI model interactions with RAG capabilities.
55
+ """
56
+
57
+ def __init__(self, api_key: Optional[str] = None, provider: Optional[str] = None):
58
+ print("--- ModelOrchestrator.__init__ started ---")
59
+ """
60
+ Initialize the model orchestrator with RAG capabilities.
61
+
62
+ Args:
63
+ api_key: Optional API key (if not provided, will use from environment)
64
+ provider: Optional provider name ('openai', 'openrouter', or 'deepseek')
65
+ """
66
+ self.provider = provider.lower() if provider else DEFAULT_PROVIDER
67
+ self.context = []
68
+ self.goal = None
69
+ self.planning_enabled = True
70
+ self.memory = []
71
+
72
+ # Set up API key based on selected provider
73
+ if self.provider == 'openai':
74
+ self.api_key = api_key or OPENAI_API_KEY
75
+ if not self.api_key:
76
+ raise ValueError(
77
+ "OpenAI API key is required. Please provide it or set the OPENAI_API_KEY environment variable.")
78
+
79
+ print(
80
+ "--- ModelOrchestrator.__init__: Preparing to initialize ChatOpenAI ---")
81
+ print(
82
+ f"--- ModelOrchestrator.__init__: API Key: {str(self.api_key)[:15]}..., Model: {DEFAULT_MODEL}, Temp: {TEMPERATURE}, Max Tokens: {MAX_TOKENS} ---")
83
+ # self.llm = ChatOpenAI(
84
+ # api_key=self.api_key,
85
+ # model_name=DEFAULT_MODEL,
86
+ # temperature=TEMPERATURE,
87
+ # max_tokens=MAX_TOKENS
88
+ # )
89
+ print("--- ModelOrchestrator.__init__: ChatOpenAI initialization SKIPPED ---")
90
+
91
+ print(
92
+ "--- ModelOrchestrator.__init__: Preparing to initialize OpenAI (base_llm) ---")
93
+ # self.base_llm = OpenAI(
94
+ # api_key=self.api_key,
95
+ # model_name=DEFAULT_MODEL,
96
+ # temperature=TEMPERATURE,
97
+ # max_tokens=MAX_TOKENS
98
+ # )
99
+ print(
100
+ "--- ModelOrchestrator.__init__: OpenAI (base_llm) initialization SKIPPED ---")
101
+ elif self.provider == 'deepseek':
102
+ self.api_key = api_key or DEEPSEEK_API_KEY
103
+ if not self.api_key:
104
+ raise ValueError(
105
+ "DeepSeek API key is required. Please provide it or set the DEEPSEEK_API_KEY environment variable.")
106
+ print("--- ModelOrchestrator.__init__: DeepSeek provider selected, client initialization SKIPPED for now ---")
107
+ elif self.provider == 'openrouter':
108
+ self.api_key = api_key or OPENROUTER_API_KEY
109
+ if not self.api_key:
110
+ raise ValueError(
111
+ "OpenRouter API key is required. Please provide it or set the OPENROUTER_API_KEY environment variable.")
112
+ print(
113
+ "--- ModelOrchestrator.__init__: OpenRouter provider selected (free models available) ---")
114
+ # Only OpenAI, OpenRouter and DeepSeek providers are supported now
115
+ # (OpenAI is the primary and recommended provider)
116
+ else:
117
+ raise ValueError(
118
+ f"Unsupported provider: {self.provider}. Use 'openai', 'openrouter', or 'deepseek'.")
119
+
120
+ # Track current model name
121
+ self.model_name = DEFAULT_MODEL
122
+
123
+ # Initialize observability manager
124
+ self.obs_manager = get_observability_manager()
125
+
126
+ # Override default model if DeepSeek provider is selected
127
+ if self.provider == 'deepseek':
128
+ # Allow environment variable override but default to the official DeepSeek chat model
129
+ self.model_name = os.getenv("DEEPSEEK_MODEL", "deepseek-chat")
130
+ print(
131
+ f"--- ModelOrchestrator.__init__: DeepSeek provider detected, using model: {self.model_name} ---")
132
+
133
+ # Initialize the language model based on provider
134
+ print("--- ModelOrchestrator.__init__: Calling init_language_model ---")
135
+ self.init_language_model()
136
+ print("--- ModelOrchestrator.__init__ finished (LLM initialized) ---")
137
+
138
+ def init_language_model(self, model_name: Optional[str] = None, temperature: Optional[float] = None):
139
+ print(
140
+ f"--- ModelOrchestrator.init_language_model started (provider: {self.provider}, model: {model_name or self.model_name}) ---")
141
+ """
142
+ Initialize or switch the language model.
143
+
144
+ Args:
145
+ model_name: Name of the model to use
146
+ temperature: Temperature setting for the model
147
+ """
148
+ # Update model name if provided
149
+ if model_name:
150
+ self.model_name = model_name
151
+
152
+ temp = temperature if temperature is not None else TEMPERATURE
153
+
154
+ # Initialize based on provider
155
+ try:
156
+ if self.provider == 'openai':
157
+ print(
158
+ f"--- ModelOrchestrator.init_language_model: Initializing ChatOpenAI for {self.provider} ---")
159
+ self.llm = ChatOpenAI(
160
+ openai_api_key=self.api_key,
161
+ model=self.model_name,
162
+ temperature=temp,
163
+ max_tokens=MAX_TOKENS,
164
+ )
165
+ print(
166
+ f"--- ModelOrchestrator.init_language_model: ChatOpenAI for {self.provider} initialized ---")
167
+ elif self.provider == 'openrouter':
168
+ print(
169
+ f"--- ModelOrchestrator.init_language_model: Initializing ChatOpenAI for OpenRouter ---")
170
+ # Use OpenRouter free model for this provider
171
+ model_to_use = OPENROUTER_FREE_MODEL
172
+ self.model_name = model_to_use # Update model name
173
+ # OpenRouter uses OpenAI-compatible API with different endpoint
174
+ self.llm = ChatOpenAI(
175
+ openai_api_key=self.api_key,
176
+ openai_api_base="https://openrouter.ai/api/v1",
177
+ model=model_to_use,
178
+ temperature=temp,
179
+ max_tokens=MAX_TOKENS,
180
+ )
181
+ print(
182
+ f"--- ModelOrchestrator.init_language_model: ChatOpenAI for OpenRouter initialized with model: {model_to_use} ---")
183
+ elif self.provider == 'deepseek':
184
+ print(
185
+ f"--- ModelOrchestrator.init_language_model: Initializing ChatOpenAI for {self.provider} ---")
186
+ # DeepSeek uses OpenAI-compatible API
187
+ self.llm = ChatOpenAI(
188
+ openai_api_key=self.api_key,
189
+ openai_api_base="https://api.deepseek.com/v1",
190
+ model=self.model_name,
191
+ temperature=temp,
192
+ max_tokens=MAX_TOKENS,
193
+ )
194
+ print(
195
+ f"--- ModelOrchestrator.init_language_model: ChatOpenAI for DeepSeek initialized ---")
196
+ except Exception as e:
197
+ print(f"Error initializing language model: {str(e)}")
198
+ raise
199
+
200
+ def switch_provider(self, provider: str, api_key: Optional[str] = None, model_name: Optional[str] = None):
201
+ """
202
+ Switch between AI providers.
203
+
204
+ Args:
205
+ provider: The provider to switch to ('openai' or 'deepseek')
206
+ api_key: Optional API key for the provider
207
+ model_name: Optional model name to use
208
+
209
+ Returns:
210
+ str: Status message indicating the provider and model in use
211
+ """
212
+ try:
213
+ self.provider = provider.lower()
214
+
215
+ # Update API key if provided
216
+ if api_key:
217
+ self.api_key = api_key
218
+ elif self.provider == 'openai':
219
+ self.api_key = OPENAI_API_KEY
220
+ elif self.provider == 'deepseek':
221
+ self.api_key = DEEPSEEK_API_KEY
222
+ # OpenAI is the primary provider now
223
+ else:
224
+ raise ValueError(
225
+ f"Unsupported provider: {provider}. Use 'openai' or 'deepseek'.")
226
+
227
+ # Update model name if provided
228
+ if model_name:
229
+ self.model_name = model_name
230
+
231
+ # Re-initialize the language model
232
+ self.init_language_model()
233
+
234
+ return f"Switched to {self.provider} provider with model {self.model_name}"
235
+
236
+ except Exception as e:
237
+ error_msg = f"Error switching to provider {provider}: {str(e)}"
238
+ print(error_msg)
239
+ # Try to fallback to a working provider
240
+ if self.provider != 'openai':
241
+ print("Falling back to OpenAI provider")
242
+ return self.switch_provider('openai', OPENAI_API_KEY, model_name or DEFAULT_MODEL)
243
+ raise ValueError(error_msg) from e
244
+
245
+ def generate_response(
246
+ self,
247
+ prompt: str,
248
+ relevant_documents: Optional[List[str]] = None,
249
+ temperature: Optional[float] = None,
250
+ use_cache: bool = True # NEW: Enable caching by default
251
+ ) -> str:
252
+ """
253
+ Generate a text response from the language model.
254
+
255
+ Args:
256
+ prompt: The prompt for the model
257
+ relevant_documents: Optional list of relevant documents to add context
258
+ temperature: Optional override for model temperature
259
+ use_cache: Whether to use cached responses (default: True)
260
+
261
+ Returns:
262
+ The generated response as a string
263
+ """
264
+ # Check cache first to save money! 💰
265
+ if use_cache:
266
+ cache_key = cache.cache_key(
267
+ "response",
268
+ prompt[:200], # First 200 chars of prompt
269
+ str(relevant_documents)[:100] if relevant_documents else "",
270
+ self.model_name,
271
+ temperature or TEMPERATURE
272
+ )
273
+
274
+ cached_response = cache.get(cache_key)
275
+ if cached_response:
276
+ print("💰 Using cached response - $0.00 cost!")
277
+ return cached_response
278
+
279
+ # Optimize prompt to reduce token usage and save money! 💰
280
+ full_prompt = optimize_prompt(
281
+ prompt, relevant_documents, max_tokens=4000)
282
+
283
+ # Log token count and estimated cost for monitoring
284
+ input_token_count = count_tokens(full_prompt, self.model_name)
285
+ estimated_input_cost = estimate_api_cost(
286
+ input_token_count, self.model_name)
287
+ print(
288
+ f"💰 Token count: {input_token_count} (~${estimated_input_cost:.4f} input cost)")
289
+
290
+ try:
291
+ # Set up the temperature
292
+ temp = temperature if temperature is not None else TEMPERATURE
293
+
294
+ print("DEBUG: About to make OpenAI API call using direct implementation...")
295
+
296
+ import time
297
+ from src.direct_openai import generate_completion
298
+
299
+ try:
300
+ start_time = time.time()
301
+ print(f"DEBUG: Using model: {self.model_name}")
302
+ print(f"DEBUG: Prompt length: {len(full_prompt)} chars")
303
+
304
+ # Use our direct implementation that bypasses the client library
305
+ response_text = generate_completion(
306
+ prompt=full_prompt,
307
+ system_message="You are an expert educational AI assistant that specializes in creating personalized learning paths.",
308
+ model=self.model_name,
309
+ temperature=temp,
310
+ max_tokens=MAX_TOKENS,
311
+ timeout=120
312
+ )
313
+
314
+ latency_ms = (time.time() - start_time) * 1000
315
+ print(f"DEBUG: API call completed in {latency_ms:.2f}ms")
316
+
317
+ # Estimate output tokens and total cost
318
+ output_token_count = count_tokens(
319
+ response_text, self.model_name) if response_text else 0
320
+ total_cost = estimate_cost(
321
+ self.model_name, input_token_count, output_token_count)
322
+
323
+ # Log to observability platform (LangSmith + W&B)
324
+ self.obs_manager.log_llm_call(
325
+ prompt=full_prompt,
326
+ response=response_text,
327
+ model=self.model_name,
328
+ metadata={
329
+ "temperature": temp,
330
+ "max_tokens": MAX_TOKENS,
331
+ "provider": self.provider,
332
+ "cached": False
333
+ },
334
+ latency_ms=latency_ms,
335
+ token_count=input_token_count + output_token_count,
336
+ cost=total_cost
337
+ )
338
+
339
+ # Cache the response for future use (save money!)
340
+ if use_cache and response_text:
341
+ # Cache for 24 hours
342
+ cache.set(cache_key, response_text, ttl=86400)
343
+
344
+ return response_text
345
+
346
+ except Exception as e:
347
+ print(f"DEBUG: API call failed: {str(e)}")
348
+ raise
349
+
350
+ except Exception as e:
351
+ error_msg = f"Error generating response: {str(e)}"
352
+ print(error_msg)
353
+ # Try to extract more detailed error information
354
+ try:
355
+ import traceback
356
+ error_traceback = traceback.format_exc()
357
+ print(f"Error traceback:\n{error_traceback}")
358
+
359
+ # Check if it's an OpenAI API error
360
+ if hasattr(e, 'response') and hasattr(e.response, 'json'):
361
+ error_data = e.response.json()
362
+ print(f"OpenAI API Error: {error_data}")
363
+ error_msg += f"\nAPI Error: {error_data.get('error', {}).get('message', str(e))}"
364
+
365
+ except Exception as inner_e:
366
+ print(f"Error while processing error: {str(inner_e)}")
367
+
368
+ raise ValueError(error_msg) from e
369
+
370
+ def generate_response_stream(
371
+ self,
372
+ prompt: str,
373
+ relevant_documents: Optional[List[str]] = None,
374
+ temperature: Optional[float] = None,
375
+ ):
376
+ """
377
+ Generate streaming response for real-time output.
378
+
379
+ Why streaming:
380
+ - Users see progress immediately
381
+ - Perceived performance is better
382
+ - Same cost as regular response!
383
+ - Better UX = happier users
384
+
385
+ Args:
386
+ prompt: The prompt for the model
387
+ relevant_documents: Optional list of relevant documents to add context
388
+ temperature: Optional override for model temperature
389
+
390
+ Yields:
391
+ Chunks of response text as they arrive
392
+ """
393
+ # Optimize prompt to reduce costs
394
+ full_prompt = optimize_prompt(
395
+ prompt, relevant_documents, max_tokens=4000)
396
+
397
+ # Log token count
398
+ token_count = count_tokens(full_prompt, self.model_name)
399
+ estimated_cost = estimate_api_cost(token_count, self.model_name)
400
+ print(
401
+ f"💰 Streaming - Token count: {token_count} (~${estimated_cost:.4f} input cost)")
402
+
403
+ temp = temperature if temperature is not None else TEMPERATURE
404
+
405
+ try:
406
+ from openai import OpenAI
407
+ client = OpenAI(api_key=OPENAI_API_KEY)
408
+
409
+ stream = client.chat.completions.create(
410
+ model=self.model_name,
411
+ messages=[
412
+ {"role": "system", "content": "You are an expert educational AI assistant that specializes in creating personalized learning paths."},
413
+ {"role": "user", "content": full_prompt}
414
+ ],
415
+ temperature=temp,
416
+ max_tokens=MAX_TOKENS,
417
+ stream=True # Enable streaming!
418
+ )
419
+
420
+ for chunk in stream:
421
+ if chunk.choices[0].delta.content is not None:
422
+ yield chunk.choices[0].delta.content
423
+
424
+ except Exception as e:
425
+ print(f"Streaming error: {str(e)}")
426
+ yield f"Error: {str(e)}"
427
+
428
+ def generate_structured_response(
429
+ self,
430
+ prompt: str,
431
+ output_schema: str,
432
+ relevant_documents: Optional[List[str]] = None,
433
+ temperature: Optional[float] = None,
434
+ use_cache: bool = True # NEW: Enable caching by default
435
+ ) -> str:
436
+ """
437
+ Generate a structured response that follows a specific schema.
438
+
439
+ Args:
440
+ prompt: The prompt for the model
441
+ output_schema: The schema instructions for the output
442
+ relevant_documents: Optional list of relevant documents to add context
443
+ temperature: Optional override for model temperature
444
+ use_cache: Whether to use cached responses (default: True)
445
+
446
+ Returns:
447
+ The generated response as a JSON string
448
+ """
449
+ # Check cache first to save money! 💰
450
+ if use_cache:
451
+ cache_key = cache.cache_key(
452
+ "structured",
453
+ prompt[:200], # First 200 chars of prompt
454
+ output_schema[:100], # First 100 chars of schema
455
+ str(relevant_documents)[:100] if relevant_documents else "",
456
+ self.model_name,
457
+ temperature or 0.2
458
+ )
459
+
460
+ cached_response = cache.get(cache_key)
461
+ if cached_response:
462
+ print("💰 Using cached structured response - $0.00 cost!")
463
+ return cached_response
464
+ # Determine if this is a learning path generation
465
+ is_learning_path = 'LearningPath' in output_schema
466
+
467
+ # Prepare the prompt with schema instructions and emphasize required fields
468
+ required_fields_reminder = ""
469
+ if is_learning_path:
470
+ required_fields_reminder = """
471
+ IMPORTANT: Your response MUST include ALL of these required fields:
472
+ - title: String title of the learning path
473
+ - description: Detailed description of the learning path
474
+ - topic: Main topic of study
475
+ - expertise_level: Starting expertise level
476
+ - learning_style: Preferred learning style
477
+ - time_commitment: Weekly time commitment
478
+ - duration_weeks: Total duration in weeks (integer)
479
+ - goals: List of learning goals and objectives
480
+ - milestones: List of learning milestones
481
+ - prerequisites: List of prerequisites for this path
482
+ - total_hours: Total estimated hours (integer)
483
+
484
+ For each milestone, you MUST include:
485
+ - title: Short title for the milestone
486
+ - description: Detailed description
487
+ - estimated_hours: Estimated hours to complete (integer)
488
+ - resources: List of recommended learning resources
489
+ - skills_gained: List of skills gained after completion
490
+ """
491
+
492
+ schema_prompt = f"""
493
+ {prompt}
494
+
495
+ Your response should follow this schema format:
496
+ {output_schema}
497
+
498
+ {required_fields_reminder}
499
+
500
+ Please provide a valid JSON response that strictly follows this schema.
501
+ Do not include any explanatory text outside the JSON structure.
502
+ """
503
+
504
+ # Optimize prompt with context to reduce token usage 💰
505
+ full_prompt = optimize_prompt(
506
+ schema_prompt, relevant_documents, max_tokens=6000)
507
+
508
+ # Log token count and estimated cost
509
+ token_count = count_tokens(full_prompt, self.model_name)
510
+ estimated_cost = estimate_api_cost(token_count, self.model_name)
511
+ print(
512
+ f"💰 Structured response - Token count: {token_count} (~${estimated_cost:.4f} input cost)")
513
+
514
+ # Set up the temperature - lower for structured outputs
515
+ temp = temperature if temperature is not None else 0.2
516
+
517
+ # Use our direct implementation that bypasses the client library
518
+ import time
519
+ import requests
520
+ import traceback
521
+ response_text = None
522
+
523
+ try:
524
+ start_time = time.time()
525
+ print(
526
+ f"DEBUG: Generating structured response using provider: {self.provider}, model: {self.model_name}")
527
+ print(f"DEBUG: Prompt length: {len(full_prompt)} chars")
528
+
529
+ # Print the first 200 chars of the prompt for debugging
530
+ print(f"DEBUG: Prompt preview: {full_prompt[:200]}...")
531
+
532
+ # Print API key details for debugging (safely)
533
+ if self.provider == 'openai':
534
+ api_key = OPENAI_API_KEY
535
+ if api_key:
536
+ print(
537
+ f"DEBUG: Using OpenAI API key starting with: {api_key[:5]}{'*' * 10}")
538
+ else:
539
+ print("DEBUG: WARNING - No OpenAI API key found!")
540
+
541
+ elif self.provider == 'deepseek':
542
+ api_key = DEEPSEEK_API_KEY
543
+ if api_key:
544
+ print(
545
+ f"DEBUG: Using DeepSeek API key starting with: {api_key[:5]}{'*' * 10}")
546
+ else:
547
+ print("DEBUG: WARNING - No DeepSeek API key found!")
548
+
549
+ # OpenAI is the primary provider now
550
+
551
+ if self.provider == 'openai':
552
+ from src.direct_openai import generate_completion
553
+ print("Attempting to generate OpenAI completion...")
554
+ response_text = generate_completion(
555
+ prompt=full_prompt,
556
+ system_message="You are an expert AI assistant that specializes in generating structured responses following specified schemas. Always include all required fields in your JSON response.",
557
+ model=self.model_name,
558
+ temperature=temp,
559
+ max_tokens=MAX_TOKENS,
560
+ timeout=300 # Increase timeout for reliability
561
+ )
562
+ print(
563
+ f"Successfully generated completion with {len(response_text) if response_text else 0} characters")
564
+ elif self.provider == 'openrouter':
565
+ # OpenRouter uses OpenAI-compatible API via direct_openai with custom endpoint
566
+ from openai import OpenAI as OpenAIClient
567
+ print("Attempting to generate OpenRouter completion...")
568
+
569
+ client = OpenAIClient(
570
+ api_key=self.api_key,
571
+ base_url="https://openrouter.ai/api/v1"
572
+ )
573
+
574
+ # Use free model if not specified
575
+ model_to_use = self.model_name if self.model_name else OPENROUTER_FREE_MODEL
576
+
577
+ try:
578
+ completion = client.chat.completions.create(
579
+ model=model_to_use,
580
+ messages=[
581
+ {"role": "system", "content": "You are an expert AI assistant that specializes in generating structured responses following specified schemas. Always include all required fields in your JSON response."},
582
+ {"role": "user", "content": full_prompt}
583
+ ],
584
+ temperature=temp,
585
+ max_tokens=MAX_TOKENS,
586
+ timeout=300
587
+ )
588
+ response_text = completion.choices[0].message.content
589
+ print(
590
+ f"Successfully generated OpenRouter completion with {len(response_text) if response_text else 0} characters")
591
+ except Exception as e:
592
+ print(f"Error calling OpenRouter API: {e}")
593
+ response_text = None
594
+ elif self.provider == 'deepseek':
595
+ response_text = self._deepseek_completion(
596
+ full_prompt,
597
+ temp,
598
+ system_message="You are an expert AI assistant that specializes in generating structured responses following specified schemas. Always include all required fields in your JSON response."
599
+ )
600
+ # OpenAI is the primary provider now
601
+ else:
602
+ raise ValueError(f"Unknown provider: {self.provider}")
603
+
604
+ print(
605
+ f"DEBUG: API call completed in {time.time() - start_time:.2f} seconds")
606
+ if response_text:
607
+ print(
608
+ f"DEBUG: Received response with length: {len(response_text)} chars")
609
+ print(f"DEBUG: Response preview: {response_text[:100]}...")
610
+ else:
611
+ print("DEBUG: WARNING - Received empty response from API")
612
+ if is_learning_path:
613
+ # Return a fallback learning path
614
+ return self._create_fallback_learning_path()
615
+ else:
616
+ # Return a fallback generic response
617
+ return json.dumps({
618
+ "summary": "Sorry, I encountered an error retrieving information.",
619
+ "key_concepts": ["Error occurred while processing your request"],
620
+ "learning_path": ["Please try again with a different query"],
621
+ "resources": [],
622
+ "code_examples": [],
623
+ "advanced_topics": []
624
+ })
625
+
626
+ except Exception as e:
627
+ print(f"DEBUG: Structured response generation failed: {str(e)}")
628
+ print(traceback.format_exc())
629
+ if is_learning_path:
630
+ # Return a fallback learning path
631
+ return self._create_fallback_learning_path()
632
+ else:
633
+ # Return a fallback generic response
634
+ return json.dumps({
635
+ "summary": f"Sorry, I encountered an error: {str(e)}",
636
+ "key_concepts": ["Unable to extract structured information"],
637
+ "learning_path": ["Please try asking in a different way"],
638
+ "resources": [],
639
+ "code_examples": [],
640
+ "advanced_topics": [],
641
+ "career_applications": []
642
+ })
643
+
644
+ # Extract JSON from the response
645
+ try:
646
+ # Try to find JSON in the response (may be enclosed in ```json blocks)
647
+ if "```json" in response_text:
648
+ json_start = response_text.find("```json") + 7
649
+ json_end = response_text.find("```", json_start)
650
+ json_str = response_text[json_start:json_end].strip()
651
+ elif "```" in response_text:
652
+ json_start = response_text.find("```") + 3
653
+ json_end = response_text.find("```", json_start)
654
+ json_str = response_text[json_start:json_end].strip()
655
+ else:
656
+ json_str = response_text.strip()
657
+
658
+ # Validate JSON
659
+ data = json.loads(json_str)
660
+
661
+ # If expecting a learning path but received a list or wrong type, fallback
662
+ if is_learning_path and not isinstance(data, dict):
663
+ print(
664
+ "DEBUG: Expected learning path dict but received different type, returning fallback path.")
665
+ return self._create_fallback_learning_path()
666
+
667
+ # For learning paths, validate that all required fields are present
668
+ if is_learning_path:
669
+ required_fields = [
670
+ 'title', 'description', 'topic', 'expertise_level',
671
+ 'learning_style', 'time_commitment', 'duration_weeks',
672
+ 'goals', 'milestones', 'prerequisites', 'total_hours'
673
+ ]
674
+
675
+ missing_fields = [
676
+ field for field in required_fields if field not in data]
677
+ if missing_fields:
678
+ print(
679
+ f"DEBUG: Missing required fields in learning path: {missing_fields}")
680
+
681
+ # If any fields are missing, add them with default values
682
+ for field in missing_fields:
683
+ if field == 'title':
684
+ data['title'] = data.get(
685
+ 'topic', 'Learning Path') + ' Learning Path'
686
+ elif field == 'description':
687
+ data[
688
+ 'description'] = f"A comprehensive learning path for {data.get('topic', 'the requested topic')}."
689
+ elif field == 'topic':
690
+ data['topic'] = data.get(
691
+ 'title', 'General Learning').replace(' Learning Path', '')
692
+ elif field == 'expertise_level':
693
+ data['expertise_level'] = 'beginner'
694
+ elif field == 'learning_style':
695
+ data['learning_style'] = 'visual'
696
+ elif field == 'time_commitment':
697
+ data['time_commitment'] = 'moderate'
698
+ elif field == 'duration_weeks':
699
+ data['duration_weeks'] = 8
700
+ elif field == 'goals':
701
+ data['goals'] = [
702
+ f"Master {data.get('topic', 'the subject')}"]
703
+ elif field == 'milestones':
704
+ data['milestones'] = [{
705
+ 'title': 'Getting Started',
706
+ 'description': f"Introduction to {data.get('topic', 'the subject')}",
707
+ 'estimated_hours': 10,
708
+ 'resources': [{'name': 'Online Documentation', 'url': '', 'type': 'documentation'}],
709
+ 'skills_gained': [f"Basic {data.get('topic', 'subject')} knowledge"]
710
+ }]
711
+ elif field == 'prerequisites':
712
+ data['prerequisites'] = ['None']
713
+ elif field == 'total_hours':
714
+ data['total_hours'] = 40
715
+
716
+ # Also check that each milestone has the required fields
717
+ if 'milestones' in data and isinstance(data['milestones'], list):
718
+ milestone_required_fields = [
719
+ 'title', 'description', 'estimated_hours', 'resources', 'skills_gained']
720
+ for i, milestone in enumerate(data['milestones']):
721
+ milestone_missing_fields = [
722
+ field for field in milestone_required_fields if field not in milestone]
723
+
724
+ if milestone_missing_fields:
725
+ print(
726
+ f"DEBUG: Missing required fields in milestone {i+1}: {milestone_missing_fields}")
727
+
728
+ # Add missing fields with default values
729
+ for field in milestone_missing_fields:
730
+ if field == 'title':
731
+ milestone['title'] = f"Milestone {i+1}"
732
+ elif field == 'description':
733
+ milestone['description'] = f"A key learning milestone in this path."
734
+ elif field == 'estimated_hours':
735
+ milestone['estimated_hours'] = 10
736
+ elif field == 'resources':
737
+ milestone['resources'] = [
738
+ {'name': 'Online Resource', 'url': '', 'type': 'article'}]
739
+ elif field == 'skills_gained':
740
+ milestone['skills_gained'] = [
741
+ f"Skills related to {data.get('topic', 'the subject')}"]
742
+
743
+ # Cache the successful response for future use (save money!)
744
+ json_result = json.dumps(data)
745
+ if use_cache:
746
+ # Cache for 24 hours
747
+ cache.set(cache_key, json_result, ttl=86400)
748
+
749
+ return json_result
750
+ except Exception as e:
751
+ print(f"DEBUG: Error parsing initial JSON: {str(e)}")
752
+
753
+ # First cleanup attempt - remove markdown code block wrappers
754
+ cleaned_response = response_text.strip()
755
+
756
+ # Remove ```json...``` or ```...``` markdown wrappers
757
+ import re
758
+ markdown_match = re.search(
759
+ r'```(?:json)?\s*(.*?)\s*```', response_text, re.DOTALL)
760
+ if markdown_match:
761
+ cleaned_response = markdown_match.group(1).strip()
762
+ print(f"DEBUG: Extracted content from markdown code block")
763
+
764
+ # Remove common text prefixes
765
+ for prefix in ["+", "-", "*", "#", "Response:", "JSON:", "Here's", "```", "```json"]:
766
+ if cleaned_response.startswith(prefix):
767
+ cleaned_response = cleaned_response[len(prefix):].strip()
768
+
769
+ try:
770
+ # Try to parse the cleaned response
771
+ data = json.loads(cleaned_response)
772
+ print(f"DEBUG: Successfully parsed cleaned JSON")
773
+ return json.dumps(data)
774
+ except Exception as e2:
775
+ print(f"DEBUG: Error parsing cleaned JSON: {str(e2)}")
776
+
777
+ # Second attempt - find the main JSON object (start with first { and match closing })
778
+ try:
779
+ first_brace = cleaned_response.find('{')
780
+ if first_brace != -1:
781
+ # Count braces to find the matching closing brace
782
+ brace_count = 0
783
+ end_pos = first_brace
784
+ for i in range(first_brace, len(cleaned_response)):
785
+ if cleaned_response[i] == '{':
786
+ brace_count += 1
787
+ elif cleaned_response[i] == '}':
788
+ brace_count -= 1
789
+ if brace_count == 0:
790
+ end_pos = i + 1
791
+ break
792
+
793
+ potential_json = cleaned_response[first_brace:end_pos]
794
+ print(
795
+ f"DEBUG: Extracted JSON from position {first_brace} to {end_pos} ({len(potential_json)} chars)")
796
+ data = json.loads(potential_json)
797
+ print(f"DEBUG: Successfully parsed extracted JSON")
798
+ return json.dumps(data)
799
+ except Exception as e3:
800
+ print(f"DEBUG: Error in brace matching: {str(e3)}")
801
+
802
+ # Return a fallback JSON as last resort instead of raising an exception
803
+ print("DEBUG: Returning fallback JSON structure due to parsing failure")
804
+ return json.dumps({
805
+ "summary": "Failed to parse the AI's response. The content might not be in the expected JSON format.",
806
+ "key_concepts": ["JSON parsing error"],
807
+ "learning_path": ["Please try a different query or check the AI provider's output directly if possible."],
808
+ "resources": [],
809
+ "code_examples": [],
810
+ "advanced_topics": [],
811
+ "error_details": "The AI's response could not be successfully parsed as JSON after multiple attempts."
812
+ })
813
+ return json.dumps({
814
+ "summary": f"I processed your request but encountered a formatting issue. Your question was about: {response_text[:100]}...",
815
+ "key_concepts": ["Unable to extract structured information"],
816
+ "learning_path": ["Please try asking in a different way"],
817
+ "resources": [],
818
+ "code_examples": [],
819
+ "advanced_topics": [],
820
+ "career_applications": []
821
+ })
822
+
823
+ def _deepseek_completion(self, prompt: str, temperature: float, system_message: str = None):
824
+ """Call DeepSeek API for chat completion.
825
+
826
+ The helper explicitly adds a **system** message reminding the model to comply with the
827
+ schema and strictly return JSON. We have observed that without this guard-rail the
828
+ DeepSeek model occasionally omits required fields which later causes Pydantic
829
+ validation failures. Passing a clear system prompt greatly increases response
830
+ reliability.
831
+ """
832
+ import requests
833
+ import traceback
834
+ import json
835
+ import time
836
+
837
+ api_key = DEEPSEEK_API_KEY
838
+ url = "https://api.deepseek.com/v1/chat/completions"
839
+
840
+ system_msg = (
841
+ system_message
842
+ or "You are an expert AI assistant that MUST output ONLY valid JSON strictly "
843
+ "following the user's schema instructions. Do not add any commentary, markdown "
844
+ "code fences or explanations."
845
+ )
846
+
847
+ headers = {
848
+ "Authorization": f"Bearer {api_key}",
849
+ "Content-Type": "application/json",
850
+ }
851
+
852
+ payload_base = {
853
+ "model": self.model_name if hasattr(self, "model_name") else "deepseek-chat",
854
+ "temperature": temperature or 0.2,
855
+ "max_tokens": MAX_TOKENS,
856
+ }
857
+
858
+ def _post(messages):
859
+ start = time.time()
860
+ pl = {**payload_base, "messages": messages}
861
+ print(
862
+ f"DEBUG: DeepSeek request with {len(json.dumps(pl))} chars payload, "
863
+ f"messages={len(messages)}"
864
+ )
865
+ resp = requests.post(url, headers=headers, json=pl, timeout=150)
866
+ resp.raise_for_status()
867
+ data = resp.json()
868
+ content = data["choices"][0]["message"]["content"]
869
+ print(
870
+ f"DEBUG: DeepSeek response in {time.time()-start:.2f}s with "
871
+ f"{len(content)} chars"
872
+ )
873
+ return content
874
+
875
+ try:
876
+ # 1st attempt – full prompt
877
+ messages = [
878
+ {"role": "system", "content": system_msg},
879
+ {"role": "user", "content": prompt},
880
+ ]
881
+ response_text = _post(messages)
882
+
883
+ # Quick JSON sanity check; if it fails we'll retry with a reduced prompt.
884
+ try:
885
+ json.loads(response_text.strip("`"))
886
+ return response_text
887
+ except Exception:
888
+ print(
889
+ "DEBUG: DeepSeek response not valid JSON, retrying with simplified instructions...")
890
+
891
+ # 2nd attempt – simplified prompt focusing on schema only
892
+ simple_prompt = (
893
+ "Provide ONLY the JSON that matches the schema. Do not wrap it in anything."
894
+ )
895
+ messages_retry = [
896
+ {"role": "system", "content": system_msg},
897
+ {"role": "user", "content": prompt + "\n\n" + simple_prompt},
898
+ ]
899
+ return _post(messages_retry)
900
+ except Exception as e:
901
+ print(f"DEBUG: DeepSeek API call failed: {str(e)}")
902
+ print(traceback.format_exc())
903
+ raise
904
+
905
+ def _create_fallback_learning_path(self):
906
+ """
907
+ Create a fallback learning path with default values when generation fails.
908
+ """
909
+ import datetime
910
+ import uuid
911
+ fallback_path = {
912
+ "id": str(uuid.uuid4()),
913
+ "title": "General Learning Path",
914
+ "description": "A default learning path created when specific generation failed.",
915
+ "topic": "General Topic",
916
+ "expertise_level": "beginner",
917
+ "learning_style": "visual",
918
+ "time_commitment": "moderate",
919
+ "duration_weeks": 8,
920
+ "goals": ["Build foundational knowledge", "Develop practical skills"],
921
+ "milestones": [
922
+ {
923
+ "title": "Getting Started",
924
+ "description": "Introduction to the fundamentals.",
925
+ "estimated_hours": 10,
926
+ "resources": [
927
+ {"name": "Online Documentation",
928
+ "url": "", "type": "documentation"}
929
+ ],
930
+ "skills_gained": ["Basic knowledge"]
931
+ },
932
+ {
933
+ "title": "Core Concepts",
934
+ "description": "Understanding core principles and practices.",
935
+ "estimated_hours": 15,
936
+ "resources": [
937
+ {"name": "Online Tutorial", "url": "", "type": "tutorial"}
938
+ ],
939
+ "skills_gained": ["Fundamental concepts"]
940
+ }
941
+ ],
942
+ "prerequisites": ["None"],
943
+ "total_hours": 25,
944
+ "created_at": datetime.datetime.now().isoformat()
945
+ }
946
+ return json.dumps(fallback_path)
947
+
948
+ def analyze_difficulty(self, content: str) -> float:
949
+ """
950
+ Analyze the difficulty level of educational content.
951
+
952
+ Args:
953
+ content: The content to analyze
954
+
955
+ Returns:
956
+ Difficulty score between 0 (easiest) and 1 (hardest)
957
+ """
958
+ prompt = f"""
959
+ Analyze the following educational content and rate its difficulty level on a scale from 0 to 1,
960
+ where 0 is very basic (elementary level) and 1 is extremely advanced (expert/PhD level).
961
+
962
+ Content:
963
+ {content[:1000]}...
964
+
965
+ Consider factors like:
966
+ - Technical vocabulary and jargon
967
+ - Complexity of concepts
968
+ - Prerequisites required to understand
969
+ - Density of information
970
+
971
+ Return only a numeric score between 0 and 1 with up to 2 decimal places.
972
+ """
973
+
974
+ response = self.generate_response(prompt, temperature=0.1)
975
+
976
+ # Extract the numeric score
977
+ try:
978
+ # Look for patterns like "0.75" or "Difficulty: 0.75"
979
+ import re
980
+ matches = re.findall(r"([0-9]\.[0-9]{1,2})", response)
981
+ if matches:
982
+ score = float(matches[0])
983
+ return max(0.0, min(1.0, score)) # Ensure between 0 and 1
984
+
985
+ # If no decimal found, look for whole numbers
986
+ matches = re.findall(r"^([0-9])$", response)
987
+ if matches:
988
+ score = float(matches[0])
989
+ return max(0.0, min(1.0, score)) # Ensure between 0 and 1
990
+
991
+ return 0.5 # Default to middle difficulty
992
+ except Exception:
993
+ return 0.5 # Default to middle difficulty
994
+
995
+ def generate_resource_recommendations(
996
+ self,
997
+ topic: str,
998
+ learning_style: str,
999
+ expertise_level: str,
1000
+ count: int = 5
1001
+ ) -> List[Dict[str, Any]]:
1002
+ """
1003
+ Generate tailored resource recommendations for a topic.
1004
+
1005
+ Args:
1006
+ topic: The topic to find resources for
1007
+ learning_style: Preferred learning style
1008
+ expertise_level: User's expertise level
1009
+ count: Number of resources to recommend
1010
+
1011
+ Returns:
1012
+ List of resource dictionaries
1013
+ """
1014
+ prompt = f"""
1015
+ Generate {count} learning resources for someone studying {topic}.
1016
+
1017
+ Their learning style is {learning_style} and their expertise level is {expertise_level}.
1018
+
1019
+ IMPORTANT: All resources MUST be in English only. Do not include resources in Portuguese, Spanish, or any other language.
1020
+
1021
+ For each resource, include:
1022
+ 1. Title (in English)
1023
+ 2. Type (video, article, book, interactive, course, documentation, podcast, project)
1024
+ 3. Description (1-2 sentences in English)
1025
+ 4. Difficulty level (beginner, intermediate, advanced, expert)
1026
+ 5. Estimated time to complete (in minutes or hours)
1027
+ 6. URL (create a realistic but fictional URL if needed)
1028
+
1029
+ Provide the response as a JSON array of resource objects. All text fields must be in English.
1030
+ """
1031
+
1032
+ response = self.generate_structured_response(
1033
+ prompt=prompt,
1034
+ output_schema="""
1035
+ [
1036
+ {
1037
+ "title": "string",
1038
+ "type": "string",
1039
+ "description": "string",
1040
+ "difficulty": "string",
1041
+ "time_estimate": "string",
1042
+ "url": "string"
1043
+ }
1044
+ ]
1045
+ """,
1046
+ temperature=0.7
1047
+ )
1048
+
1049
+ try:
1050
+ resources = json.loads(response)
1051
+ return resources
1052
+ except Exception:
1053
+ # Fallback to empty list on parsing error
1054
+ return []
1055
+
1056
+ def generate_path(self, topic: str, expertise_level: str, learning_style: str, context: List[str] = None) -> str:
1057
+ """
1058
+ Generate a learning path based on user preferences and context using RAG.
1059
+
1060
+ Args:
1061
+ topic: The learning topic
1062
+ expertise_level: User's expertise level
1063
+ learning_style: User's preferred learning style
1064
+ context: Optional context to consider
1065
+
1066
+ Returns:
1067
+ Generated learning path
1068
+ """
1069
+ # Combine provided context with stored context
1070
+ full_context = self.context + (context or [])
1071
+
1072
+ # Plan if planning is enabled
1073
+ if self.planning_enabled and hasattr(self, '_plan_path_generation'):
1074
+ self._plan_path_generation(
1075
+ topic, expertise_level, learning_style, full_context)
1076
+
1077
+ # Generate path with context
1078
+ prompt = f"""Generate a learning path for the following topic:
1079
+
1080
+ Topic: {topic}
1081
+ Expertise Level: {expertise_level}
1082
+ Learning Style: {learning_style}
1083
+
1084
+ Context:
1085
+ {' '.join(full_context)}
1086
+
1087
+ Previous answers:
1088
+ {' '.join(self.memory)}
1089
+
1090
+ Generate a structured learning path with milestones and resources.
1091
+ """
1092
+
1093
+ path = self._generate_text(prompt)
1094
+
1095
+ # Store path in memory
1096
+ self.memory.append(
1097
+ f"Generated path for {topic} with {expertise_level} level and {learning_style} style")
1098
+
1099
+ return path
1100
+
1101
+ def generate_answer(self, question: str, context: Optional[List[str]] = None, temperature: Optional[float] = None) -> str:
1102
+ """
1103
+ Generate an answer to a question using RAG and agentic behavior.
1104
+
1105
+ Args:
1106
+ question: The question to answer
1107
+ context: Optional context to consider
1108
+ temperature: Optional temperature for response generation
1109
+
1110
+ Returns:
1111
+ Generated answer
1112
+ """
1113
+ # Combine provided context with stored context
1114
+ full_context = self.context + (context or [])
1115
+
1116
+ # Plan if planning is enabled
1117
+ if self.planning_enabled and hasattr(self, '_plan_answer_generation'):
1118
+ self._plan_answer_generation(question, full_context)
1119
+
1120
+ # Generate answer with context
1121
+ prompt = f"""Answer the following question based on the provided context:
1122
+
1123
+ Context:
1124
+ {' '.join(full_context)}
1125
+
1126
+ Question: {question}"""
1127
+
1128
+ # Store question in memory
1129
+ self.memory.append(f"Question: {question}")
1130
+
1131
+ # Generate and return the answer
1132
+ return self.generate_response(prompt, relevant_documents=full_context, temperature=temperature)
1133
+
1134
+ def _plan_answer_generation(self, question: str, context: List[str]) -> None:
1135
+ """
1136
+ Plan the answer generation process.
1137
+
1138
+ Args:
1139
+ question: The question to answer
1140
+ context: Context information
1141
+ """
1142
+ # Analyze the question to determine the best approach
1143
+ question_lower = question.lower()
1144
+
1145
+ # Determine if we need more context
1146
+ if len(context) < 2 and not any(keyword in question_lower for keyword in ["what", "how", "why", "when", "where", "who"]):
1147
+ self.context.append("Need more context for this question")
1148
+
1149
+ # Determine the type of question
1150
+ if "how" in question_lower:
1151
+ self.context.append("This is a procedural question")
1152
+ elif "why" in question_lower:
1153
+ self.context.append("This is an explanatory question")
1154
+ elif "what" in question_lower:
1155
+ self.context.append("This is a definitional question")
1156
+ elif "compare" in question_lower or "difference" in question_lower:
1157
+ self.context.append("This is a comparative question")
1158
+
1159
+ def _plan_path_generation(self, topic: str, expertise_level: str, learning_style: str, context: List[str]) -> None:
1160
+ """
1161
+ Plan the learning path generation process.
1162
+
1163
+ Args:
1164
+ topic: The learning topic
1165
+ expertise_level: User's expertise level
1166
+ learning_style: User's preferred learning style
1167
+ context: Context information
1168
+ """
1169
+ # Determine the appropriate depth and breadth based on expertise level
1170
+ if expertise_level == "beginner":
1171
+ self.context.append("Focus on fundamentals and basic concepts")
1172
+ elif expertise_level == "intermediate":
1173
+ self.context.append(
1174
+ "Include practical applications and case studies")
1175
+ elif expertise_level == "advanced":
1176
+ self.context.append(
1177
+ "Include advanced techniques and research papers")
1178
+
1179
+ # Adjust for learning style
1180
+ if learning_style == "visual":
1181
+ self.context.append("Prioritize video resources and diagrams")
1182
+ elif learning_style == "auditory":
1183
+ self.context.append("Prioritize podcasts and audio lectures")
1184
+ elif learning_style == "reading":
1185
+ self.context.append("Prioritize books and articles")
1186
+ elif learning_style == "kinesthetic":
1187
+ self.context.append("Prioritize hands-on projects and exercises")