Mansi Dwivedi commited on
Commit
ec5a0c4
·
1 Parent(s): 737a6ff

Add application file

Browse files
Files changed (5) hide show
  1. .gitignore +263 -0
  2. Dockerfile +15 -0
  3. app.py +419 -0
  4. requirements.txt +79 -0
  5. seed_logs.py +411 -0
.gitignore ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,flask
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,flask
3
+
4
+ ### Flask ###
5
+ instance/*
6
+ !instance/.gitignore
7
+ .webassets-cache
8
+ .env
9
+
10
+ ### Flask.Python Stack ###
11
+ # Byte-compiled / optimized / DLL files
12
+ __pycache__/
13
+ *.py[cod]
14
+ *$py.class
15
+
16
+ # C extensions
17
+ *.so
18
+
19
+ # Distribution / packaging
20
+ .Python
21
+ build/
22
+ develop-eggs/
23
+ dist/
24
+ downloads/
25
+ eggs/
26
+ .eggs/
27
+ lib/
28
+ lib64/
29
+ parts/
30
+ sdist/
31
+ var/
32
+ wheels/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+ cover/
63
+
64
+ # Translations
65
+ *.mo
66
+ *.pot
67
+
68
+ # Django stuff:
69
+ *.log
70
+ local_settings.py
71
+ db.sqlite3
72
+ db.sqlite3-journal
73
+
74
+ # Flask stuff:
75
+ instance/
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ .pybuilder/
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ # For a library or package, you might want to ignore these files since the code is
96
+ # intended to run in multiple environments; otherwise, check them in:
97
+ # .python-version
98
+
99
+ # pipenv
100
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
102
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
103
+ # install all needed dependencies.
104
+ #Pipfile.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ #pdm.lock
116
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117
+ # in version control.
118
+ # https://pdm.fming.dev/#use-with-ide
119
+ .pdm.toml
120
+
121
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122
+ __pypackages__/
123
+
124
+ # Celery stuff
125
+ celerybeat-schedule
126
+ celerybeat.pid
127
+
128
+ # SageMath parsed files
129
+ *.sage.py
130
+
131
+ # Environments
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ ### Python ###
171
+ # Byte-compiled / optimized / DLL files
172
+
173
+ # C extensions
174
+
175
+ # Distribution / packaging
176
+
177
+ # PyInstaller
178
+ # Usually these files are written by a python script from a template
179
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
180
+
181
+ # Installer logs
182
+
183
+ # Unit test / coverage reports
184
+
185
+ # Translations
186
+
187
+ # Django stuff:
188
+
189
+ # Flask stuff:
190
+
191
+ # Scrapy stuff:
192
+
193
+ # Sphinx documentation
194
+
195
+ # PyBuilder
196
+
197
+ # Jupyter Notebook
198
+
199
+ # IPython
200
+
201
+ # pyenv
202
+ # For a library or package, you might want to ignore these files since the code is
203
+ # intended to run in multiple environments; otherwise, check them in:
204
+ # .python-version
205
+
206
+ # pipenv
207
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
208
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
209
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
210
+ # install all needed dependencies.
211
+
212
+ # poetry
213
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
214
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
215
+ # commonly ignored for libraries.
216
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
217
+
218
+ # pdm
219
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
220
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
221
+ # in version control.
222
+ # https://pdm.fming.dev/#use-with-ide
223
+
224
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
225
+
226
+ # Celery stuff
227
+
228
+ # SageMath parsed files
229
+
230
+ # Environments
231
+
232
+ # Spyder project settings
233
+
234
+ # Rope project settings
235
+
236
+ # mkdocs documentation
237
+
238
+ # mypy
239
+
240
+ # Pyre type checker
241
+
242
+ # pytype static type analyzer
243
+
244
+ # Cython debug symbols
245
+
246
+ # PyCharm
247
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
248
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
249
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
250
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
251
+
252
+ ### Python Patch ###
253
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
254
+ poetry.toml
255
+
256
+ # ruff
257
+ .ruff_cache/
258
+
259
+ # LSP config files
260
+ pyrightconfig.json
261
+
262
+ # End of https://www.toptal.com/developers/gitignore/api/python,flask
263
+ .vscode
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install dependencies
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Copy source
10
+ COPY . .
11
+
12
+ # Hugging Face Spaces runs on port 7860
13
+ EXPOSE 7860
14
+
15
+ CMD ["gunicorn", "app:app", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "120"]
app.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import httpx
5
+ import asyncio
6
+ import requests
7
+ from datetime import datetime, timezone
8
+ from flask import Flask, request, jsonify
9
+ from flask_jwt_extended import (
10
+ JWTManager,
11
+ create_access_token,
12
+ jwt_required,
13
+ get_jwt_identity,
14
+ )
15
+ import secrets
16
+ from flask_cors import CORS
17
+ from dotenv import load_dotenv
18
+
19
+ # Import the Microsoft Agent Framework A2A classes
20
+ from a2a.client import A2ACardResolver
21
+ from agent_framework.a2a import A2AAgent
22
+
23
+ import subprocess, sys, os, logging
24
+ from apscheduler.schedulers.background import BackgroundScheduler
25
+ from apscheduler.triggers.cron import CronTrigger
26
+ import atexit
27
+
28
+ load_dotenv()
29
+
30
+ # ── SCHEDULER SETUP (add this block after your Flask app is created) ──────────
31
+
32
+ # Resolve the absolute path to seed_logs.py so the job works regardless of
33
+ # the working directory Flask was launched from.
34
+ BACKEND_DIR = os.path.dirname(os.path.abspath(__file__)) # .../backend/
35
+ SEED_SCRIPT = os.path.join(BACKEND_DIR, "seed_logs.py")
36
+
37
+
38
+ def run_seed_logs():
39
+ """
40
+ Spawns seed_logs.py as a subprocess using the same Python interpreter
41
+ that is running Flask. stdout/stderr are forwarded to the Flask logger
42
+ so you can see seeder output in your server logs.
43
+ """
44
+ logging.info("[Scheduler] Starting daily seed_logs.py run...")
45
+
46
+ try:
47
+ result = subprocess.run(
48
+ [sys.executable, SEED_SCRIPT], # same venv Python → same packages
49
+ capture_output=True,
50
+ text=True,
51
+ cwd=BACKEND_DIR, # run from backend/ so .env is found
52
+ )
53
+
54
+ if result.returncode == 0:
55
+ logging.info(
56
+ "[Scheduler] seed_logs.py completed successfully.\n%s", result.stdout
57
+ )
58
+ else:
59
+ logging.error(
60
+ "[Scheduler] seed_logs.py failed (exit %d).\nSTDOUT: %s\nSTDERR: %s",
61
+ result.returncode,
62
+ result.stdout,
63
+ result.stderr,
64
+ )
65
+
66
+ except Exception as exc:
67
+ logging.exception("[Scheduler] Unexpected error running seed_logs.py: %s", exc)
68
+
69
+
70
+ app = Flask(__name__)
71
+ app.config["JWT_SECRET_KEY"] = os.getenv("JWT_SECRET_KEY") or secrets.token_hex(32)
72
+ jwt = JWTManager(app)
73
+ CORS(app)
74
+
75
+ # Create the scheduler (BackgroundScheduler runs in a daemon thread —
76
+ # it doesn't block Flask and stops automatically when the process exits).
77
+ scheduler = BackgroundScheduler(timezone="UTC")
78
+
79
+ scheduler.add_job(
80
+ func=run_seed_logs,
81
+ trigger=CronTrigger(
82
+ hour=0, # midnight UTC
83
+ minute=0,
84
+ second=0,
85
+ timezone="UTC",
86
+ ),
87
+ id="daily_seed_logs",
88
+ name="Daily seed_logs.py at 00:00 UTC",
89
+ replace_existing=True, # safe to call even if job already registered
90
+ misfire_grace_time=60 * 5, # if the server was down at midnight, run
91
+ # the job within the next 5 minutes instead
92
+ # of skipping it entirely
93
+ )
94
+
95
+ scheduler.start()
96
+ logging.info(
97
+ "[Scheduler] APScheduler started. Next seed run: %s",
98
+ scheduler.get_job("daily_seed_logs").next_run_time,
99
+ )
100
+
101
+ # Cleanly shut the scheduler down when Flask/Gunicorn exits
102
+ atexit.register(lambda: scheduler.shutdown(wait=False))
103
+
104
+ # ---------------------------------------------------------
105
+ # Configuration
106
+ # ---------------------------------------------------------
107
+ # NOTE: Make sure this is your Kibana URL (e.g. https://xxxx.kb.asia-south1.gcp.elastic-cloud.com)
108
+ KIBANA_URL = os.getenv("KIBANA_URL", "http://localhost:5601").rstrip("/")
109
+ ES_API_KEY = os.getenv("ES_API_KEY")
110
+ AGENT_ID = os.getenv("AGENT_ID")
111
+
112
+ # ---------------------------------------------------------
113
+ # Mock Database (In-Memory for MVP)
114
+ # ---------------------------------------------------------
115
+ db = {"cases": {}, "actions": {}, "audit_logs": []}
116
+
117
+
118
+ def now_iso():
119
+ return datetime.now(timezone.utc).isoformat()
120
+
121
+
122
+ def add_audit_log(case_id, actor, action_type, details):
123
+ log = {
124
+ "id": uuid.uuid4().hex[:8],
125
+ "case_id": case_id,
126
+ "actor": actor,
127
+ "action_type": action_type,
128
+ "details": details,
129
+ "timestamp": now_iso(),
130
+ }
131
+ db["audit_logs"].append(log)
132
+ print(f"[AUDIT] {action_type}: {details}")
133
+
134
+
135
+ @app.route("/api/admin/seed", methods=["POST"])
136
+ def trigger_seed():
137
+ """Manually kick off seed_logs.py — protected by a simple header check."""
138
+ scheduler.add_job(
139
+ func=run_seed_logs,
140
+ id="manual_seed",
141
+ replace_existing=True,
142
+ )
143
+ return jsonify({"status": "seeder job queued"}), 202
144
+
145
+
146
+ # AUTH ENDPOINTS
147
+ @app.route("/api/auth/login", methods=["POST"])
148
+ def login():
149
+ data = request.json or {}
150
+ username = data.get("username", "")
151
+ password = data.get("password", "")
152
+
153
+ if username != os.getenv("UI_AUTH_USERNAME") or password != os.getenv(
154
+ "UI_AUTH_PASSWORD"
155
+ ):
156
+ return jsonify({"msg": "Bad username or password"}), 401
157
+
158
+ token = create_access_token(identity=username)
159
+ return jsonify({"access_token": token})
160
+
161
+
162
+ @app.route("/api/auth/me", methods=["GET"])
163
+ @jwt_required()
164
+ def me():
165
+ return jsonify({"user": get_jwt_identity()})
166
+
167
+
168
+ @app.route("/api/auth/logout", methods=["POST"])
169
+ @jwt_required()
170
+ def logout():
171
+ # MVP: client deletes token; real logout would use token revocation/denylist
172
+ return jsonify({"ok": True})
173
+
174
+
175
+ # ---------------------------------------------------------
176
+ # 1. Alert Webhook (Simulates an alert triggering a Case)
177
+ # ---------------------------------------------------------
178
+ @app.route("/api/alerts", methods=["POST"])
179
+ @jwt_required()
180
+ def receive_alert():
181
+ data = request.json
182
+ case_id = f"CASE-{uuid.uuid4().hex[:6].upper()}"
183
+
184
+ db["cases"][case_id] = {
185
+ "id": case_id,
186
+ "status": "OPEN",
187
+ "severity": data.get("severity", "P1"),
188
+ "title": data.get("title", "High 500 Error Rate detected"),
189
+ "service": data.get("service", "checkout-api"),
190
+ "start_time": data.get("start_time"),
191
+ "end_time": data.get("end_time"),
192
+ "created_at": now_iso(),
193
+ "analysis": None,
194
+ }
195
+
196
+ add_audit_log(
197
+ case_id, "System", "CASE_CREATED", f"Alert triggered for {data.get('service')}"
198
+ )
199
+ return jsonify({"message": "Case created", "case_id": case_id}), 201
200
+
201
+
202
+ # ---------------------------------------------------------
203
+ # 2. Analyze (Calls Elastic Agent via Microsoft Agent Framework)
204
+ # ---------------------------------------------------------
205
+ @app.route("/api/cases/<case_id>/analyze", methods=["POST"])
206
+ @jwt_required()
207
+ async def analyze_case(case_id):
208
+ case = db["cases"].get(case_id)
209
+ if not case:
210
+ return jsonify({"error": "Case not found"}), 404
211
+
212
+ prompt = (
213
+ f"An alert fired for {case['service']} between {case['start_time']} and {case['end_time']}. "
214
+ "Please investigate this using your tools. Follow your system instructions to find the root cause "
215
+ "and return ONLY the raw JSON format with causal_chain, hypotheses, and mitigations. "
216
+ "Do not include markdown blocks like ```json."
217
+ )
218
+
219
+ add_audit_log(
220
+ case_id, "System", "ANALYSIS_STARTED", "Sent context to Elastic AI Agent"
221
+ )
222
+
223
+ # The Kibana A2A base URL
224
+ a2a_agent_host = f"{KIBANA_URL}/api/agent_builder/a2a"
225
+ custom_headers = {"Authorization": f"ApiKey {ES_API_KEY}", "kbn-xsrf": "true"}
226
+
227
+ try:
228
+ # Use httpx AsyncClient as required by the Agent Framework
229
+ async with httpx.AsyncClient(
230
+ timeout=120.0, headers=custom_headers
231
+ ) as http_client:
232
+
233
+ # 1. Resolve the A2A Agent Card using the Agent ID
234
+ resolver = A2ACardResolver(
235
+ httpx_client=http_client, base_url=a2a_agent_host
236
+ )
237
+
238
+ # The Agent Card path uses your Agent ID
239
+ agent_card = await resolver.get_agent_card(
240
+ relative_card_path=f"/{AGENT_ID}.json"
241
+ )
242
+ print(f"Found Agent: {agent_card.name} - {agent_card.description}")
243
+
244
+ # 2. Use the Agent Framework to connect to the Elastic Agent
245
+ agent = A2AAgent(
246
+ name=agent_card.name,
247
+ description=agent_card.description,
248
+ agent_card=agent_card,
249
+ url=a2a_agent_host,
250
+ http_client=http_client,
251
+ )
252
+
253
+ print("Sending prompt to Elastic A2A agent...")
254
+
255
+ # 3. Execute the Run command (this handles the JSON-RPC complexity automatically)
256
+ response = await agent.run(prompt)
257
+
258
+ # Extract the text from the response
259
+ agent_reply = ""
260
+ for message in response.messages:
261
+ agent_reply += message.text
262
+
263
+ # Clean up Markdown formatting if the LLM accidentally added it
264
+ if agent_reply.startswith("```json"):
265
+ agent_reply = agent_reply.strip("```json").strip("```").strip()
266
+
267
+ analysis_data = json.loads(agent_reply)
268
+ case["analysis"] = analysis_data
269
+ case["status"] = "INVESTIGATED"
270
+
271
+ # Auto-create PENDING actions from the agent's mitigations
272
+ for mitigation in analysis_data.get("mitigations", []):
273
+ action_id = f"ACT-{uuid.uuid4().hex[:6].upper()}"
274
+ db["actions"][action_id] = {
275
+ "id": action_id,
276
+ "case_id": case_id,
277
+ "type": mitigation.get("type", "UNKNOWN"),
278
+ "action": mitigation.get("action", ""),
279
+ "status": "PENDING",
280
+ "created_at": now_iso(),
281
+ }
282
+
283
+ add_audit_log(
284
+ case_id,
285
+ "System",
286
+ "ANALYSIS_COMPLETE",
287
+ "Agent identified root cause and proposed mitigations",
288
+ )
289
+ return jsonify(
290
+ {
291
+ "message": "Analysis complete",
292
+ "analysis": analysis_data,
293
+ "actions": db["actions"],
294
+ }
295
+ )
296
+
297
+ except Exception as e:
298
+ print("Error calling agent via Agent Framework:", e)
299
+ return jsonify({"error": str(e)}), 500
300
+
301
+
302
+ # ---------------------------------------------------------
303
+ # 3. Human-in-the-Loop (HITL) Approval
304
+ # ---------------------------------------------------------
305
+ @app.route("/api/actions/<action_id>/approve", methods=["POST"])
306
+ @jwt_required()
307
+ def approve_action(action_id):
308
+ action = db["actions"].get(action_id)
309
+ if not action:
310
+ return jsonify({"error": "Action not found"}), 404
311
+
312
+ data = request.json or {}
313
+ user = data.get("user", "OnCall-Engineer-1")
314
+
315
+ action["status"] = "APPROVED"
316
+ action["approved_by"] = user
317
+ action["approved_at"] = now_iso()
318
+
319
+ add_audit_log(
320
+ action["case_id"],
321
+ user,
322
+ "ACTION_APPROVED",
323
+ f"Approved execution of {action['type']}",
324
+ )
325
+ return jsonify({"message": "Action approved", "action": action})
326
+
327
+
328
+ # ---------------------------------------------------------
329
+ # 4. Execute Action (Calls the Mock Executor)
330
+ # ---------------------------------------------------------
331
+ @app.route("/api/actions/<action_id>/execute", methods=["POST"])
332
+ @jwt_required()
333
+ def execute_action(action_id):
334
+ action = db["actions"].get(action_id)
335
+ if not action:
336
+ return jsonify({"error": "Action not found"}), 404
337
+
338
+ if action["status"] != "APPROVED":
339
+ return (
340
+ jsonify({"error": f"Cannot execute action in status: {action['status']}"}),
341
+ 400,
342
+ )
343
+
344
+ action["status"] = "RUNNING"
345
+ add_audit_log(
346
+ action["case_id"],
347
+ "System",
348
+ "EXECUTION_STARTED",
349
+ f"Running runbook for {action['type']}",
350
+ )
351
+
352
+ # ---> Call the Mock HTTP Executor
353
+ mock_executor_url = request.host_url.rstrip("/") + "/api/mock_executor"
354
+ resp = requests.post(
355
+ mock_executor_url, json={"action": action["action"], "type": action["type"]}
356
+ )
357
+
358
+ if resp.status_code == 200:
359
+ action["status"] = "SUCCESS"
360
+ add_audit_log(
361
+ action["case_id"],
362
+ "MockExecutor",
363
+ "EXECUTION_SUCCESS",
364
+ "Runbook completed successfully",
365
+ )
366
+ else:
367
+ action["status"] = "FAILED"
368
+ add_audit_log(
369
+ action["case_id"], "MockExecutor", "EXECUTION_FAILED", "Runbook failed"
370
+ )
371
+
372
+ return jsonify({"message": "Execution finished", "action": action})
373
+
374
+
375
+ # ---------------------------------------------------------
376
+ # 5. Mock HTTP Executor
377
+ # ---------------------------------------------------------
378
+ @app.route("/api/mock_executor", methods=["POST"])
379
+ @jwt_required()
380
+ def mock_executor():
381
+ data = request.json
382
+ import time
383
+
384
+ time.sleep(2)
385
+ print(
386
+ f"[MOCK EXECUTOR] Successfully applied {data.get('type')}: {data.get('action')}"
387
+ )
388
+ return jsonify({"status": "OK", "run_id": f"RUN-{uuid.uuid4().hex[:6]}"}), 200
389
+
390
+
391
+ # ---------------------------------------------------------
392
+ # 6. Read Endpoints
393
+ # ---------------------------------------------------------
394
+ @app.route("/api/cases", methods=["GET"])
395
+ @jwt_required()
396
+ def get_cases():
397
+ return jsonify(list(db["cases"].values()))
398
+
399
+
400
+ @app.route("/api/cases/<case_id>", methods=["GET"])
401
+ @jwt_required()
402
+ def get_case(case_id):
403
+ case = db["cases"].get(case_id)
404
+ if not case:
405
+ return jsonify({"error": "Not found"}), 404
406
+
407
+ case_actions = [a for a in db["actions"].values() if a["case_id"] == case_id]
408
+ case_audits = [l for l in db["audit_logs"] if l["case_id"] == case_id]
409
+
410
+ return jsonify({"case": case, "actions": case_actions, "audit_logs": case_audits})
411
+
412
+
413
+ @app.route("/api/health")
414
+ def health():
415
+ return jsonify({"status": "ok"}), 200
416
+
417
+
418
+ if __name__ == "__main__":
419
+ app.run(port=5000, debug=True)
requirements.txt ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a2a-sdk==0.3.24
2
+ agent-framework-a2a==1.0.0b260225
3
+ agent-framework-core==1.0.0rc2
4
+ annotated-types==0.7.0
5
+ anyio==4.12.1
6
+ APScheduler==3.10.4
7
+ attrs==25.4.0
8
+ azure-ai-projects==2.0.0b3
9
+ azure-core==1.38.2
10
+ azure-identity==1.25.2
11
+ azure-storage-blob==12.28.0
12
+ blinker==1.9.0
13
+ certifi==2026.2.25
14
+ cffi==2.0.0
15
+ charset-normalizer==3.4.4
16
+ click==8.3.1
17
+ colorama==0.4.6
18
+ cryptography==46.0.5
19
+ distro==1.9.0
20
+ Flask==3.1.3
21
+ flask-cors==6.0.2
22
+ Flask-JWT-Extended==4.7.1
23
+ gunicorn>=21.2.0
24
+ google-api-core==2.30.0
25
+ google-auth==2.48.0
26
+ googleapis-common-protos==1.72.0
27
+ h11==0.16.0
28
+ httpcore==1.0.9
29
+ httpx==0.28.1
30
+ httpx-sse==0.4.3
31
+ idna==3.11
32
+ importlib_metadata==8.7.1
33
+ isodate==0.7.2
34
+ itsdangerous==2.2.0
35
+ Jinja2==3.1.6
36
+ jiter==0.13.0
37
+ jsonschema==4.26.0
38
+ jsonschema-specifications==2025.9.1
39
+ MarkupSafe==3.0.3
40
+ mcp==1.26.0
41
+ msal==1.35.0
42
+ msal-extensions==1.3.1
43
+ openai==2.24.0
44
+ opentelemetry-api==1.39.1
45
+ opentelemetry-sdk==1.39.1
46
+ opentelemetry-semantic-conventions==0.60b1
47
+ opentelemetry-semantic-conventions-ai==0.4.14
48
+ packaging==26.0
49
+ proto-plus==1.27.1
50
+ protobuf==6.33.5
51
+ pyasn1==0.6.2
52
+ pyasn1_modules==0.4.2
53
+ pycparser==3.0
54
+ pydantic==2.12.5
55
+ pydantic-settings==2.13.1
56
+ pydantic_core==2.41.5
57
+ PyJWT==2.11.0
58
+ python-dotenv==1.2.1
59
+ python-multipart==0.0.22
60
+ pytz==2025.2
61
+ pywin32==311
62
+ referencing==0.37.0
63
+ requests==2.32.5
64
+ rpds-py==0.30.0
65
+ rsa==4.9.1
66
+ six==1.17.0
67
+ sniffio==1.3.1
68
+ sse-starlette==3.2.0
69
+ starlette==0.52.1
70
+ tqdm==4.67.3
71
+ typing-inspection==0.4.2
72
+ typing_extensions==4.15.0
73
+ tzdata==2025.3
74
+ tzlocal==5.3.1
75
+ urllib3==2.6.3
76
+ uvicorn==0.41.0
77
+ websockets==16.0
78
+ Werkzeug==3.1.6
79
+ zipp==3.23.0
seed_logs.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ import uuid
5
+ import random
6
+ import argparse
7
+ from datetime import datetime, timedelta, timezone
8
+
9
+ import requests
10
+ from dotenv import load_dotenv
11
+
12
+
13
+ # ----------------------------
14
+ # Env helpers
15
+ # ----------------------------
16
+ def getenv_int(key: str, default: int) -> int:
17
+ v = os.getenv(key)
18
+ return default if v is None or v == "" else int(v)
19
+
20
+
21
+ def getenv_float(key: str, default: float) -> float:
22
+ v = os.getenv(key)
23
+ return default if v is None or v == "" else float(v)
24
+
25
+
26
+ def getenv_bool(key: str, default: bool) -> bool:
27
+ v = os.getenv(key)
28
+ if v is None or v == "":
29
+ return default
30
+ return v.strip().lower() in ("1", "true", "yes", "y", "on")
31
+
32
+
33
+ def iso(ts: datetime) -> str:
34
+ return ts.astimezone(timezone.utc).isoformat()
35
+
36
+
37
+ def new_trace_id() -> str:
38
+ return uuid.uuid4().hex # 32 hex chars
39
+
40
+
41
+ # ----------------------------
42
+ # Elasticsearch bulk client
43
+ # ----------------------------
44
+ class ESClient:
45
+ def __init__(
46
+ self,
47
+ base_url: str,
48
+ api_key: str | None,
49
+ username: str | None,
50
+ password: str | None,
51
+ verify_ssl: bool,
52
+ ):
53
+ self.base_url = base_url.rstrip("/")
54
+ self.verify_ssl = verify_ssl
55
+
56
+ self.session = requests.Session()
57
+ self.session.headers.update({"Content-Type": "application/x-ndjson"})
58
+
59
+ if api_key:
60
+ self.session.headers.update({"Authorization": f"ApiKey {api_key}"})
61
+ elif username and password:
62
+ self.session.auth = (username, password)
63
+
64
+ def bulk(self, ndjson_lines: list[str]) -> dict:
65
+ body = "\n".join(ndjson_lines) + "\n" # final newline required
66
+ r = self.session.post(
67
+ f"{self.base_url}/_bulk", data=body, timeout=30, verify=self.verify_ssl
68
+ )
69
+ r.raise_for_status()
70
+ return r.json()
71
+
72
+
73
+ # ----------------------------
74
+ # Document builders (ECS-ish)
75
+ # ----------------------------
76
+ def mk_doc_logs(
77
+ dataset: str,
78
+ namespace: str,
79
+ service_name: str,
80
+ level: str,
81
+ message: str,
82
+ ts: datetime,
83
+ trace_id: str | None,
84
+ service_version: str | None,
85
+ host: str | None,
86
+ extra: dict | None = None,
87
+ ) -> dict:
88
+ doc = {
89
+ "@timestamp": iso(ts),
90
+ "data_stream": {"type": "logs", "dataset": dataset, "namespace": namespace},
91
+ "event": {"dataset": dataset},
92
+ "service": {"name": service_name},
93
+ "log": {"level": level},
94
+ "message": message,
95
+ }
96
+ if service_version:
97
+ doc["service"]["version"] = service_version
98
+ if host:
99
+ doc["host"] = {"name": host}
100
+ if trace_id:
101
+ doc["trace"] = {"id": trace_id}
102
+ if extra:
103
+ doc.update(extra)
104
+ return doc
105
+
106
+
107
+ def action_create(index_name: str) -> str:
108
+ return json.dumps({"create": {"_index": index_name}})
109
+
110
+
111
+ def doc_line(doc: dict) -> str:
112
+ return json.dumps(doc, separators=(",", ":"))
113
+
114
+
115
+ # ----------------------------
116
+ # Seeder logic
117
+ # ----------------------------
118
+ def seed_once(
119
+ es: ESClient,
120
+ namespace: str,
121
+ run_id: str,
122
+ baseline_minutes: int,
123
+ incident_minutes: int,
124
+ recovery_minutes: int,
125
+ rps: int,
126
+ batch_size_requests: int,
127
+ incident_timeout_ratio: float,
128
+ incident_deadlock_ratio: float,
129
+ random_seed: int | None,
130
+ ) -> None:
131
+
132
+ if random_seed is not None:
133
+ random.seed(random_seed)
134
+
135
+ # Data stream names (follow logs-<dataset>-<namespace>)
136
+ checkout_stream = f"logs-checkout_api-{namespace}"
137
+ payment_stream = f"logs-payment_service-{namespace}"
138
+ postgres_stream = f"logs-postgres-{namespace}"
139
+
140
+ total_minutes = baseline_minutes + incident_minutes + recovery_minutes
141
+ start_ts = datetime.now(timezone.utc) - timedelta(minutes=total_minutes)
142
+
143
+ phases = [
144
+ ("baseline", baseline_minutes),
145
+ ("incident", incident_minutes),
146
+ ("recovery", recovery_minutes),
147
+ ]
148
+
149
+ hosts = {
150
+ "checkout": ["chk-1", "chk-2"],
151
+ "payment": ["pay-1", "pay-2", "pay-3"],
152
+ "postgres": ["pg-1"],
153
+ }
154
+
155
+ versions = {
156
+ "checkout": "3.1.2",
157
+ "payment_ok": "2.6.9",
158
+ "payment_bad": "2.7.0",
159
+ }
160
+
161
+ ndjson_lines: list[str] = []
162
+ total_docs = 0
163
+ errors = 0
164
+
165
+ def flush():
166
+ nonlocal ndjson_lines, total_docs, errors
167
+ if not ndjson_lines:
168
+ return
169
+ resp = es.bulk(ndjson_lines)
170
+ if resp.get("errors"):
171
+ errors += 1
172
+ ndjson_lines = []
173
+
174
+ # Optional: emit a deploy marker just before incident begins
175
+ deploy_marker_emitted = False
176
+
177
+ cursor = start_ts
178
+ for phase_name, phase_minutes in phases:
179
+ seconds = phase_minutes * 60
180
+
181
+ # Deploy marker at the start of incident phase
182
+ if phase_name == "incident" and not deploy_marker_emitted:
183
+ deploy_marker_emitted = True
184
+ t_id = None
185
+ pay_host = random.choice(hosts["payment"])
186
+ deploy_doc = mk_doc_logs(
187
+ dataset="payment_service",
188
+ namespace=namespace,
189
+ service_name="payment-service",
190
+ level="INFO",
191
+ message="deploy payment-service v2.7.0",
192
+ ts=cursor,
193
+ trace_id=t_id,
194
+ service_version=versions["payment_bad"],
195
+ host=pay_host,
196
+ extra={
197
+ "labels": {
198
+ "run_id": run_id,
199
+ "phase": "deploy",
200
+ "change_type": "deploy",
201
+ }
202
+ },
203
+ )
204
+ ndjson_lines.append(action_create(payment_stream))
205
+ ndjson_lines.append(doc_line(deploy_doc))
206
+ total_docs += 1
207
+
208
+ for _sec in range(seconds):
209
+ # For each second, generate rps "requests"
210
+ for _ in range(rps):
211
+ t_id = new_trace_id()
212
+
213
+ chk_host = random.choice(hosts["checkout"])
214
+ pay_host = random.choice(hosts["payment"])
215
+ pg_host = random.choice(hosts["postgres"])
216
+
217
+ # Default healthy request
218
+ http_status = 200
219
+ chk_level = "INFO"
220
+ chk_msg = "checkout ok"
221
+
222
+ pay_level = "INFO"
223
+ pay_msg = "payment ok"
224
+ pay_version = versions["payment_ok"]
225
+
226
+ pg_level = "INFO"
227
+ pg_msg = "query ok"
228
+
229
+ # Simulated db pool metrics
230
+ pool_max = 50
231
+ pool_in_use = random.randint(5, 25)
232
+ pool_wait_ms = random.randint(0, 20)
233
+ query_ms = random.randint(5, 40)
234
+
235
+ if phase_name == "incident":
236
+ pay_version = versions["payment_bad"]
237
+
238
+ # Pool usage rises
239
+ pool_in_use = random.randint(45, 50)
240
+ pool_wait_ms = random.randint(200, 2000)
241
+ query_ms = random.randint(200, 6000)
242
+
243
+ # Some requests hit pool acquire timeout
244
+ if random.random() < incident_timeout_ratio:
245
+ pay_level = "ERROR"
246
+ pay_msg = "db.pool acquire timeout"
247
+ http_status = 500
248
+ chk_level = "ERROR"
249
+ chk_msg = "Upstream timeout calling payment-service"
250
+
251
+ # Some requests show deadlock/lock issues
252
+ if random.random() < incident_deadlock_ratio:
253
+ pg_level = "ERROR"
254
+ pg_msg = random.choice(
255
+ [
256
+ "deadlock detected",
257
+ "canceling statement due to lock timeout",
258
+ ]
259
+ )
260
+
261
+ if phase_name == "recovery":
262
+ # Things normalize again
263
+ pool_in_use = random.randint(8, 22)
264
+ pool_wait_ms = random.randint(0, 50)
265
+ query_ms = random.randint(5, 80)
266
+
267
+ base_labels = {"run_id": run_id, "phase": phase_name}
268
+
269
+ checkout_doc = mk_doc_logs(
270
+ dataset="checkout_api",
271
+ namespace=namespace,
272
+ service_name="checkout-api",
273
+ level=chk_level,
274
+ message=chk_msg,
275
+ ts=cursor,
276
+ trace_id=t_id,
277
+ service_version=versions["checkout"],
278
+ host=chk_host,
279
+ extra={
280
+ "http": {
281
+ "request": {"method": "POST"},
282
+ "response": {"status_code": http_status},
283
+ },
284
+ "labels": {**base_labels, "timeout_ms": 2000},
285
+ },
286
+ )
287
+
288
+ payment_doc = mk_doc_logs(
289
+ dataset="payment_service",
290
+ namespace=namespace,
291
+ service_name="payment-service",
292
+ level=pay_level,
293
+ message=pay_msg,
294
+ ts=cursor,
295
+ trace_id=t_id,
296
+ service_version=pay_version,
297
+ host=pay_host,
298
+ extra={
299
+ "labels": {
300
+ **base_labels,
301
+ "db.pool_max": pool_max,
302
+ "db.pool_in_use": pool_in_use,
303
+ "db.pool_wait_ms": pool_wait_ms,
304
+ "db.query_ms": query_ms,
305
+ }
306
+ },
307
+ )
308
+
309
+ postgres_doc = mk_doc_logs(
310
+ dataset="postgres",
311
+ namespace=namespace,
312
+ service_name="postgres",
313
+ level=pg_level,
314
+ message=pg_msg,
315
+ ts=cursor,
316
+ trace_id=t_id,
317
+ service_version=None,
318
+ host=pg_host,
319
+ extra={
320
+ "labels": {
321
+ **base_labels,
322
+ "db.query_ms": query_ms,
323
+ "db.query": "UPDATE orders SET status=?",
324
+ }
325
+ },
326
+ )
327
+
328
+ # Add to bulk payload (3 docs per request)
329
+ for index_name, doc in [
330
+ (checkout_stream, checkout_doc),
331
+ (payment_stream, payment_doc),
332
+ (postgres_stream, postgres_doc),
333
+ ]:
334
+ ndjson_lines.append(action_create(index_name))
335
+ ndjson_lines.append(doc_line(doc))
336
+ total_docs += 1
337
+
338
+ # Flush when batch is big enough (batch_size_requests requests -> 3*2*batch_size_requests lines)
339
+ if (total_docs % (batch_size_requests * 3)) == 0:
340
+ flush()
341
+
342
+ cursor += timedelta(seconds=1)
343
+
344
+ flush()
345
+
346
+ print("Seed complete")
347
+ print(f"Namespace: {namespace}")
348
+ print(f"Run ID: {run_id}")
349
+ print(f"Streams: {checkout_stream}, {payment_stream}, {postgres_stream}")
350
+ print(f"Docs indexed: {total_docs}")
351
+ print(
352
+ f"Bulk responses with errors: {errors} (check Elasticsearch response details if > 0)"
353
+ )
354
+
355
+
356
+ def main():
357
+ load_dotenv() # reads .env into environment variables for os.getenv [web:320]
358
+
359
+ parser = argparse.ArgumentParser(
360
+ description="One-time ECS-ish log seeder: baseline -> incident -> recovery"
361
+ )
362
+ parser.add_argument("--namespace", default=os.getenv("SEED_NAMESPACE", "mvp1"))
363
+ parser.add_argument(
364
+ "--run-id", default=os.getenv("SEED_RUN_ID", f"run_{uuid.uuid4().hex[:8]}")
365
+ )
366
+ args = parser.parse_args()
367
+
368
+ es_url = os.getenv("ES_URL", "http://localhost:9200")
369
+ es_api_key = os.getenv("ES_API_KEY") # base64 ApiKey
370
+ es_username = os.getenv("ES_USERNAME")
371
+ es_password = os.getenv("ES_PASSWORD")
372
+ verify_ssl = getenv_bool("ES_VERIFY_SSL", True)
373
+
374
+ baseline_minutes = getenv_int("SEED_BASELINE_MINUTES", 10)
375
+ incident_minutes = getenv_int("SEED_INCIDENT_MINUTES", 3)
376
+ recovery_minutes = getenv_int("SEED_RECOVERY_MINUTES", 5)
377
+
378
+ rps = getenv_int("SEED_RPS", 5)
379
+ batch_size_requests = getenv_int("SEED_BATCH_SIZE_REQUESTS", 200)
380
+
381
+ incident_timeout_ratio = getenv_float("SEED_INCIDENT_TIMEOUT_RATIO", 0.30)
382
+ incident_deadlock_ratio = getenv_float("SEED_INCIDENT_DEADLOCK_RATIO", 0.20)
383
+
384
+ random_seed = os.getenv("SEED_RANDOM_SEED")
385
+ random_seed = None if not random_seed else int(random_seed)
386
+
387
+ es = ESClient(
388
+ base_url=es_url,
389
+ api_key=es_api_key,
390
+ username=es_username,
391
+ password=es_password,
392
+ verify_ssl=verify_ssl,
393
+ )
394
+
395
+ seed_once(
396
+ es=es,
397
+ namespace=args.namespace,
398
+ run_id=args.run_id,
399
+ baseline_minutes=baseline_minutes,
400
+ incident_minutes=incident_minutes,
401
+ recovery_minutes=recovery_minutes,
402
+ rps=rps,
403
+ batch_size_requests=batch_size_requests,
404
+ incident_timeout_ratio=incident_timeout_ratio,
405
+ incident_deadlock_ratio=incident_deadlock_ratio,
406
+ random_seed=random_seed,
407
+ )
408
+
409
+
410
+ if __name__ == "__main__":
411
+ main()