Spaces:
Sleeping
Sleeping
Trae Assistant
Initial commit: Incident Postmortem Pro with 5-Whys, Timeline, and robust features
7bb2782
| import os | |
| import json | |
| import uuid | |
| from flask import Flask, render_template, request, jsonify, send_from_directory | |
| app = Flask(__name__) | |
| app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit for safety | |
| # 配置 | |
| DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') | |
| DATA_FILE = os.path.join(DATA_DIR, 'postmortems.json') | |
| # 确保数据目录存在 | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| def load_data(): | |
| if not os.path.exists(DATA_FILE): | |
| # 初始化默认数据 | |
| default_data = [ | |
| { | |
| "id": "demo-001", | |
| "title": "生产环境数据库连接池耗尽导致服务不可用", | |
| "date": "2023-10-24", | |
| "severity": "P0", | |
| "owner": "张三", | |
| "created_at": "2023-10-24T10:00:00Z", | |
| "summary": "在双十一预热活动期间,主数据库连接数突然飙升至 100%,导致订单服务无法连接数据库,所有下单请求失败。持续时间约 15 分钟。", | |
| "impact": "造成约 5000 单交易失败,预估 GMV 损失 100 万。用户投诉激增。", | |
| "timeline": [ | |
| {"time": "20:00", "description": "流量开始激增,达到平时峰值的 3 倍", "type": "detection"}, | |
| {"time": "20:05", "description": "监控系统报警,DB CPU 飙升至 90%", "type": "detection"}, | |
| {"time": "20:08", "description": "DBA 介入排查,发现活动连接数打满", "type": "investigation"}, | |
| {"time": "20:12", "description": "紧急扩容 Read Replica,并临时调大 Max Connections", "type": "fix"}, | |
| {"time": "20:15", "description": "服务逐步恢复正常", "type": "verification"} | |
| ], | |
| "root_cause": { | |
| "whys": [ | |
| "数据库连接数打满,拒绝新连接", | |
| "后端服务实例扩容后,每个实例的连接池配置未做相应调整", | |
| "微服务配置中心下发的连接池参数是静态的,未随实例数动态计算", | |
| "在进行压力测试时,只测试了单实例性能,未测试全链路大规模并发下的 DB 瓶颈", | |
| "缺乏对数据库总连接数的全局管控机制" | |
| ], | |
| "conclusion": "缺乏全局的数据库连接治理机制,且压测覆盖不全。" | |
| }, | |
| "action_items": [ | |
| {"task": "实施数据库连接代理 (Proxy) 层", "owner": "李四", "deadline": "2023-11-01", "status": "in_progress"}, | |
| {"task": "优化全链路压测模型,包含 DB 极限场景", "owner": "王五", "deadline": "2023-11-15", "status": "pending"} | |
| ] | |
| }, | |
| { | |
| "id": "demo-002", | |
| "title": "支付回调接口验签逻辑缺陷", | |
| "date": "2023-09-15", | |
| "severity": "P2", | |
| "owner": "赵六", | |
| "created_at": "2023-09-15T14:00:00Z", | |
| "summary": "收到用户反馈支付成功但订单状态未更新。排查发现部分特殊字符导致签名验证失败。", | |
| "impact": "约 20 笔订单卡单,需人工补单。", | |
| "timeline": [ | |
| {"time": "14:00", "description": "客服收到用户反馈", "type": "detection"}, | |
| {"time": "14:30", "description": "定位到日志中存在大量验签失败错误", "type": "investigation"}, | |
| {"time": "15:00", "description": "修复验签逻辑中的字符编码处理问题", "type": "fix"} | |
| ], | |
| "root_cause": { | |
| "whys": [ | |
| "签名验证失败", | |
| "第三方支付回调参数中包含特殊 Emoji 字符", | |
| "验签逻辑未正确处理 UTF-8 编码", | |
| "开发测试阶段未覆盖特殊字符场景", | |
| "" | |
| ], | |
| "conclusion": "编码规范执行不到位,测试用例缺失。" | |
| }, | |
| "action_items": [ | |
| {"task": "修复代码并补充单元测试", "owner": "赵六", "deadline": "2023-09-16", "status": "done"} | |
| ] | |
| } | |
| ] | |
| save_data(default_data) | |
| return default_data | |
| try: | |
| with open(DATA_FILE, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| except Exception: | |
| return [] | |
| def save_data(data): | |
| with open(DATA_FILE, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| def index(): | |
| return render_template('index.html') | |
| def get_postmortems(): | |
| data = load_data() | |
| # 按最后修改时间或创建时间排序(这里简单按列表顺序倒序) | |
| return jsonify(data[::-1]) | |
| def create_postmortem(): | |
| data = load_data() | |
| new_item = request.json | |
| # 基础验证与默认值 | |
| if not new_item.get('id'): | |
| new_item['id'] = str(uuid.uuid4()) | |
| new_item['created_at'] = new_item.get('created_at', '') | |
| new_item['updated_at'] = new_item.get('updated_at', '') | |
| data.append(new_item) | |
| save_data(data) | |
| return jsonify(new_item), 201 | |
| def update_postmortem(item_id): | |
| data = load_data() | |
| updated_item = request.json | |
| for i, item in enumerate(data): | |
| if item['id'] == item_id: | |
| data[i] = updated_item | |
| save_data(data) | |
| return jsonify(updated_item) | |
| return jsonify({'error': 'Not found'}), 404 | |
| def delete_postmortem(item_id): | |
| data = load_data() | |
| data = [item for item in data if item['id'] != item_id] | |
| save_data(data) | |
| return jsonify({'success': True}) | |
| def export_markdown(item_id): | |
| data = load_data() | |
| item = next((i for i in data if i['id'] == item_id), None) | |
| if not item: | |
| return "Not Found", 404 | |
| # 生成 Markdown 内容 | |
| md = f"# 事故复盘: {item.get('title', '无标题')}\n\n" | |
| md += f"**日期**: {item.get('date', '')} | **级别**: {item.get('severity', '')} | **负责人**: {item.get('owner', '')}\n\n" | |
| md += "## 1. 事故摘要 (Summary)\n" | |
| md += f"{item.get('summary', '无')}\n\n" | |
| md += "## 2. 影响范围 (Impact)\n" | |
| md += f"{item.get('impact', '无')}\n\n" | |
| md += "## 3. 时间线 (Timeline)\n" | |
| for t in item.get('timeline', []): | |
| md += f"- **{t.get('time', '')}**: {t.get('description', '')} ({t.get('type', '')})\n" | |
| md += "\n" | |
| md += "## 4. 根因分析 (5 Whys)\n" | |
| whys = item.get('root_cause', {}).get('whys', []) | |
| for idx, why in enumerate(whys): | |
| if why: | |
| md += f"{idx+1}. Why? {why}\n" | |
| md += f"\n**结论**: {item.get('root_cause', {}).get('conclusion', '')}\n\n" | |
| md += "## 5. 改进措施 (Action Items)\n" | |
| md += "| 任务 | 负责人 | 截止日期 | 状态 |\n" | |
| md += "| --- | --- | --- | --- |\n" | |
| for action in item.get('action_items', []): | |
| md += f"| {action.get('task', '')} | {action.get('owner', '')} | {action.get('deadline', '')} | {action.get('status', '')} |\n" | |
| return jsonify({'markdown': md}) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860, debug=True) | |