Trae Assistant
Initial commit: Incident Postmortem Pro with 5-Whys, Timeline, and robust features
7bb2782
import os
import json
import uuid
from flask import Flask, render_template, request, jsonify, send_from_directory
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit for safety
# 配置
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
DATA_FILE = os.path.join(DATA_DIR, 'postmortems.json')
# 确保数据目录存在
os.makedirs(DATA_DIR, exist_ok=True)
def load_data():
if not os.path.exists(DATA_FILE):
# 初始化默认数据
default_data = [
{
"id": "demo-001",
"title": "生产环境数据库连接池耗尽导致服务不可用",
"date": "2023-10-24",
"severity": "P0",
"owner": "张三",
"created_at": "2023-10-24T10:00:00Z",
"summary": "在双十一预热活动期间,主数据库连接数突然飙升至 100%,导致订单服务无法连接数据库,所有下单请求失败。持续时间约 15 分钟。",
"impact": "造成约 5000 单交易失败,预估 GMV 损失 100 万。用户投诉激增。",
"timeline": [
{"time": "20:00", "description": "流量开始激增,达到平时峰值的 3 倍", "type": "detection"},
{"time": "20:05", "description": "监控系统报警,DB CPU 飙升至 90%", "type": "detection"},
{"time": "20:08", "description": "DBA 介入排查,发现活动连接数打满", "type": "investigation"},
{"time": "20:12", "description": "紧急扩容 Read Replica,并临时调大 Max Connections", "type": "fix"},
{"time": "20:15", "description": "服务逐步恢复正常", "type": "verification"}
],
"root_cause": {
"whys": [
"数据库连接数打满,拒绝新连接",
"后端服务实例扩容后,每个实例的连接池配置未做相应调整",
"微服务配置中心下发的连接池参数是静态的,未随实例数动态计算",
"在进行压力测试时,只测试了单实例性能,未测试全链路大规模并发下的 DB 瓶颈",
"缺乏对数据库总连接数的全局管控机制"
],
"conclusion": "缺乏全局的数据库连接治理机制,且压测覆盖不全。"
},
"action_items": [
{"task": "实施数据库连接代理 (Proxy) 层", "owner": "李四", "deadline": "2023-11-01", "status": "in_progress"},
{"task": "优化全链路压测模型,包含 DB 极限场景", "owner": "王五", "deadline": "2023-11-15", "status": "pending"}
]
},
{
"id": "demo-002",
"title": "支付回调接口验签逻辑缺陷",
"date": "2023-09-15",
"severity": "P2",
"owner": "赵六",
"created_at": "2023-09-15T14:00:00Z",
"summary": "收到用户反馈支付成功但订单状态未更新。排查发现部分特殊字符导致签名验证失败。",
"impact": "约 20 笔订单卡单,需人工补单。",
"timeline": [
{"time": "14:00", "description": "客服收到用户反馈", "type": "detection"},
{"time": "14:30", "description": "定位到日志中存在大量验签失败错误", "type": "investigation"},
{"time": "15:00", "description": "修复验签逻辑中的字符编码处理问题", "type": "fix"}
],
"root_cause": {
"whys": [
"签名验证失败",
"第三方支付回调参数中包含特殊 Emoji 字符",
"验签逻辑未正确处理 UTF-8 编码",
"开发测试阶段未覆盖特殊字符场景",
""
],
"conclusion": "编码规范执行不到位,测试用例缺失。"
},
"action_items": [
{"task": "修复代码并补充单元测试", "owner": "赵六", "deadline": "2023-09-16", "status": "done"}
]
}
]
save_data(default_data)
return default_data
try:
with open(DATA_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception:
return []
def save_data(data):
with open(DATA_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/postmortems', methods=['GET'])
def get_postmortems():
data = load_data()
# 按最后修改时间或创建时间排序(这里简单按列表顺序倒序)
return jsonify(data[::-1])
@app.route('/api/postmortems', methods=['POST'])
def create_postmortem():
data = load_data()
new_item = request.json
# 基础验证与默认值
if not new_item.get('id'):
new_item['id'] = str(uuid.uuid4())
new_item['created_at'] = new_item.get('created_at', '')
new_item['updated_at'] = new_item.get('updated_at', '')
data.append(new_item)
save_data(data)
return jsonify(new_item), 201
@app.route('/api/postmortems/<item_id>', methods=['PUT'])
def update_postmortem(item_id):
data = load_data()
updated_item = request.json
for i, item in enumerate(data):
if item['id'] == item_id:
data[i] = updated_item
save_data(data)
return jsonify(updated_item)
return jsonify({'error': 'Not found'}), 404
@app.route('/api/postmortems/<item_id>', methods=['DELETE'])
def delete_postmortem(item_id):
data = load_data()
data = [item for item in data if item['id'] != item_id]
save_data(data)
return jsonify({'success': True})
@app.route('/api/export/<item_id>', methods=['GET'])
def export_markdown(item_id):
data = load_data()
item = next((i for i in data if i['id'] == item_id), None)
if not item:
return "Not Found", 404
# 生成 Markdown 内容
md = f"# 事故复盘: {item.get('title', '无标题')}\n\n"
md += f"**日期**: {item.get('date', '')} | **级别**: {item.get('severity', '')} | **负责人**: {item.get('owner', '')}\n\n"
md += "## 1. 事故摘要 (Summary)\n"
md += f"{item.get('summary', '无')}\n\n"
md += "## 2. 影响范围 (Impact)\n"
md += f"{item.get('impact', '无')}\n\n"
md += "## 3. 时间线 (Timeline)\n"
for t in item.get('timeline', []):
md += f"- **{t.get('time', '')}**: {t.get('description', '')} ({t.get('type', '')})\n"
md += "\n"
md += "## 4. 根因分析 (5 Whys)\n"
whys = item.get('root_cause', {}).get('whys', [])
for idx, why in enumerate(whys):
if why:
md += f"{idx+1}. Why? {why}\n"
md += f"\n**结论**: {item.get('root_cause', {}).get('conclusion', '')}\n\n"
md += "## 5. 改进措施 (Action Items)\n"
md += "| 任务 | 负责人 | 截止日期 | 状态 |\n"
md += "| --- | --- | --- | --- |\n"
for action in item.get('action_items', []):
md += f"| {action.get('task', '')} | {action.get('owner', '')} | {action.get('deadline', '')} | {action.get('status', '')} |\n"
return jsonify({'markdown': md})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=True)