Spaces:

duqing026
/

incident-postmortem-pro

Sleeping

Trae Assistant

Initial commit: Incident Postmortem Pro with 5-Whys, Timeline, and robust features

7bb2782 16 days ago

7.72 kB

	import os
	import json
	import uuid
	from flask import Flask, render_template, request, jsonify, send_from_directory

	app = Flask(__name__)
	app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB limit for safety

	# 配置
	DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
	DATA_FILE = os.path.join(DATA_DIR, 'postmortems.json')

	# 确保数据目录存在
	os.makedirs(DATA_DIR, exist_ok=True)

	def load_data():
	if not os.path.exists(DATA_FILE):
	# 初始化默认数据
	default_data = [
	{
	"id": "demo-001",
	"title": "生产环境数据库连接池耗尽导致服务不可用",
	"date": "2023-10-24",
	"severity": "P0",
	"owner": "张三",
	"created_at": "2023-10-24T10:00:00Z",
	"summary": "在双十一预热活动期间，主数据库连接数突然飙升至 100%，导致订单服务无法连接数据库，所有下单请求失败。持续时间约 15 分钟。",
	"impact": "造成约 5000 单交易失败，预估 GMV 损失 100 万。用户投诉激增。",
	"timeline": [
	{"time": "20:00", "description": "流量开始激增，达到平时峰值的 3 倍", "type": "detection"},
	{"time": "20:05", "description": "监控系统报警，DB CPU 飙升至 90%", "type": "detection"},
	{"time": "20:08", "description": "DBA 介入排查，发现活动连接数打满", "type": "investigation"},
	{"time": "20:12", "description": "紧急扩容 Read Replica，并临时调大 Max Connections", "type": "fix"},
	{"time": "20:15", "description": "服务逐步恢复正常", "type": "verification"}
	],
	"root_cause": {
	"whys": [
	"数据库连接数打满，拒绝新连接",
	"后端服务实例扩容后，每个实例的连接池配置未做相应调整",
	"微服务配置中心下发的连接池参数是静态的，未随实例数动态计算",
	"在进行压力测试时，只测试了单实例性能，未测试全链路大规模并发下的 DB 瓶颈",
	"缺乏对数据库总连接数的全局管控机制"
	],
	"conclusion": "缺乏全局的数据库连接治理机制，且压测覆盖不全。"
	},
	"action_items": [
	{"task": "实施数据库连接代理 (Proxy) 层", "owner": "李四", "deadline": "2023-11-01", "status": "in_progress"},
	{"task": "优化全链路压测模型，包含 DB 极限场景", "owner": "王五", "deadline": "2023-11-15", "status": "pending"}
	]
	},
	{
	"id": "demo-002",
	"title": "支付回调接口验签逻辑缺陷",
	"date": "2023-09-15",
	"severity": "P2",
	"owner": "赵六",
	"created_at": "2023-09-15T14:00:00Z",
	"summary": "收到用户反馈支付成功但订单状态未更新。排查发现部分特殊字符导致签名验证失败。",
	"impact": "约 20 笔订单卡单，需人工补单。",
	"timeline": [
	{"time": "14:00", "description": "客服收到用户反馈", "type": "detection"},
	{"time": "14:30", "description": "定位到日志中存在大量验签失败错误", "type": "investigation"},
	{"time": "15:00", "description": "修复验签逻辑中的字符编码处理问题", "type": "fix"}
	],
	"root_cause": {
	"whys": [
	"签名验证失败",
	"第三方支付回调参数中包含特殊 Emoji 字符",
	"验签逻辑未正确处理 UTF-8 编码",
	"开发测试阶段未覆盖特殊字符场景",
	""
	],
	"conclusion": "编码规范执行不到位，测试用例缺失。"
	},
	"action_items": [
	{"task": "修复代码并补充单元测试", "owner": "赵六", "deadline": "2023-09-16", "status": "done"}
	]
	}
	]
	save_data(default_data)
	return default_data
	try:
	with open(DATA_FILE, 'r', encoding='utf-8') as f:
	return json.load(f)
	except Exception:
	return []

	def save_data(data):
	with open(DATA_FILE, 'w', encoding='utf-8') as f:
	json.dump(data, f, ensure_ascii=False, indent=2)

	@app.route('/')
	def index():
	return render_template('index.html')

	@app.route('/api/postmortems', methods=['GET'])
	def get_postmortems():
	data = load_data()
	# 按最后修改时间或创建时间排序（这里简单按列表顺序倒序）
	return jsonify(data[::-1])

	@app.route('/api/postmortems', methods=['POST'])
	def create_postmortem():
	data = load_data()
	new_item = request.json

	# 基础验证与默认值
	if not new_item.get('id'):
	new_item['id'] = str(uuid.uuid4())

	new_item['created_at'] = new_item.get('created_at', '')
	new_item['updated_at'] = new_item.get('updated_at', '')

	data.append(new_item)
	save_data(data)
	return jsonify(new_item), 201

	@app.route('/api/postmortems/<item_id>', methods=['PUT'])
	def update_postmortem(item_id):
	data = load_data()
	updated_item = request.json

	for i, item in enumerate(data):
	if item['id'] == item_id:
	data[i] = updated_item
	save_data(data)
	return jsonify(updated_item)

	return jsonify({'error': 'Not found'}), 404

	@app.route('/api/postmortems/<item_id>', methods=['DELETE'])
	def delete_postmortem(item_id):
	data = load_data()
	data = [item for item in data if item['id'] != item_id]
	save_data(data)
	return jsonify({'success': True})

	@app.route('/api/export/<item_id>', methods=['GET'])
	def export_markdown(item_id):
	data = load_data()
	item = next((i for i in data if i['id'] == item_id), None)
	if not item:
	return "Not Found", 404

	# 生成 Markdown 内容
	md = f"# 事故复盘: {item.get('title', '无标题')}\n\n"
	md += f"日期: {item.get('date', '')} \| 级别: {item.get('severity', '')} \| 负责人: {item.get('owner', '')}\n\n"

	md += "## 1. 事故摘要 (Summary)\n"
	md += f"{item.get('summary', '无')}\n\n"

	md += "## 2. 影响范围 (Impact)\n"
	md += f"{item.get('impact', '无')}\n\n"

	md += "## 3. 时间线 (Timeline)\n"
	for t in item.get('timeline', []):
	md += f"- {t.get('time', '')}: {t.get('description', '')} ({t.get('type', '')})\n"
	md += "\n"

	md += "## 4. 根因分析 (5 Whys)\n"
	whys = item.get('root_cause', {}).get('whys', [])
	for idx, why in enumerate(whys):
	if why:
	md += f"{idx+1}. Why? {why}\n"
	md += f"\n结论: {item.get('root_cause', {}).get('conclusion', '')}\n\n"

	md += "## 5. 改进措施 (Action Items)\n"
	md += "\| 任务 \| 负责人 \| 截止日期 \| 状态 \|\n"
	md += "\| --- \| --- \| --- \| --- \|\n"
	for action in item.get('action_items', []):
	md += f"\| {action.get('task', '')} \| {action.get('owner', '')} \| {action.get('deadline', '')} \| {action.get('status', '')} \|\n"

	return jsonify({'markdown': md})

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860, debug=True)