Spaces:
Build error
Build error
OpenCode Deployer commited on
Commit ·
4ca5973
1
Parent(s): ede55f3
update
Browse files- DEPLOYMENT.md +585 -0
- SYSTEM_SUMMARY.md +117 -0
- config_template.json +219 -0
- core_system.py +538 -0
- docker-compose.yml +143 -0
- error_analyzer.py +504 -0
- huggingface_client.py +289 -0
- usage_examples.py +457 -0
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,585 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HuggingFace Spaces 自动修复系统部署指南
|
| 2 |
+
|
| 3 |
+
## 📋 部署概览
|
| 4 |
+
|
| 5 |
+
本系统支持多种部署方式:
|
| 6 |
+
- **Docker Compose 部署**(推荐)
|
| 7 |
+
- **本地 Python 部署**
|
| 8 |
+
- **Kubernetes 部署**
|
| 9 |
+
- **云服务部署**
|
| 10 |
+
|
| 11 |
+
## 🐳 Docker Compose 部署(推荐)
|
| 12 |
+
|
| 13 |
+
### 1. 环境准备
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
# 克隆项目
|
| 17 |
+
git clone <repository-url>
|
| 18 |
+
cd hf-repair-system
|
| 19 |
+
|
| 20 |
+
# 创建环境变量文件
|
| 21 |
+
cp .env.example .env
|
| 22 |
+
|
| 23 |
+
# 编辑环境变量
|
| 24 |
+
nano .env
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 2. 环境变量配置
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# .env 文件内容
|
| 31 |
+
HF_TOKEN=your_huggingface_token_here
|
| 32 |
+
WEBHOOK_URL=your_webhook_url_here
|
| 33 |
+
REDIS_PASSWORD=your_redis_password_here
|
| 34 |
+
POSTGRES_PASSWORD=your_postgres_password_here
|
| 35 |
+
GRAFANA_PASSWORD=your_grafana_password_here
|
| 36 |
+
|
| 37 |
+
# 可选配置
|
| 38 |
+
LOG_LEVEL=INFO
|
| 39 |
+
CHECK_INTERVAL=60
|
| 40 |
+
MAX_CONCURRENT_SPACES=10
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### 3. 启动系统
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
# 构建并启动所有服务
|
| 47 |
+
docker-compose up -d
|
| 48 |
+
|
| 49 |
+
# 查看服务状态
|
| 50 |
+
docker-compose ps
|
| 51 |
+
|
| 52 |
+
# 查看日志
|
| 53 |
+
docker-compose logs -f hf-repair-system
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### 4. 验证部署
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# 检查健康状态
|
| 60 |
+
curl http://localhost:8080/health
|
| 61 |
+
|
| 62 |
+
# 访问 Web 界面
|
| 63 |
+
open http://localhost:3000
|
| 64 |
+
|
| 65 |
+
# 访问 Grafana 监控
|
| 66 |
+
open http://localhost:3001
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## 🐧 本地 Python 部署
|
| 70 |
+
|
| 71 |
+
### 1. 环境准备
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
# Python 3.11+
|
| 75 |
+
python3.11 -m venv venv
|
| 76 |
+
source venv/bin/activate # Linux/Mac
|
| 77 |
+
# 或
|
| 78 |
+
venv\Scripts\activate # Windows
|
| 79 |
+
|
| 80 |
+
# 安装依赖
|
| 81 |
+
pip install -r requirements.txt
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### 2. 配置文件
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
# 复制配置模板
|
| 88 |
+
cp config_template.json config.json
|
| 89 |
+
|
| 90 |
+
# 编辑配置
|
| 91 |
+
nano config.json
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### 3. 数据库初始化
|
| 95 |
+
|
| 96 |
+
```bash
|
| 97 |
+
# 创建数据目录
|
| 98 |
+
mkdir -p data logs backups
|
| 99 |
+
|
| 100 |
+
# 初始化数据库
|
| 101 |
+
python -m repair_system.db.init
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### 4. 启动服务
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
# 启动主服务
|
| 108 |
+
python main.py
|
| 109 |
+
|
| 110 |
+
# 或使用 start 脚本
|
| 111 |
+
./start.sh
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
## ☸️ Kubernetes 部署
|
| 115 |
+
|
| 116 |
+
### 1. 准备 Kubernetes 配置
|
| 117 |
+
|
| 118 |
+
```yaml
|
| 119 |
+
# k8s/namespace.yaml
|
| 120 |
+
apiVersion: v1
|
| 121 |
+
kind: Namespace
|
| 122 |
+
metadata:
|
| 123 |
+
name: hf-repair-system
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
# k8s/configmap.yaml
|
| 127 |
+
apiVersion: v1
|
| 128 |
+
kind: ConfigMap
|
| 129 |
+
metadata:
|
| 130 |
+
name: hf-repair-config
|
| 131 |
+
namespace: hf-repair-system
|
| 132 |
+
data:
|
| 133 |
+
config.json: |
|
| 134 |
+
{
|
| 135 |
+
"system": {
|
| 136 |
+
"name": "HuggingFace Spaces 自动修复系统",
|
| 137 |
+
"log_level": "INFO"
|
| 138 |
+
},
|
| 139 |
+
"huggingface": {
|
| 140 |
+
"api_token": "${HF_TOKEN}",
|
| 141 |
+
"base_url": "https://huggingface.co/api"
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
# k8s/secret.yaml
|
| 147 |
+
apiVersion: v1
|
| 148 |
+
kind: Secret
|
| 149 |
+
metadata:
|
| 150 |
+
name: hf-repair-secrets
|
| 151 |
+
namespace: hf-repair-system
|
| 152 |
+
type: Opaque
|
| 153 |
+
data:
|
| 154 |
+
hf-token: <base64-encoded-token>
|
| 155 |
+
webhook-url: <base64-encoded-webhook-url>
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
# k8s/deployment.yaml
|
| 159 |
+
apiVersion: apps/v1
|
| 160 |
+
kind: Deployment
|
| 161 |
+
metadata:
|
| 162 |
+
name: hf-repair-system
|
| 163 |
+
namespace: hf-repair-system
|
| 164 |
+
spec:
|
| 165 |
+
replicas: 3
|
| 166 |
+
selector:
|
| 167 |
+
matchLabels:
|
| 168 |
+
app: hf-repair-system
|
| 169 |
+
template:
|
| 170 |
+
metadata:
|
| 171 |
+
labels:
|
| 172 |
+
app: hf-repair-system
|
| 173 |
+
spec:
|
| 174 |
+
containers:
|
| 175 |
+
- name: repair-system
|
| 176 |
+
image: hf-repair-system:latest
|
| 177 |
+
ports:
|
| 178 |
+
- containerPort: 8080
|
| 179 |
+
env:
|
| 180 |
+
- name: HF_TOKEN
|
| 181 |
+
valueFrom:
|
| 182 |
+
secretKeyRef:
|
| 183 |
+
name: hf-repair-secrets
|
| 184 |
+
key: hf-token
|
| 185 |
+
volumeMounts:
|
| 186 |
+
- name: config
|
| 187 |
+
mountPath: /app/config
|
| 188 |
+
- name: data
|
| 189 |
+
mountPath: /app/data
|
| 190 |
+
resources:
|
| 191 |
+
requests:
|
| 192 |
+
memory: "512Mi"
|
| 193 |
+
cpu: "250m"
|
| 194 |
+
limits:
|
| 195 |
+
memory: "1Gi"
|
| 196 |
+
cpu: "500m"
|
| 197 |
+
volumes:
|
| 198 |
+
- name: config
|
| 199 |
+
configMap:
|
| 200 |
+
name: hf-repair-config
|
| 201 |
+
- name: data
|
| 202 |
+
persistentVolumeClaim:
|
| 203 |
+
claimName: hf-repair-data
|
| 204 |
+
|
| 205 |
+
---
|
| 206 |
+
# k8s/service.yaml
|
| 207 |
+
apiVersion: v1
|
| 208 |
+
kind: Service
|
| 209 |
+
metadata:
|
| 210 |
+
name: hf-repair-service
|
| 211 |
+
namespace: hf-repair-system
|
| 212 |
+
spec:
|
| 213 |
+
selector:
|
| 214 |
+
app: hf-repair-system
|
| 215 |
+
ports:
|
| 216 |
+
- port: 80
|
| 217 |
+
targetPort: 8080
|
| 218 |
+
type: ClusterIP
|
| 219 |
+
|
| 220 |
+
---
|
| 221 |
+
# k8s/ingress.yaml
|
| 222 |
+
apiVersion: networking.k8s.io/v1
|
| 223 |
+
kind: Ingress
|
| 224 |
+
metadata:
|
| 225 |
+
name: hf-repair-ingress
|
| 226 |
+
namespace: hf-repair-system
|
| 227 |
+
annotations:
|
| 228 |
+
nginx.ingress.kubernetes.io/rewrite-target: /
|
| 229 |
+
spec:
|
| 230 |
+
rules:
|
| 231 |
+
- host: hf-repair.yourdomain.com
|
| 232 |
+
http:
|
| 233 |
+
paths:
|
| 234 |
+
- path: /
|
| 235 |
+
pathType: Prefix
|
| 236 |
+
backend:
|
| 237 |
+
service:
|
| 238 |
+
name: hf-repair-service
|
| 239 |
+
port:
|
| 240 |
+
number: 80
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
### 2. 部署到 Kubernetes
|
| 244 |
+
|
| 245 |
+
```bash
|
| 246 |
+
# 创建命名空间和配置
|
| 247 |
+
kubectl apply -f k8s/namespace.yaml
|
| 248 |
+
kubectl apply -f k8s/configmap.yaml
|
| 249 |
+
kubectl apply -f k8s/secret.yaml
|
| 250 |
+
|
| 251 |
+
# 部署应用
|
| 252 |
+
kubectl apply -f k8s/deployment.yaml
|
| 253 |
+
kubectl apply -f k8s/service.yaml
|
| 254 |
+
kubectl apply -f k8s/ingress.yaml
|
| 255 |
+
|
| 256 |
+
# 检查部署状态
|
| 257 |
+
kubectl get pods -n hf-repair-system
|
| 258 |
+
kubectl logs -f deployment/hf-repair-system -n hf-repair-system
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
## 🌩️ 云服务部署
|
| 262 |
+
|
| 263 |
+
### AWS 部署
|
| 264 |
+
|
| 265 |
+
```bash
|
| 266 |
+
# 使用 AWS CLI
|
| 267 |
+
aws ecs create-cluster --cluster-name hf-repair-cluster
|
| 268 |
+
aws ecs register-task-definition --cli-input-json file://task-definition.json
|
| 269 |
+
aws ecs create-service --cluster hf-repair-cluster --service-name hf-repair-service --task-definition hf-repair-task
|
| 270 |
+
|
| 271 |
+
# 设置 CloudWatch 日志
|
| 272 |
+
aws logs create-log-group --log-group-name /ecs/hf-repair-system
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
### Google Cloud 部署
|
| 276 |
+
|
| 277 |
+
```bash
|
| 278 |
+
# 使用 gcloud
|
| 279 |
+
gcloud run deploy hf-repair-system \
|
| 280 |
+
--image gcr.io/your-project/hf-repair-system:latest \
|
| 281 |
+
--platform managed \
|
| 282 |
+
--region us-central1 \
|
| 283 |
+
--allow-unauthenticated \
|
| 284 |
+
--set-env-vars HF_TOKEN=$HF_TOKEN
|
| 285 |
+
```
|
| 286 |
+
|
| 287 |
+
### Azure 部署
|
| 288 |
+
|
| 289 |
+
```bash
|
| 290 |
+
# 使用 Azure CLI
|
| 291 |
+
az container create \
|
| 292 |
+
--resource-group hf-repair-rg \
|
| 293 |
+
--name hf-repair-system \
|
| 294 |
+
--image your-registry/hf-repair-system:latest \
|
| 295 |
+
--environment-variables HF_TOKEN=$HF_TOKEN \
|
| 296 |
+
--ports 8080
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
## 🔧 配置管理
|
| 300 |
+
|
| 301 |
+
### 生产环境配置
|
| 302 |
+
|
| 303 |
+
```json
|
| 304 |
+
{
|
| 305 |
+
"system": {
|
| 306 |
+
"log_level": "WARNING",
|
| 307 |
+
"debug": false
|
| 308 |
+
},
|
| 309 |
+
"monitoring": {
|
| 310 |
+
"check_interval": 30,
|
| 311 |
+
"max_concurrent_spaces": 20
|
| 312 |
+
},
|
| 313 |
+
"performance": {
|
| 314 |
+
"max_concurrent_repairs": 10,
|
| 315 |
+
"worker_threads": 8
|
| 316 |
+
},
|
| 317 |
+
"database": {
|
| 318 |
+
"type": "postgresql",
|
| 319 |
+
"host": "postgres",
|
| 320 |
+
"port": 5432,
|
| 321 |
+
"database": "hf_repair",
|
| 322 |
+
"username": "hf_repair",
|
| 323 |
+
"password": "${POSTGRES_PASSWORD}"
|
| 324 |
+
}
|
| 325 |
+
}
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
### 开发环境配置
|
| 329 |
+
|
| 330 |
+
```json
|
| 331 |
+
{
|
| 332 |
+
"system": {
|
| 333 |
+
"log_level": "DEBUG",
|
| 334 |
+
"debug": true
|
| 335 |
+
},
|
| 336 |
+
"monitoring": {
|
| 337 |
+
"check_interval": 60,
|
| 338 |
+
"max_concurrent_spaces": 3
|
| 339 |
+
},
|
| 340 |
+
"file_operations": {
|
| 341 |
+
"git": {
|
| 342 |
+
"auto_commit": false,
|
| 343 |
+
"push_immediately": false
|
| 344 |
+
}
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
```
|
| 348 |
+
|
| 349 |
+
## 📊 监控和日志
|
| 350 |
+
|
| 351 |
+
### Prometheus 配置
|
| 352 |
+
|
| 353 |
+
```yaml
|
| 354 |
+
# monitoring/prometheus.yml
|
| 355 |
+
global:
|
| 356 |
+
scrape_interval: 15s
|
| 357 |
+
|
| 358 |
+
scrape_configs:
|
| 359 |
+
- job_name: 'hf-repair-system'
|
| 360 |
+
static_configs:
|
| 361 |
+
- targets: ['hf-repair-system:8080']
|
| 362 |
+
metrics_path: /metrics
|
| 363 |
+
scrape_interval: 30s
|
| 364 |
+
```
|
| 365 |
+
|
| 366 |
+
### Grafana 仪表板
|
| 367 |
+
|
| 368 |
+
1. 访问 http://localhost:3001
|
| 369 |
+
2. 导入预配置的仪表板
|
| 370 |
+
3. 设置数据源为 Prometheus
|
| 371 |
+
|
| 372 |
+
## 🔒 安全配置
|
| 373 |
+
|
| 374 |
+
### 网络安全
|
| 375 |
+
|
| 376 |
+
```bash
|
| 377 |
+
# 防火墙配置
|
| 378 |
+
ufw allow 22/tcp # SSH
|
| 379 |
+
ufw allow 80/tcp # HTTP
|
| 380 |
+
ufw allow 443/tcp # HTTPS
|
| 381 |
+
ufw deny 8080/tcp # 限制内部服务访问
|
| 382 |
+
```
|
| 383 |
+
|
| 384 |
+
### SSL/TLS 配置
|
| 385 |
+
|
| 386 |
+
```nginx
|
| 387 |
+
# nginx/ssl.conf
|
| 388 |
+
server {
|
| 389 |
+
listen 443 ssl http2;
|
| 390 |
+
server_name hf-repair.yourdomain.com;
|
| 391 |
+
|
| 392 |
+
ssl_certificate /path/to/certificate.crt;
|
| 393 |
+
ssl_certificate_key /path/to/private.key;
|
| 394 |
+
|
| 395 |
+
location / {
|
| 396 |
+
proxy_pass http://localhost:8080;
|
| 397 |
+
proxy_set_header Host $host;
|
| 398 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 399 |
+
}
|
| 400 |
+
}
|
| 401 |
+
```
|
| 402 |
+
|
| 403 |
+
## 🚀 性能优化
|
| 404 |
+
|
| 405 |
+
### 资源调优
|
| 406 |
+
|
| 407 |
+
```yaml
|
| 408 |
+
# docker-compose 性能配置
|
| 409 |
+
services:
|
| 410 |
+
hf-repair-system:
|
| 411 |
+
deploy:
|
| 412 |
+
resources:
|
| 413 |
+
limits:
|
| 414 |
+
cpus: '2.0'
|
| 415 |
+
memory: 2G
|
| 416 |
+
reservations:
|
| 417 |
+
cpus: '1.0'
|
| 418 |
+
memory: 1G
|
| 419 |
+
ulimits:
|
| 420 |
+
nofile:
|
| 421 |
+
soft: 65536
|
| 422 |
+
hard: 65536
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
### 缓存配置
|
| 426 |
+
|
| 427 |
+
```python
|
| 428 |
+
# config/redis.json
|
| 429 |
+
{
|
| 430 |
+
"redis": {
|
| 431 |
+
"host": "redis",
|
| 432 |
+
"port": 6379,
|
| 433 |
+
"password": "${REDIS_PASSWORD}",
|
| 434 |
+
"db": 0,
|
| 435 |
+
"max_connections": 20,
|
| 436 |
+
"socket_timeout": 5,
|
| 437 |
+
"socket_connect_timeout": 5
|
| 438 |
+
}
|
| 439 |
+
}
|
| 440 |
+
```
|
| 441 |
+
|
| 442 |
+
## 🔄 维护和更新
|
| 443 |
+
|
| 444 |
+
### 备份策略
|
| 445 |
+
|
| 446 |
+
```bash
|
| 447 |
+
#!/bin/bash
|
| 448 |
+
# scripts/backup.sh
|
| 449 |
+
|
| 450 |
+
# 备份数据库
|
| 451 |
+
pg_dump hf_repair > backup_$(date +%Y%m%d_%H%M%S).sql
|
| 452 |
+
|
| 453 |
+
# 备份配置文件
|
| 454 |
+
tar -czf config_backup_$(date +%Y%m%d_%H%M%S).tar.gz config/
|
| 455 |
+
|
| 456 |
+
# 备份日志
|
| 457 |
+
find logs/ -name "*.log" -mtime +7 -delete
|
| 458 |
+
```
|
| 459 |
+
|
| 460 |
+
### 更新流程
|
| 461 |
+
|
| 462 |
+
```bash
|
| 463 |
+
# 拉取最新代码
|
| 464 |
+
git pull origin main
|
| 465 |
+
|
| 466 |
+
# 重新构建
|
| 467 |
+
docker-compose build
|
| 468 |
+
|
| 469 |
+
# 滚动更新
|
| 470 |
+
docker-compose up -d --no-deps hf-repair-system
|
| 471 |
+
|
| 472 |
+
# 验证更新
|
| 473 |
+
curl http://localhost:8080/health
|
| 474 |
+
```
|
| 475 |
+
|
| 476 |
+
## 📱 移动端和远程访问
|
| 477 |
+
|
| 478 |
+
### VPN 配置
|
| 479 |
+
|
| 480 |
+
```bash
|
| 481 |
+
# 使用 WireGuard 进行安全远程访问
|
| 482 |
+
wg-quick up wg0
|
| 483 |
+
```
|
| 484 |
+
|
| 485 |
+
### 移动端应用
|
| 486 |
+
|
| 487 |
+
1. 使用 PWA 技术
|
| 488 |
+
2. 推送通知集成
|
| 489 |
+
3. 离线状态支持
|
| 490 |
+
|
| 491 |
+
## 🤝 高可用部署
|
| 492 |
+
|
| 493 |
+
### 多节点部署
|
| 494 |
+
|
| 495 |
+
```yaml
|
| 496 |
+
# docker-compose.ha.yml
|
| 497 |
+
version: '3.8'
|
| 498 |
+
services:
|
| 499 |
+
hf-repair-system:
|
| 500 |
+
image: hf-repair-system:latest
|
| 501 |
+
deploy:
|
| 502 |
+
replicas: 3
|
| 503 |
+
update_config:
|
| 504 |
+
parallelism: 1
|
| 505 |
+
delay: 10s
|
| 506 |
+
restart_policy:
|
| 507 |
+
condition: on-failure
|
| 508 |
+
networks:
|
| 509 |
+
- hf-repair-network
|
| 510 |
+
```
|
| 511 |
+
|
| 512 |
+
### 负载均衡
|
| 513 |
+
|
| 514 |
+
```nginx
|
| 515 |
+
# nginx/load-balancer.conf
|
| 516 |
+
upstream hf_repair_backend {
|
| 517 |
+
server hf-repair-1:8080;
|
| 518 |
+
server hf-repair-2:8080;
|
| 519 |
+
server hf-repair-3:8080;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
server {
|
| 523 |
+
listen 80;
|
| 524 |
+
location / {
|
| 525 |
+
proxy_pass http://hf_repair_backend;
|
| 526 |
+
proxy_set_header Host $host;
|
| 527 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 528 |
+
}
|
| 529 |
+
}
|
| 530 |
+
```
|
| 531 |
+
|
| 532 |
+
## 📋 部署检查清单
|
| 533 |
+
|
| 534 |
+
- [ ] 环境变量配置完成
|
| 535 |
+
- [ ] 数据库连接正常
|
| 536 |
+
- [ ] Redis 缓存运行正常
|
| 537 |
+
- [ ] HuggingFace Token 有效
|
| 538 |
+
- [ ] Webhook 配置正确
|
| 539 |
+
- [ ] 防火墙规则设置
|
| 540 |
+
- [ ] SSL 证书配置
|
| 541 |
+
- [ ] 监控系统运行
|
| 542 |
+
- [ ] 备份策略实施
|
| 543 |
+
- [ ] 日志轮转配置
|
| 544 |
+
- [ ] 健康检查正常
|
| 545 |
+
- [ ] 性能基准测试完成
|
| 546 |
+
|
| 547 |
+
## 🆘 故障排除
|
| 548 |
+
|
| 549 |
+
### 常见问题
|
| 550 |
+
|
| 551 |
+
1. **服务无法启动**
|
| 552 |
+
```bash
|
| 553 |
+
# 检查日志
|
| 554 |
+
docker-compose logs hf-repair-system
|
| 555 |
+
|
| 556 |
+
# 检查配置
|
| 557 |
+
python -m json.tool config.json
|
| 558 |
+
```
|
| 559 |
+
|
| 560 |
+
2. **数据库连接失败**
|
| 561 |
+
```bash
|
| 562 |
+
# 检查数据库状态
|
| 563 |
+
docker-compose exec postgres pg_isready -U hf_repair
|
| 564 |
+
|
| 565 |
+
# 检查网络连接
|
| 566 |
+
docker network ls
|
| 567 |
+
```
|
| 568 |
+
|
| 569 |
+
3. **API 限制**
|
| 570 |
+
```bash
|
| 571 |
+
# 检查 Token 权限
|
| 572 |
+
curl -H "Authorization: Bearer $HF_TOKEN" \
|
| 573 |
+
https://huggingface.co/api/whoami
|
| 574 |
+
```
|
| 575 |
+
|
| 576 |
+
### 调试模式
|
| 577 |
+
|
| 578 |
+
```bash
|
| 579 |
+
# 启用调试模式
|
| 580 |
+
export LOG_LEVEL=DEBUG
|
| 581 |
+
export DEBUG=true
|
| 582 |
+
|
| 583 |
+
# 或修改配置文件
|
| 584 |
+
sed -i 's/"debug": false/"debug": true/' config.json
|
| 585 |
+
```
|
SYSTEM_SUMMARY.md
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 系统总结
|
| 2 |
+
|
| 3 |
+
我已经为您设计了一个完整的 HuggingFace Spaces 自动化监控和修复系统。以下是系统的核心组件和功能总结:
|
| 4 |
+
|
| 5 |
+
## 🎯 已完成的核心系统架构
|
| 6 |
+
|
| 7 |
+
### 1. **核心系统架构** (`core_system.py`)
|
| 8 |
+
- **数据模型**: SpaceStatus, ErrorType, RepairAction 等枚举
|
| 9 |
+
- **接口定义**: HuggingFaceAPI, ErrorAnalyzer, RepairStrategyEngine 等抽象接口
|
| 10 |
+
- **主要系统类**: HFSpaceMonitor, AutoRepairSystem, SmartRepairEngine
|
| 11 |
+
- **状态管理**: RepairHistory, StateManager 数据持久化
|
| 12 |
+
|
| 13 |
+
### 2. **HuggingFace API 客户端** (`huggingface_client.py`)
|
| 14 |
+
- **API 集成**: 完整的 HuggingFace Spaces API 封装
|
| 15 |
+
- **速率限制**: 内置请求限制器防止 API 超限
|
| 16 |
+
- **Webhook 处理**: 支持实时事件处理
|
| 17 |
+
- **会话管理**: 异步 HTTP 会话优化
|
| 18 |
+
|
| 19 |
+
### 3. **智能错误分析器** (`error_analyzer.py`)
|
| 20 |
+
- **多模式识别**: 正则表达式 + 上下文分析
|
| 21 |
+
- **错误分类**: 8 种主要错误类型识别
|
| 22 |
+
- **置信度评估**: 基于多种因素的智能评分
|
| 23 |
+
- **专门分析器**: Dockerfile、依赖、环境等专项分析
|
| 24 |
+
|
| 25 |
+
### 4. **配置管理** (`config_template.json`)
|
| 26 |
+
- **完整配置模板**: 涵盖所有系统组件
|
| 27 |
+
- **环境变量支持**: 安全的配置管理
|
| 28 |
+
- **性能调优**: 可配置的性能参数
|
| 29 |
+
- **监控和通知**: 完整的告警配置
|
| 30 |
+
|
| 31 |
+
### 5. **部署方案** (`docker-compose.yml`)
|
| 32 |
+
- **容器化部署**: 完整的 Docker Compose 配置
|
| 33 |
+
- **服务编排**: 主应用 + Redis + PostgreSQL + 监控
|
| 34 |
+
- **高可用**: 多副本 + 负载均衡配置
|
| 35 |
+
- **监控栈**: Prometheus + Grafana 集成
|
| 36 |
+
|
| 37 |
+
### 6. **部署指南** (`DEPLOYMENT.md`)
|
| 38 |
+
- **多部署方式**: Docker, 本地, K8s, 云服务
|
| 39 |
+
- **安全配置**: SSL/TLS, 防火墙, 权限管理
|
| 40 |
+
- **性能优化**: 资源调优, 缓存配置
|
| 41 |
+
- **故障排除**: 常见问题和调试方法
|
| 42 |
+
|
| 43 |
+
### 7. **使用示例** (`usage_examples.py`)
|
| 44 |
+
- **基本使用**: 简单的监控和修复流程
|
| 45 |
+
- **高级功能**: 自定义工作流, 批量处理
|
| 46 |
+
- **Webhook 集成**: 事件驱动的修复流程
|
| 47 |
+
- **性能监控**: 系统性能指标追踪
|
| 48 |
+
|
| 49 |
+
## 🏗️ 系统架构特点
|
| 50 |
+
|
| 51 |
+
### **模块化设计**
|
| 52 |
+
- 清晰的接口定义和组件分离
|
| 53 |
+
- 可插拔的错误分析器和修复策略
|
| 54 |
+
- 独立的配置和状态管理
|
| 55 |
+
|
| 56 |
+
### **智能错误处理**
|
| 57 |
+
- 多层次的错误识别机制
|
| 58 |
+
- 基于上下文的智能分析
|
| 59 |
+
- 置信度评估和风险控制
|
| 60 |
+
|
| 61 |
+
### **自动化工作流**
|
| 62 |
+
- 监控 → 分析 → 修复 → 验证的闭环
|
| 63 |
+
- 支持多种修复策略和回滚机制
|
| 64 |
+
- 异步处理和并发控制
|
| 65 |
+
|
| 66 |
+
### **可扩展性**
|
| 67 |
+
- 支持自定义错误模式和修复规则
|
| 68 |
+
- 插件化的分析器架构
|
| 69 |
+
- 灵活的配置和部署选项
|
| 70 |
+
|
| 71 |
+
## 🚀 核心功能
|
| 72 |
+
|
| 73 |
+
### **实时监控**
|
| 74 |
+
- HuggingFace Spaces 状态轮询
|
| 75 |
+
- 日志实时分析
|
| 76 |
+
- Webhook 事件处理
|
| 77 |
+
|
| 78 |
+
### **智能分析**
|
| 79 |
+
- 8 种错误类型自动识别
|
| 80 |
+
- 上下文感知的错误分析
|
| 81 |
+
- 置信度评估和优先级排序
|
| 82 |
+
|
| 83 |
+
### **自动修复**
|
| 84 |
+
- Dockerfile 语法修正
|
| 85 |
+
- 依赖版本和源地址调整
|
| 86 |
+
- 环境变量和配置优化
|
| 87 |
+
- 端口和权限问题处理
|
| 88 |
+
|
| 89 |
+
### **状态管理**
|
| 90 |
+
- 修复历史记录
|
| 91 |
+
- 回滚机制
|
| 92 |
+
- 性能指标追踪
|
| 93 |
+
|
| 94 |
+
## 📊 技术栈
|
| 95 |
+
|
| 96 |
+
- **核心**: Python 3.11+, asyncio, aiohttp
|
| 97 |
+
- **数据库**: SQLite (开发) / PostgreSQL (生产)
|
| 98 |
+
- **缓存**: Redis
|
| 99 |
+
- **监控**: Prometheus + Grafana
|
| 100 |
+
- **部署**: Docker + Kubernetes
|
| 101 |
+
- **配置**: JSON/YAML, 环境变量
|
| 102 |
+
|
| 103 |
+
## 🔧 部署方式
|
| 104 |
+
|
| 105 |
+
1. **Docker Compose** (推荐): 一键部署完整系统
|
| 106 |
+
2. **本地部署**: 直接 Python 运行
|
| 107 |
+
3. **Kubernetes**: 生产级容器编排
|
| 108 |
+
4. **云服务**: AWS, GCP, Azure 集成
|
| 109 |
+
|
| 110 |
+
## 📈 预期效果
|
| 111 |
+
|
| 112 |
+
- **监控效率**: 24/7 自动监控,快速发现问题
|
| 113 |
+
- **修复成功率**: 基于历史数据的智能修复策略
|
| 114 |
+
- **运维成本**: 大幅减少人工干预需求
|
| 115 |
+
- **系统稳定性**: 自动化故障恢复和优化
|
| 116 |
+
|
| 117 |
+
这个系统提供了完整的 HuggingFace Spaces 监控修复解决方案,具有高度的自动化、智能化和可扩展性。您可以根据具体需求调整配置和部署方式。
|
config_template.json
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
配置文件模板
|
| 3 |
+
系统配置和环境设置
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
{
|
| 7 |
+
"system": {
|
| 8 |
+
"name": "HuggingFace Spaces 自动修复系统",
|
| 9 |
+
"version": "1.0.0",
|
| 10 |
+
"log_level": "INFO",
|
| 11 |
+
"debug": false
|
| 12 |
+
},
|
| 13 |
+
|
| 14 |
+
"huggingface": {
|
| 15 |
+
"api_token": "${HF_TOKEN}",
|
| 16 |
+
"base_url": "https://huggingface.co/api",
|
| 17 |
+
"rate_limit": {
|
| 18 |
+
"requests_per_minute": 60,
|
| 19 |
+
"burst_limit": 10
|
| 20 |
+
},
|
| 21 |
+
"timeout": {
|
| 22 |
+
"api_timeout": 30,
|
| 23 |
+
"build_timeout": 1800,
|
| 24 |
+
"log_timeout": 60
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
|
| 28 |
+
"monitoring": {
|
| 29 |
+
"check_interval": 60,
|
| 30 |
+
"max_concurrent_spaces": 10,
|
| 31 |
+
"retry_attempts": 3,
|
| 32 |
+
"retry_delay": 30,
|
| 33 |
+
"health_check": {
|
| 34 |
+
"enabled": true,
|
| 35 |
+
"port_check": 7860,
|
| 36 |
+
"timeout": 10,
|
| 37 |
+
"retries": 3
|
| 38 |
+
}
|
| 39 |
+
},
|
| 40 |
+
|
| 41 |
+
"error_analysis": {
|
| 42 |
+
"confidence_threshold": 0.7,
|
| 43 |
+
"max_errors_per_analysis": 20,
|
| 44 |
+
"context_lines": 10,
|
| 45 |
+
"pattern_matching": {
|
| 46 |
+
"enabled": true,
|
| 47 |
+
"case_sensitive": false,
|
| 48 |
+
"max_matches": 50
|
| 49 |
+
},
|
| 50 |
+
"context_analysis": {
|
| 51 |
+
"enabled": true,
|
| 52 |
+
"analyzers": [
|
| 53 |
+
"dockerfile_syntax",
|
| 54 |
+
"dependency_install",
|
| 55 |
+
"environment_config",
|
| 56 |
+
"port_conflict",
|
| 57 |
+
"permission_error",
|
| 58 |
+
"network_connection",
|
| 59 |
+
"timeout_error",
|
| 60 |
+
"resource_exceeded"
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
},
|
| 64 |
+
|
| 65 |
+
"repair_strategies": {
|
| 66 |
+
"max_attempts_per_error": 3,
|
| 67 |
+
"success_rate_threshold": 0.6,
|
| 68 |
+
"risk_tolerance": "medium",
|
| 69 |
+
"backup_enabled": true,
|
| 70 |
+
"strategies": {
|
| 71 |
+
"dockerfile_syntax": {
|
| 72 |
+
"enabled": true,
|
| 73 |
+
"priority": 1,
|
| 74 |
+
"auto_apply": true,
|
| 75 |
+
"risk_level": "medium"
|
| 76 |
+
},
|
| 77 |
+
"dependency_install": {
|
| 78 |
+
"enabled": true,
|
| 79 |
+
"priority": 2,
|
| 80 |
+
"auto_apply": true,
|
| 81 |
+
"risk_level": "low",
|
| 82 |
+
"fallback_sources": [
|
| 83 |
+
"https://pypi.tuna.tsinghua.edu.cn/simple",
|
| 84 |
+
"https://mirrors.aliyun.com/pypi/simple",
|
| 85 |
+
"https://pypi.douban.com/simple"
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
"environment_config": {
|
| 89 |
+
"enabled": true,
|
| 90 |
+
"priority": 3,
|
| 91 |
+
"auto_apply": true,
|
| 92 |
+
"risk_level": "low"
|
| 93 |
+
},
|
| 94 |
+
"port_conflict": {
|
| 95 |
+
"enabled": true,
|
| 96 |
+
"priority": 4,
|
| 97 |
+
"auto_apply": true,
|
| 98 |
+
"risk_level": "medium",
|
| 99 |
+
"alternative_ports": [7861, 7862, 7863, 8080, 8000]
|
| 100 |
+
},
|
| 101 |
+
"permission_error": {
|
| 102 |
+
"enabled": true,
|
| 103 |
+
"priority": 5,
|
| 104 |
+
"auto_apply": false,
|
| 105 |
+
"risk_level": "high"
|
| 106 |
+
},
|
| 107 |
+
"network_connection": {
|
| 108 |
+
"enabled": true,
|
| 109 |
+
"priority": 6,
|
| 110 |
+
"auto_apply": false,
|
| 111 |
+
"risk_level": "medium"
|
| 112 |
+
},
|
| 113 |
+
"timeout_error": {
|
| 114 |
+
"enabled": true,
|
| 115 |
+
"priority": 7,
|
| 116 |
+
"auto_apply": true,
|
| 117 |
+
"risk_level": "low"
|
| 118 |
+
},
|
| 119 |
+
"resource_exceeded": {
|
| 120 |
+
"enabled": true,
|
| 121 |
+
"priority": 8,
|
| 122 |
+
"auto_apply": false,
|
| 123 |
+
"risk_level": "high"
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
},
|
| 127 |
+
|
| 128 |
+
"file_operations": {
|
| 129 |
+
"backup": {
|
| 130 |
+
"enabled": true,
|
| 131 |
+
"directory": "./backups",
|
| 132 |
+
"max_backups": 10,
|
| 133 |
+
"compression": true
|
| 134 |
+
},
|
| 135 |
+
"git": {
|
| 136 |
+
"enabled": true,
|
| 137 |
+
"auto_commit": true,
|
| 138 |
+
"commit_message_prefix": "[Auto-Repair]",
|
| 139 |
+
"branch_name": "auto-repair",
|
| 140 |
+
"push_immediately": true
|
| 141 |
+
},
|
| 142 |
+
"modification": {
|
| 143 |
+
"dry_run": false,
|
| 144 |
+
"confirm_changes": false,
|
| 145 |
+
"max_file_size_mb": 10
|
| 146 |
+
}
|
| 147 |
+
},
|
| 148 |
+
|
| 149 |
+
"database": {
|
| 150 |
+
"type": "sqlite",
|
| 151 |
+
"path": "./data/repair_system.db",
|
| 152 |
+
"backup_enabled": true,
|
| 153 |
+
"backup_interval_hours": 24,
|
| 154 |
+
"retention_days": 30
|
| 155 |
+
},
|
| 156 |
+
|
| 157 |
+
"notifications": {
|
| 158 |
+
"enabled": true,
|
| 159 |
+
"channels": {
|
| 160 |
+
"email": {
|
| 161 |
+
"enabled": false,
|
| 162 |
+
"smtp_server": "",
|
| 163 |
+
"smtp_port": 587,
|
| 164 |
+
"username": "",
|
| 165 |
+
"password": "",
|
| 166 |
+
"recipients": []
|
| 167 |
+
},
|
| 168 |
+
"webhook": {
|
| 169 |
+
"enabled": true,
|
| 170 |
+
"url": "${WEBHOOK_URL}",
|
| 171 |
+
"timeout": 10,
|
| 172 |
+
"retry_attempts": 3
|
| 173 |
+
},
|
| 174 |
+
"slack": {
|
| 175 |
+
"enabled": false,
|
| 176 |
+
"webhook_url": "",
|
| 177 |
+
"channel": "#alerts"
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
"events": {
|
| 181 |
+
"repair_success": true,
|
| 182 |
+
"repair_failed": true,
|
| 183 |
+
"space_error": true,
|
| 184 |
+
"build_completed": false,
|
| 185 |
+
"system_error": true
|
| 186 |
+
}
|
| 187 |
+
},
|
| 188 |
+
|
| 189 |
+
"security": {
|
| 190 |
+
"max_file_access_attempts": 3,
|
| 191 |
+
"allowed_file_extensions": [".py", ".js", ".json", ".yml", ".yaml", ".md", ".txt"],
|
| 192 |
+
"forbidden_paths": ["/etc", "/proc", "/sys", "/dev"],
|
| 193 |
+
"scan_for_secrets": true,
|
| 194 |
+
"secret_patterns": [
|
| 195 |
+
"password",
|
| 196 |
+
"token",
|
| 197 |
+
"api_key",
|
| 198 |
+
"secret",
|
| 199 |
+
"credential"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
|
| 203 |
+
"performance": {
|
| 204 |
+
"max_concurrent_repairs": 5,
|
| 205 |
+
"queue_size": 100,
|
| 206 |
+
"worker_threads": 4,
|
| 207 |
+
"cache_size_mb": 100,
|
| 208 |
+
"timeout_per_repair": 600
|
| 209 |
+
},
|
| 210 |
+
|
| 211 |
+
"logging": {
|
| 212 |
+
"level": "INFO",
|
| 213 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 214 |
+
"file": "./logs/repair_system.log",
|
| 215 |
+
"max_file_size_mb": 50,
|
| 216 |
+
"backup_count": 5,
|
| 217 |
+
"console_output": true
|
| 218 |
+
}
|
| 219 |
+
}
|
core_system.py
ADDED
|
@@ -0,0 +1,538 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Spaces 自动化监控修复系统
|
| 3 |
+
核心系统架构和主要类定义
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from typing import Dict, List, Optional, Any, Tuple, Union
|
| 9 |
+
from enum import Enum
|
| 10 |
+
import asyncio
|
| 11 |
+
import logging
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
import json
|
| 14 |
+
import sqlite3
|
| 15 |
+
import os
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
# ============================================================================
|
| 19 |
+
# 数据模型和枚举
|
| 20 |
+
# ============================================================================
|
| 21 |
+
|
| 22 |
+
class SpaceStatus(Enum):
|
| 23 |
+
"""Space 状态枚举"""
|
| 24 |
+
BUILDING = "building"
|
| 25 |
+
RUNNING = "running"
|
| 26 |
+
STOPPED = "stopped"
|
| 27 |
+
ERROR = "error"
|
| 28 |
+
UNKNOWN = "unknown"
|
| 29 |
+
|
| 30 |
+
class ErrorType(Enum):
|
| 31 |
+
"""错误类型枚举"""
|
| 32 |
+
DOCKERFILE_SYNTAX = "dockerfile_syntax"
|
| 33 |
+
DEPENDENCY_INSTALL = "dependency_install"
|
| 34 |
+
ENVIRONMENT_CONFIG = "environment_config"
|
| 35 |
+
PORT_CONFLICT = "port_conflict"
|
| 36 |
+
PERMISSION_ERROR = "permission_error"
|
| 37 |
+
NETWORK_CONNECTION = "network_connection"
|
| 38 |
+
TIMEOUT_ERROR = "timeout_error"
|
| 39 |
+
RESOURCE_EXCEEDED = "resource_exceeded"
|
| 40 |
+
UNKNOWN_ERROR = "unknown_error"
|
| 41 |
+
|
| 42 |
+
class RepairAction(Enum):
|
| 43 |
+
"""修复动作枚举"""
|
| 44 |
+
MODIFY_DOCKERFILE = "modify_dockerfile"
|
| 45 |
+
UPDATE_DEPENDENCIES = "update_dependencies"
|
| 46 |
+
FIX_ENVIRONMENT = "fix_environment"
|
| 47 |
+
CHANGE_PORT = "change_port"
|
| 48 |
+
SET_PERMISSIONS = "set_permissions"
|
| 49 |
+
UPDATE_SOURCES = "update_sources"
|
| 50 |
+
ADJUST_RESOURCES = "adjust_resources"
|
| 51 |
+
RETRY_BUILD = "retry_build"
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class SpaceInfo:
|
| 55 |
+
"""Space 信息"""
|
| 56 |
+
space_id: str
|
| 57 |
+
name: str
|
| 58 |
+
repository_url: str
|
| 59 |
+
current_status: SpaceStatus
|
| 60 |
+
last_updated: datetime
|
| 61 |
+
dockerfile_path: str = "Dockerfile"
|
| 62 |
+
local_path: str = ""
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class ErrorInfo:
|
| 66 |
+
"""错误信息"""
|
| 67 |
+
error_type: ErrorType
|
| 68 |
+
message: str
|
| 69 |
+
log_snippet: str
|
| 70 |
+
line_number: Optional[int] = None
|
| 71 |
+
confidence: float = 0.0
|
| 72 |
+
context: Dict[str, Any] = field(default_factory=dict)
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class RepairStrategy:
|
| 76 |
+
"""修复策略"""
|
| 77 |
+
action: RepairAction
|
| 78 |
+
description: str
|
| 79 |
+
modifications: Dict[str, Any]
|
| 80 |
+
risk_level: str # low, medium, high
|
| 81 |
+
success_rate: float = 0.0
|
| 82 |
+
estimated_time: int = 0 # 秒
|
| 83 |
+
|
| 84 |
+
@dataclass
|
| 85 |
+
class RepairHistory:
|
| 86 |
+
"""修复历史"""
|
| 87 |
+
id: int
|
| 88 |
+
space_id: str
|
| 89 |
+
timestamp: datetime
|
| 90 |
+
error_info: ErrorInfo
|
| 91 |
+
strategy: RepairStrategy
|
| 92 |
+
success: bool
|
| 93 |
+
git_commit: Optional[str] = None
|
| 94 |
+
rollback_data: Optional[str] = None
|
| 95 |
+
|
| 96 |
+
# ============================================================================
|
| 97 |
+
# 核心接口定义
|
| 98 |
+
# ============================================================================
|
| 99 |
+
|
| 100 |
+
class HuggingFaceAPI(ABC):
|
| 101 |
+
"""HuggingFace API 接口"""
|
| 102 |
+
|
| 103 |
+
@abstractmethod
|
| 104 |
+
async def get_space_status(self, space_id: str) -> SpaceStatus:
|
| 105 |
+
"""获取 Space 状态"""
|
| 106 |
+
pass
|
| 107 |
+
|
| 108 |
+
@abstractmethod
|
| 109 |
+
async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
|
| 110 |
+
"""获取 Space 日志"""
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
@abstractmethod
|
| 114 |
+
async def trigger_rebuild(self, space_id: str) -> bool:
|
| 115 |
+
"""触发重新构建"""
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
@abstractmethod
|
| 119 |
+
async def get_space_info(self, space_id: str) -> SpaceInfo:
|
| 120 |
+
"""获取 Space 详细信息"""
|
| 121 |
+
pass
|
| 122 |
+
|
| 123 |
+
class ErrorAnalyzer(ABC):
|
| 124 |
+
"""错误分析器接口"""
|
| 125 |
+
|
| 126 |
+
@abstractmethod
|
| 127 |
+
async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
|
| 128 |
+
"""分析日志并识别错误"""
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
@abstractmethod
|
| 132 |
+
async def classify_error(self, error_message: str) -> ErrorType:
|
| 133 |
+
"""分类错误类型"""
|
| 134 |
+
pass
|
| 135 |
+
|
| 136 |
+
class RepairStrategyEngine(ABC):
|
| 137 |
+
"""修复策略引擎接口"""
|
| 138 |
+
|
| 139 |
+
@abstractmethod
|
| 140 |
+
async def generate_strategy(self, error: ErrorInfo, space_info: SpaceInfo) -> Optional[RepairStrategy]:
|
| 141 |
+
"""生成修复策略"""
|
| 142 |
+
pass
|
| 143 |
+
|
| 144 |
+
@abstractmethod
|
| 145 |
+
async def estimate_success(self, strategy: RepairStrategy) -> float:
|
| 146 |
+
"""估算成功概率"""
|
| 147 |
+
pass
|
| 148 |
+
|
| 149 |
+
class FileModifier(ABC):
|
| 150 |
+
"""文件修改器接口"""
|
| 151 |
+
|
| 152 |
+
@abstractmethod
|
| 153 |
+
async def apply_modifications(self, file_path: str, modifications: Dict[str, Any]) -> bool:
|
| 154 |
+
"""应用修改"""
|
| 155 |
+
pass
|
| 156 |
+
|
| 157 |
+
@abstractmethod
|
| 158 |
+
async def backup_file(self, file_path: str) -> str:
|
| 159 |
+
"""备份文件"""
|
| 160 |
+
pass
|
| 161 |
+
|
| 162 |
+
# ============================================================================
|
| 163 |
+
# 核心系统类
|
| 164 |
+
# ============================================================================
|
| 165 |
+
|
| 166 |
+
class HFSpaceMonitor:
|
| 167 |
+
"""HuggingFace Space 监控器"""
|
| 168 |
+
|
| 169 |
+
def __init__(self, hf_api: HuggingFaceAPI, check_interval: int = 60):
|
| 170 |
+
self.hf_api = hf_api
|
| 171 |
+
self.check_interval = check_interval
|
| 172 |
+
self.logger = logging.getLogger(__name__)
|
| 173 |
+
self._running = False
|
| 174 |
+
|
| 175 |
+
async def start_monitoring(self, space_ids: List[str]) -> None:
|
| 176 |
+
"""开始监控 Spaces"""
|
| 177 |
+
self._running = True
|
| 178 |
+
self.logger.info(f"开始监控 {len(space_ids)} 个 Space")
|
| 179 |
+
|
| 180 |
+
while self._running:
|
| 181 |
+
try:
|
| 182 |
+
await self._check_spaces(space_ids)
|
| 183 |
+
await asyncio.sleep(self.check_interval)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
self.logger.error(f"监控过程出错: {e}")
|
| 186 |
+
await asyncio.sleep(5)
|
| 187 |
+
|
| 188 |
+
async def _check_spaces(self, space_ids: List[str]) -> None:
|
| 189 |
+
"""检查所有 Space 状态"""
|
| 190 |
+
tasks = [self._check_single_space(space_id) for space_id in space_ids]
|
| 191 |
+
await asyncio.gather(*tasks, return_exceptions=True)
|
| 192 |
+
|
| 193 |
+
async def _check_single_space(self, space_id: str) -> None:
|
| 194 |
+
"""检查单个 Space 状态"""
|
| 195 |
+
try:
|
| 196 |
+
status = await self.hf_api.get_space_status(space_id)
|
| 197 |
+
self.logger.info(f"Space {space_id} 状态: {status.value}")
|
| 198 |
+
|
| 199 |
+
if status == SpaceStatus.ERROR:
|
| 200 |
+
logs = await self.hf_api.get_space_logs(space_id)
|
| 201 |
+
# 触发错误分析和修复流程
|
| 202 |
+
await self._handle_error(space_id, logs)
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
self.logger.error(f"检查 Space {space_id} 失败: {e}")
|
| 206 |
+
|
| 207 |
+
async def _handle_error(self, space_id: str, logs: str) -> None:
|
| 208 |
+
"""处理错误"""
|
| 209 |
+
# 这里会调用错误分析器和修复引擎
|
| 210 |
+
pass
|
| 211 |
+
|
| 212 |
+
def stop(self) -> None:
|
| 213 |
+
"""停止监控"""
|
| 214 |
+
self._running = False
|
| 215 |
+
|
| 216 |
+
class IntelligentErrorAnalyzer:
|
| 217 |
+
"""智能错误分析器"""
|
| 218 |
+
|
| 219 |
+
def __init__(self):
|
| 220 |
+
self.logger = logging.getLogger(__name__)
|
| 221 |
+
self.error_patterns = self._load_error_patterns()
|
| 222 |
+
|
| 223 |
+
async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
|
| 224 |
+
"""分析日志并识别错误"""
|
| 225 |
+
errors = []
|
| 226 |
+
|
| 227 |
+
# 分行分析日志
|
| 228 |
+
for line_num, line in enumerate(logs.split('\n'), 1):
|
| 229 |
+
for error_type, patterns in self.error_patterns.items():
|
| 230 |
+
for pattern in patterns:
|
| 231 |
+
if pattern['regex'].search(line):
|
| 232 |
+
error_info = ErrorInfo(
|
| 233 |
+
error_type=ErrorType(error_type),
|
| 234 |
+
message=line.strip(),
|
| 235 |
+
log_snippet=line.strip(),
|
| 236 |
+
line_number=line_num,
|
| 237 |
+
confidence=pattern['confidence'],
|
| 238 |
+
context=self._extract_context(line, logs, line_num)
|
| 239 |
+
)
|
| 240 |
+
errors.append(error_info)
|
| 241 |
+
break
|
| 242 |
+
|
| 243 |
+
return errors
|
| 244 |
+
|
| 245 |
+
def _load_error_patterns(self) -> Dict[str, List[Dict]]:
|
| 246 |
+
"""加载错误模式"""
|
| 247 |
+
return {
|
| 248 |
+
"dockerfile_syntax": [
|
| 249 |
+
{
|
| 250 |
+
"regex": re.compile(r"ERROR:.*failed to solve|failed to compute cache key"),
|
| 251 |
+
"confidence": 0.9
|
| 252 |
+
}
|
| 253 |
+
],
|
| 254 |
+
"dependency_install": [
|
| 255 |
+
{
|
| 256 |
+
"regex": re.compile(r"ERROR:.*Could not find a version|No matching distribution"),
|
| 257 |
+
"confidence": 0.85
|
| 258 |
+
}
|
| 259 |
+
],
|
| 260 |
+
"environment_config": [
|
| 261 |
+
{
|
| 262 |
+
"regex": re.compile(r"ERROR:.*environment variable|ENV not found"),
|
| 263 |
+
"confidence": 0.8
|
| 264 |
+
}
|
| 265 |
+
]
|
| 266 |
+
# 更多模式...
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
def _extract_context(self, error_line: str, logs: str, line_num: int) -> Dict[str, Any]:
|
| 270 |
+
"""提取错误上下文"""
|
| 271 |
+
lines = logs.split('\n')
|
| 272 |
+
start = max(0, line_num - 3)
|
| 273 |
+
end = min(len(lines), line_num + 3)
|
| 274 |
+
|
| 275 |
+
return {
|
| 276 |
+
"before": lines[start:line_num],
|
| 277 |
+
"after": lines[line_num + 1:end],
|
| 278 |
+
"full_context": lines[start:end]
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
class SmartRepairEngine:
|
| 282 |
+
"""智能修复引擎"""
|
| 283 |
+
|
| 284 |
+
def __init__(self):
|
| 285 |
+
self.logger = logging.getLogger(__name__)
|
| 286 |
+
self.repair_rules = self._load_repair_rules()
|
| 287 |
+
|
| 288 |
+
async def generate_strategy(self, error: ErrorInfo, space_info: SpaceInfo) -> Optional[RepairStrategy]:
|
| 289 |
+
"""生成修复策略"""
|
| 290 |
+
error_type = error.error_type.value
|
| 291 |
+
|
| 292 |
+
if error_type in self.repair_rules:
|
| 293 |
+
rules = self.repair_rules[error_type]
|
| 294 |
+
# 选择最适合的规则
|
| 295 |
+
best_rule = max(rules, key=lambda r: r['success_rate'])
|
| 296 |
+
|
| 297 |
+
return RepairStrategy(
|
| 298 |
+
action=RepairAction(best_rule['action']),
|
| 299 |
+
description=best_rule['description'],
|
| 300 |
+
modifications=best_rule['modifications'],
|
| 301 |
+
risk_level=best_rule['risk_level'],
|
| 302 |
+
success_rate=best_rule['success_rate'],
|
| 303 |
+
estimated_time=best_rule['estimated_time']
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
return None
|
| 307 |
+
|
| 308 |
+
def _load_repair_rules(self) -> Dict[str, List[Dict]]:
|
| 309 |
+
"""加载修复规则"""
|
| 310 |
+
return {
|
| 311 |
+
"dockerfile_syntax": [
|
| 312 |
+
{
|
| 313 |
+
"action": "modify_dockerfile",
|
| 314 |
+
"description": "修复 Dockerfile 语法错误",
|
| 315 |
+
"modifications": {
|
| 316 |
+
"type": "syntax_fix",
|
| 317 |
+
"target": error.line_number
|
| 318 |
+
},
|
| 319 |
+
"risk_level": "medium",
|
| 320 |
+
"success_rate": 0.7,
|
| 321 |
+
"estimated_time": 120
|
| 322 |
+
}
|
| 323 |
+
],
|
| 324 |
+
"dependency_install": [
|
| 325 |
+
{
|
| 326 |
+
"action": "update_dependencies",
|
| 327 |
+
"description": "更新依赖版本或更换源地址",
|
| 328 |
+
"modifications": {
|
| 329 |
+
"type": "dependency_update",
|
| 330 |
+
"strategy": "version_bump_or_source_change"
|
| 331 |
+
},
|
| 332 |
+
"risk_level": "low",
|
| 333 |
+
"success_rate": 0.8,
|
| 334 |
+
"estimated_time": 300
|
| 335 |
+
}
|
| 336 |
+
]
|
| 337 |
+
# 更多规则...
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
class AutoRepairSystem:
|
| 341 |
+
"""自动修复系统主类"""
|
| 342 |
+
|
| 343 |
+
def __init__(self, config_path: str = "config.json"):
|
| 344 |
+
self.config = self._load_config(config_path)
|
| 345 |
+
self.logger = self._setup_logging()
|
| 346 |
+
|
| 347 |
+
# 初始化各个组件
|
| 348 |
+
self.hf_api = HuggingFaceAPIClient(self.config['hf_token'])
|
| 349 |
+
self.error_analyzer = IntelligentErrorAnalyzer()
|
| 350 |
+
self.repair_engine = SmartRepairEngine()
|
| 351 |
+
self.file_modifier = DockerfileModifier()
|
| 352 |
+
self.state_manager = StateManager(self.config['db_path'])
|
| 353 |
+
|
| 354 |
+
# 监控器
|
| 355 |
+
self.monitor = HFSpaceMonitor(self.hf_api, self.config['check_interval'])
|
| 356 |
+
|
| 357 |
+
# 修复队列
|
| 358 |
+
self.repair_queue = asyncio.Queue()
|
| 359 |
+
|
| 360 |
+
async def start(self, space_ids: List[str]) -> None:
|
| 361 |
+
"""启动系统"""
|
| 362 |
+
self.logger.info("启动 HuggingFace Spaces 自动修复系统")
|
| 363 |
+
|
| 364 |
+
# 启动监控任务
|
| 365 |
+
monitor_task = asyncio.create_task(self.monitor.start_monitoring(space_ids))
|
| 366 |
+
|
| 367 |
+
# 启动修复任务
|
| 368 |
+
repair_task = asyncio.create_task(self._process_repair_queue())
|
| 369 |
+
|
| 370 |
+
# 等待任务完成(正常情况下不会完成)
|
| 371 |
+
await asyncio.gather(monitor_task, repair_task)
|
| 372 |
+
|
| 373 |
+
async def _process_repair_queue(self) -> None:
|
| 374 |
+
"""处理修复队列"""
|
| 375 |
+
while True:
|
| 376 |
+
try:
|
| 377 |
+
repair_job = await self.repair_queue.get()
|
| 378 |
+
await self._execute_repair(repair_job)
|
| 379 |
+
except Exception as e:
|
| 380 |
+
self.logger.error(f"修复任务执行失败: {e}")
|
| 381 |
+
|
| 382 |
+
async def _execute_repair(self, job: Dict[str, Any]) -> None:
|
| 383 |
+
"""执行修复任务"""
|
| 384 |
+
space_id = job['space_id']
|
| 385 |
+
error_info = job['error_info']
|
| 386 |
+
|
| 387 |
+
self.logger.info(f"开始修复 Space {space_id}")
|
| 388 |
+
|
| 389 |
+
# 获取 Space 信息
|
| 390 |
+
space_info = await self.hf_api.get_space_info(space_id)
|
| 391 |
+
|
| 392 |
+
# 生成修复策略
|
| 393 |
+
strategy = await self.repair_engine.generate_strategy(error_info, space_info)
|
| 394 |
+
|
| 395 |
+
if strategy:
|
| 396 |
+
try:
|
| 397 |
+
# 备份原文件
|
| 398 |
+
backup_path = await self.file_modifier.backup_file(space_info.dockerfile_path)
|
| 399 |
+
|
| 400 |
+
# 应用修改
|
| 401 |
+
success = await self.file_modifier.apply_modifications(
|
| 402 |
+
space_info.dockerfile_path,
|
| 403 |
+
strategy.modifications
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
if success:
|
| 407 |
+
# 提交到 Git
|
| 408 |
+
git_commit = await self._commit_changes(space_id, strategy)
|
| 409 |
+
|
| 410 |
+
# 触发重新构建
|
| 411 |
+
await self.hf_api.trigger_rebuild(space_id)
|
| 412 |
+
|
| 413 |
+
# 记录历史
|
| 414 |
+
await self.state_manager.record_repair(
|
| 415 |
+
space_id, error_info, strategy, True, git_commit
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
self.logger.info(f"Space {space_id} 修复完成")
|
| 419 |
+
else:
|
| 420 |
+
# 回滚
|
| 421 |
+
await self._rollback(backup_path, space_info.dockerfile_path)
|
| 422 |
+
|
| 423 |
+
except Exception as e:
|
| 424 |
+
self.logger.error(f"修复失败: {e}")
|
| 425 |
+
await self._rollback(backup_path, space_info.dockerfile_path)
|
| 426 |
+
|
| 427 |
+
def _load_config(self, config_path: str) -> Dict[str, Any]:
|
| 428 |
+
"""加载配置"""
|
| 429 |
+
default_config = {
|
| 430 |
+
"hf_token": os.getenv("HF_TOKEN", ""),
|
| 431 |
+
"check_interval": 60,
|
| 432 |
+
"db_path": "repair_system.db",
|
| 433 |
+
"max_retry": 3,
|
| 434 |
+
"log_level": "INFO"
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
if os.path.exists(config_path):
|
| 438 |
+
with open(config_path, 'r') as f:
|
| 439 |
+
user_config = json.load(f)
|
| 440 |
+
default_config.update(user_config)
|
| 441 |
+
|
| 442 |
+
return default_config
|
| 443 |
+
|
| 444 |
+
def _setup_logging(self) -> logging.Logger:
|
| 445 |
+
"""设置日志"""
|
| 446 |
+
logger = logging.getLogger(__name__)
|
| 447 |
+
logger.setLevel(getattr(logging, self.config['log_level']))
|
| 448 |
+
|
| 449 |
+
handler = logging.StreamHandler()
|
| 450 |
+
formatter = logging.Formatter(
|
| 451 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 452 |
+
)
|
| 453 |
+
handler.setFormatter(formatter)
|
| 454 |
+
logger.addHandler(handler)
|
| 455 |
+
|
| 456 |
+
return logger
|
| 457 |
+
|
| 458 |
+
# ============================================================================
|
| 459 |
+
# 具体实现类
|
| 460 |
+
# ============================================================================
|
| 461 |
+
|
| 462 |
+
class HuggingFaceAPIClient(HuggingFaceAPI):
|
| 463 |
+
"""HuggingFace API 客户端实现"""
|
| 464 |
+
|
| 465 |
+
def __init__(self, token: str):
|
| 466 |
+
self.token = token
|
| 467 |
+
self.base_url = "https://huggingface.co/api"
|
| 468 |
+
self.headers = {"Authorization": f"Bearer {token}"}
|
| 469 |
+
|
| 470 |
+
async def get_space_status(self, space_id: str) -> SpaceStatus:
|
| 471 |
+
"""获取 Space 状态"""
|
| 472 |
+
# 实现具体的 API 调用逻辑
|
| 473 |
+
pass
|
| 474 |
+
|
| 475 |
+
async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
|
| 476 |
+
"""获取 Space 日志"""
|
| 477 |
+
# 实现具体的 API 调用逻辑
|
| 478 |
+
pass
|
| 479 |
+
|
| 480 |
+
async def trigger_rebuild(self, space_id: str) -> bool:
|
| 481 |
+
"""触发重新构建"""
|
| 482 |
+
# 实现具体的 API 调用逻辑
|
| 483 |
+
pass
|
| 484 |
+
|
| 485 |
+
async def get_space_info(self, space_id: str) -> SpaceInfo:
|
| 486 |
+
"""获取 Space 详细信息"""
|
| 487 |
+
# 实现具体的 API 调用逻辑
|
| 488 |
+
pass
|
| 489 |
+
|
| 490 |
+
class DockerfileModifier(FileModifier):
|
| 491 |
+
"""Dockerfile 修改器实现"""
|
| 492 |
+
|
| 493 |
+
async def apply_modifications(self, file_path: str, modifications: Dict[str, Any]) -> bool:
|
| 494 |
+
"""应用修改"""
|
| 495 |
+
# 实现具体的 Dockerfile 修改逻辑
|
| 496 |
+
pass
|
| 497 |
+
|
| 498 |
+
async def backup_file(self, file_path: str) -> str:
|
| 499 |
+
"""备份文件"""
|
| 500 |
+
# 实现文件备份逻辑
|
| 501 |
+
pass
|
| 502 |
+
|
| 503 |
+
class StateManager:
|
| 504 |
+
"""状态管理器"""
|
| 505 |
+
|
| 506 |
+
def __init__(self, db_path: str):
|
| 507 |
+
self.db_path = db_path
|
| 508 |
+
self._init_database()
|
| 509 |
+
|
| 510 |
+
def _init_database(self) -> None:
|
| 511 |
+
"""初始化数据库"""
|
| 512 |
+
# 创建数据库表结构
|
| 513 |
+
pass
|
| 514 |
+
|
| 515 |
+
async def record_repair(self, space_id: str, error_info: ErrorInfo,
|
| 516 |
+
strategy: RepairStrategy, success: bool,
|
| 517 |
+
git_commit: Optional[str] = None) -> None:
|
| 518 |
+
"""记录修复历史"""
|
| 519 |
+
# 实现修复历史记录逻辑
|
| 520 |
+
pass
|
| 521 |
+
|
| 522 |
+
async def get_repair_history(self, space_id: str) -> List[RepairHistory]:
|
| 523 |
+
"""获取修复历史"""
|
| 524 |
+
# 实现历史查询逻辑
|
| 525 |
+
pass
|
| 526 |
+
|
| 527 |
+
if __name__ == "__main__":
|
| 528 |
+
# 系统启动示例
|
| 529 |
+
system = AutoRepairSystem()
|
| 530 |
+
|
| 531 |
+
# 要监控的 Space ID 列表
|
| 532 |
+
space_ids = [
|
| 533 |
+
"your-username/your-space-1",
|
| 534 |
+
"your-username/your-space-2"
|
| 535 |
+
]
|
| 536 |
+
|
| 537 |
+
# 启动系统
|
| 538 |
+
asyncio.run(system.start(space_ids))
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
部署方案和启动脚本
|
| 3 |
+
包含 Docker 部署、本地部署和系统服务配置
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
version: '3.8'
|
| 7 |
+
|
| 8 |
+
services:
|
| 9 |
+
# 主应用服务
|
| 10 |
+
hf-repair-system:
|
| 11 |
+
build:
|
| 12 |
+
context: .
|
| 13 |
+
dockerfile: Dockerfile.repair
|
| 14 |
+
container_name: hf-repair-system
|
| 15 |
+
restart: unless-stopped
|
| 16 |
+
environment:
|
| 17 |
+
- HF_TOKEN=${HF_TOKEN}
|
| 18 |
+
- WEBHOOK_URL=${WEBHOOK_URL}
|
| 19 |
+
- DB_PATH=/app/data/repair_system.db
|
| 20 |
+
- LOG_LEVEL=INFO
|
| 21 |
+
volumes:
|
| 22 |
+
- ./data:/app/data
|
| 23 |
+
- ./logs:/app/logs
|
| 24 |
+
- ./backups:/app/backups
|
| 25 |
+
- ./config:/app/config
|
| 26 |
+
ports:
|
| 27 |
+
- "8080:8080"
|
| 28 |
+
networks:
|
| 29 |
+
- hf-repair-network
|
| 30 |
+
depends_on:
|
| 31 |
+
- redis
|
| 32 |
+
- postgres
|
| 33 |
+
healthcheck:
|
| 34 |
+
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
| 35 |
+
interval: 30s
|
| 36 |
+
timeout: 10s
|
| 37 |
+
retries: 3
|
| 38 |
+
start_period: 40s
|
| 39 |
+
|
| 40 |
+
# Redis 缓存服务
|
| 41 |
+
redis:
|
| 42 |
+
image: redis:7-alpine
|
| 43 |
+
container_name: hf-repair-redis
|
| 44 |
+
restart: unless-stopped
|
| 45 |
+
ports:
|
| 46 |
+
- "6379:6379"
|
| 47 |
+
volumes:
|
| 48 |
+
- redis_data:/data
|
| 49 |
+
networks:
|
| 50 |
+
- hf-repair-network
|
| 51 |
+
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
|
| 52 |
+
healthcheck:
|
| 53 |
+
test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
|
| 54 |
+
interval: 30s
|
| 55 |
+
timeout: 10s
|
| 56 |
+
retries: 3
|
| 57 |
+
|
| 58 |
+
# PostgreSQL 数据库服务
|
| 59 |
+
postgres:
|
| 60 |
+
image: postgres:15-alpine
|
| 61 |
+
container_name: hf-repair-postgres
|
| 62 |
+
restart: unless-stopped
|
| 63 |
+
environment:
|
| 64 |
+
- POSTGRES_DB=hf_repair
|
| 65 |
+
- POSTGRES_USER=hf_repair
|
| 66 |
+
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
| 67 |
+
ports:
|
| 68 |
+
- "5432:5432"
|
| 69 |
+
volumes:
|
| 70 |
+
- postgres_data:/var/lib/postgresql/data
|
| 71 |
+
- ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
|
| 72 |
+
networks:
|
| 73 |
+
- hf-repair-network
|
| 74 |
+
healthcheck:
|
| 75 |
+
test: ["CMD-SHELL", "pg_isready -U hf_repair"]
|
| 76 |
+
interval: 30s
|
| 77 |
+
timeout: 10s
|
| 78 |
+
retries: 3
|
| 79 |
+
|
| 80 |
+
# Web 服务(可选的 Web 界面)
|
| 81 |
+
web-interface:
|
| 82 |
+
build:
|
| 83 |
+
context: ./web
|
| 84 |
+
dockerfile: Dockerfile
|
| 85 |
+
container_name: hf-repair-web
|
| 86 |
+
restart: unless-stopped
|
| 87 |
+
ports:
|
| 88 |
+
- "3000:3000"
|
| 89 |
+
environment:
|
| 90 |
+
- REACT_APP_API_URL=http://localhost:8080
|
| 91 |
+
networks:
|
| 92 |
+
- hf-repair-network
|
| 93 |
+
depends_on:
|
| 94 |
+
- hf-repair-system
|
| 95 |
+
|
| 96 |
+
# 监控服务
|
| 97 |
+
prometheus:
|
| 98 |
+
image: prom/prometheus:latest
|
| 99 |
+
container_name: hf-repair-prometheus
|
| 100 |
+
restart: unless-stopped
|
| 101 |
+
ports:
|
| 102 |
+
- "9090:9090"
|
| 103 |
+
volumes:
|
| 104 |
+
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
|
| 105 |
+
- prometheus_data:/prometheus
|
| 106 |
+
networks:
|
| 107 |
+
- hf-repair-network
|
| 108 |
+
command:
|
| 109 |
+
- '--config.file=/etc/prometheus/prometheus.yml'
|
| 110 |
+
- '--storage.tsdb.path=/prometheus'
|
| 111 |
+
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
| 112 |
+
- '--web.console.templates=/etc/prometheus/consoles'
|
| 113 |
+
|
| 114 |
+
# Grafana 可视化
|
| 115 |
+
grafana:
|
| 116 |
+
image: grafana/grafana:latest
|
| 117 |
+
container_name: hf-repair-grafana
|
| 118 |
+
restart: unless-stopped
|
| 119 |
+
ports:
|
| 120 |
+
- "3001:3000"
|
| 121 |
+
environment:
|
| 122 |
+
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
| 123 |
+
volumes:
|
| 124 |
+
- grafana_data:/var/lib/grafana
|
| 125 |
+
- ./monitoring/grafana:/etc/grafana/provisioning
|
| 126 |
+
networks:
|
| 127 |
+
- hf-repair-network
|
| 128 |
+
depends_on:
|
| 129 |
+
- prometheus
|
| 130 |
+
|
| 131 |
+
volumes:
|
| 132 |
+
redis_data:
|
| 133 |
+
driver: local
|
| 134 |
+
postgres_data:
|
| 135 |
+
driver: local
|
| 136 |
+
prometheus_data:
|
| 137 |
+
driver: local
|
| 138 |
+
grafana_data:
|
| 139 |
+
driver: local
|
| 140 |
+
|
| 141 |
+
networks:
|
| 142 |
+
hf-repair-network:
|
| 143 |
+
driver: bridge
|
error_analyzer.py
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
错误分析器实现
|
| 3 |
+
负责分析日志、识别错误类型和根本原因
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
import asyncio
|
| 8 |
+
import logging
|
| 9 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
from core_system import ErrorAnalyzer, ErrorInfo, ErrorType
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class ErrorPattern:
|
| 17 |
+
"""错误模式定义"""
|
| 18 |
+
regex: re.Pattern
|
| 19 |
+
error_type: ErrorType
|
| 20 |
+
confidence: float
|
| 21 |
+
description: str
|
| 22 |
+
common_causes: List[str]
|
| 23 |
+
suggested_fixes: List[str]
|
| 24 |
+
|
| 25 |
+
class LogAnalyzer:
|
| 26 |
+
"""日志分析器"""
|
| 27 |
+
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
def extract_error_context(self, logs: str, error_line: int, context_size: int = 5) -> Dict[str, Any]:
|
| 32 |
+
"""提取错误上下文"""
|
| 33 |
+
lines = logs.split('\n')
|
| 34 |
+
start = max(0, error_line - context_size)
|
| 35 |
+
end = min(len(lines), error_line + context_size + 1)
|
| 36 |
+
|
| 37 |
+
return {
|
| 38 |
+
"before": lines[start:error_line],
|
| 39 |
+
"error_line": lines[error_line] if error_line < len(lines) else "",
|
| 40 |
+
"after": lines[error_line + 1:end],
|
| 41 |
+
"full_context": lines[start:end],
|
| 42 |
+
"relative_line": error_line - start
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
def detect_error_sequence(self, logs: str) -> List[str]:
|
| 46 |
+
"""检测错误序列"""
|
| 47 |
+
lines = logs.split('\n')
|
| 48 |
+
error_sequence = []
|
| 49 |
+
|
| 50 |
+
for line in lines:
|
| 51 |
+
if any(keyword in line.lower() for keyword in ['error', 'failed', 'exception', 'traceback']):
|
| 52 |
+
error_sequence.append(line.strip())
|
| 53 |
+
|
| 54 |
+
return error_sequence
|
| 55 |
+
|
| 56 |
+
def find_related_errors(self, logs: str, main_error: ErrorInfo) -> List[ErrorInfo]:
|
| 57 |
+
"""查找相关错误"""
|
| 58 |
+
related_errors = []
|
| 59 |
+
lines = logs.split('\n')
|
| 60 |
+
|
| 61 |
+
# 在主错误附近查找相关错误
|
| 62 |
+
if main_error.line_number:
|
| 63 |
+
start = max(0, main_error.line_number - 10)
|
| 64 |
+
end = min(len(lines), main_error.line_number + 10)
|
| 65 |
+
|
| 66 |
+
for i, line in enumerate(lines[start:end], start):
|
| 67 |
+
if i != main_error.line_number and 'error' in line.lower():
|
| 68 |
+
related_error = ErrorInfo(
|
| 69 |
+
error_type=ErrorType.UNKNOWN_ERROR,
|
| 70 |
+
message=line.strip(),
|
| 71 |
+
log_snippet=line.strip(),
|
| 72 |
+
line_number=i,
|
| 73 |
+
confidence=0.5
|
| 74 |
+
)
|
| 75 |
+
related_errors.append(related_error)
|
| 76 |
+
|
| 77 |
+
return related_errors
|
| 78 |
+
|
| 79 |
+
class IntelligentErrorAnalyzer(ErrorAnalyzer):
|
| 80 |
+
"""智能错误分析器"""
|
| 81 |
+
|
| 82 |
+
def __init__(self):
|
| 83 |
+
self.logger = logging.getLogger(__name__)
|
| 84 |
+
self.log_analyzer = LogAnalyzer()
|
| 85 |
+
self.error_patterns = self._initialize_patterns()
|
| 86 |
+
self.context_analyzers = {
|
| 87 |
+
ErrorType.DOCKERFILE_SYNTAX: DockerfileSyntaxAnalyzer(),
|
| 88 |
+
ErrorType.DEPENDENCY_INSTALL: DependencyErrorAnalyzer(),
|
| 89 |
+
ErrorType.ENVIRONMENT_CONFIG: EnvironmentErrorAnalyzer(),
|
| 90 |
+
ErrorType.PORT_CONFLICT: PortErrorAnalyzer(),
|
| 91 |
+
ErrorType.PERMISSION_ERROR: PermissionErrorAnalyzer(),
|
| 92 |
+
ErrorType.NETWORK_CONNECTION: NetworkErrorAnalyzer(),
|
| 93 |
+
ErrorType.TIMEOUT_ERROR: TimeoutErrorAnalyzer(),
|
| 94 |
+
ErrorType.RESOURCE_EXCEEDED: ResourceErrorAnalyzer()
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
|
| 98 |
+
"""分析日志并识别错误"""
|
| 99 |
+
errors = []
|
| 100 |
+
|
| 101 |
+
# 首先使用正则模式进行快速匹配
|
| 102 |
+
pattern_errors = await self._pattern_matching(logs)
|
| 103 |
+
errors.extend(pattern_errors)
|
| 104 |
+
|
| 105 |
+
# 然后使用上下文分析器进行深度分析
|
| 106 |
+
context_errors = await self._context_analysis(logs)
|
| 107 |
+
errors.extend(context_errors)
|
| 108 |
+
|
| 109 |
+
# 去重和合并相似错误
|
| 110 |
+
deduplicated_errors = self._deduplicate_errors(errors)
|
| 111 |
+
|
| 112 |
+
# 计算最终置信度
|
| 113 |
+
final_errors = self._calculate_final_confidence(deduplicated_errors, logs)
|
| 114 |
+
|
| 115 |
+
return final_errors
|
| 116 |
+
|
| 117 |
+
async def classify_error(self, error_message: str) -> ErrorType:
|
| 118 |
+
"""分类错误类型"""
|
| 119 |
+
max_confidence = 0.0
|
| 120 |
+
best_type = ErrorType.UNKNOWN_ERROR
|
| 121 |
+
|
| 122 |
+
for pattern in self.error_patterns:
|
| 123 |
+
if pattern.regex.search(error_message):
|
| 124 |
+
if pattern.confidence > max_confidence:
|
| 125 |
+
max_confidence = pattern.confidence
|
| 126 |
+
best_type = pattern.error_type
|
| 127 |
+
|
| 128 |
+
return best_type
|
| 129 |
+
|
| 130 |
+
async def _pattern_matching(self, logs: str) -> List[ErrorInfo]:
|
| 131 |
+
"""基于模式的错误匹配"""
|
| 132 |
+
errors = []
|
| 133 |
+
lines = logs.split('\n')
|
| 134 |
+
|
| 135 |
+
for line_num, line in enumerate(lines, 1):
|
| 136 |
+
for pattern in self.error_patterns:
|
| 137 |
+
if pattern.regex.search(line):
|
| 138 |
+
error_info = ErrorInfo(
|
| 139 |
+
error_type=pattern.error_type,
|
| 140 |
+
message=line.strip(),
|
| 141 |
+
log_snippet=line.strip(),
|
| 142 |
+
line_number=line_num,
|
| 143 |
+
confidence=pattern.confidence,
|
| 144 |
+
context={
|
| 145 |
+
"description": pattern.description,
|
| 146 |
+
"common_causes": pattern.common_causes,
|
| 147 |
+
"suggested_fixes": pattern.suggested_fixes
|
| 148 |
+
}
|
| 149 |
+
)
|
| 150 |
+
errors.append(error_info)
|
| 151 |
+
|
| 152 |
+
return errors
|
| 153 |
+
|
| 154 |
+
async def _context_analysis(self, logs: str) -> List[ErrorInfo]:
|
| 155 |
+
"""上下文感知的错误分析"""
|
| 156 |
+
errors = []
|
| 157 |
+
|
| 158 |
+
for error_type, analyzer in self.context_analyzers.items():
|
| 159 |
+
try:
|
| 160 |
+
type_errors = await analyzer.analyze(logs)
|
| 161 |
+
errors.extend(type_errors)
|
| 162 |
+
except Exception as e:
|
| 163 |
+
self.logger.error(f"上下文分析器 {error_type} 执行失败: {e}")
|
| 164 |
+
|
| 165 |
+
return errors
|
| 166 |
+
|
| 167 |
+
def _deduplicate_errors(self, errors: List[ErrorInfo]) -> List[ErrorInfo]:
|
| 168 |
+
"""去重错误"""
|
| 169 |
+
if not errors:
|
| 170 |
+
return []
|
| 171 |
+
|
| 172 |
+
# 按行号和错误类型去重
|
| 173 |
+
seen = set()
|
| 174 |
+
deduplicated = []
|
| 175 |
+
|
| 176 |
+
for error in errors:
|
| 177 |
+
key = (error.line_number, error.error_type)
|
| 178 |
+
if key not in seen:
|
| 179 |
+
seen.add(key)
|
| 180 |
+
deduplicated.append(error)
|
| 181 |
+
|
| 182 |
+
return deduplicated
|
| 183 |
+
|
| 184 |
+
def _calculate_final_confidence(self, errors: List[ErrorInfo], logs: str) -> List[ErrorInfo]:
|
| 185 |
+
"""计算最终置信度"""
|
| 186 |
+
for error in errors:
|
| 187 |
+
# 基于多种因素调整置信度
|
| 188 |
+
base_confidence = error.confidence
|
| 189 |
+
|
| 190 |
+
# 如果错误信息中包含具体的技术关键词,提高置信度
|
| 191 |
+
tech_keywords = ['docker', 'pip', 'npm', 'apt', 'python', 'node']
|
| 192 |
+
keyword_boost = sum(0.1 for keyword in tech_keywords if keyword in error.message.lower())
|
| 193 |
+
|
| 194 |
+
# 如果错误在日志的末尾(最近的错误),提高置信度
|
| 195 |
+
lines = logs.split('\n')
|
| 196 |
+
position_factor = (error.line_number or 0) / len(lines) if len(lines) > 0 else 0.5
|
| 197 |
+
recent_boost = (1 - position_factor) * 0.2
|
| 198 |
+
|
| 199 |
+
# 计算最终置信度
|
| 200 |
+
final_confidence = min(1.0, base_confidence + keyword_boost + recent_boost)
|
| 201 |
+
error.confidence = final_confidence
|
| 202 |
+
|
| 203 |
+
return errors
|
| 204 |
+
|
| 205 |
+
def _initialize_patterns(self) -> List[ErrorPattern]:
|
| 206 |
+
"""初始化错误模式"""
|
| 207 |
+
patterns = [
|
| 208 |
+
# Dockerfile 语法错误
|
| 209 |
+
ErrorPattern(
|
| 210 |
+
regex=re.compile(r"failed to solve:.*syntax error|Dockerfile:\d+"),
|
| 211 |
+
error_type=ErrorType.DOCKERFILE_SYNTAX,
|
| 212 |
+
confidence=0.9,
|
| 213 |
+
description="Dockerfile 语法错误",
|
| 214 |
+
common_causes=["命令格式错误", "参数缺失", "缩进问题"],
|
| 215 |
+
suggested_fixes=["检查命令语法", "验证参数", "修复格式"]
|
| 216 |
+
),
|
| 217 |
+
|
| 218 |
+
# 依赖安装失败
|
| 219 |
+
ErrorPattern(
|
| 220 |
+
regex=re.compile(r"ERROR: Could not find a version|No matching distribution|pip install failed"),
|
| 221 |
+
error_type=ErrorType.DEPENDENCY_INSTALL,
|
| 222 |
+
confidence=0.85,
|
| 223 |
+
description="Python 依赖安装失败",
|
| 224 |
+
common_causes=["版本不存在", "网络问题", "依赖冲突"],
|
| 225 |
+
suggested_fixes=["检查版本", "更换源", "解决冲突"]
|
| 226 |
+
),
|
| 227 |
+
|
| 228 |
+
# Node.js 依赖安装失败
|
| 229 |
+
ErrorPattern(
|
| 230 |
+
regex=re.compile(r"npm ERR!|yarn error|failed to install node packages"),
|
| 231 |
+
error_type=ErrorType.DEPENDENCY_INSTALL,
|
| 232 |
+
confidence=0.85,
|
| 233 |
+
description="Node.js 依赖安装失败",
|
| 234 |
+
common_causes=["版本冲突", "网络问题", "缓存问题"],
|
| 235 |
+
suggested_fixes=["清理缓存", "检查版本", "使用国内源"]
|
| 236 |
+
),
|
| 237 |
+
|
| 238 |
+
# 环境变量配置问题
|
| 239 |
+
ErrorPattern(
|
| 240 |
+
regex=re.compile(r"Environment variable.*not found|ENV.*undefined|getenv.*None"),
|
| 241 |
+
error_type=ErrorType.ENVIRONMENT_CONFIG,
|
| 242 |
+
confidence=0.8,
|
| 243 |
+
description="环境变量配置问题",
|
| 244 |
+
common_causes=["变量未设置", "配置文件缺失", "权限问题"],
|
| 245 |
+
suggested_fixes=["设置环境变量", "创建配置文件", "检查权限"]
|
| 246 |
+
),
|
| 247 |
+
|
| 248 |
+
# 端口冲突
|
| 249 |
+
ErrorPattern(
|
| 250 |
+
regex=re.compile(r"Address already in use|Port.*already used|EADDRINUSE"),
|
| 251 |
+
error_type=ErrorType.PORT_CONFLICT,
|
| 252 |
+
confidence=0.95,
|
| 253 |
+
description="端口冲突",
|
| 254 |
+
common_causes=["端口被占用", "权限不足", "配置错误"],
|
| 255 |
+
suggested_fixes=["更换端口", "杀死占用进程", "修改配置"]
|
| 256 |
+
),
|
| 257 |
+
|
| 258 |
+
# 权限问题
|
| 259 |
+
ErrorPattern(
|
| 260 |
+
regex=re.compile(r"Permission denied|Operation not permitted|EACCES"),
|
| 261 |
+
error_type=ErrorType.PERMISSION_ERROR,
|
| 262 |
+
confidence=0.9,
|
| 263 |
+
description="权限不足",
|
| 264 |
+
common_causes=["文件权限", "用户权限", "目录权限"],
|
| 265 |
+
suggested_fixes=["修改权限", "使用 sudo", "更改用户"]
|
| 266 |
+
),
|
| 267 |
+
|
| 268 |
+
# 网络连接问题
|
| 269 |
+
ErrorPattern(
|
| 270 |
+
regex=re.compile(r"Connection refused|Network unreachable|Timeout|DNS resolution failed"),
|
| 271 |
+
error_type=ErrorType.NETWORK_CONNECTION,
|
| 272 |
+
confidence=0.8,
|
| 273 |
+
description="网络连接问题",
|
| 274 |
+
common_causes=["网络不可达", "DNS问题", "防火墙限制"],
|
| 275 |
+
suggested_fixes=["检查网络", "配置DNS", "调整防火墙"]
|
| 276 |
+
),
|
| 277 |
+
|
| 278 |
+
# 超时错误
|
| 279 |
+
ErrorPattern(
|
| 280 |
+
regex=re.compile(r"timeout|timed out|deadline exceeded"),
|
| 281 |
+
error_type=ErrorType.TIMEOUT_ERROR,
|
| 282 |
+
confidence=0.75,
|
| 283 |
+
description="操作超时",
|
| 284 |
+
common_causes=["操作时间过长", "资源不足", "网络延迟"],
|
| 285 |
+
suggested_fixes["增加超时时间", "优化性能", "检查资源"]
|
| 286 |
+
),
|
| 287 |
+
|
| 288 |
+
# 资源超限
|
| 289 |
+
ErrorPattern(
|
| 290 |
+
regex=re.compile(r"out of memory|disk full|CPU limit exceeded|resource exceeded"),
|
| 291 |
+
error_type=ErrorType.RESOURCE_EXCEEDED,
|
| 292 |
+
confidence=0.9,
|
| 293 |
+
description="资源超限",
|
| 294 |
+
common_causes=["内存不足", "磁盘满", "CPU限制"],
|
| 295 |
+
suggested_fixes=["清理资源", "增加配额", "优化代码"]
|
| 296 |
+
)
|
| 297 |
+
]
|
| 298 |
+
|
| 299 |
+
return patterns
|
| 300 |
+
|
| 301 |
+
class ContextAnalyzer(ABC):
|
| 302 |
+
"""上下文分析器基类"""
|
| 303 |
+
|
| 304 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 305 |
+
"""分析日志"""
|
| 306 |
+
pass
|
| 307 |
+
|
| 308 |
+
class DockerfileSyntaxAnalyzer(ContextAnalyzer):
|
| 309 |
+
"""Dockerfile 语法分析器"""
|
| 310 |
+
|
| 311 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 312 |
+
errors = []
|
| 313 |
+
|
| 314 |
+
# 分析 Dockerfile 特有的语法错误
|
| 315 |
+
dockerfile_errors = [
|
| 316 |
+
(r"FROM.*invalid", "FROM 指令格式错误"),
|
| 317 |
+
(r"RUN.*command not found", "RUN 命令执行失败"),
|
| 318 |
+
(r"COPY.*No such file", "COPY 源文件不存在"),
|
| 319 |
+
(r"EXPOSE.*invalid port", "EXPOSE 端口格式错误"),
|
| 320 |
+
(r"ENV.*invalid format", "ENV 环境变量格式错误")
|
| 321 |
+
]
|
| 322 |
+
|
| 323 |
+
for pattern, description in dockerfile_errors:
|
| 324 |
+
if re.search(pattern, logs, re.IGNORECASE):
|
| 325 |
+
error_info = ErrorInfo(
|
| 326 |
+
error_type=ErrorType.DOCKERFILE_SYNTAX,
|
| 327 |
+
message=description,
|
| 328 |
+
log_snippet="",
|
| 329 |
+
confidence=0.8,
|
| 330 |
+
context={"analysis_type": "dockerfile_syntax"}
|
| 331 |
+
)
|
| 332 |
+
errors.append(error_info)
|
| 333 |
+
|
| 334 |
+
return errors
|
| 335 |
+
|
| 336 |
+
class DependencyErrorAnalyzer(ContextAnalyzer):
|
| 337 |
+
"""依赖错误分析器"""
|
| 338 |
+
|
| 339 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 340 |
+
errors = []
|
| 341 |
+
|
| 342 |
+
# Python 依赖问题
|
| 343 |
+
python_patterns = [
|
| 344 |
+
(r"pip.*Requirement already satisfied", "依赖重复安装"),
|
| 345 |
+
(r"pip.*Could not find.*version", "依赖版本不存在"),
|
| 346 |
+
(r"pip.*incompatible dependencies", "依赖版本冲突")
|
| 347 |
+
]
|
| 348 |
+
|
| 349 |
+
# Node.js 依赖问题
|
| 350 |
+
node_patterns = [
|
| 351 |
+
(r"npm.*peer dependency", "peer 依赖问题"),
|
| 352 |
+
(r"npm.*version mismatch", "版本不匹配"),
|
| 353 |
+
(r"npm.*cache problem", "npm 缓存问题")
|
| 354 |
+
]
|
| 355 |
+
|
| 356 |
+
all_patterns = python_patterns + node_patterns
|
| 357 |
+
|
| 358 |
+
for pattern, description in all_patterns:
|
| 359 |
+
if re.search(pattern, logs, re.IGNORECASE):
|
| 360 |
+
error_info = ErrorInfo(
|
| 361 |
+
error_type=ErrorType.DEPENDENCY_INSTALL,
|
| 362 |
+
message=description,
|
| 363 |
+
log_snippet="",
|
| 364 |
+
confidence=0.75,
|
| 365 |
+
context={"analysis_type": "dependency"}
|
| 366 |
+
)
|
| 367 |
+
errors.append(error_info)
|
| 368 |
+
|
| 369 |
+
return errors
|
| 370 |
+
|
| 371 |
+
class EnvironmentErrorAnalyzer(ContextAnalyzer):
|
| 372 |
+
"""环境错误分析器"""
|
| 373 |
+
|
| 374 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 375 |
+
errors = []
|
| 376 |
+
|
| 377 |
+
# 环境变量问题
|
| 378 |
+
if re.search(r"PATH.*not found", logs, re.IGNORECASE):
|
| 379 |
+
error_info = ErrorInfo(
|
| 380 |
+
error_type=ErrorType.ENVIRONMENT_CONFIG,
|
| 381 |
+
message="PATH 环境变量配置问题",
|
| 382 |
+
log_snippet="",
|
| 383 |
+
confidence=0.8,
|
| 384 |
+
context={"analysis_type": "environment", "var_type": "PATH"}
|
| 385 |
+
)
|
| 386 |
+
errors.append(error_info)
|
| 387 |
+
|
| 388 |
+
return errors
|
| 389 |
+
|
| 390 |
+
class PortErrorAnalyzer(ContextAnalyzer):
|
| 391 |
+
"""端口错误分析器"""
|
| 392 |
+
|
| 393 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 394 |
+
errors = []
|
| 395 |
+
|
| 396 |
+
# 检测常见的 HuggingFace Spaces 端口问题
|
| 397 |
+
if re.search(r"port.*7860", logs, re.IGNORECASE) and re.search(r"error|failed", logs, re.IGNORECASE):
|
| 398 |
+
error_info = ErrorInfo(
|
| 399 |
+
error_type=ErrorType.PORT_CONFLICT,
|
| 400 |
+
message="HuggingFace Spaces 默认端口 7860 问题",
|
| 401 |
+
log_snippet="",
|
| 402 |
+
confidence=0.9,
|
| 403 |
+
context={"analysis_type": "port", "port": "7860"}
|
| 404 |
+
)
|
| 405 |
+
errors.append(error_info)
|
| 406 |
+
|
| 407 |
+
return errors
|
| 408 |
+
|
| 409 |
+
class PermissionErrorAnalyzer(ContextAnalyzer):
|
| 410 |
+
"""权限错误分析器"""
|
| 411 |
+
|
| 412 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 413 |
+
errors = []
|
| 414 |
+
|
| 415 |
+
# 检测文件权限问题
|
| 416 |
+
if re.search(r"permission denied.*\.py|\.js|\.sh", logs, re.IGNORECASE):
|
| 417 |
+
error_info = ErrorInfo(
|
| 418 |
+
error_type=ErrorType.PERMISSION_ERROR,
|
| 419 |
+
message="脚本文件权限问题",
|
| 420 |
+
log_snippet="",
|
| 421 |
+
confidence=0.8,
|
| 422 |
+
context={"analysis_type": "permission", "file_type": "script"}
|
| 423 |
+
)
|
| 424 |
+
errors.append(error_info)
|
| 425 |
+
|
| 426 |
+
return errors
|
| 427 |
+
|
| 428 |
+
class NetworkErrorAnalyzer(ContextAnalyzer):
|
| 429 |
+
"""网络错误分析器"""
|
| 430 |
+
|
| 431 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 432 |
+
errors = []
|
| 433 |
+
|
| 434 |
+
# 检测网络连接问题
|
| 435 |
+
network_indicators = [
|
| 436 |
+
(r"github\.com.*timeout", "GitHub 连接超时"),
|
| 437 |
+
(r"pypi\.org.*failed", "PyPI 连接失败"),
|
| 438 |
+
(r"npm\.registry.*error", "npm registry 连接错误")
|
| 439 |
+
]
|
| 440 |
+
|
| 441 |
+
for pattern, description in network_indicators:
|
| 442 |
+
if re.search(pattern, logs, re.IGNORECASE):
|
| 443 |
+
error_info = ErrorInfo(
|
| 444 |
+
error_type=ErrorType.NETWORK_CONNECTION,
|
| 445 |
+
message=description,
|
| 446 |
+
log_snippet="",
|
| 447 |
+
confidence=0.7,
|
| 448 |
+
context={"analysis_type": "network", "service": pattern.split('.')[0]}
|
| 449 |
+
)
|
| 450 |
+
errors.append(error_info)
|
| 451 |
+
|
| 452 |
+
return errors
|
| 453 |
+
|
| 454 |
+
class TimeoutErrorAnalyzer(ContextAnalyzer):
|
| 455 |
+
"""超时错误分析器"""
|
| 456 |
+
|
| 457 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 458 |
+
errors = []
|
| 459 |
+
|
| 460 |
+
# 检测不同类型的超时
|
| 461 |
+
timeout_patterns = [
|
| 462 |
+
(r"build.*timeout", "构建超时"),
|
| 463 |
+
(r"install.*timeout", "安装超时"),
|
| 464 |
+
(r"download.*timeout", "下载超时")
|
| 465 |
+
]
|
| 466 |
+
|
| 467 |
+
for pattern, description in timeout_patterns:
|
| 468 |
+
if re.search(pattern, logs, re.IGNORECASE):
|
| 469 |
+
error_info = ErrorInfo(
|
| 470 |
+
error_type=ErrorType.TIMEOUT_ERROR,
|
| 471 |
+
message=description,
|
| 472 |
+
log_snippet="",
|
| 473 |
+
confidence=0.8,
|
| 474 |
+
context={"analysis_type": "timeout", "operation": pattern.split('.')[0]}
|
| 475 |
+
)
|
| 476 |
+
errors.append(error_info)
|
| 477 |
+
|
| 478 |
+
return errors
|
| 479 |
+
|
| 480 |
+
class ResourceErrorAnalyzer(ContextAnalyzer):
|
| 481 |
+
"""资源错误分析器"""
|
| 482 |
+
|
| 483 |
+
async def analyze(self, logs: str) -> List[ErrorInfo]:
|
| 484 |
+
errors = []
|
| 485 |
+
|
| 486 |
+
# 检测资源限制问题
|
| 487 |
+
resource_patterns = [
|
| 488 |
+
(r"memory.*limit", "内存限制"),
|
| 489 |
+
(r"disk.*space", "磁盘空间不足"),
|
| 490 |
+
(r"cpu.*quota", "CPU 配额限制")
|
| 491 |
+
]
|
| 492 |
+
|
| 493 |
+
for pattern, description in resource_patterns:
|
| 494 |
+
if re.search(pattern, logs, re.IGNORECASE):
|
| 495 |
+
error_info = ErrorInfo(
|
| 496 |
+
error_type=ErrorType.RESOURCE_EXCEEDED,
|
| 497 |
+
message=description,
|
| 498 |
+
log_snippet="",
|
| 499 |
+
confidence=0.8,
|
| 500 |
+
context={"analysis_type": "resource", "resource_type": pattern.split('.')[0]}
|
| 501 |
+
)
|
| 502 |
+
errors.append(error_info)
|
| 503 |
+
|
| 504 |
+
return errors
|
huggingface_client.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Spaces API 客户端实现
|
| 3 |
+
负责与 HuggingFace API 的所有交互
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import aiohttp
|
| 7 |
+
import asyncio
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
from typing import Dict, List, Optional, Any
|
| 11 |
+
from dataclasses import asdict
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
from core_system import HuggingFaceAPI, SpaceInfo, SpaceStatus, ErrorInfo
|
| 15 |
+
|
| 16 |
+
class HuggingFaceAPIClient(HuggingFaceAPI):
|
| 17 |
+
"""HuggingFace API 客户端实现"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, token: str):
|
| 20 |
+
self.token = token
|
| 21 |
+
self.base_url = "https://huggingface.co/api"
|
| 22 |
+
self.headers = {"Authorization": f"Bearer {token}"}
|
| 23 |
+
self.logger = logging.getLogger(__name__)
|
| 24 |
+
self.session = None
|
| 25 |
+
|
| 26 |
+
async def _get_session(self) -> aiohttp.ClientSession:
|
| 27 |
+
"""获取 HTTP 会话"""
|
| 28 |
+
if self.session is None:
|
| 29 |
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
| 30 |
+
return self.session
|
| 31 |
+
|
| 32 |
+
async def close(self) -> None:
|
| 33 |
+
"""关闭会话"""
|
| 34 |
+
if self.session:
|
| 35 |
+
await self.session.close()
|
| 36 |
+
|
| 37 |
+
async def get_space_status(self, space_id: str) -> SpaceStatus:
|
| 38 |
+
"""获取 Space 状态"""
|
| 39 |
+
try:
|
| 40 |
+
session = await self._get_session()
|
| 41 |
+
url = f"{self.base_url}/spaces/{space_id}"
|
| 42 |
+
|
| 43 |
+
async with session.get(url) as response:
|
| 44 |
+
if response.status == 200:
|
| 45 |
+
data = await response.json()
|
| 46 |
+
runtime_data = data.get('runtime', {})
|
| 47 |
+
|
| 48 |
+
# 根据运行时状态确定 Space 状态
|
| 49 |
+
if runtime_data.get('stage') == 'BUILDING':
|
| 50 |
+
return SpaceStatus.BUILDING
|
| 51 |
+
elif runtime_data.get('stage') == 'RUNNING':
|
| 52 |
+
if runtime_data.get('state') == 'RUNNING':
|
| 53 |
+
return SpaceStatus.RUNNING
|
| 54 |
+
else:
|
| 55 |
+
return SpaceStatus.ERROR
|
| 56 |
+
elif runtime_data.get('stage') == 'STOPPED':
|
| 57 |
+
return SpaceStatus.STOPPED
|
| 58 |
+
else:
|
| 59 |
+
return SpaceStatus.ERROR
|
| 60 |
+
else:
|
| 61 |
+
self.logger.error(f"获取 Space 状态失败: {response.status}")
|
| 62 |
+
return SpaceStatus.UNKNOWN
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
self.logger.error(f"获取 Space {space_id} 状态异常: {e}")
|
| 66 |
+
return SpaceStatus.UNKNOWN
|
| 67 |
+
|
| 68 |
+
async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
|
| 69 |
+
"""获取 Space 日志"""
|
| 70 |
+
try:
|
| 71 |
+
session = await self._get_session()
|
| 72 |
+
url = f"{self.base_url}/spaces/{space_id}/logs"
|
| 73 |
+
params = {"lines": lines}
|
| 74 |
+
|
| 75 |
+
async with session.get(url, params=params) as response:
|
| 76 |
+
if response.status == 200:
|
| 77 |
+
data = await response.json()
|
| 78 |
+
# 解析日志数据
|
| 79 |
+
log_lines = []
|
| 80 |
+
for entry in data:
|
| 81 |
+
if isinstance(entry, dict) and 'message' in entry:
|
| 82 |
+
log_lines.append(entry['message'])
|
| 83 |
+
elif isinstance(entry, str):
|
| 84 |
+
log_lines.append(entry)
|
| 85 |
+
|
| 86 |
+
return '\n'.join(log_lines)
|
| 87 |
+
else:
|
| 88 |
+
error_msg = await response.text()
|
| 89 |
+
self.logger.error(f"获取日志失败: {response.status} - {error_msg}")
|
| 90 |
+
return f"ERROR: 无法获取日志 - {response.status}"
|
| 91 |
+
|
| 92 |
+
except Exception as e:
|
| 93 |
+
self.logger.error(f"获取 Space {space_id} 日志异常: {e}")
|
| 94 |
+
return f"ERROR: 获取日志异常 - {str(e)}"
|
| 95 |
+
|
| 96 |
+
async def trigger_rebuild(self, space_id: str) -> bool:
|
| 97 |
+
"""触发重新构建"""
|
| 98 |
+
try:
|
| 99 |
+
session = await self._get_session()
|
| 100 |
+
url = f"{self.base_url}/spaces/{space_id}/restart"
|
| 101 |
+
|
| 102 |
+
async with session.post(url) as response:
|
| 103 |
+
if response.status == 200:
|
| 104 |
+
self.logger.info(f"成功触发 Space {space_id} 重新构建")
|
| 105 |
+
return True
|
| 106 |
+
else:
|
| 107 |
+
error_msg = await response.text()
|
| 108 |
+
self.logger.error(f"触发重新构建失败: {response.status} - {error_msg}")
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
self.logger.error(f"触发重新构建异常: {e}")
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
async def get_space_info(self, space_id: str) -> SpaceInfo:
|
| 116 |
+
"""获取 Space 详细信息"""
|
| 117 |
+
try:
|
| 118 |
+
session = await self._get_session()
|
| 119 |
+
url = f"{self.base_url}/spaces/{space_id}"
|
| 120 |
+
|
| 121 |
+
async with session.get(url) as response:
|
| 122 |
+
if response.status == 200:
|
| 123 |
+
data = await response.json()
|
| 124 |
+
|
| 125 |
+
return SpaceInfo(
|
| 126 |
+
space_id=space_id,
|
| 127 |
+
name=data.get('id', space_id),
|
| 128 |
+
repository_url=data.get('url', ''),
|
| 129 |
+
current_status=await self.get_space_status(space_id),
|
| 130 |
+
last_updated=datetime.now(),
|
| 131 |
+
dockerfile_path="Dockerfile", # 默认路径
|
| 132 |
+
local_path="" # 本地路径需要额外配置
|
| 133 |
+
)
|
| 134 |
+
else:
|
| 135 |
+
raise Exception(f"无法获取 Space 信息: {response.status}")
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
self.logger.error(f"获取 Space {space_id} 信息异常: {e}")
|
| 139 |
+
# 返回默认信息
|
| 140 |
+
return SpaceInfo(
|
| 141 |
+
space_id=space_id,
|
| 142 |
+
name=space_id,
|
| 143 |
+
repository_url="",
|
| 144 |
+
current_status=SpaceStatus.UNKNOWN,
|
| 145 |
+
last_updated=datetime.now()
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
async def get_space_discussion(self, space_id: str) -> List[Dict]:
|
| 149 |
+
"""获取 Space 讨论信息(用于获取更多上下文)"""
|
| 150 |
+
try:
|
| 151 |
+
session = await self._get_session()
|
| 152 |
+
url = f"{self.base_url}/spaces/{space_id}/discussions"
|
| 153 |
+
|
| 154 |
+
async with session.get(url) as response:
|
| 155 |
+
if response.status == 200:
|
| 156 |
+
return await response.json()
|
| 157 |
+
else:
|
| 158 |
+
return []
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
self.logger.error(f"获取 Space {space_id} 讨论信息异常: {e}")
|
| 162 |
+
return []
|
| 163 |
+
|
| 164 |
+
async def get_space_runtime_info(self, space_id: str) -> Dict[str, Any]:
|
| 165 |
+
"""获取 Space 运行时详细信息"""
|
| 166 |
+
try:
|
| 167 |
+
session = await self._get_session()
|
| 168 |
+
url = f"{self.base_url}/spaces/{space_id}/runtime"
|
| 169 |
+
|
| 170 |
+
async with session.get(url) as response:
|
| 171 |
+
if response.status == 200:
|
| 172 |
+
return await response.json()
|
| 173 |
+
else:
|
| 174 |
+
return {}
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
self.logger.error(f"获取 Space {space_id} 运行时信息异常: {e}")
|
| 178 |
+
return {}
|
| 179 |
+
|
| 180 |
+
class HuggingFaceWebhookHandler:
|
| 181 |
+
"""HuggingFace Webhook 处理器"""
|
| 182 |
+
|
| 183 |
+
def __init__(self, api_client: HuggingFaceAPIClient):
|
| 184 |
+
self.api_client = api_client
|
| 185 |
+
self.logger = logging.getLogger(__name__)
|
| 186 |
+
self.event_handlers = {
|
| 187 |
+
'space.status_updated': self._handle_status_update,
|
| 188 |
+
'space.build_error': self._handle_build_error,
|
| 189 |
+
'space.started': self._handle_space_started,
|
| 190 |
+
'space.stopped': self._handle_space_stopped
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
async def handle_webhook(self, payload: Dict[str, Any]) -> None:
|
| 194 |
+
"""处理 Webhook 事件"""
|
| 195 |
+
try:
|
| 196 |
+
event_type = payload.get('event')
|
| 197 |
+
if event_type in self.event_handlers:
|
| 198 |
+
await self.event_handlers[event_type](payload)
|
| 199 |
+
else:
|
| 200 |
+
self.logger.warning(f"未知的事件类型: {event_type}")
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
self.logger.error(f"处理 Webhook 事件失败: {e}")
|
| 204 |
+
|
| 205 |
+
async def _handle_status_update(self, payload: Dict[str, Any]) -> None:
|
| 206 |
+
"""处理状态更新事件"""
|
| 207 |
+
space_id = payload.get('space', {}).get('id')
|
| 208 |
+
new_status = payload.get('space', {}).get('runtime', {}).get('stage')
|
| 209 |
+
|
| 210 |
+
self.logger.info(f"Space {space_id} 状态更新为: {new_status}")
|
| 211 |
+
|
| 212 |
+
# 根据状态变化触发相应处理
|
| 213 |
+
if new_status == 'ERROR':
|
| 214 |
+
await self._handle_build_error(payload)
|
| 215 |
+
|
| 216 |
+
async def _handle_build_error(self, payload: Dict[str, Any]) -> None:
|
| 217 |
+
"""处理构建错误事件"""
|
| 218 |
+
space_id = payload.get('space', {}).get('id')
|
| 219 |
+
|
| 220 |
+
# 获取错误日志
|
| 221 |
+
logs = await self.api_client.get_space_logs(space_id, lines=50)
|
| 222 |
+
|
| 223 |
+
# 触发错误分析流程
|
| 224 |
+
# 这里需要与错误分析器集成
|
| 225 |
+
|
| 226 |
+
async def _handle_space_started(self, payload: Dict[str, Any]) -> None:
|
| 227 |
+
"""处理 Space 启动事件"""
|
| 228 |
+
space_id = payload.get('space', {}).get('id')
|
| 229 |
+
self.logger.info(f"Space {space_id} 启动成功")
|
| 230 |
+
|
| 231 |
+
async def _handle_space_stopped(self, payload: Dict[str, Any]) -> None:
|
| 232 |
+
"""处理 Space 停止事件"""
|
| 233 |
+
space_id = payload.get('space', {}).get('id')
|
| 234 |
+
self.logger.info(f"Space {space_id} 已停止")
|
| 235 |
+
|
| 236 |
+
class RateLimiter:
|
| 237 |
+
"""API 请求限制器"""
|
| 238 |
+
|
| 239 |
+
def __init__(self, max_requests_per_minute: int = 60):
|
| 240 |
+
self.max_requests = max_requests_per_minute
|
| 241 |
+
self.requests = []
|
| 242 |
+
self.lock = asyncio.Lock()
|
| 243 |
+
|
| 244 |
+
async def acquire(self) -> None:
|
| 245 |
+
"""获取请求许可"""
|
| 246 |
+
async with self.lock:
|
| 247 |
+
now = datetime.now()
|
| 248 |
+
# 清理超过1分钟的请求记录
|
| 249 |
+
self.requests = [req_time for req_time in self.requests
|
| 250 |
+
if (now - req_time).total_seconds() < 60]
|
| 251 |
+
|
| 252 |
+
# 检查是否超过限制
|
| 253 |
+
if len(self.requests) >= self.max_requests:
|
| 254 |
+
# 计算需要等待的时间
|
| 255 |
+
oldest_request = min(self.requests)
|
| 256 |
+
wait_time = 60 - (now - oldest_request).total_seconds()
|
| 257 |
+
if wait_time > 0:
|
| 258 |
+
await asyncio.sleep(wait_time)
|
| 259 |
+
|
| 260 |
+
# 记录当前请求
|
| 261 |
+
self.requests.append(now)
|
| 262 |
+
|
| 263 |
+
class HuggingFaceAPIClientWithRateLimit(HuggingFaceAPIClient):
|
| 264 |
+
"""带速率限制的 HuggingFace API 客户端"""
|
| 265 |
+
|
| 266 |
+
def __init__(self, token: str, rate_limit: int = 60):
|
| 267 |
+
super().__init__(token)
|
| 268 |
+
self.rate_limiter = RateLimiter(rate_limit)
|
| 269 |
+
self.base_client = HuggingFaceAPIClient(token)
|
| 270 |
+
|
| 271 |
+
async def get_space_status(self, space_id: str) -> SpaceStatus:
|
| 272 |
+
"""获取 Space 状态(带速率限制)"""
|
| 273 |
+
await self.rate_limiter.acquire()
|
| 274 |
+
return await self.base_client.get_space_status(space_id)
|
| 275 |
+
|
| 276 |
+
async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
|
| 277 |
+
"""获取 Space 日志(带速率限制)"""
|
| 278 |
+
await self.rate_limiter.acquire()
|
| 279 |
+
return await self.base_client.get_space_logs(space_id, lines)
|
| 280 |
+
|
| 281 |
+
async def trigger_rebuild(self, space_id: str) -> bool:
|
| 282 |
+
"""触发重新构建(带速率限制)"""
|
| 283 |
+
await self.rate_limiter.acquire()
|
| 284 |
+
return await self.base_client.trigger_rebuild(space_id)
|
| 285 |
+
|
| 286 |
+
async def get_space_info(self, space_id: str) -> SpaceInfo:
|
| 287 |
+
"""获取 Space 详细信息(带速率限制)"""
|
| 288 |
+
await self.rate_limiter.acquire()
|
| 289 |
+
return await self.base_client.get_space_info(space_id)
|
usage_examples.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
使用示例和最佳实践
|
| 3 |
+
展示系统的基本使用流程和高级功能
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
|
| 11 |
+
from core_system import AutoRepairSystem, SpaceStatus, ErrorType
|
| 12 |
+
from huggingface_client import HuggingFaceAPIClient
|
| 13 |
+
from error_analyzer import IntelligentErrorAnalyzer
|
| 14 |
+
|
| 15 |
+
# ============================================================================
|
| 16 |
+
# 基本使用示例
|
| 17 |
+
# ============================================================================
|
| 18 |
+
|
| 19 |
+
async def basic_usage_example():
|
| 20 |
+
"""基本使用示例"""
|
| 21 |
+
|
| 22 |
+
# 1. 初始化系统
|
| 23 |
+
system = AutoRepairSystem("config.json")
|
| 24 |
+
|
| 25 |
+
# 2. 配置要监控的 Spaces
|
| 26 |
+
space_ids = [
|
| 27 |
+
"your-username/space-1",
|
| 28 |
+
"your-username/space-2",
|
| 29 |
+
"your-username/space-3"
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
print(f"开始监控 {len(space_ids)} 个 Space...")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
# 3. 启动系统
|
| 36 |
+
await system.start(space_ids)
|
| 37 |
+
except KeyboardInterrupt:
|
| 38 |
+
print("\n停止监控...")
|
| 39 |
+
system.monitor.stop()
|
| 40 |
+
|
| 41 |
+
# ============================================================================
|
| 42 |
+
# 高级使用示例
|
| 43 |
+
# ============================================================================
|
| 44 |
+
|
| 45 |
+
class AdvancedUsageExample:
|
| 46 |
+
"""高级使用示例类"""
|
| 47 |
+
|
| 48 |
+
def __init__(self):
|
| 49 |
+
self.logger = logging.getLogger(__name__)
|
| 50 |
+
|
| 51 |
+
async def custom_monitoring_workflow(self, space_ids: List[str]) -> None:
|
| 52 |
+
"""自定义监控工作流"""
|
| 53 |
+
|
| 54 |
+
# 初始化各个组件
|
| 55 |
+
hf_client = HuggingFaceAPIClient("your_token_here")
|
| 56 |
+
error_analyzer = IntelligentErrorAnalyzer()
|
| 57 |
+
|
| 58 |
+
for space_id in space_ids:
|
| 59 |
+
# 1. 检查状态
|
| 60 |
+
status = await hf_client.get_space_status(space_id)
|
| 61 |
+
print(f"Space {space_id}: {status.value}")
|
| 62 |
+
|
| 63 |
+
# 2. 如果有错误,分析日志
|
| 64 |
+
if status == SpaceStatus.ERROR:
|
| 65 |
+
logs = await hf_client.get_space_logs(space_id, lines=100)
|
| 66 |
+
errors = await error_analyzer.analyze_logs(logs)
|
| 67 |
+
|
| 68 |
+
# 3. 分类并处理错误
|
| 69 |
+
for error in errors:
|
| 70 |
+
if error.confidence > 0.8:
|
| 71 |
+
await self._handle_high_confidence_error(space_id, error)
|
| 72 |
+
else:
|
| 73 |
+
await self._handle_low_confidence_error(space_id, error)
|
| 74 |
+
|
| 75 |
+
async def _handle_high_confidence_error(self, space_id: str, error) -> None:
|
| 76 |
+
"""处理高置信度错误"""
|
| 77 |
+
print(f"高置信度错误 {space_id}: {error.error_type.value}")
|
| 78 |
+
|
| 79 |
+
if error.error_type == ErrorType.DEPENDENCY_INSTALL:
|
| 80 |
+
await self._fix_dependency_error(space_id, error)
|
| 81 |
+
elif error.error_type == ErrorType.DOCKERFILE_SYNTAX:
|
| 82 |
+
await self._fix_dockerfile_error(space_id, error)
|
| 83 |
+
# ... 其他错误类型处理
|
| 84 |
+
|
| 85 |
+
async def _fix_dependency_error(self, space_id: str, error) -> None:
|
| 86 |
+
"""修复依赖错误"""
|
| 87 |
+
print(f"修复 {space_id} 的依赖错误...")
|
| 88 |
+
|
| 89 |
+
# 实现具体的修复逻辑
|
| 90 |
+
# 1. 分析依赖类型(Python/Node.js)
|
| 91 |
+
# 2. 尝试更换源地址
|
| 92 |
+
# 3. 调整版本号
|
| 93 |
+
# 4. 重新安装依赖
|
| 94 |
+
|
| 95 |
+
async def _fix_dockerfile_error(self, space_id: str, error) -> None:
|
| 96 |
+
"""修复 Dockerfile 错误"""
|
| 97 |
+
print(f"修复 {space_id} 的 Dockerfile 错误...")
|
| 98 |
+
|
| 99 |
+
# 实现具体的修复逻辑
|
| 100 |
+
# 1. 定位错误行
|
| 101 |
+
# 2. 语法修正
|
| 102 |
+
# 3. 优化命令结构
|
| 103 |
+
|
| 104 |
+
# ============================================================================
|
| 105 |
+
# 批量处理示例
|
| 106 |
+
# ============================================================================
|
| 107 |
+
|
| 108 |
+
class BatchProcessingExample:
|
| 109 |
+
"""批量处理示例"""
|
| 110 |
+
|
| 111 |
+
def __init__(self):
|
| 112 |
+
self.logger = logging.getLogger(__name__)
|
| 113 |
+
|
| 114 |
+
async def batch_monitor_spaces(self, space_configs: List[Dict[str, Any]]) -> None:
|
| 115 |
+
"""批量监控 Spaces"""
|
| 116 |
+
|
| 117 |
+
tasks = []
|
| 118 |
+
for config in space_configs:
|
| 119 |
+
task = self._monitor_single_space(config)
|
| 120 |
+
tasks.append(task)
|
| 121 |
+
|
| 122 |
+
await asyncio.gather(*tasks, return_exceptions=True)
|
| 123 |
+
|
| 124 |
+
async def _monitor_single_space(self, config: Dict[str, Any]) -> None:
|
| 125 |
+
"""监控单个 Space"""
|
| 126 |
+
space_id = config['space_id']
|
| 127 |
+
monitoring_interval = config.get('interval', 60)
|
| 128 |
+
max_retries = config.get('max_retries', 3)
|
| 129 |
+
|
| 130 |
+
retry_count = 0
|
| 131 |
+
while retry_count < max_retries:
|
| 132 |
+
try:
|
| 133 |
+
# 监控逻辑
|
| 134 |
+
status = await self._check_space_status(space_id)
|
| 135 |
+
|
| 136 |
+
if status != SpaceStatus.ERROR:
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
retry_count += 1
|
| 140 |
+
if retry_count < max_retries:
|
| 141 |
+
await asyncio.sleep(monitoring_interval)
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
self.logger.error(f"监控 {space_id} 失败: {e}")
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
async def _check_space_status(self, space_id: str) -> SpaceStatus:
|
| 148 |
+
"""检查 Space 状态"""
|
| 149 |
+
# 实现状态检查逻辑
|
| 150 |
+
pass
|
| 151 |
+
|
| 152 |
+
# ============================================================================
|
| 153 |
+
# 自定义错误分析示例
|
| 154 |
+
# ============================================================================
|
| 155 |
+
|
| 156 |
+
class CustomErrorAnalyzer:
|
| 157 |
+
"""自定义错误分析器"""
|
| 158 |
+
|
| 159 |
+
def __init__(self):
|
| 160 |
+
self.custom_patterns = self._load_custom_patterns()
|
| 161 |
+
|
| 162 |
+
async def analyze_with_custom_rules(self, logs: str) -> List[Dict]:
|
| 163 |
+
"""使用自定义规则分析"""
|
| 164 |
+
|
| 165 |
+
results = []
|
| 166 |
+
|
| 167 |
+
# 1. 应用自定义模式
|
| 168 |
+
for pattern in self.custom_patterns:
|
| 169 |
+
matches = pattern['regex'].findall(logs)
|
| 170 |
+
if matches:
|
| 171 |
+
results.append({
|
| 172 |
+
'type': pattern['type'],
|
| 173 |
+
'matches': matches,
|
| 174 |
+
'severity': pattern['severity'],
|
| 175 |
+
'suggested_fix': pattern['fix']
|
| 176 |
+
})
|
| 177 |
+
|
| 178 |
+
# 2. 应用机器学习模型(如果可用)
|
| 179 |
+
ml_results = await self._ml_analysis(logs)
|
| 180 |
+
results.extend(ml_results)
|
| 181 |
+
|
| 182 |
+
# 3. 综合评分
|
| 183 |
+
scored_results = self._score_results(results)
|
| 184 |
+
|
| 185 |
+
return scored_results
|
| 186 |
+
|
| 187 |
+
def _load_custom_patterns(self) -> List[Dict]:
|
| 188 |
+
"""加载自定义错误模式"""
|
| 189 |
+
return [
|
| 190 |
+
{
|
| 191 |
+
'name': 'Custom GPU Error',
|
| 192 |
+
'regex': re.compile(r'GPU.*out of memory|CUDA.*error'),
|
| 193 |
+
'type': 'gpu_error',
|
| 194 |
+
'severity': 'high',
|
| 195 |
+
'fix': '减少批处理大小或使用更小的模型'
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
'name': 'Custom Timeout Pattern',
|
| 199 |
+
'regex': re.compile(r'operation.*timeout.*after.*(\d+)ms'),
|
| 200 |
+
'type': 'custom_timeout',
|
| 201 |
+
'severity': 'medium',
|
| 202 |
+
'fix': '增加超时设置或优化性能'
|
| 203 |
+
}
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
async def _ml_analysis(self, logs: str) -> List[Dict]:
|
| 207 |
+
"""机器学习分析"""
|
| 208 |
+
# 这里可以集成预训练的错误分类模型
|
| 209 |
+
return []
|
| 210 |
+
|
| 211 |
+
def _score_results(self, results: List[Dict]) -> List[Dict]:
|
| 212 |
+
"""对结果进行评分"""
|
| 213 |
+
for result in results:
|
| 214 |
+
if result['severity'] == 'high':
|
| 215 |
+
result['score'] = 0.9
|
| 216 |
+
elif result['severity'] == 'medium':
|
| 217 |
+
result['score'] = 0.7
|
| 218 |
+
else:
|
| 219 |
+
result['score'] = 0.5
|
| 220 |
+
|
| 221 |
+
return sorted(results, key=lambda x: x['score'], reverse=True)
|
| 222 |
+
|
| 223 |
+
# ============================================================================
|
| 224 |
+
# Webhook 集成示例
|
| 225 |
+
# ============================================================================
|
| 226 |
+
|
| 227 |
+
class WebhookIntegrationExample:
|
| 228 |
+
"""Webhook 集成示例"""
|
| 229 |
+
|
| 230 |
+
def __init__(self):
|
| 231 |
+
self.logger = logging.getLogger(__name__)
|
| 232 |
+
|
| 233 |
+
async def setup_webhook_server(self) -> None:
|
| 234 |
+
"""设置 Webhook 服务器"""
|
| 235 |
+
|
| 236 |
+
from fastapi import FastAPI, Request
|
| 237 |
+
import uvicorn
|
| 238 |
+
|
| 239 |
+
app = FastAPI()
|
| 240 |
+
|
| 241 |
+
@app.post("/webhook/huggingface")
|
| 242 |
+
async def handle_hf_webhook(request: Request):
|
| 243 |
+
payload = await request.json()
|
| 244 |
+
|
| 245 |
+
# 处理不同的事件类型
|
| 246 |
+
event_type = payload.get('event')
|
| 247 |
+
|
| 248 |
+
if event_type == 'space.status_updated':
|
| 249 |
+
await self._handle_status_update(payload)
|
| 250 |
+
elif event_type == 'space.build_error':
|
| 251 |
+
await self._handle_build_error(payload)
|
| 252 |
+
elif event_type == 'space.started':
|
| 253 |
+
await self._handle_space_started(payload)
|
| 254 |
+
|
| 255 |
+
return {"status": "ok"}
|
| 256 |
+
|
| 257 |
+
# 启动服务器
|
| 258 |
+
config = uvicorn.Config(app, host="0.0.0.0", port=8000)
|
| 259 |
+
server = uvicorn.Server(config)
|
| 260 |
+
await server.serve()
|
| 261 |
+
|
| 262 |
+
async def _handle_status_update(self, payload: Dict) -> None:
|
| 263 |
+
"""处理状态更新事件"""
|
| 264 |
+
space_id = payload.get('space', {}).get('id')
|
| 265 |
+
new_status = payload.get('space', {}).get('runtime', {}).get('stage')
|
| 266 |
+
|
| 267 |
+
self.logger.info(f"Space {space_id} 状态更新: {new_status}")
|
| 268 |
+
|
| 269 |
+
# 触发相应处理逻辑
|
| 270 |
+
if new_status == 'ERROR':
|
| 271 |
+
await self._trigger_repair_workflow(space_id)
|
| 272 |
+
|
| 273 |
+
async def _trigger_repair_workflow(self, space_id: str) -> None:
|
| 274 |
+
"""触发修复工作流"""
|
| 275 |
+
# 实现修复工作流
|
| 276 |
+
pass
|
| 277 |
+
|
| 278 |
+
# ============================================================================
|
| 279 |
+
# 测试和调试示例
|
| 280 |
+
# ============================================================================
|
| 281 |
+
|
| 282 |
+
class TestingExample:
|
| 283 |
+
"""测试和��试示例"""
|
| 284 |
+
|
| 285 |
+
def __init__(self):
|
| 286 |
+
self.logger = logging.getLogger(__name__)
|
| 287 |
+
|
| 288 |
+
async def test_error_analysis(self) -> None:
|
| 289 |
+
"""测试错误分析功能"""
|
| 290 |
+
|
| 291 |
+
# 模拟日志数据
|
| 292 |
+
sample_logs = """
|
| 293 |
+
ERROR: Could not find a version that satisfies the requirement torch==2.0.0
|
| 294 |
+
ERROR: No matching distribution found for torch==2.0.0
|
| 295 |
+
Build failed
|
| 296 |
+
"""
|
| 297 |
+
|
| 298 |
+
analyzer = IntelligentErrorAnalyzer()
|
| 299 |
+
errors = await analyzer.analyze_logs(sample_logs)
|
| 300 |
+
|
| 301 |
+
print(f"检测到 {len(errors)} 个错误:")
|
| 302 |
+
for error in errors:
|
| 303 |
+
print(f"- {error.error_type.value}: {error.message}")
|
| 304 |
+
print(f" 置信度: {error.confidence}")
|
| 305 |
+
|
| 306 |
+
async def test_repair_strategies(self) -> None:
|
| 307 |
+
"""测试修复策略"""
|
| 308 |
+
|
| 309 |
+
# 测试不同错误类型的修复策略
|
| 310 |
+
from core_system import SmartRepairEngine, ErrorInfo, SpaceInfo
|
| 311 |
+
|
| 312 |
+
repair_engine = SmartRepairEngine()
|
| 313 |
+
|
| 314 |
+
test_errors = [
|
| 315 |
+
ErrorInfo(
|
| 316 |
+
error_type=ErrorType.DEPENDENCY_INSTALL,
|
| 317 |
+
message="pip install failed",
|
| 318 |
+
log_snippet="ERROR: Could not find torch",
|
| 319 |
+
confidence=0.9
|
| 320 |
+
),
|
| 321 |
+
ErrorInfo(
|
| 322 |
+
error_type=ErrorType.DOCKERFILE_SYNTAX,
|
| 323 |
+
message="Dockerfile syntax error",
|
| 324 |
+
log_snippet="failed to solve: syntax error",
|
| 325 |
+
confidence=0.85
|
| 326 |
+
)
|
| 327 |
+
]
|
| 328 |
+
|
| 329 |
+
space_info = SpaceInfo(
|
| 330 |
+
space_id="test/space",
|
| 331 |
+
name="Test Space",
|
| 332 |
+
repository_url="",
|
| 333 |
+
current_status=SpaceStatus.ERROR,
|
| 334 |
+
last_updated=datetime.now()
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
for error in test_errors:
|
| 338 |
+
strategy = await repair_engine.generate_strategy(error, space_info)
|
| 339 |
+
if strategy:
|
| 340 |
+
print(f"修复策略: {strategy.action.value}")
|
| 341 |
+
print(f"描述: {strategy.description}")
|
| 342 |
+
print(f"成功率: {strategy.success_rate}")
|
| 343 |
+
print(f"风险等级: {strategy.risk_level}")
|
| 344 |
+
print()
|
| 345 |
+
|
| 346 |
+
# ============================================================================
|
| 347 |
+
# 性能监控示例
|
| 348 |
+
# ============================================================================
|
| 349 |
+
|
| 350 |
+
class PerformanceMonitoringExample:
|
| 351 |
+
"""性能监控示例"""
|
| 352 |
+
|
| 353 |
+
def __init__(self):
|
| 354 |
+
self.metrics = {}
|
| 355 |
+
|
| 356 |
+
async def monitor_system_performance(self) -> None:
|
| 357 |
+
"""监控系统性能"""
|
| 358 |
+
|
| 359 |
+
while True:
|
| 360 |
+
# 收集性能指标
|
| 361 |
+
current_metrics = await self._collect_metrics()
|
| 362 |
+
|
| 363 |
+
# 存储和比较指标
|
| 364 |
+
self._store_metrics(current_metrics)
|
| 365 |
+
|
| 366 |
+
# 检查异常
|
| 367 |
+
anomalies = self._detect_anomalies(current_metrics)
|
| 368 |
+
|
| 369 |
+
if anomalies:
|
| 370 |
+
await self._handle_anomalies(anomalies)
|
| 371 |
+
|
| 372 |
+
await asyncio.sleep(60) # 每分钟检查一次
|
| 373 |
+
|
| 374 |
+
async def _collect_metrics(self) -> Dict[str, Any]:
|
| 375 |
+
"""收集性能指标"""
|
| 376 |
+
return {
|
| 377 |
+
'timestamp': datetime.now(),
|
| 378 |
+
'cpu_usage': self._get_cpu_usage(),
|
| 379 |
+
'memory_usage': self._get_memory_usage(),
|
| 380 |
+
'active_repairs': self._get_active_repairs(),
|
| 381 |
+
'queue_size': self._get_queue_size(),
|
| 382 |
+
'error_rate': self._get_error_rate()
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
def _store_metrics(self, metrics: Dict[str, Any]) -> None:
|
| 386 |
+
"""存储指标"""
|
| 387 |
+
# 存储到数据库或时间序列数据库
|
| 388 |
+
pass
|
| 389 |
+
|
| 390 |
+
def _detect_anomalies(self, metrics: Dict[str, Any]) -> List[str]:
|
| 391 |
+
"""检测异常"""
|
| 392 |
+
anomalies = []
|
| 393 |
+
|
| 394 |
+
if metrics['cpu_usage'] > 80:
|
| 395 |
+
anomalies.append(f"CPU 使用率过高: {metrics['cpu_usage']}%")
|
| 396 |
+
|
| 397 |
+
if metrics['memory_usage'] > 90:
|
| 398 |
+
anomalies.append(f"内存使用率过高: {metrics['memory_usage']}%")
|
| 399 |
+
|
| 400 |
+
if metrics['error_rate'] > 0.1:
|
| 401 |
+
anomalies.append(f"错误率过高: {metrics['error_rate']}")
|
| 402 |
+
|
| 403 |
+
return anomalies
|
| 404 |
+
|
| 405 |
+
async def _handle_anomalies(self, anomalies: List[str]) -> None:
|
| 406 |
+
"""处理异常"""
|
| 407 |
+
for anomaly in anomalies:
|
| 408 |
+
self.logger.warning(f"性能异常: {anomaly}")
|
| 409 |
+
# 发送告警或自动调整
|
| 410 |
+
|
| 411 |
+
# ============================================================================
|
| 412 |
+
# 主程序示例
|
| 413 |
+
# ============================================================================
|
| 414 |
+
|
| 415 |
+
async def main():
|
| 416 |
+
"""主程序示例"""
|
| 417 |
+
print("HuggingFace Spaces 自动修复系统示例")
|
| 418 |
+
print("=" * 50)
|
| 419 |
+
|
| 420 |
+
# 选择运行的示例
|
| 421 |
+
examples = {
|
| 422 |
+
"1": ("基本使用", basic_usage_example),
|
| 423 |
+
"2": ("高级使用", lambda: AdvancedUsageExample().custom_monitoring_workflow(
|
| 424 |
+
["user/space1", "user/space2"]
|
| 425 |
+
)),
|
| 426 |
+
"3": ("测试错误分析", lambda: TestingExample().test_error_analysis()),
|
| 427 |
+
"4": ("性能监控", lambda: PerformanceMonitoringExample().monitor_system_performance()),
|
| 428 |
+
"5": ("Webhook 服务器", lambda: WebhookIntegrationExample().setup_webhook_server())
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
print("请选择要运行的示例:")
|
| 432 |
+
for key, (desc, _) in examples.items():
|
| 433 |
+
print(f"{key}. {desc}")
|
| 434 |
+
|
| 435 |
+
choice = input("请输入选择 (1-5): ").strip()
|
| 436 |
+
|
| 437 |
+
if choice in examples:
|
| 438 |
+
desc, func = examples[choice]
|
| 439 |
+
print(f"\n运行: {desc}")
|
| 440 |
+
try:
|
| 441 |
+
await func()
|
| 442 |
+
except KeyboardInterrupt:
|
| 443 |
+
print("\n程序被用户中断")
|
| 444 |
+
except Exception as e:
|
| 445 |
+
print(f"运行出错: {e}")
|
| 446 |
+
else:
|
| 447 |
+
print("无效的选择")
|
| 448 |
+
|
| 449 |
+
if __name__ == "__main__":
|
| 450 |
+
# 设置日志
|
| 451 |
+
logging.basicConfig(
|
| 452 |
+
level=logging.INFO,
|
| 453 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
# 运行主程序
|
| 457 |
+
asyncio.run(main())
|