OpenCode Deployer commited on
Commit
4ca5973
·
1 Parent(s): ede55f3
DEPLOYMENT.md ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Spaces 自动修复系统部署指南
2
+
3
+ ## 📋 部署概览
4
+
5
+ 本系统支持多种部署方式:
6
+ - **Docker Compose 部署**(推荐)
7
+ - **本地 Python 部署**
8
+ - **Kubernetes 部署**
9
+ - **云服务部署**
10
+
11
+ ## 🐳 Docker Compose 部署(推荐)
12
+
13
+ ### 1. 环境准备
14
+
15
+ ```bash
16
+ # 克隆项目
17
+ git clone <repository-url>
18
+ cd hf-repair-system
19
+
20
+ # 创建环境变量文件
21
+ cp .env.example .env
22
+
23
+ # 编辑环境变量
24
+ nano .env
25
+ ```
26
+
27
+ ### 2. 环境变量配置
28
+
29
+ ```bash
30
+ # .env 文件内容
31
+ HF_TOKEN=your_huggingface_token_here
32
+ WEBHOOK_URL=your_webhook_url_here
33
+ REDIS_PASSWORD=your_redis_password_here
34
+ POSTGRES_PASSWORD=your_postgres_password_here
35
+ GRAFANA_PASSWORD=your_grafana_password_here
36
+
37
+ # 可选配置
38
+ LOG_LEVEL=INFO
39
+ CHECK_INTERVAL=60
40
+ MAX_CONCURRENT_SPACES=10
41
+ ```
42
+
43
+ ### 3. 启动系统
44
+
45
+ ```bash
46
+ # 构建并启动所有服务
47
+ docker-compose up -d
48
+
49
+ # 查看服务状态
50
+ docker-compose ps
51
+
52
+ # 查看日志
53
+ docker-compose logs -f hf-repair-system
54
+ ```
55
+
56
+ ### 4. 验证部署
57
+
58
+ ```bash
59
+ # 检查健康状态
60
+ curl http://localhost:8080/health
61
+
62
+ # 访问 Web 界面
63
+ open http://localhost:3000
64
+
65
+ # 访问 Grafana 监控
66
+ open http://localhost:3001
67
+ ```
68
+
69
+ ## 🐧 本地 Python 部署
70
+
71
+ ### 1. 环境准备
72
+
73
+ ```bash
74
+ # Python 3.11+
75
+ python3.11 -m venv venv
76
+ source venv/bin/activate # Linux/Mac
77
+ # 或
78
+ venv\Scripts\activate # Windows
79
+
80
+ # 安装依赖
81
+ pip install -r requirements.txt
82
+ ```
83
+
84
+ ### 2. 配置文件
85
+
86
+ ```bash
87
+ # 复制配置模板
88
+ cp config_template.json config.json
89
+
90
+ # 编辑配置
91
+ nano config.json
92
+ ```
93
+
94
+ ### 3. 数据库初始化
95
+
96
+ ```bash
97
+ # 创建数据目录
98
+ mkdir -p data logs backups
99
+
100
+ # 初始化数据库
101
+ python -m repair_system.db.init
102
+ ```
103
+
104
+ ### 4. 启动服务
105
+
106
+ ```bash
107
+ # 启动主服务
108
+ python main.py
109
+
110
+ # 或使用 start 脚本
111
+ ./start.sh
112
+ ```
113
+
114
+ ## ☸️ Kubernetes 部署
115
+
116
+ ### 1. 准备 Kubernetes 配置
117
+
118
+ ```yaml
119
+ # k8s/namespace.yaml
120
+ apiVersion: v1
121
+ kind: Namespace
122
+ metadata:
123
+ name: hf-repair-system
124
+
125
+ ---
126
+ # k8s/configmap.yaml
127
+ apiVersion: v1
128
+ kind: ConfigMap
129
+ metadata:
130
+ name: hf-repair-config
131
+ namespace: hf-repair-system
132
+ data:
133
+ config.json: |
134
+ {
135
+ "system": {
136
+ "name": "HuggingFace Spaces 自动修复系统",
137
+ "log_level": "INFO"
138
+ },
139
+ "huggingface": {
140
+ "api_token": "${HF_TOKEN}",
141
+ "base_url": "https://huggingface.co/api"
142
+ }
143
+ }
144
+
145
+ ---
146
+ # k8s/secret.yaml
147
+ apiVersion: v1
148
+ kind: Secret
149
+ metadata:
150
+ name: hf-repair-secrets
151
+ namespace: hf-repair-system
152
+ type: Opaque
153
+ data:
154
+ hf-token: <base64-encoded-token>
155
+ webhook-url: <base64-encoded-webhook-url>
156
+
157
+ ---
158
+ # k8s/deployment.yaml
159
+ apiVersion: apps/v1
160
+ kind: Deployment
161
+ metadata:
162
+ name: hf-repair-system
163
+ namespace: hf-repair-system
164
+ spec:
165
+ replicas: 3
166
+ selector:
167
+ matchLabels:
168
+ app: hf-repair-system
169
+ template:
170
+ metadata:
171
+ labels:
172
+ app: hf-repair-system
173
+ spec:
174
+ containers:
175
+ - name: repair-system
176
+ image: hf-repair-system:latest
177
+ ports:
178
+ - containerPort: 8080
179
+ env:
180
+ - name: HF_TOKEN
181
+ valueFrom:
182
+ secretKeyRef:
183
+ name: hf-repair-secrets
184
+ key: hf-token
185
+ volumeMounts:
186
+ - name: config
187
+ mountPath: /app/config
188
+ - name: data
189
+ mountPath: /app/data
190
+ resources:
191
+ requests:
192
+ memory: "512Mi"
193
+ cpu: "250m"
194
+ limits:
195
+ memory: "1Gi"
196
+ cpu: "500m"
197
+ volumes:
198
+ - name: config
199
+ configMap:
200
+ name: hf-repair-config
201
+ - name: data
202
+ persistentVolumeClaim:
203
+ claimName: hf-repair-data
204
+
205
+ ---
206
+ # k8s/service.yaml
207
+ apiVersion: v1
208
+ kind: Service
209
+ metadata:
210
+ name: hf-repair-service
211
+ namespace: hf-repair-system
212
+ spec:
213
+ selector:
214
+ app: hf-repair-system
215
+ ports:
216
+ - port: 80
217
+ targetPort: 8080
218
+ type: ClusterIP
219
+
220
+ ---
221
+ # k8s/ingress.yaml
222
+ apiVersion: networking.k8s.io/v1
223
+ kind: Ingress
224
+ metadata:
225
+ name: hf-repair-ingress
226
+ namespace: hf-repair-system
227
+ annotations:
228
+ nginx.ingress.kubernetes.io/rewrite-target: /
229
+ spec:
230
+ rules:
231
+ - host: hf-repair.yourdomain.com
232
+ http:
233
+ paths:
234
+ - path: /
235
+ pathType: Prefix
236
+ backend:
237
+ service:
238
+ name: hf-repair-service
239
+ port:
240
+ number: 80
241
+ ```
242
+
243
+ ### 2. 部署到 Kubernetes
244
+
245
+ ```bash
246
+ # 创建命名空间和配置
247
+ kubectl apply -f k8s/namespace.yaml
248
+ kubectl apply -f k8s/configmap.yaml
249
+ kubectl apply -f k8s/secret.yaml
250
+
251
+ # 部署应用
252
+ kubectl apply -f k8s/deployment.yaml
253
+ kubectl apply -f k8s/service.yaml
254
+ kubectl apply -f k8s/ingress.yaml
255
+
256
+ # 检查部署状态
257
+ kubectl get pods -n hf-repair-system
258
+ kubectl logs -f deployment/hf-repair-system -n hf-repair-system
259
+ ```
260
+
261
+ ## 🌩️ 云服务部署
262
+
263
+ ### AWS 部署
264
+
265
+ ```bash
266
+ # 使用 AWS CLI
267
+ aws ecs create-cluster --cluster-name hf-repair-cluster
268
+ aws ecs register-task-definition --cli-input-json file://task-definition.json
269
+ aws ecs create-service --cluster hf-repair-cluster --service-name hf-repair-service --task-definition hf-repair-task
270
+
271
+ # 设置 CloudWatch 日志
272
+ aws logs create-log-group --log-group-name /ecs/hf-repair-system
273
+ ```
274
+
275
+ ### Google Cloud 部署
276
+
277
+ ```bash
278
+ # 使用 gcloud
279
+ gcloud run deploy hf-repair-system \
280
+ --image gcr.io/your-project/hf-repair-system:latest \
281
+ --platform managed \
282
+ --region us-central1 \
283
+ --allow-unauthenticated \
284
+ --set-env-vars HF_TOKEN=$HF_TOKEN
285
+ ```
286
+
287
+ ### Azure 部署
288
+
289
+ ```bash
290
+ # 使用 Azure CLI
291
+ az container create \
292
+ --resource-group hf-repair-rg \
293
+ --name hf-repair-system \
294
+ --image your-registry/hf-repair-system:latest \
295
+ --environment-variables HF_TOKEN=$HF_TOKEN \
296
+ --ports 8080
297
+ ```
298
+
299
+ ## 🔧 配置管理
300
+
301
+ ### 生产环境配置
302
+
303
+ ```json
304
+ {
305
+ "system": {
306
+ "log_level": "WARNING",
307
+ "debug": false
308
+ },
309
+ "monitoring": {
310
+ "check_interval": 30,
311
+ "max_concurrent_spaces": 20
312
+ },
313
+ "performance": {
314
+ "max_concurrent_repairs": 10,
315
+ "worker_threads": 8
316
+ },
317
+ "database": {
318
+ "type": "postgresql",
319
+ "host": "postgres",
320
+ "port": 5432,
321
+ "database": "hf_repair",
322
+ "username": "hf_repair",
323
+ "password": "${POSTGRES_PASSWORD}"
324
+ }
325
+ }
326
+ ```
327
+
328
+ ### 开发环境配置
329
+
330
+ ```json
331
+ {
332
+ "system": {
333
+ "log_level": "DEBUG",
334
+ "debug": true
335
+ },
336
+ "monitoring": {
337
+ "check_interval": 60,
338
+ "max_concurrent_spaces": 3
339
+ },
340
+ "file_operations": {
341
+ "git": {
342
+ "auto_commit": false,
343
+ "push_immediately": false
344
+ }
345
+ }
346
+ }
347
+ ```
348
+
349
+ ## 📊 监控和日志
350
+
351
+ ### Prometheus 配置
352
+
353
+ ```yaml
354
+ # monitoring/prometheus.yml
355
+ global:
356
+ scrape_interval: 15s
357
+
358
+ scrape_configs:
359
+ - job_name: 'hf-repair-system'
360
+ static_configs:
361
+ - targets: ['hf-repair-system:8080']
362
+ metrics_path: /metrics
363
+ scrape_interval: 30s
364
+ ```
365
+
366
+ ### Grafana 仪表板
367
+
368
+ 1. 访问 http://localhost:3001
369
+ 2. 导入预配置的仪表板
370
+ 3. 设置数据源为 Prometheus
371
+
372
+ ## 🔒 安全配置
373
+
374
+ ### 网络安全
375
+
376
+ ```bash
377
+ # 防火墙配置
378
+ ufw allow 22/tcp # SSH
379
+ ufw allow 80/tcp # HTTP
380
+ ufw allow 443/tcp # HTTPS
381
+ ufw deny 8080/tcp # 限制内部服务访问
382
+ ```
383
+
384
+ ### SSL/TLS 配置
385
+
386
+ ```nginx
387
+ # nginx/ssl.conf
388
+ server {
389
+ listen 443 ssl http2;
390
+ server_name hf-repair.yourdomain.com;
391
+
392
+ ssl_certificate /path/to/certificate.crt;
393
+ ssl_certificate_key /path/to/private.key;
394
+
395
+ location / {
396
+ proxy_pass http://localhost:8080;
397
+ proxy_set_header Host $host;
398
+ proxy_set_header X-Real-IP $remote_addr;
399
+ }
400
+ }
401
+ ```
402
+
403
+ ## 🚀 性能优化
404
+
405
+ ### 资源调优
406
+
407
+ ```yaml
408
+ # docker-compose 性能配置
409
+ services:
410
+ hf-repair-system:
411
+ deploy:
412
+ resources:
413
+ limits:
414
+ cpus: '2.0'
415
+ memory: 2G
416
+ reservations:
417
+ cpus: '1.0'
418
+ memory: 1G
419
+ ulimits:
420
+ nofile:
421
+ soft: 65536
422
+ hard: 65536
423
+ ```
424
+
425
+ ### 缓存配置
426
+
427
+ ```python
428
+ # config/redis.json
429
+ {
430
+ "redis": {
431
+ "host": "redis",
432
+ "port": 6379,
433
+ "password": "${REDIS_PASSWORD}",
434
+ "db": 0,
435
+ "max_connections": 20,
436
+ "socket_timeout": 5,
437
+ "socket_connect_timeout": 5
438
+ }
439
+ }
440
+ ```
441
+
442
+ ## 🔄 维护和更新
443
+
444
+ ### 备份策略
445
+
446
+ ```bash
447
+ #!/bin/bash
448
+ # scripts/backup.sh
449
+
450
+ # 备份数据库
451
+ pg_dump hf_repair > backup_$(date +%Y%m%d_%H%M%S).sql
452
+
453
+ # 备份配置文件
454
+ tar -czf config_backup_$(date +%Y%m%d_%H%M%S).tar.gz config/
455
+
456
+ # 备份日志
457
+ find logs/ -name "*.log" -mtime +7 -delete
458
+ ```
459
+
460
+ ### 更新流程
461
+
462
+ ```bash
463
+ # 拉取最新代码
464
+ git pull origin main
465
+
466
+ # 重新构建
467
+ docker-compose build
468
+
469
+ # 滚动更新
470
+ docker-compose up -d --no-deps hf-repair-system
471
+
472
+ # 验证更新
473
+ curl http://localhost:8080/health
474
+ ```
475
+
476
+ ## 📱 移动端和远程访问
477
+
478
+ ### VPN 配置
479
+
480
+ ```bash
481
+ # 使用 WireGuard 进行安全远程访问
482
+ wg-quick up wg0
483
+ ```
484
+
485
+ ### 移动端应用
486
+
487
+ 1. 使用 PWA 技术
488
+ 2. 推送通知集成
489
+ 3. 离线状态支持
490
+
491
+ ## 🤝 高可用部署
492
+
493
+ ### 多节点部署
494
+
495
+ ```yaml
496
+ # docker-compose.ha.yml
497
+ version: '3.8'
498
+ services:
499
+ hf-repair-system:
500
+ image: hf-repair-system:latest
501
+ deploy:
502
+ replicas: 3
503
+ update_config:
504
+ parallelism: 1
505
+ delay: 10s
506
+ restart_policy:
507
+ condition: on-failure
508
+ networks:
509
+ - hf-repair-network
510
+ ```
511
+
512
+ ### 负载均衡
513
+
514
+ ```nginx
515
+ # nginx/load-balancer.conf
516
+ upstream hf_repair_backend {
517
+ server hf-repair-1:8080;
518
+ server hf-repair-2:8080;
519
+ server hf-repair-3:8080;
520
+ }
521
+
522
+ server {
523
+ listen 80;
524
+ location / {
525
+ proxy_pass http://hf_repair_backend;
526
+ proxy_set_header Host $host;
527
+ proxy_set_header X-Real-IP $remote_addr;
528
+ }
529
+ }
530
+ ```
531
+
532
+ ## 📋 部署检查清单
533
+
534
+ - [ ] 环境变量配置完成
535
+ - [ ] 数据库连接正常
536
+ - [ ] Redis 缓存运行正常
537
+ - [ ] HuggingFace Token 有效
538
+ - [ ] Webhook 配置正确
539
+ - [ ] 防火墙规则设置
540
+ - [ ] SSL 证书配置
541
+ - [ ] 监控系统运行
542
+ - [ ] 备份策略实施
543
+ - [ ] 日志轮转配置
544
+ - [ ] 健康检查正常
545
+ - [ ] 性能基准测试完成
546
+
547
+ ## 🆘 故障排除
548
+
549
+ ### 常见问题
550
+
551
+ 1. **服务无法启动**
552
+ ```bash
553
+ # 检查日志
554
+ docker-compose logs hf-repair-system
555
+
556
+ # 检查配置
557
+ python -m json.tool config.json
558
+ ```
559
+
560
+ 2. **数据库连接失败**
561
+ ```bash
562
+ # 检查数据库状态
563
+ docker-compose exec postgres pg_isready -U hf_repair
564
+
565
+ # 检查网络连接
566
+ docker network ls
567
+ ```
568
+
569
+ 3. **API 限制**
570
+ ```bash
571
+ # 检查 Token 权限
572
+ curl -H "Authorization: Bearer $HF_TOKEN" \
573
+ https://huggingface.co/api/whoami
574
+ ```
575
+
576
+ ### 调试模式
577
+
578
+ ```bash
579
+ # 启用调试模式
580
+ export LOG_LEVEL=DEBUG
581
+ export DEBUG=true
582
+
583
+ # 或修改配置文件
584
+ sed -i 's/"debug": false/"debug": true/' config.json
585
+ ```
SYSTEM_SUMMARY.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 系统总结
2
+
3
+ 我已经为您设计了一个完整的 HuggingFace Spaces 自动化监控和修复系统。以下是系统的核心组件和功能总结:
4
+
5
+ ## 🎯 已完成的核心系统架构
6
+
7
+ ### 1. **核心系统架构** (`core_system.py`)
8
+ - **数据模型**: SpaceStatus, ErrorType, RepairAction 等枚举
9
+ - **接口定义**: HuggingFaceAPI, ErrorAnalyzer, RepairStrategyEngine 等抽象接口
10
+ - **主要系统类**: HFSpaceMonitor, AutoRepairSystem, SmartRepairEngine
11
+ - **状态管理**: RepairHistory, StateManager 数据持久化
12
+
13
+ ### 2. **HuggingFace API 客户端** (`huggingface_client.py`)
14
+ - **API 集成**: 完整的 HuggingFace Spaces API 封装
15
+ - **速率限制**: 内置请求限制器防止 API 超限
16
+ - **Webhook 处理**: 支持实时事件处理
17
+ - **会话管理**: 异步 HTTP 会话优化
18
+
19
+ ### 3. **智能错误分析器** (`error_analyzer.py`)
20
+ - **多模式识别**: 正则表达式 + 上下文分析
21
+ - **错误分类**: 8 种主要错误类型识别
22
+ - **置信度评估**: 基于多种因素的智能评分
23
+ - **专门分析器**: Dockerfile、依赖、环境等专项分析
24
+
25
+ ### 4. **配置管理** (`config_template.json`)
26
+ - **完整配置模板**: 涵盖所有系统组件
27
+ - **环境变量支持**: 安全的配置管理
28
+ - **性能调优**: 可配置的性能参数
29
+ - **监控和通知**: 完整的告警配置
30
+
31
+ ### 5. **部署方案** (`docker-compose.yml`)
32
+ - **容器化部署**: 完整的 Docker Compose 配置
33
+ - **服务编排**: 主应用 + Redis + PostgreSQL + 监控
34
+ - **高可用**: 多副本 + 负载均衡配置
35
+ - **监控栈**: Prometheus + Grafana 集成
36
+
37
+ ### 6. **部署指南** (`DEPLOYMENT.md`)
38
+ - **多部署方式**: Docker, 本地, K8s, 云服务
39
+ - **安全配置**: SSL/TLS, 防火墙, 权限管理
40
+ - **性能优化**: 资源调优, 缓存配置
41
+ - **故障排除**: 常见问题和调试方法
42
+
43
+ ### 7. **使用示例** (`usage_examples.py`)
44
+ - **基本使用**: 简单的监控和修复流程
45
+ - **高级功能**: 自定义工作流, 批量处理
46
+ - **Webhook 集成**: 事件驱动的修复流程
47
+ - **性能监控**: 系统性能指标追踪
48
+
49
+ ## 🏗️ 系统架构特点
50
+
51
+ ### **模块化设计**
52
+ - 清晰的接口定义和组件分离
53
+ - 可插拔的错误分析器和修复策略
54
+ - 独立的配置和状态管理
55
+
56
+ ### **智能错误处理**
57
+ - 多层次的错误识别机制
58
+ - 基于上下文的智能分析
59
+ - 置信度评估和风险控制
60
+
61
+ ### **自动化工作流**
62
+ - 监控 → 分析 → 修复 → 验证的闭环
63
+ - 支持多种修复策略和回滚机制
64
+ - 异步处理和并发控制
65
+
66
+ ### **可扩展性**
67
+ - 支持自定义错误模式和修复规则
68
+ - 插件化的分析器架构
69
+ - 灵活的配置和部署选项
70
+
71
+ ## 🚀 核心功能
72
+
73
+ ### **实时监控**
74
+ - HuggingFace Spaces 状态轮询
75
+ - 日志实时分析
76
+ - Webhook 事件处理
77
+
78
+ ### **智能分析**
79
+ - 8 种错误类型自动识别
80
+ - 上下文感知的错误分析
81
+ - 置信度评估和优先级排序
82
+
83
+ ### **自动修复**
84
+ - Dockerfile 语法修正
85
+ - 依赖版本和源地址调整
86
+ - 环境变量和配置优化
87
+ - 端口和权限问题处理
88
+
89
+ ### **状态管理**
90
+ - 修复历史记录
91
+ - 回滚机制
92
+ - 性能指标追踪
93
+
94
+ ## 📊 技术栈
95
+
96
+ - **核心**: Python 3.11+, asyncio, aiohttp
97
+ - **数据库**: SQLite (开发) / PostgreSQL (生产)
98
+ - **缓存**: Redis
99
+ - **监控**: Prometheus + Grafana
100
+ - **部署**: Docker + Kubernetes
101
+ - **配置**: JSON/YAML, 环境变量
102
+
103
+ ## 🔧 部署方式
104
+
105
+ 1. **Docker Compose** (推荐): 一键部署完整系统
106
+ 2. **本地部署**: 直接 Python 运行
107
+ 3. **Kubernetes**: 生产级容器编排
108
+ 4. **云服务**: AWS, GCP, Azure 集成
109
+
110
+ ## 📈 预期效果
111
+
112
+ - **监控效率**: 24/7 自动监控,快速发现问题
113
+ - **修复成功率**: 基于历史数据的智能修复策略
114
+ - **运维成本**: 大幅减少人工干预需求
115
+ - **系统稳定性**: 自动化故障恢复和优化
116
+
117
+ 这个系统提供了完整的 HuggingFace Spaces 监控修复解决方案,具有高度的自动化、智能化和可扩展性。您可以根据具体需求调整配置和部署方式。
config_template.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 配置文件模板
3
+ 系统配置和环境设置
4
+ """
5
+
6
+ {
7
+ "system": {
8
+ "name": "HuggingFace Spaces 自动修复系统",
9
+ "version": "1.0.0",
10
+ "log_level": "INFO",
11
+ "debug": false
12
+ },
13
+
14
+ "huggingface": {
15
+ "api_token": "${HF_TOKEN}",
16
+ "base_url": "https://huggingface.co/api",
17
+ "rate_limit": {
18
+ "requests_per_minute": 60,
19
+ "burst_limit": 10
20
+ },
21
+ "timeout": {
22
+ "api_timeout": 30,
23
+ "build_timeout": 1800,
24
+ "log_timeout": 60
25
+ }
26
+ },
27
+
28
+ "monitoring": {
29
+ "check_interval": 60,
30
+ "max_concurrent_spaces": 10,
31
+ "retry_attempts": 3,
32
+ "retry_delay": 30,
33
+ "health_check": {
34
+ "enabled": true,
35
+ "port_check": 7860,
36
+ "timeout": 10,
37
+ "retries": 3
38
+ }
39
+ },
40
+
41
+ "error_analysis": {
42
+ "confidence_threshold": 0.7,
43
+ "max_errors_per_analysis": 20,
44
+ "context_lines": 10,
45
+ "pattern_matching": {
46
+ "enabled": true,
47
+ "case_sensitive": false,
48
+ "max_matches": 50
49
+ },
50
+ "context_analysis": {
51
+ "enabled": true,
52
+ "analyzers": [
53
+ "dockerfile_syntax",
54
+ "dependency_install",
55
+ "environment_config",
56
+ "port_conflict",
57
+ "permission_error",
58
+ "network_connection",
59
+ "timeout_error",
60
+ "resource_exceeded"
61
+ ]
62
+ }
63
+ },
64
+
65
+ "repair_strategies": {
66
+ "max_attempts_per_error": 3,
67
+ "success_rate_threshold": 0.6,
68
+ "risk_tolerance": "medium",
69
+ "backup_enabled": true,
70
+ "strategies": {
71
+ "dockerfile_syntax": {
72
+ "enabled": true,
73
+ "priority": 1,
74
+ "auto_apply": true,
75
+ "risk_level": "medium"
76
+ },
77
+ "dependency_install": {
78
+ "enabled": true,
79
+ "priority": 2,
80
+ "auto_apply": true,
81
+ "risk_level": "low",
82
+ "fallback_sources": [
83
+ "https://pypi.tuna.tsinghua.edu.cn/simple",
84
+ "https://mirrors.aliyun.com/pypi/simple",
85
+ "https://pypi.douban.com/simple"
86
+ ]
87
+ },
88
+ "environment_config": {
89
+ "enabled": true,
90
+ "priority": 3,
91
+ "auto_apply": true,
92
+ "risk_level": "low"
93
+ },
94
+ "port_conflict": {
95
+ "enabled": true,
96
+ "priority": 4,
97
+ "auto_apply": true,
98
+ "risk_level": "medium",
99
+ "alternative_ports": [7861, 7862, 7863, 8080, 8000]
100
+ },
101
+ "permission_error": {
102
+ "enabled": true,
103
+ "priority": 5,
104
+ "auto_apply": false,
105
+ "risk_level": "high"
106
+ },
107
+ "network_connection": {
108
+ "enabled": true,
109
+ "priority": 6,
110
+ "auto_apply": false,
111
+ "risk_level": "medium"
112
+ },
113
+ "timeout_error": {
114
+ "enabled": true,
115
+ "priority": 7,
116
+ "auto_apply": true,
117
+ "risk_level": "low"
118
+ },
119
+ "resource_exceeded": {
120
+ "enabled": true,
121
+ "priority": 8,
122
+ "auto_apply": false,
123
+ "risk_level": "high"
124
+ }
125
+ }
126
+ },
127
+
128
+ "file_operations": {
129
+ "backup": {
130
+ "enabled": true,
131
+ "directory": "./backups",
132
+ "max_backups": 10,
133
+ "compression": true
134
+ },
135
+ "git": {
136
+ "enabled": true,
137
+ "auto_commit": true,
138
+ "commit_message_prefix": "[Auto-Repair]",
139
+ "branch_name": "auto-repair",
140
+ "push_immediately": true
141
+ },
142
+ "modification": {
143
+ "dry_run": false,
144
+ "confirm_changes": false,
145
+ "max_file_size_mb": 10
146
+ }
147
+ },
148
+
149
+ "database": {
150
+ "type": "sqlite",
151
+ "path": "./data/repair_system.db",
152
+ "backup_enabled": true,
153
+ "backup_interval_hours": 24,
154
+ "retention_days": 30
155
+ },
156
+
157
+ "notifications": {
158
+ "enabled": true,
159
+ "channels": {
160
+ "email": {
161
+ "enabled": false,
162
+ "smtp_server": "",
163
+ "smtp_port": 587,
164
+ "username": "",
165
+ "password": "",
166
+ "recipients": []
167
+ },
168
+ "webhook": {
169
+ "enabled": true,
170
+ "url": "${WEBHOOK_URL}",
171
+ "timeout": 10,
172
+ "retry_attempts": 3
173
+ },
174
+ "slack": {
175
+ "enabled": false,
176
+ "webhook_url": "",
177
+ "channel": "#alerts"
178
+ }
179
+ },
180
+ "events": {
181
+ "repair_success": true,
182
+ "repair_failed": true,
183
+ "space_error": true,
184
+ "build_completed": false,
185
+ "system_error": true
186
+ }
187
+ },
188
+
189
+ "security": {
190
+ "max_file_access_attempts": 3,
191
+ "allowed_file_extensions": [".py", ".js", ".json", ".yml", ".yaml", ".md", ".txt"],
192
+ "forbidden_paths": ["/etc", "/proc", "/sys", "/dev"],
193
+ "scan_for_secrets": true,
194
+ "secret_patterns": [
195
+ "password",
196
+ "token",
197
+ "api_key",
198
+ "secret",
199
+ "credential"
200
+ ]
201
+ },
202
+
203
+ "performance": {
204
+ "max_concurrent_repairs": 5,
205
+ "queue_size": 100,
206
+ "worker_threads": 4,
207
+ "cache_size_mb": 100,
208
+ "timeout_per_repair": 600
209
+ },
210
+
211
+ "logging": {
212
+ "level": "INFO",
213
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
214
+ "file": "./logs/repair_system.log",
215
+ "max_file_size_mb": 50,
216
+ "backup_count": 5,
217
+ "console_output": true
218
+ }
219
+ }
core_system.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Spaces 自动化监控修复系统
3
+ 核心系统架构和主要类定义
4
+ """
5
+
6
+ from abc import ABC, abstractmethod
7
+ from dataclasses import dataclass, field
8
+ from typing import Dict, List, Optional, Any, Tuple, Union
9
+ from enum import Enum
10
+ import asyncio
11
+ import logging
12
+ from datetime import datetime
13
+ import json
14
+ import sqlite3
15
+ import os
16
+ from pathlib import Path
17
+
18
+ # ============================================================================
19
+ # 数据模型和枚举
20
+ # ============================================================================
21
+
22
+ class SpaceStatus(Enum):
23
+ """Space 状态枚举"""
24
+ BUILDING = "building"
25
+ RUNNING = "running"
26
+ STOPPED = "stopped"
27
+ ERROR = "error"
28
+ UNKNOWN = "unknown"
29
+
30
+ class ErrorType(Enum):
31
+ """错误类型枚举"""
32
+ DOCKERFILE_SYNTAX = "dockerfile_syntax"
33
+ DEPENDENCY_INSTALL = "dependency_install"
34
+ ENVIRONMENT_CONFIG = "environment_config"
35
+ PORT_CONFLICT = "port_conflict"
36
+ PERMISSION_ERROR = "permission_error"
37
+ NETWORK_CONNECTION = "network_connection"
38
+ TIMEOUT_ERROR = "timeout_error"
39
+ RESOURCE_EXCEEDED = "resource_exceeded"
40
+ UNKNOWN_ERROR = "unknown_error"
41
+
42
+ class RepairAction(Enum):
43
+ """修复动作枚举"""
44
+ MODIFY_DOCKERFILE = "modify_dockerfile"
45
+ UPDATE_DEPENDENCIES = "update_dependencies"
46
+ FIX_ENVIRONMENT = "fix_environment"
47
+ CHANGE_PORT = "change_port"
48
+ SET_PERMISSIONS = "set_permissions"
49
+ UPDATE_SOURCES = "update_sources"
50
+ ADJUST_RESOURCES = "adjust_resources"
51
+ RETRY_BUILD = "retry_build"
52
+
53
+ @dataclass
54
+ class SpaceInfo:
55
+ """Space 信息"""
56
+ space_id: str
57
+ name: str
58
+ repository_url: str
59
+ current_status: SpaceStatus
60
+ last_updated: datetime
61
+ dockerfile_path: str = "Dockerfile"
62
+ local_path: str = ""
63
+
64
+ @dataclass
65
+ class ErrorInfo:
66
+ """错误信息"""
67
+ error_type: ErrorType
68
+ message: str
69
+ log_snippet: str
70
+ line_number: Optional[int] = None
71
+ confidence: float = 0.0
72
+ context: Dict[str, Any] = field(default_factory=dict)
73
+
74
+ @dataclass
75
+ class RepairStrategy:
76
+ """修复策略"""
77
+ action: RepairAction
78
+ description: str
79
+ modifications: Dict[str, Any]
80
+ risk_level: str # low, medium, high
81
+ success_rate: float = 0.0
82
+ estimated_time: int = 0 # 秒
83
+
84
+ @dataclass
85
+ class RepairHistory:
86
+ """修复历史"""
87
+ id: int
88
+ space_id: str
89
+ timestamp: datetime
90
+ error_info: ErrorInfo
91
+ strategy: RepairStrategy
92
+ success: bool
93
+ git_commit: Optional[str] = None
94
+ rollback_data: Optional[str] = None
95
+
96
+ # ============================================================================
97
+ # 核心接口定义
98
+ # ============================================================================
99
+
100
+ class HuggingFaceAPI(ABC):
101
+ """HuggingFace API 接口"""
102
+
103
+ @abstractmethod
104
+ async def get_space_status(self, space_id: str) -> SpaceStatus:
105
+ """获取 Space 状态"""
106
+ pass
107
+
108
+ @abstractmethod
109
+ async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
110
+ """获取 Space 日志"""
111
+ pass
112
+
113
+ @abstractmethod
114
+ async def trigger_rebuild(self, space_id: str) -> bool:
115
+ """触发重新构建"""
116
+ pass
117
+
118
+ @abstractmethod
119
+ async def get_space_info(self, space_id: str) -> SpaceInfo:
120
+ """获取 Space 详细信息"""
121
+ pass
122
+
123
+ class ErrorAnalyzer(ABC):
124
+ """错误分析器接口"""
125
+
126
+ @abstractmethod
127
+ async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
128
+ """分析日志并识别错误"""
129
+ pass
130
+
131
+ @abstractmethod
132
+ async def classify_error(self, error_message: str) -> ErrorType:
133
+ """分类错误类型"""
134
+ pass
135
+
136
+ class RepairStrategyEngine(ABC):
137
+ """修复策略引擎接口"""
138
+
139
+ @abstractmethod
140
+ async def generate_strategy(self, error: ErrorInfo, space_info: SpaceInfo) -> Optional[RepairStrategy]:
141
+ """生成修复策略"""
142
+ pass
143
+
144
+ @abstractmethod
145
+ async def estimate_success(self, strategy: RepairStrategy) -> float:
146
+ """估算成功概率"""
147
+ pass
148
+
149
+ class FileModifier(ABC):
150
+ """文件修改器接口"""
151
+
152
+ @abstractmethod
153
+ async def apply_modifications(self, file_path: str, modifications: Dict[str, Any]) -> bool:
154
+ """应用修改"""
155
+ pass
156
+
157
+ @abstractmethod
158
+ async def backup_file(self, file_path: str) -> str:
159
+ """备份文件"""
160
+ pass
161
+
162
+ # ============================================================================
163
+ # 核心系统类
164
+ # ============================================================================
165
+
166
+ class HFSpaceMonitor:
167
+ """HuggingFace Space 监控器"""
168
+
169
+ def __init__(self, hf_api: HuggingFaceAPI, check_interval: int = 60):
170
+ self.hf_api = hf_api
171
+ self.check_interval = check_interval
172
+ self.logger = logging.getLogger(__name__)
173
+ self._running = False
174
+
175
+ async def start_monitoring(self, space_ids: List[str]) -> None:
176
+ """开始监控 Spaces"""
177
+ self._running = True
178
+ self.logger.info(f"开始监控 {len(space_ids)} 个 Space")
179
+
180
+ while self._running:
181
+ try:
182
+ await self._check_spaces(space_ids)
183
+ await asyncio.sleep(self.check_interval)
184
+ except Exception as e:
185
+ self.logger.error(f"监控过程出错: {e}")
186
+ await asyncio.sleep(5)
187
+
188
+ async def _check_spaces(self, space_ids: List[str]) -> None:
189
+ """检查所有 Space 状态"""
190
+ tasks = [self._check_single_space(space_id) for space_id in space_ids]
191
+ await asyncio.gather(*tasks, return_exceptions=True)
192
+
193
+ async def _check_single_space(self, space_id: str) -> None:
194
+ """检查单个 Space 状态"""
195
+ try:
196
+ status = await self.hf_api.get_space_status(space_id)
197
+ self.logger.info(f"Space {space_id} 状态: {status.value}")
198
+
199
+ if status == SpaceStatus.ERROR:
200
+ logs = await self.hf_api.get_space_logs(space_id)
201
+ # 触发错误分析和修复流程
202
+ await self._handle_error(space_id, logs)
203
+
204
+ except Exception as e:
205
+ self.logger.error(f"检查 Space {space_id} 失败: {e}")
206
+
207
+ async def _handle_error(self, space_id: str, logs: str) -> None:
208
+ """处理错误"""
209
+ # 这里会调用错误分析器和修复引擎
210
+ pass
211
+
212
+ def stop(self) -> None:
213
+ """停止监控"""
214
+ self._running = False
215
+
216
+ class IntelligentErrorAnalyzer:
217
+ """智能错误分析器"""
218
+
219
+ def __init__(self):
220
+ self.logger = logging.getLogger(__name__)
221
+ self.error_patterns = self._load_error_patterns()
222
+
223
+ async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
224
+ """分析日志并识别错误"""
225
+ errors = []
226
+
227
+ # 分行分析日志
228
+ for line_num, line in enumerate(logs.split('\n'), 1):
229
+ for error_type, patterns in self.error_patterns.items():
230
+ for pattern in patterns:
231
+ if pattern['regex'].search(line):
232
+ error_info = ErrorInfo(
233
+ error_type=ErrorType(error_type),
234
+ message=line.strip(),
235
+ log_snippet=line.strip(),
236
+ line_number=line_num,
237
+ confidence=pattern['confidence'],
238
+ context=self._extract_context(line, logs, line_num)
239
+ )
240
+ errors.append(error_info)
241
+ break
242
+
243
+ return errors
244
+
245
+ def _load_error_patterns(self) -> Dict[str, List[Dict]]:
246
+ """加载错误模式"""
247
+ return {
248
+ "dockerfile_syntax": [
249
+ {
250
+ "regex": re.compile(r"ERROR:.*failed to solve|failed to compute cache key"),
251
+ "confidence": 0.9
252
+ }
253
+ ],
254
+ "dependency_install": [
255
+ {
256
+ "regex": re.compile(r"ERROR:.*Could not find a version|No matching distribution"),
257
+ "confidence": 0.85
258
+ }
259
+ ],
260
+ "environment_config": [
261
+ {
262
+ "regex": re.compile(r"ERROR:.*environment variable|ENV not found"),
263
+ "confidence": 0.8
264
+ }
265
+ ]
266
+ # 更多模式...
267
+ }
268
+
269
+ def _extract_context(self, error_line: str, logs: str, line_num: int) -> Dict[str, Any]:
270
+ """提取错误上下文"""
271
+ lines = logs.split('\n')
272
+ start = max(0, line_num - 3)
273
+ end = min(len(lines), line_num + 3)
274
+
275
+ return {
276
+ "before": lines[start:line_num],
277
+ "after": lines[line_num + 1:end],
278
+ "full_context": lines[start:end]
279
+ }
280
+
281
+ class SmartRepairEngine:
282
+ """智能修复引擎"""
283
+
284
+ def __init__(self):
285
+ self.logger = logging.getLogger(__name__)
286
+ self.repair_rules = self._load_repair_rules()
287
+
288
+ async def generate_strategy(self, error: ErrorInfo, space_info: SpaceInfo) -> Optional[RepairStrategy]:
289
+ """生成修复策略"""
290
+ error_type = error.error_type.value
291
+
292
+ if error_type in self.repair_rules:
293
+ rules = self.repair_rules[error_type]
294
+ # 选择最适合的规则
295
+ best_rule = max(rules, key=lambda r: r['success_rate'])
296
+
297
+ return RepairStrategy(
298
+ action=RepairAction(best_rule['action']),
299
+ description=best_rule['description'],
300
+ modifications=best_rule['modifications'],
301
+ risk_level=best_rule['risk_level'],
302
+ success_rate=best_rule['success_rate'],
303
+ estimated_time=best_rule['estimated_time']
304
+ )
305
+
306
+ return None
307
+
308
+ def _load_repair_rules(self) -> Dict[str, List[Dict]]:
309
+ """加载修复规则"""
310
+ return {
311
+ "dockerfile_syntax": [
312
+ {
313
+ "action": "modify_dockerfile",
314
+ "description": "修复 Dockerfile 语法错误",
315
+ "modifications": {
316
+ "type": "syntax_fix",
317
+ "target": error.line_number
318
+ },
319
+ "risk_level": "medium",
320
+ "success_rate": 0.7,
321
+ "estimated_time": 120
322
+ }
323
+ ],
324
+ "dependency_install": [
325
+ {
326
+ "action": "update_dependencies",
327
+ "description": "更新依赖版本或更换源地址",
328
+ "modifications": {
329
+ "type": "dependency_update",
330
+ "strategy": "version_bump_or_source_change"
331
+ },
332
+ "risk_level": "low",
333
+ "success_rate": 0.8,
334
+ "estimated_time": 300
335
+ }
336
+ ]
337
+ # 更多规则...
338
+ }
339
+
340
+ class AutoRepairSystem:
341
+ """自动修复系统主类"""
342
+
343
+ def __init__(self, config_path: str = "config.json"):
344
+ self.config = self._load_config(config_path)
345
+ self.logger = self._setup_logging()
346
+
347
+ # 初始化各个组件
348
+ self.hf_api = HuggingFaceAPIClient(self.config['hf_token'])
349
+ self.error_analyzer = IntelligentErrorAnalyzer()
350
+ self.repair_engine = SmartRepairEngine()
351
+ self.file_modifier = DockerfileModifier()
352
+ self.state_manager = StateManager(self.config['db_path'])
353
+
354
+ # 监控器
355
+ self.monitor = HFSpaceMonitor(self.hf_api, self.config['check_interval'])
356
+
357
+ # 修复队列
358
+ self.repair_queue = asyncio.Queue()
359
+
360
+ async def start(self, space_ids: List[str]) -> None:
361
+ """启动系统"""
362
+ self.logger.info("启动 HuggingFace Spaces 自动修复系统")
363
+
364
+ # 启动监控任务
365
+ monitor_task = asyncio.create_task(self.monitor.start_monitoring(space_ids))
366
+
367
+ # 启动修复任务
368
+ repair_task = asyncio.create_task(self._process_repair_queue())
369
+
370
+ # 等待任务完成(正常情况下不会完成)
371
+ await asyncio.gather(monitor_task, repair_task)
372
+
373
+ async def _process_repair_queue(self) -> None:
374
+ """处理修复队列"""
375
+ while True:
376
+ try:
377
+ repair_job = await self.repair_queue.get()
378
+ await self._execute_repair(repair_job)
379
+ except Exception as e:
380
+ self.logger.error(f"修复任务执行失败: {e}")
381
+
382
+ async def _execute_repair(self, job: Dict[str, Any]) -> None:
383
+ """执行修复任务"""
384
+ space_id = job['space_id']
385
+ error_info = job['error_info']
386
+
387
+ self.logger.info(f"开始修复 Space {space_id}")
388
+
389
+ # 获取 Space 信息
390
+ space_info = await self.hf_api.get_space_info(space_id)
391
+
392
+ # 生成修复策略
393
+ strategy = await self.repair_engine.generate_strategy(error_info, space_info)
394
+
395
+ if strategy:
396
+ try:
397
+ # 备份原文件
398
+ backup_path = await self.file_modifier.backup_file(space_info.dockerfile_path)
399
+
400
+ # 应用修改
401
+ success = await self.file_modifier.apply_modifications(
402
+ space_info.dockerfile_path,
403
+ strategy.modifications
404
+ )
405
+
406
+ if success:
407
+ # 提交到 Git
408
+ git_commit = await self._commit_changes(space_id, strategy)
409
+
410
+ # 触发重新构建
411
+ await self.hf_api.trigger_rebuild(space_id)
412
+
413
+ # 记录历史
414
+ await self.state_manager.record_repair(
415
+ space_id, error_info, strategy, True, git_commit
416
+ )
417
+
418
+ self.logger.info(f"Space {space_id} 修复完成")
419
+ else:
420
+ # 回滚
421
+ await self._rollback(backup_path, space_info.dockerfile_path)
422
+
423
+ except Exception as e:
424
+ self.logger.error(f"修复失败: {e}")
425
+ await self._rollback(backup_path, space_info.dockerfile_path)
426
+
427
+ def _load_config(self, config_path: str) -> Dict[str, Any]:
428
+ """加载配置"""
429
+ default_config = {
430
+ "hf_token": os.getenv("HF_TOKEN", ""),
431
+ "check_interval": 60,
432
+ "db_path": "repair_system.db",
433
+ "max_retry": 3,
434
+ "log_level": "INFO"
435
+ }
436
+
437
+ if os.path.exists(config_path):
438
+ with open(config_path, 'r') as f:
439
+ user_config = json.load(f)
440
+ default_config.update(user_config)
441
+
442
+ return default_config
443
+
444
+ def _setup_logging(self) -> logging.Logger:
445
+ """设置日志"""
446
+ logger = logging.getLogger(__name__)
447
+ logger.setLevel(getattr(logging, self.config['log_level']))
448
+
449
+ handler = logging.StreamHandler()
450
+ formatter = logging.Formatter(
451
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
452
+ )
453
+ handler.setFormatter(formatter)
454
+ logger.addHandler(handler)
455
+
456
+ return logger
457
+
458
+ # ============================================================================
459
+ # 具体实现类
460
+ # ============================================================================
461
+
462
+ class HuggingFaceAPIClient(HuggingFaceAPI):
463
+ """HuggingFace API 客户端实现"""
464
+
465
+ def __init__(self, token: str):
466
+ self.token = token
467
+ self.base_url = "https://huggingface.co/api"
468
+ self.headers = {"Authorization": f"Bearer {token}"}
469
+
470
+ async def get_space_status(self, space_id: str) -> SpaceStatus:
471
+ """获取 Space 状态"""
472
+ # 实现具体的 API 调用逻辑
473
+ pass
474
+
475
+ async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
476
+ """获取 Space 日志"""
477
+ # 实现具体的 API 调用逻辑
478
+ pass
479
+
480
+ async def trigger_rebuild(self, space_id: str) -> bool:
481
+ """触发重新构建"""
482
+ # 实现具体的 API 调用逻辑
483
+ pass
484
+
485
+ async def get_space_info(self, space_id: str) -> SpaceInfo:
486
+ """获取 Space 详细信息"""
487
+ # 实现具体的 API 调用逻辑
488
+ pass
489
+
490
+ class DockerfileModifier(FileModifier):
491
+ """Dockerfile 修改器实现"""
492
+
493
+ async def apply_modifications(self, file_path: str, modifications: Dict[str, Any]) -> bool:
494
+ """应用修改"""
495
+ # 实现具体的 Dockerfile 修改逻辑
496
+ pass
497
+
498
+ async def backup_file(self, file_path: str) -> str:
499
+ """备份文件"""
500
+ # 实现文件备份逻辑
501
+ pass
502
+
503
+ class StateManager:
504
+ """状态管理器"""
505
+
506
+ def __init__(self, db_path: str):
507
+ self.db_path = db_path
508
+ self._init_database()
509
+
510
+ def _init_database(self) -> None:
511
+ """初始化数据库"""
512
+ # 创建数据库表结构
513
+ pass
514
+
515
+ async def record_repair(self, space_id: str, error_info: ErrorInfo,
516
+ strategy: RepairStrategy, success: bool,
517
+ git_commit: Optional[str] = None) -> None:
518
+ """记录修复历史"""
519
+ # 实现修复历史记录逻辑
520
+ pass
521
+
522
+ async def get_repair_history(self, space_id: str) -> List[RepairHistory]:
523
+ """获取修复历史"""
524
+ # 实现历史查询逻辑
525
+ pass
526
+
527
+ if __name__ == "__main__":
528
+ # 系统启动示例
529
+ system = AutoRepairSystem()
530
+
531
+ # 要监控的 Space ID 列表
532
+ space_ids = [
533
+ "your-username/your-space-1",
534
+ "your-username/your-space-2"
535
+ ]
536
+
537
+ # 启动系统
538
+ asyncio.run(system.start(space_ids))
docker-compose.yml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 部署方案和启动脚本
3
+ 包含 Docker 部署、本地部署和系统服务配置
4
+ """
5
+
6
+ version: '3.8'
7
+
8
+ services:
9
+ # 主应用服务
10
+ hf-repair-system:
11
+ build:
12
+ context: .
13
+ dockerfile: Dockerfile.repair
14
+ container_name: hf-repair-system
15
+ restart: unless-stopped
16
+ environment:
17
+ - HF_TOKEN=${HF_TOKEN}
18
+ - WEBHOOK_URL=${WEBHOOK_URL}
19
+ - DB_PATH=/app/data/repair_system.db
20
+ - LOG_LEVEL=INFO
21
+ volumes:
22
+ - ./data:/app/data
23
+ - ./logs:/app/logs
24
+ - ./backups:/app/backups
25
+ - ./config:/app/config
26
+ ports:
27
+ - "8080:8080"
28
+ networks:
29
+ - hf-repair-network
30
+ depends_on:
31
+ - redis
32
+ - postgres
33
+ healthcheck:
34
+ test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
35
+ interval: 30s
36
+ timeout: 10s
37
+ retries: 3
38
+ start_period: 40s
39
+
40
+ # Redis 缓存服务
41
+ redis:
42
+ image: redis:7-alpine
43
+ container_name: hf-repair-redis
44
+ restart: unless-stopped
45
+ ports:
46
+ - "6379:6379"
47
+ volumes:
48
+ - redis_data:/data
49
+ networks:
50
+ - hf-repair-network
51
+ command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD}
52
+ healthcheck:
53
+ test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
54
+ interval: 30s
55
+ timeout: 10s
56
+ retries: 3
57
+
58
+ # PostgreSQL 数据库服务
59
+ postgres:
60
+ image: postgres:15-alpine
61
+ container_name: hf-repair-postgres
62
+ restart: unless-stopped
63
+ environment:
64
+ - POSTGRES_DB=hf_repair
65
+ - POSTGRES_USER=hf_repair
66
+ - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
67
+ ports:
68
+ - "5432:5432"
69
+ volumes:
70
+ - postgres_data:/var/lib/postgresql/data
71
+ - ./sql/init.sql:/docker-entrypoint-initdb.d/init.sql
72
+ networks:
73
+ - hf-repair-network
74
+ healthcheck:
75
+ test: ["CMD-SHELL", "pg_isready -U hf_repair"]
76
+ interval: 30s
77
+ timeout: 10s
78
+ retries: 3
79
+
80
+ # Web 服务(可选的 Web 界面)
81
+ web-interface:
82
+ build:
83
+ context: ./web
84
+ dockerfile: Dockerfile
85
+ container_name: hf-repair-web
86
+ restart: unless-stopped
87
+ ports:
88
+ - "3000:3000"
89
+ environment:
90
+ - REACT_APP_API_URL=http://localhost:8080
91
+ networks:
92
+ - hf-repair-network
93
+ depends_on:
94
+ - hf-repair-system
95
+
96
+ # 监控服务
97
+ prometheus:
98
+ image: prom/prometheus:latest
99
+ container_name: hf-repair-prometheus
100
+ restart: unless-stopped
101
+ ports:
102
+ - "9090:9090"
103
+ volumes:
104
+ - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml
105
+ - prometheus_data:/prometheus
106
+ networks:
107
+ - hf-repair-network
108
+ command:
109
+ - '--config.file=/etc/prometheus/prometheus.yml'
110
+ - '--storage.tsdb.path=/prometheus'
111
+ - '--web.console.libraries=/etc/prometheus/console_libraries'
112
+ - '--web.console.templates=/etc/prometheus/consoles'
113
+
114
+ # Grafana 可视化
115
+ grafana:
116
+ image: grafana/grafana:latest
117
+ container_name: hf-repair-grafana
118
+ restart: unless-stopped
119
+ ports:
120
+ - "3001:3000"
121
+ environment:
122
+ - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
123
+ volumes:
124
+ - grafana_data:/var/lib/grafana
125
+ - ./monitoring/grafana:/etc/grafana/provisioning
126
+ networks:
127
+ - hf-repair-network
128
+ depends_on:
129
+ - prometheus
130
+
131
+ volumes:
132
+ redis_data:
133
+ driver: local
134
+ postgres_data:
135
+ driver: local
136
+ prometheus_data:
137
+ driver: local
138
+ grafana_data:
139
+ driver: local
140
+
141
+ networks:
142
+ hf-repair-network:
143
+ driver: bridge
error_analyzer.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 错误分析器实现
3
+ 负责分析日志、识别错误类型和根本原因
4
+ """
5
+
6
+ import re
7
+ import asyncio
8
+ import logging
9
+ from typing import List, Dict, Any, Tuple, Optional
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+
13
+ from core_system import ErrorAnalyzer, ErrorInfo, ErrorType
14
+
15
+ @dataclass
16
+ class ErrorPattern:
17
+ """错误模式定义"""
18
+ regex: re.Pattern
19
+ error_type: ErrorType
20
+ confidence: float
21
+ description: str
22
+ common_causes: List[str]
23
+ suggested_fixes: List[str]
24
+
25
+ class LogAnalyzer:
26
+ """日志分析器"""
27
+
28
+ def __init__(self):
29
+ self.logger = logging.getLogger(__name__)
30
+
31
+ def extract_error_context(self, logs: str, error_line: int, context_size: int = 5) -> Dict[str, Any]:
32
+ """提取错误上下文"""
33
+ lines = logs.split('\n')
34
+ start = max(0, error_line - context_size)
35
+ end = min(len(lines), error_line + context_size + 1)
36
+
37
+ return {
38
+ "before": lines[start:error_line],
39
+ "error_line": lines[error_line] if error_line < len(lines) else "",
40
+ "after": lines[error_line + 1:end],
41
+ "full_context": lines[start:end],
42
+ "relative_line": error_line - start
43
+ }
44
+
45
+ def detect_error_sequence(self, logs: str) -> List[str]:
46
+ """检测错误序列"""
47
+ lines = logs.split('\n')
48
+ error_sequence = []
49
+
50
+ for line in lines:
51
+ if any(keyword in line.lower() for keyword in ['error', 'failed', 'exception', 'traceback']):
52
+ error_sequence.append(line.strip())
53
+
54
+ return error_sequence
55
+
56
+ def find_related_errors(self, logs: str, main_error: ErrorInfo) -> List[ErrorInfo]:
57
+ """查找相关错误"""
58
+ related_errors = []
59
+ lines = logs.split('\n')
60
+
61
+ # 在主错误附近查找相关错误
62
+ if main_error.line_number:
63
+ start = max(0, main_error.line_number - 10)
64
+ end = min(len(lines), main_error.line_number + 10)
65
+
66
+ for i, line in enumerate(lines[start:end], start):
67
+ if i != main_error.line_number and 'error' in line.lower():
68
+ related_error = ErrorInfo(
69
+ error_type=ErrorType.UNKNOWN_ERROR,
70
+ message=line.strip(),
71
+ log_snippet=line.strip(),
72
+ line_number=i,
73
+ confidence=0.5
74
+ )
75
+ related_errors.append(related_error)
76
+
77
+ return related_errors
78
+
79
+ class IntelligentErrorAnalyzer(ErrorAnalyzer):
80
+ """智能错误分析器"""
81
+
82
+ def __init__(self):
83
+ self.logger = logging.getLogger(__name__)
84
+ self.log_analyzer = LogAnalyzer()
85
+ self.error_patterns = self._initialize_patterns()
86
+ self.context_analyzers = {
87
+ ErrorType.DOCKERFILE_SYNTAX: DockerfileSyntaxAnalyzer(),
88
+ ErrorType.DEPENDENCY_INSTALL: DependencyErrorAnalyzer(),
89
+ ErrorType.ENVIRONMENT_CONFIG: EnvironmentErrorAnalyzer(),
90
+ ErrorType.PORT_CONFLICT: PortErrorAnalyzer(),
91
+ ErrorType.PERMISSION_ERROR: PermissionErrorAnalyzer(),
92
+ ErrorType.NETWORK_CONNECTION: NetworkErrorAnalyzer(),
93
+ ErrorType.TIMEOUT_ERROR: TimeoutErrorAnalyzer(),
94
+ ErrorType.RESOURCE_EXCEEDED: ResourceErrorAnalyzer()
95
+ }
96
+
97
+ async def analyze_logs(self, logs: str) -> List[ErrorInfo]:
98
+ """分析日志并识别错误"""
99
+ errors = []
100
+
101
+ # 首先使用正则模式进行快速匹配
102
+ pattern_errors = await self._pattern_matching(logs)
103
+ errors.extend(pattern_errors)
104
+
105
+ # 然后使用上下文分析器进行深度分析
106
+ context_errors = await self._context_analysis(logs)
107
+ errors.extend(context_errors)
108
+
109
+ # 去重和合并相似错误
110
+ deduplicated_errors = self._deduplicate_errors(errors)
111
+
112
+ # 计算最终置信度
113
+ final_errors = self._calculate_final_confidence(deduplicated_errors, logs)
114
+
115
+ return final_errors
116
+
117
+ async def classify_error(self, error_message: str) -> ErrorType:
118
+ """分类错误类型"""
119
+ max_confidence = 0.0
120
+ best_type = ErrorType.UNKNOWN_ERROR
121
+
122
+ for pattern in self.error_patterns:
123
+ if pattern.regex.search(error_message):
124
+ if pattern.confidence > max_confidence:
125
+ max_confidence = pattern.confidence
126
+ best_type = pattern.error_type
127
+
128
+ return best_type
129
+
130
+ async def _pattern_matching(self, logs: str) -> List[ErrorInfo]:
131
+ """基于模式的错误匹配"""
132
+ errors = []
133
+ lines = logs.split('\n')
134
+
135
+ for line_num, line in enumerate(lines, 1):
136
+ for pattern in self.error_patterns:
137
+ if pattern.regex.search(line):
138
+ error_info = ErrorInfo(
139
+ error_type=pattern.error_type,
140
+ message=line.strip(),
141
+ log_snippet=line.strip(),
142
+ line_number=line_num,
143
+ confidence=pattern.confidence,
144
+ context={
145
+ "description": pattern.description,
146
+ "common_causes": pattern.common_causes,
147
+ "suggested_fixes": pattern.suggested_fixes
148
+ }
149
+ )
150
+ errors.append(error_info)
151
+
152
+ return errors
153
+
154
+ async def _context_analysis(self, logs: str) -> List[ErrorInfo]:
155
+ """上下文感知的错误分析"""
156
+ errors = []
157
+
158
+ for error_type, analyzer in self.context_analyzers.items():
159
+ try:
160
+ type_errors = await analyzer.analyze(logs)
161
+ errors.extend(type_errors)
162
+ except Exception as e:
163
+ self.logger.error(f"上下文分析器 {error_type} 执行失败: {e}")
164
+
165
+ return errors
166
+
167
+ def _deduplicate_errors(self, errors: List[ErrorInfo]) -> List[ErrorInfo]:
168
+ """去重错误"""
169
+ if not errors:
170
+ return []
171
+
172
+ # 按行号和错误类型去重
173
+ seen = set()
174
+ deduplicated = []
175
+
176
+ for error in errors:
177
+ key = (error.line_number, error.error_type)
178
+ if key not in seen:
179
+ seen.add(key)
180
+ deduplicated.append(error)
181
+
182
+ return deduplicated
183
+
184
+ def _calculate_final_confidence(self, errors: List[ErrorInfo], logs: str) -> List[ErrorInfo]:
185
+ """计算最终置信度"""
186
+ for error in errors:
187
+ # 基于多种因素调整置信度
188
+ base_confidence = error.confidence
189
+
190
+ # 如果错误信息中包含具体的技术关键词,提高置信度
191
+ tech_keywords = ['docker', 'pip', 'npm', 'apt', 'python', 'node']
192
+ keyword_boost = sum(0.1 for keyword in tech_keywords if keyword in error.message.lower())
193
+
194
+ # 如果错误在日志的末尾(最近的错误),提高置信度
195
+ lines = logs.split('\n')
196
+ position_factor = (error.line_number or 0) / len(lines) if len(lines) > 0 else 0.5
197
+ recent_boost = (1 - position_factor) * 0.2
198
+
199
+ # 计算最终置信度
200
+ final_confidence = min(1.0, base_confidence + keyword_boost + recent_boost)
201
+ error.confidence = final_confidence
202
+
203
+ return errors
204
+
205
+ def _initialize_patterns(self) -> List[ErrorPattern]:
206
+ """初始化错误模式"""
207
+ patterns = [
208
+ # Dockerfile 语法错误
209
+ ErrorPattern(
210
+ regex=re.compile(r"failed to solve:.*syntax error|Dockerfile:\d+"),
211
+ error_type=ErrorType.DOCKERFILE_SYNTAX,
212
+ confidence=0.9,
213
+ description="Dockerfile 语法错误",
214
+ common_causes=["命令格式错误", "参数缺失", "缩进问题"],
215
+ suggested_fixes=["检查命令语法", "验证参数", "修复格式"]
216
+ ),
217
+
218
+ # 依赖安装失败
219
+ ErrorPattern(
220
+ regex=re.compile(r"ERROR: Could not find a version|No matching distribution|pip install failed"),
221
+ error_type=ErrorType.DEPENDENCY_INSTALL,
222
+ confidence=0.85,
223
+ description="Python 依赖安装失败",
224
+ common_causes=["版本不存在", "网络问题", "依赖冲突"],
225
+ suggested_fixes=["检查版本", "更换源", "解决冲突"]
226
+ ),
227
+
228
+ # Node.js 依赖安装失败
229
+ ErrorPattern(
230
+ regex=re.compile(r"npm ERR!|yarn error|failed to install node packages"),
231
+ error_type=ErrorType.DEPENDENCY_INSTALL,
232
+ confidence=0.85,
233
+ description="Node.js 依赖安装失败",
234
+ common_causes=["版本冲突", "网络问题", "缓存问题"],
235
+ suggested_fixes=["清理缓存", "检查版本", "使用国内源"]
236
+ ),
237
+
238
+ # 环境变量配置问题
239
+ ErrorPattern(
240
+ regex=re.compile(r"Environment variable.*not found|ENV.*undefined|getenv.*None"),
241
+ error_type=ErrorType.ENVIRONMENT_CONFIG,
242
+ confidence=0.8,
243
+ description="环境变量配置问题",
244
+ common_causes=["变量未设置", "配置文件缺失", "权限问题"],
245
+ suggested_fixes=["设置环境变量", "创建配置文件", "检查权限"]
246
+ ),
247
+
248
+ # 端口冲突
249
+ ErrorPattern(
250
+ regex=re.compile(r"Address already in use|Port.*already used|EADDRINUSE"),
251
+ error_type=ErrorType.PORT_CONFLICT,
252
+ confidence=0.95,
253
+ description="端口冲突",
254
+ common_causes=["端口被占用", "权限不足", "配置错误"],
255
+ suggested_fixes=["更换端口", "杀死占用进程", "修改配置"]
256
+ ),
257
+
258
+ # 权限问题
259
+ ErrorPattern(
260
+ regex=re.compile(r"Permission denied|Operation not permitted|EACCES"),
261
+ error_type=ErrorType.PERMISSION_ERROR,
262
+ confidence=0.9,
263
+ description="权限不足",
264
+ common_causes=["文件权限", "用户权限", "目录权限"],
265
+ suggested_fixes=["修改权限", "使用 sudo", "更改用户"]
266
+ ),
267
+
268
+ # 网络连接问题
269
+ ErrorPattern(
270
+ regex=re.compile(r"Connection refused|Network unreachable|Timeout|DNS resolution failed"),
271
+ error_type=ErrorType.NETWORK_CONNECTION,
272
+ confidence=0.8,
273
+ description="网络连接问题",
274
+ common_causes=["网络不可达", "DNS问题", "防火墙限制"],
275
+ suggested_fixes=["检查网络", "配置DNS", "调整防火墙"]
276
+ ),
277
+
278
+ # 超时错误
279
+ ErrorPattern(
280
+ regex=re.compile(r"timeout|timed out|deadline exceeded"),
281
+ error_type=ErrorType.TIMEOUT_ERROR,
282
+ confidence=0.75,
283
+ description="操作超时",
284
+ common_causes=["操作时间过长", "资源不足", "网络延迟"],
285
+ suggested_fixes["增加超时时间", "优化性能", "检查资源"]
286
+ ),
287
+
288
+ # 资源超限
289
+ ErrorPattern(
290
+ regex=re.compile(r"out of memory|disk full|CPU limit exceeded|resource exceeded"),
291
+ error_type=ErrorType.RESOURCE_EXCEEDED,
292
+ confidence=0.9,
293
+ description="资源超限",
294
+ common_causes=["内存不足", "磁盘满", "CPU限制"],
295
+ suggested_fixes=["清理资源", "增加配额", "优化代码"]
296
+ )
297
+ ]
298
+
299
+ return patterns
300
+
301
+ class ContextAnalyzer(ABC):
302
+ """上下文分析器基类"""
303
+
304
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
305
+ """分析日志"""
306
+ pass
307
+
308
+ class DockerfileSyntaxAnalyzer(ContextAnalyzer):
309
+ """Dockerfile 语法分析器"""
310
+
311
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
312
+ errors = []
313
+
314
+ # 分析 Dockerfile 特有的语法错误
315
+ dockerfile_errors = [
316
+ (r"FROM.*invalid", "FROM 指令格式错误"),
317
+ (r"RUN.*command not found", "RUN 命令执行失败"),
318
+ (r"COPY.*No such file", "COPY 源文件不存在"),
319
+ (r"EXPOSE.*invalid port", "EXPOSE 端口格式错误"),
320
+ (r"ENV.*invalid format", "ENV 环境变量格式错误")
321
+ ]
322
+
323
+ for pattern, description in dockerfile_errors:
324
+ if re.search(pattern, logs, re.IGNORECASE):
325
+ error_info = ErrorInfo(
326
+ error_type=ErrorType.DOCKERFILE_SYNTAX,
327
+ message=description,
328
+ log_snippet="",
329
+ confidence=0.8,
330
+ context={"analysis_type": "dockerfile_syntax"}
331
+ )
332
+ errors.append(error_info)
333
+
334
+ return errors
335
+
336
+ class DependencyErrorAnalyzer(ContextAnalyzer):
337
+ """依赖错误分析器"""
338
+
339
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
340
+ errors = []
341
+
342
+ # Python 依赖问题
343
+ python_patterns = [
344
+ (r"pip.*Requirement already satisfied", "依赖重复安装"),
345
+ (r"pip.*Could not find.*version", "依赖版本不存在"),
346
+ (r"pip.*incompatible dependencies", "依赖版本冲突")
347
+ ]
348
+
349
+ # Node.js 依赖问题
350
+ node_patterns = [
351
+ (r"npm.*peer dependency", "peer 依赖问题"),
352
+ (r"npm.*version mismatch", "版本不匹配"),
353
+ (r"npm.*cache problem", "npm 缓存问题")
354
+ ]
355
+
356
+ all_patterns = python_patterns + node_patterns
357
+
358
+ for pattern, description in all_patterns:
359
+ if re.search(pattern, logs, re.IGNORECASE):
360
+ error_info = ErrorInfo(
361
+ error_type=ErrorType.DEPENDENCY_INSTALL,
362
+ message=description,
363
+ log_snippet="",
364
+ confidence=0.75,
365
+ context={"analysis_type": "dependency"}
366
+ )
367
+ errors.append(error_info)
368
+
369
+ return errors
370
+
371
+ class EnvironmentErrorAnalyzer(ContextAnalyzer):
372
+ """环境错误分析器"""
373
+
374
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
375
+ errors = []
376
+
377
+ # 环境变量问题
378
+ if re.search(r"PATH.*not found", logs, re.IGNORECASE):
379
+ error_info = ErrorInfo(
380
+ error_type=ErrorType.ENVIRONMENT_CONFIG,
381
+ message="PATH 环境变量配置问题",
382
+ log_snippet="",
383
+ confidence=0.8,
384
+ context={"analysis_type": "environment", "var_type": "PATH"}
385
+ )
386
+ errors.append(error_info)
387
+
388
+ return errors
389
+
390
+ class PortErrorAnalyzer(ContextAnalyzer):
391
+ """端口错误分析器"""
392
+
393
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
394
+ errors = []
395
+
396
+ # 检测常见的 HuggingFace Spaces 端口问题
397
+ if re.search(r"port.*7860", logs, re.IGNORECASE) and re.search(r"error|failed", logs, re.IGNORECASE):
398
+ error_info = ErrorInfo(
399
+ error_type=ErrorType.PORT_CONFLICT,
400
+ message="HuggingFace Spaces 默认端口 7860 问题",
401
+ log_snippet="",
402
+ confidence=0.9,
403
+ context={"analysis_type": "port", "port": "7860"}
404
+ )
405
+ errors.append(error_info)
406
+
407
+ return errors
408
+
409
+ class PermissionErrorAnalyzer(ContextAnalyzer):
410
+ """权限错误分析器"""
411
+
412
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
413
+ errors = []
414
+
415
+ # 检测文件权限问题
416
+ if re.search(r"permission denied.*\.py|\.js|\.sh", logs, re.IGNORECASE):
417
+ error_info = ErrorInfo(
418
+ error_type=ErrorType.PERMISSION_ERROR,
419
+ message="脚本文件权限问题",
420
+ log_snippet="",
421
+ confidence=0.8,
422
+ context={"analysis_type": "permission", "file_type": "script"}
423
+ )
424
+ errors.append(error_info)
425
+
426
+ return errors
427
+
428
+ class NetworkErrorAnalyzer(ContextAnalyzer):
429
+ """网络错误分析器"""
430
+
431
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
432
+ errors = []
433
+
434
+ # 检测网络连接问题
435
+ network_indicators = [
436
+ (r"github\.com.*timeout", "GitHub 连接超时"),
437
+ (r"pypi\.org.*failed", "PyPI 连接失败"),
438
+ (r"npm\.registry.*error", "npm registry 连接错误")
439
+ ]
440
+
441
+ for pattern, description in network_indicators:
442
+ if re.search(pattern, logs, re.IGNORECASE):
443
+ error_info = ErrorInfo(
444
+ error_type=ErrorType.NETWORK_CONNECTION,
445
+ message=description,
446
+ log_snippet="",
447
+ confidence=0.7,
448
+ context={"analysis_type": "network", "service": pattern.split('.')[0]}
449
+ )
450
+ errors.append(error_info)
451
+
452
+ return errors
453
+
454
+ class TimeoutErrorAnalyzer(ContextAnalyzer):
455
+ """超时错误分析器"""
456
+
457
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
458
+ errors = []
459
+
460
+ # 检测不同类型的超时
461
+ timeout_patterns = [
462
+ (r"build.*timeout", "构建超时"),
463
+ (r"install.*timeout", "安装超时"),
464
+ (r"download.*timeout", "下载超时")
465
+ ]
466
+
467
+ for pattern, description in timeout_patterns:
468
+ if re.search(pattern, logs, re.IGNORECASE):
469
+ error_info = ErrorInfo(
470
+ error_type=ErrorType.TIMEOUT_ERROR,
471
+ message=description,
472
+ log_snippet="",
473
+ confidence=0.8,
474
+ context={"analysis_type": "timeout", "operation": pattern.split('.')[0]}
475
+ )
476
+ errors.append(error_info)
477
+
478
+ return errors
479
+
480
+ class ResourceErrorAnalyzer(ContextAnalyzer):
481
+ """资源错误分析器"""
482
+
483
+ async def analyze(self, logs: str) -> List[ErrorInfo]:
484
+ errors = []
485
+
486
+ # 检测资源限制问题
487
+ resource_patterns = [
488
+ (r"memory.*limit", "内存限制"),
489
+ (r"disk.*space", "磁盘空间不足"),
490
+ (r"cpu.*quota", "CPU 配额限制")
491
+ ]
492
+
493
+ for pattern, description in resource_patterns:
494
+ if re.search(pattern, logs, re.IGNORECASE):
495
+ error_info = ErrorInfo(
496
+ error_type=ErrorType.RESOURCE_EXCEEDED,
497
+ message=description,
498
+ log_snippet="",
499
+ confidence=0.8,
500
+ context={"analysis_type": "resource", "resource_type": pattern.split('.')[0]}
501
+ )
502
+ errors.append(error_info)
503
+
504
+ return errors
huggingface_client.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Spaces API 客户端实现
3
+ 负责与 HuggingFace API 的所有交互
4
+ """
5
+
6
+ import aiohttp
7
+ import asyncio
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Dict, List, Optional, Any
11
+ from dataclasses import asdict
12
+ import json
13
+
14
+ from core_system import HuggingFaceAPI, SpaceInfo, SpaceStatus, ErrorInfo
15
+
16
+ class HuggingFaceAPIClient(HuggingFaceAPI):
17
+ """HuggingFace API 客户端实现"""
18
+
19
+ def __init__(self, token: str):
20
+ self.token = token
21
+ self.base_url = "https://huggingface.co/api"
22
+ self.headers = {"Authorization": f"Bearer {token}"}
23
+ self.logger = logging.getLogger(__name__)
24
+ self.session = None
25
+
26
+ async def _get_session(self) -> aiohttp.ClientSession:
27
+ """获取 HTTP 会话"""
28
+ if self.session is None:
29
+ self.session = aiohttp.ClientSession(headers=self.headers)
30
+ return self.session
31
+
32
+ async def close(self) -> None:
33
+ """关闭会话"""
34
+ if self.session:
35
+ await self.session.close()
36
+
37
+ async def get_space_status(self, space_id: str) -> SpaceStatus:
38
+ """获取 Space 状态"""
39
+ try:
40
+ session = await self._get_session()
41
+ url = f"{self.base_url}/spaces/{space_id}"
42
+
43
+ async with session.get(url) as response:
44
+ if response.status == 200:
45
+ data = await response.json()
46
+ runtime_data = data.get('runtime', {})
47
+
48
+ # 根据运行时状态确定 Space 状态
49
+ if runtime_data.get('stage') == 'BUILDING':
50
+ return SpaceStatus.BUILDING
51
+ elif runtime_data.get('stage') == 'RUNNING':
52
+ if runtime_data.get('state') == 'RUNNING':
53
+ return SpaceStatus.RUNNING
54
+ else:
55
+ return SpaceStatus.ERROR
56
+ elif runtime_data.get('stage') == 'STOPPED':
57
+ return SpaceStatus.STOPPED
58
+ else:
59
+ return SpaceStatus.ERROR
60
+ else:
61
+ self.logger.error(f"获取 Space 状态失败: {response.status}")
62
+ return SpaceStatus.UNKNOWN
63
+
64
+ except Exception as e:
65
+ self.logger.error(f"获取 Space {space_id} 状态异常: {e}")
66
+ return SpaceStatus.UNKNOWN
67
+
68
+ async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
69
+ """获取 Space 日志"""
70
+ try:
71
+ session = await self._get_session()
72
+ url = f"{self.base_url}/spaces/{space_id}/logs"
73
+ params = {"lines": lines}
74
+
75
+ async with session.get(url, params=params) as response:
76
+ if response.status == 200:
77
+ data = await response.json()
78
+ # 解析日志数据
79
+ log_lines = []
80
+ for entry in data:
81
+ if isinstance(entry, dict) and 'message' in entry:
82
+ log_lines.append(entry['message'])
83
+ elif isinstance(entry, str):
84
+ log_lines.append(entry)
85
+
86
+ return '\n'.join(log_lines)
87
+ else:
88
+ error_msg = await response.text()
89
+ self.logger.error(f"获取日志失败: {response.status} - {error_msg}")
90
+ return f"ERROR: 无法获取日志 - {response.status}"
91
+
92
+ except Exception as e:
93
+ self.logger.error(f"获取 Space {space_id} 日志异常: {e}")
94
+ return f"ERROR: 获取日志异常 - {str(e)}"
95
+
96
+ async def trigger_rebuild(self, space_id: str) -> bool:
97
+ """触发重新构建"""
98
+ try:
99
+ session = await self._get_session()
100
+ url = f"{self.base_url}/spaces/{space_id}/restart"
101
+
102
+ async with session.post(url) as response:
103
+ if response.status == 200:
104
+ self.logger.info(f"成功触发 Space {space_id} 重新构建")
105
+ return True
106
+ else:
107
+ error_msg = await response.text()
108
+ self.logger.error(f"触发重新构建失败: {response.status} - {error_msg}")
109
+ return False
110
+
111
+ except Exception as e:
112
+ self.logger.error(f"触发重新构建异常: {e}")
113
+ return False
114
+
115
+ async def get_space_info(self, space_id: str) -> SpaceInfo:
116
+ """获取 Space 详细信息"""
117
+ try:
118
+ session = await self._get_session()
119
+ url = f"{self.base_url}/spaces/{space_id}"
120
+
121
+ async with session.get(url) as response:
122
+ if response.status == 200:
123
+ data = await response.json()
124
+
125
+ return SpaceInfo(
126
+ space_id=space_id,
127
+ name=data.get('id', space_id),
128
+ repository_url=data.get('url', ''),
129
+ current_status=await self.get_space_status(space_id),
130
+ last_updated=datetime.now(),
131
+ dockerfile_path="Dockerfile", # 默认路径
132
+ local_path="" # 本地路径需要额外配置
133
+ )
134
+ else:
135
+ raise Exception(f"无法获取 Space 信息: {response.status}")
136
+
137
+ except Exception as e:
138
+ self.logger.error(f"获取 Space {space_id} 信息异常: {e}")
139
+ # 返回默认信息
140
+ return SpaceInfo(
141
+ space_id=space_id,
142
+ name=space_id,
143
+ repository_url="",
144
+ current_status=SpaceStatus.UNKNOWN,
145
+ last_updated=datetime.now()
146
+ )
147
+
148
+ async def get_space_discussion(self, space_id: str) -> List[Dict]:
149
+ """获取 Space 讨论信息(用于获取更多上下文)"""
150
+ try:
151
+ session = await self._get_session()
152
+ url = f"{self.base_url}/spaces/{space_id}/discussions"
153
+
154
+ async with session.get(url) as response:
155
+ if response.status == 200:
156
+ return await response.json()
157
+ else:
158
+ return []
159
+
160
+ except Exception as e:
161
+ self.logger.error(f"获取 Space {space_id} 讨论信息异常: {e}")
162
+ return []
163
+
164
+ async def get_space_runtime_info(self, space_id: str) -> Dict[str, Any]:
165
+ """获取 Space 运行时详细信息"""
166
+ try:
167
+ session = await self._get_session()
168
+ url = f"{self.base_url}/spaces/{space_id}/runtime"
169
+
170
+ async with session.get(url) as response:
171
+ if response.status == 200:
172
+ return await response.json()
173
+ else:
174
+ return {}
175
+
176
+ except Exception as e:
177
+ self.logger.error(f"获取 Space {space_id} 运行时信息异常: {e}")
178
+ return {}
179
+
180
+ class HuggingFaceWebhookHandler:
181
+ """HuggingFace Webhook 处理器"""
182
+
183
+ def __init__(self, api_client: HuggingFaceAPIClient):
184
+ self.api_client = api_client
185
+ self.logger = logging.getLogger(__name__)
186
+ self.event_handlers = {
187
+ 'space.status_updated': self._handle_status_update,
188
+ 'space.build_error': self._handle_build_error,
189
+ 'space.started': self._handle_space_started,
190
+ 'space.stopped': self._handle_space_stopped
191
+ }
192
+
193
+ async def handle_webhook(self, payload: Dict[str, Any]) -> None:
194
+ """处理 Webhook 事件"""
195
+ try:
196
+ event_type = payload.get('event')
197
+ if event_type in self.event_handlers:
198
+ await self.event_handlers[event_type](payload)
199
+ else:
200
+ self.logger.warning(f"未知的事件类型: {event_type}")
201
+
202
+ except Exception as e:
203
+ self.logger.error(f"处理 Webhook 事件失败: {e}")
204
+
205
+ async def _handle_status_update(self, payload: Dict[str, Any]) -> None:
206
+ """处理状态更新事件"""
207
+ space_id = payload.get('space', {}).get('id')
208
+ new_status = payload.get('space', {}).get('runtime', {}).get('stage')
209
+
210
+ self.logger.info(f"Space {space_id} 状态更新为: {new_status}")
211
+
212
+ # 根据状态变化触发相应处理
213
+ if new_status == 'ERROR':
214
+ await self._handle_build_error(payload)
215
+
216
+ async def _handle_build_error(self, payload: Dict[str, Any]) -> None:
217
+ """处理构建错误事件"""
218
+ space_id = payload.get('space', {}).get('id')
219
+
220
+ # 获取错误日志
221
+ logs = await self.api_client.get_space_logs(space_id, lines=50)
222
+
223
+ # 触发错误分析流程
224
+ # 这里需要与错误分析器集成
225
+
226
+ async def _handle_space_started(self, payload: Dict[str, Any]) -> None:
227
+ """处理 Space 启动事件"""
228
+ space_id = payload.get('space', {}).get('id')
229
+ self.logger.info(f"Space {space_id} 启动成功")
230
+
231
+ async def _handle_space_stopped(self, payload: Dict[str, Any]) -> None:
232
+ """处理 Space 停止事件"""
233
+ space_id = payload.get('space', {}).get('id')
234
+ self.logger.info(f"Space {space_id} 已停止")
235
+
236
+ class RateLimiter:
237
+ """API 请求限制器"""
238
+
239
+ def __init__(self, max_requests_per_minute: int = 60):
240
+ self.max_requests = max_requests_per_minute
241
+ self.requests = []
242
+ self.lock = asyncio.Lock()
243
+
244
+ async def acquire(self) -> None:
245
+ """获取请求许可"""
246
+ async with self.lock:
247
+ now = datetime.now()
248
+ # 清理超过1分钟的请求记录
249
+ self.requests = [req_time for req_time in self.requests
250
+ if (now - req_time).total_seconds() < 60]
251
+
252
+ # 检查是否超过限制
253
+ if len(self.requests) >= self.max_requests:
254
+ # 计算需要等待的时间
255
+ oldest_request = min(self.requests)
256
+ wait_time = 60 - (now - oldest_request).total_seconds()
257
+ if wait_time > 0:
258
+ await asyncio.sleep(wait_time)
259
+
260
+ # 记录当前请求
261
+ self.requests.append(now)
262
+
263
+ class HuggingFaceAPIClientWithRateLimit(HuggingFaceAPIClient):
264
+ """带速率限制的 HuggingFace API 客户端"""
265
+
266
+ def __init__(self, token: str, rate_limit: int = 60):
267
+ super().__init__(token)
268
+ self.rate_limiter = RateLimiter(rate_limit)
269
+ self.base_client = HuggingFaceAPIClient(token)
270
+
271
+ async def get_space_status(self, space_id: str) -> SpaceStatus:
272
+ """获取 Space 状态(带速率限制)"""
273
+ await self.rate_limiter.acquire()
274
+ return await self.base_client.get_space_status(space_id)
275
+
276
+ async def get_space_logs(self, space_id: str, lines: int = 100) -> str:
277
+ """获取 Space 日志(带速率限制)"""
278
+ await self.rate_limiter.acquire()
279
+ return await self.base_client.get_space_logs(space_id, lines)
280
+
281
+ async def trigger_rebuild(self, space_id: str) -> bool:
282
+ """触发重新构建(带速率限制)"""
283
+ await self.rate_limiter.acquire()
284
+ return await self.base_client.trigger_rebuild(space_id)
285
+
286
+ async def get_space_info(self, space_id: str) -> SpaceInfo:
287
+ """获取 Space 详细信息(带速率限制)"""
288
+ await self.rate_limiter.acquire()
289
+ return await self.base_client.get_space_info(space_id)
usage_examples.py ADDED
@@ -0,0 +1,457 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 使用示例和最佳实践
3
+ 展示系统的基本使用流程和高级功能
4
+ """
5
+
6
+ import asyncio
7
+ import logging
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any
10
+
11
+ from core_system import AutoRepairSystem, SpaceStatus, ErrorType
12
+ from huggingface_client import HuggingFaceAPIClient
13
+ from error_analyzer import IntelligentErrorAnalyzer
14
+
15
+ # ============================================================================
16
+ # 基本使用示例
17
+ # ============================================================================
18
+
19
+ async def basic_usage_example():
20
+ """基本使用示例"""
21
+
22
+ # 1. 初始化系统
23
+ system = AutoRepairSystem("config.json")
24
+
25
+ # 2. 配置要监控的 Spaces
26
+ space_ids = [
27
+ "your-username/space-1",
28
+ "your-username/space-2",
29
+ "your-username/space-3"
30
+ ]
31
+
32
+ print(f"开始监控 {len(space_ids)} 个 Space...")
33
+
34
+ try:
35
+ # 3. 启动系统
36
+ await system.start(space_ids)
37
+ except KeyboardInterrupt:
38
+ print("\n停止监控...")
39
+ system.monitor.stop()
40
+
41
+ # ============================================================================
42
+ # 高级使用示例
43
+ # ============================================================================
44
+
45
+ class AdvancedUsageExample:
46
+ """高级使用示例类"""
47
+
48
+ def __init__(self):
49
+ self.logger = logging.getLogger(__name__)
50
+
51
+ async def custom_monitoring_workflow(self, space_ids: List[str]) -> None:
52
+ """自定义监控工作流"""
53
+
54
+ # 初始化各个组件
55
+ hf_client = HuggingFaceAPIClient("your_token_here")
56
+ error_analyzer = IntelligentErrorAnalyzer()
57
+
58
+ for space_id in space_ids:
59
+ # 1. 检查状态
60
+ status = await hf_client.get_space_status(space_id)
61
+ print(f"Space {space_id}: {status.value}")
62
+
63
+ # 2. 如果有错误,分析日志
64
+ if status == SpaceStatus.ERROR:
65
+ logs = await hf_client.get_space_logs(space_id, lines=100)
66
+ errors = await error_analyzer.analyze_logs(logs)
67
+
68
+ # 3. 分类并处理错误
69
+ for error in errors:
70
+ if error.confidence > 0.8:
71
+ await self._handle_high_confidence_error(space_id, error)
72
+ else:
73
+ await self._handle_low_confidence_error(space_id, error)
74
+
75
+ async def _handle_high_confidence_error(self, space_id: str, error) -> None:
76
+ """处理高置信度错误"""
77
+ print(f"高置信度错误 {space_id}: {error.error_type.value}")
78
+
79
+ if error.error_type == ErrorType.DEPENDENCY_INSTALL:
80
+ await self._fix_dependency_error(space_id, error)
81
+ elif error.error_type == ErrorType.DOCKERFILE_SYNTAX:
82
+ await self._fix_dockerfile_error(space_id, error)
83
+ # ... 其他错误类型处理
84
+
85
+ async def _fix_dependency_error(self, space_id: str, error) -> None:
86
+ """修复依赖错误"""
87
+ print(f"修复 {space_id} 的依赖错误...")
88
+
89
+ # 实现具体的修复逻辑
90
+ # 1. 分析依赖类型(Python/Node.js)
91
+ # 2. 尝试更换源地址
92
+ # 3. 调整版本号
93
+ # 4. 重新安装依赖
94
+
95
+ async def _fix_dockerfile_error(self, space_id: str, error) -> None:
96
+ """修复 Dockerfile 错误"""
97
+ print(f"修复 {space_id} 的 Dockerfile 错误...")
98
+
99
+ # 实现具体的修复逻辑
100
+ # 1. 定位错误行
101
+ # 2. 语法修正
102
+ # 3. 优化命令结构
103
+
104
+ # ============================================================================
105
+ # 批量处理示例
106
+ # ============================================================================
107
+
108
+ class BatchProcessingExample:
109
+ """批量处理示例"""
110
+
111
+ def __init__(self):
112
+ self.logger = logging.getLogger(__name__)
113
+
114
+ async def batch_monitor_spaces(self, space_configs: List[Dict[str, Any]]) -> None:
115
+ """批量监控 Spaces"""
116
+
117
+ tasks = []
118
+ for config in space_configs:
119
+ task = self._monitor_single_space(config)
120
+ tasks.append(task)
121
+
122
+ await asyncio.gather(*tasks, return_exceptions=True)
123
+
124
+ async def _monitor_single_space(self, config: Dict[str, Any]) -> None:
125
+ """监控单个 Space"""
126
+ space_id = config['space_id']
127
+ monitoring_interval = config.get('interval', 60)
128
+ max_retries = config.get('max_retries', 3)
129
+
130
+ retry_count = 0
131
+ while retry_count < max_retries:
132
+ try:
133
+ # 监控逻辑
134
+ status = await self._check_space_status(space_id)
135
+
136
+ if status != SpaceStatus.ERROR:
137
+ break
138
+
139
+ retry_count += 1
140
+ if retry_count < max_retries:
141
+ await asyncio.sleep(monitoring_interval)
142
+
143
+ except Exception as e:
144
+ self.logger.error(f"监控 {space_id} 失败: {e}")
145
+ break
146
+
147
+ async def _check_space_status(self, space_id: str) -> SpaceStatus:
148
+ """检查 Space 状态"""
149
+ # 实现状态检查逻辑
150
+ pass
151
+
152
+ # ============================================================================
153
+ # 自定义错误分析示例
154
+ # ============================================================================
155
+
156
+ class CustomErrorAnalyzer:
157
+ """自定义错误分析器"""
158
+
159
+ def __init__(self):
160
+ self.custom_patterns = self._load_custom_patterns()
161
+
162
+ async def analyze_with_custom_rules(self, logs: str) -> List[Dict]:
163
+ """使用自定义规则分析"""
164
+
165
+ results = []
166
+
167
+ # 1. 应用自定义模式
168
+ for pattern in self.custom_patterns:
169
+ matches = pattern['regex'].findall(logs)
170
+ if matches:
171
+ results.append({
172
+ 'type': pattern['type'],
173
+ 'matches': matches,
174
+ 'severity': pattern['severity'],
175
+ 'suggested_fix': pattern['fix']
176
+ })
177
+
178
+ # 2. 应用机器学习模型(如果可用)
179
+ ml_results = await self._ml_analysis(logs)
180
+ results.extend(ml_results)
181
+
182
+ # 3. 综合评分
183
+ scored_results = self._score_results(results)
184
+
185
+ return scored_results
186
+
187
+ def _load_custom_patterns(self) -> List[Dict]:
188
+ """加载自定义错误模式"""
189
+ return [
190
+ {
191
+ 'name': 'Custom GPU Error',
192
+ 'regex': re.compile(r'GPU.*out of memory|CUDA.*error'),
193
+ 'type': 'gpu_error',
194
+ 'severity': 'high',
195
+ 'fix': '减少批处理大小或使用更小的模型'
196
+ },
197
+ {
198
+ 'name': 'Custom Timeout Pattern',
199
+ 'regex': re.compile(r'operation.*timeout.*after.*(\d+)ms'),
200
+ 'type': 'custom_timeout',
201
+ 'severity': 'medium',
202
+ 'fix': '增加超时设置或优化性能'
203
+ }
204
+ ]
205
+
206
+ async def _ml_analysis(self, logs: str) -> List[Dict]:
207
+ """机器学习分析"""
208
+ # 这里可以集成预训练的错误分类模型
209
+ return []
210
+
211
+ def _score_results(self, results: List[Dict]) -> List[Dict]:
212
+ """对结果进行评分"""
213
+ for result in results:
214
+ if result['severity'] == 'high':
215
+ result['score'] = 0.9
216
+ elif result['severity'] == 'medium':
217
+ result['score'] = 0.7
218
+ else:
219
+ result['score'] = 0.5
220
+
221
+ return sorted(results, key=lambda x: x['score'], reverse=True)
222
+
223
+ # ============================================================================
224
+ # Webhook 集成示例
225
+ # ============================================================================
226
+
227
+ class WebhookIntegrationExample:
228
+ """Webhook 集成示例"""
229
+
230
+ def __init__(self):
231
+ self.logger = logging.getLogger(__name__)
232
+
233
+ async def setup_webhook_server(self) -> None:
234
+ """设置 Webhook 服务器"""
235
+
236
+ from fastapi import FastAPI, Request
237
+ import uvicorn
238
+
239
+ app = FastAPI()
240
+
241
+ @app.post("/webhook/huggingface")
242
+ async def handle_hf_webhook(request: Request):
243
+ payload = await request.json()
244
+
245
+ # 处理不同的事件类型
246
+ event_type = payload.get('event')
247
+
248
+ if event_type == 'space.status_updated':
249
+ await self._handle_status_update(payload)
250
+ elif event_type == 'space.build_error':
251
+ await self._handle_build_error(payload)
252
+ elif event_type == 'space.started':
253
+ await self._handle_space_started(payload)
254
+
255
+ return {"status": "ok"}
256
+
257
+ # 启动服务器
258
+ config = uvicorn.Config(app, host="0.0.0.0", port=8000)
259
+ server = uvicorn.Server(config)
260
+ await server.serve()
261
+
262
+ async def _handle_status_update(self, payload: Dict) -> None:
263
+ """处理状态更新事件"""
264
+ space_id = payload.get('space', {}).get('id')
265
+ new_status = payload.get('space', {}).get('runtime', {}).get('stage')
266
+
267
+ self.logger.info(f"Space {space_id} 状态更新: {new_status}")
268
+
269
+ # 触发相应处理逻辑
270
+ if new_status == 'ERROR':
271
+ await self._trigger_repair_workflow(space_id)
272
+
273
+ async def _trigger_repair_workflow(self, space_id: str) -> None:
274
+ """触发修复工作流"""
275
+ # 实现修复工作流
276
+ pass
277
+
278
+ # ============================================================================
279
+ # 测试和调试示例
280
+ # ============================================================================
281
+
282
+ class TestingExample:
283
+ """测试和��试示例"""
284
+
285
+ def __init__(self):
286
+ self.logger = logging.getLogger(__name__)
287
+
288
+ async def test_error_analysis(self) -> None:
289
+ """测试错误分析功能"""
290
+
291
+ # 模拟日志数据
292
+ sample_logs = """
293
+ ERROR: Could not find a version that satisfies the requirement torch==2.0.0
294
+ ERROR: No matching distribution found for torch==2.0.0
295
+ Build failed
296
+ """
297
+
298
+ analyzer = IntelligentErrorAnalyzer()
299
+ errors = await analyzer.analyze_logs(sample_logs)
300
+
301
+ print(f"检测到 {len(errors)} 个错误:")
302
+ for error in errors:
303
+ print(f"- {error.error_type.value}: {error.message}")
304
+ print(f" 置信度: {error.confidence}")
305
+
306
+ async def test_repair_strategies(self) -> None:
307
+ """测试修复策略"""
308
+
309
+ # 测试不同错误类型的修复策略
310
+ from core_system import SmartRepairEngine, ErrorInfo, SpaceInfo
311
+
312
+ repair_engine = SmartRepairEngine()
313
+
314
+ test_errors = [
315
+ ErrorInfo(
316
+ error_type=ErrorType.DEPENDENCY_INSTALL,
317
+ message="pip install failed",
318
+ log_snippet="ERROR: Could not find torch",
319
+ confidence=0.9
320
+ ),
321
+ ErrorInfo(
322
+ error_type=ErrorType.DOCKERFILE_SYNTAX,
323
+ message="Dockerfile syntax error",
324
+ log_snippet="failed to solve: syntax error",
325
+ confidence=0.85
326
+ )
327
+ ]
328
+
329
+ space_info = SpaceInfo(
330
+ space_id="test/space",
331
+ name="Test Space",
332
+ repository_url="",
333
+ current_status=SpaceStatus.ERROR,
334
+ last_updated=datetime.now()
335
+ )
336
+
337
+ for error in test_errors:
338
+ strategy = await repair_engine.generate_strategy(error, space_info)
339
+ if strategy:
340
+ print(f"修复策略: {strategy.action.value}")
341
+ print(f"描述: {strategy.description}")
342
+ print(f"成功率: {strategy.success_rate}")
343
+ print(f"风险等级: {strategy.risk_level}")
344
+ print()
345
+
346
+ # ============================================================================
347
+ # 性能监控示例
348
+ # ============================================================================
349
+
350
+ class PerformanceMonitoringExample:
351
+ """性能监控示例"""
352
+
353
+ def __init__(self):
354
+ self.metrics = {}
355
+
356
+ async def monitor_system_performance(self) -> None:
357
+ """监控系统性能"""
358
+
359
+ while True:
360
+ # 收集性能指标
361
+ current_metrics = await self._collect_metrics()
362
+
363
+ # 存储和比较指标
364
+ self._store_metrics(current_metrics)
365
+
366
+ # 检查异常
367
+ anomalies = self._detect_anomalies(current_metrics)
368
+
369
+ if anomalies:
370
+ await self._handle_anomalies(anomalies)
371
+
372
+ await asyncio.sleep(60) # 每分钟检查一次
373
+
374
+ async def _collect_metrics(self) -> Dict[str, Any]:
375
+ """收集性能指标"""
376
+ return {
377
+ 'timestamp': datetime.now(),
378
+ 'cpu_usage': self._get_cpu_usage(),
379
+ 'memory_usage': self._get_memory_usage(),
380
+ 'active_repairs': self._get_active_repairs(),
381
+ 'queue_size': self._get_queue_size(),
382
+ 'error_rate': self._get_error_rate()
383
+ }
384
+
385
+ def _store_metrics(self, metrics: Dict[str, Any]) -> None:
386
+ """存储指标"""
387
+ # 存储到数据库或时间序列数据库
388
+ pass
389
+
390
+ def _detect_anomalies(self, metrics: Dict[str, Any]) -> List[str]:
391
+ """检测异常"""
392
+ anomalies = []
393
+
394
+ if metrics['cpu_usage'] > 80:
395
+ anomalies.append(f"CPU 使用率过高: {metrics['cpu_usage']}%")
396
+
397
+ if metrics['memory_usage'] > 90:
398
+ anomalies.append(f"内存使用率过高: {metrics['memory_usage']}%")
399
+
400
+ if metrics['error_rate'] > 0.1:
401
+ anomalies.append(f"错误率过高: {metrics['error_rate']}")
402
+
403
+ return anomalies
404
+
405
+ async def _handle_anomalies(self, anomalies: List[str]) -> None:
406
+ """处理异常"""
407
+ for anomaly in anomalies:
408
+ self.logger.warning(f"性能异常: {anomaly}")
409
+ # 发送告警或自动调整
410
+
411
+ # ============================================================================
412
+ # 主程序示例
413
+ # ============================================================================
414
+
415
+ async def main():
416
+ """主程序示例"""
417
+ print("HuggingFace Spaces 自动修复系统示例")
418
+ print("=" * 50)
419
+
420
+ # 选择运行的示例
421
+ examples = {
422
+ "1": ("基本使用", basic_usage_example),
423
+ "2": ("高级使用", lambda: AdvancedUsageExample().custom_monitoring_workflow(
424
+ ["user/space1", "user/space2"]
425
+ )),
426
+ "3": ("测试错误分析", lambda: TestingExample().test_error_analysis()),
427
+ "4": ("性能监控", lambda: PerformanceMonitoringExample().monitor_system_performance()),
428
+ "5": ("Webhook 服务器", lambda: WebhookIntegrationExample().setup_webhook_server())
429
+ }
430
+
431
+ print("请选择要运行的示例:")
432
+ for key, (desc, _) in examples.items():
433
+ print(f"{key}. {desc}")
434
+
435
+ choice = input("请输入选择 (1-5): ").strip()
436
+
437
+ if choice in examples:
438
+ desc, func = examples[choice]
439
+ print(f"\n运行: {desc}")
440
+ try:
441
+ await func()
442
+ except KeyboardInterrupt:
443
+ print("\n程序被用户中断")
444
+ except Exception as e:
445
+ print(f"运行出错: {e}")
446
+ else:
447
+ print("无效的选择")
448
+
449
+ if __name__ == "__main__":
450
+ # 设置日志
451
+ logging.basicConfig(
452
+ level=logging.INFO,
453
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
454
+ )
455
+
456
+ # 运行主程序
457
+ asyncio.run(main())