Spaces:
Build error
Build error
File size: 23,849 Bytes
14f6b4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 | """
修复循环引擎
控制修复循环的启动、暂停、停止,包含条件判断和超时机制
"""
import asyncio
import logging
from typing import Dict, List, Optional, Any, Callable, Set, Tuple
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from enum import Enum
import threading
import time
from pathlib import Path
from data_models import SpaceInfo, ErrorInfo, RepairStrategy, SpaceStatus
from auto_repair_executor import AutoRepairExecutor
class LoopState(Enum):
"""循环状态"""
STOPPED = "stopped"
STARTING = "starting"
RUNNING = "running"
PAUSING = "pausing"
PAUSED = "paused"
STOPPING = "stopping"
ERROR = "error"
class TerminationReason(Enum):
"""终止原因"""
MANUAL = "manual"
SUCCESS = "success"
TIMEOUT = "timeout"
MAX_ITERATIONS = "max_iterations"
ERROR = "error"
NO_PROGRESS = "no_progress"
RESOURCE_EXHAUSTED = "resource_exhausted"
@dataclass
class LoopConfig:
"""循环配置"""
max_iterations: int = 10 # 最大迭代次数
timeout_minutes: int = 60 # 超时时间(分钟)
check_interval_seconds: int = 30 # 检查间隔(秒)
success_wait_seconds: int = 60 # 成功后等待时间
failure_wait_seconds: int = 120 # 失败后等待时间
enable_progress_check: bool = True # 启用进度检查
no_progress_timeout_minutes: int = 15 # 无进度超时(分钟)
max_concurrent_repairs: int = 3 # 最大并发修复数
@dataclass
class LoopStatistics:
"""循环统计"""
start_time: datetime
iterations: int = 0
successful_repairs: int = 0
failed_repairs: int = 0
total_repair_time: float = 0.0
current_iteration_start: Optional[datetime] = None
last_successful_repair: Optional[datetime] = None
last_error: Optional[str] = None
termination_reason: Optional[TerminationReason] = None
class ConditionEvaluator:
"""条件评估器"""
def __init__(self):
self.logger = logging.getLogger(__name__)
def should_continue_loop(self, stats: LoopStatistics, config: LoopConfig) -> Tuple[bool, Optional[str]]:
"""判断是否应该继续循环"""
# 检查最大迭代次数
if stats.iterations >= config.max_iterations:
return False, f"达到最大迭代次数: {config.max_iterations}"
# 检查超时
elapsed_time = (datetime.now() - stats.start_time).total_seconds()
timeout_seconds = config.timeout_minutes * 60
if elapsed_time >= timeout_seconds:
return False, f"循环超时: {config.timeout_minutes} 分钟"
# 检查进度
if config.enable_progress_check and self._check_no_progress(stats, config):
return False, f"长期无进展: {config.no_progress_timeout_minutes} 分钟"
return True, None
def _check_no_progress(self, stats: LoopStatistics, config: LoopConfig) -> bool:
"""检查是否有进展"""
if not stats.last_successful_repair:
return stats.iterations > 3 # 前3次给机会
no_progress_time = (datetime.now() - stats.last_successful_repair).total_seconds()
timeout_seconds = config.no_progress_timeout_minutes * 60
return no_progress_time >= timeout_seconds
def should_attempt_repair(self, space_info: SpaceInfo, last_status: Optional[SpaceStatus]) -> bool:
"""判断是否应该尝试修复"""
# 如果当前状态不是错误,不需要修复
if space_info.current_status != SpaceStatus.ERROR:
return False
# 如果上次状态也是错误,可能还在处理中
if last_status == SpaceStatus.ERROR:
return False
return True
def evaluate_repair_success(self, previous_status: SpaceStatus, current_status: SpaceStatus,
error_before: Optional[ErrorInfo], error_after: Optional[ErrorInfo]) -> bool:
"""评估修复是否成功"""
# 状态从错误变为非错误
if previous_status == SpaceStatus.ERROR and current_status != SpaceStatus.ERROR:
return True
# 错误信息减少或消失
if error_before and not error_after:
return True
if error_before and error_after:
# 错误类型改变,可能有问题
if error_before.error_type != error_after.error_type:
return False
# 置信度降低,可能有问题
if error_after.confidence < error_before.confidence * 0.5:
return False
return False
def calculate_wait_time(self, repair_success: bool, config: LoopConfig) -> int:
"""计算等待时间"""
if repair_success:
return config.success_wait_seconds
else:
return config.failure_wait_seconds
class TimeoutManager:
"""超时管理器"""
def __init__(self):
self.logger = logging.getLogger(__name__)
self.timeouts: Dict[str, datetime] = {}
def set_timeout(self, key: str, timeout_seconds: int) -> None:
"""设置超时"""
expire_time = datetime.now() + timedelta(seconds=timeout_seconds)
self.timeouts[key] = expire_time
self.logger.debug(f"设置超时: {key} - {timeout_seconds} 秒")
def is_expired(self, key: str) -> bool:
"""检查是否超时"""
if key not in self.timeouts:
return True
return datetime.now() > self.timeouts[key]
def get_remaining_time(self, key: str) -> Optional[float]:
"""获取剩余时间"""
if key not in self.timeouts:
return None
remaining = (self.timeouts[key] - datetime.now()).total_seconds()
return max(0, remaining)
def cancel_timeout(self, key: str) -> None:
"""取消超时"""
if key in self.timeouts:
del self.timeouts[key]
self.logger.debug(f"取消超时: {key}")
def cleanup_expired(self) -> None:
"""清理过期的超时"""
current_time = datetime.now()
expired_keys = [
key for key, expire_time in self.timeouts.items()
if current_time > expire_time
]
for key in expired_keys:
del self.timeouts[key]
self.logger.debug(f"清理过期超时: {key}")
class LoopController:
"""循环控制器"""
def __init__(self, config: LoopConfig):
self.logger = logging.getLogger(__name__)
self.config = config
self.state = LoopState.STOPPED
self.stats = None
self.condition_evaluator = ConditionEvaluator()
self.timeout_manager = TimeoutManager()
# 控制标志
self._stop_requested = threading.Event()
self._pause_requested = threading.Event()
self._lock = threading.Lock()
# 回调函数
self.on_iteration_start: Optional[Callable] = None
self.on_iteration_complete: Optional[Callable] = None
self.on_loop_complete: Optional[Callable] = None
self.on_error: Optional[Callable] = None
async def start_loop(self) -> None:
"""启动循环"""
with self._lock:
if self.state != LoopState.STOPPED:
raise RuntimeError(f"循环已在运行或正在启动: {self.state.value}")
self.state = LoopState.STARTING
self._stop_requested.clear()
self._pause_requested.clear()
try:
await self._run_loop()
except Exception as e:
with self._lock:
self.state = LoopState.ERROR
self.logger.error(f"循环运行异常: {e}")
if self.on_error:
await self._safe_call(self.on_error, e)
async def _run_loop(self) -> None:
"""运行主循环"""
self.stats = LoopStatistics(start_time=datetime.now())
with self._lock:
self.state = LoopState.RUNNING
self.logger.info("修复循环已启动")
try:
while True:
# 检查停止请求
if self._stop_requested.is_set():
self.logger.info("收到停止请求")
break
# 检查暂停请求
if self._pause_requested.is_set():
with self._lock:
self.state = LoopState.PAUSED
self.logger.info("循环已暂停")
await self._wait_for_resume()
with self._lock:
self.state = LoopState.RUNNING
self.logger.info("循环已恢复")
continue
# 执行一次迭代
iteration_result = await self._execute_iteration()
if not iteration_result.continue_loop:
self.stats.termination_reason = iteration_result.termination_reason
break
# 等待下一次迭代
wait_time = iteration_result.wait_time
if wait_time > 0:
await asyncio.sleep(wait_time)
finally:
with self._lock:
self.state = LoopState.STOPPED
self.logger.info("修复循环已停止")
if self.on_loop_complete:
await self._safe_call(self.on_loop_complete, self.stats)
async def _execute_iteration(self) -> Any:
"""执行一次迭代"""
self.stats.iterations += 1
self.stats.current_iteration_start = datetime.now()
# 调用迭代开始回调
if self.on_iteration_start:
await self._safe_call(self.on_iteration_start, self.stats)
try:
# 判断是否应该继续循环
should_continue, reason = self.condition_evaluator.should_continue_loop(self.stats, self.config)
if not should_continue:
termination_reason = self._determine_termination_reason(reason)
self.logger.info(f"循环终止: {reason}")
return IterationResult(continue_loop=False, termination_reason=termination_reason, wait_time=0)
# 执行修复逻辑(这里需要实际实现)
repair_result = await self._attempt_repair()
# 更新统计信息
if repair_result.success:
self.stats.successful_repairs += 1
self.stats.last_successful_repair = datetime.now()
wait_time = self.condition_evaluator.calculate_wait_time(True, self.config)
else:
self.stats.failed_repairs += 1
self.stats.last_error = repair_result.error_message
wait_time = self.condition_evaluator.calculate_wait_time(False, self.config)
# 更新总修复时间
iteration_time = (datetime.now() - self.stats.current_iteration_start).total_seconds()
self.stats.total_repair_time += iteration_time
return IterationResult(
continue_loop=True,
termination_reason=None,
wait_time=wait_time,
repair_success=repair_result.success
)
except Exception as e:
self.stats.failed_repairs += 1
self.stats.last_error = str(e)
self.logger.error(f"迭代执行异常: {e}")
return IterationResult(
continue_loop=True,
termination_reason=None,
wait_time=self.config.failure_wait_seconds,
repair_success=False,
error_message=str(e)
)
finally:
# 调用迭代完成回调
if self.on_iteration_complete:
await self._safe_call(self.on_iteration_complete, self.stats)
async def _attempt_repair(self) -> Any:
"""尝试修复(需要实际实现)"""
# 这里应该调用实际的修复逻辑
# 目前返回示例结果
return RepairResult(success=False, error_message="需要实现具体修复逻辑")
def _determine_termination_reason(self, reason: str) -> TerminationReason:
"""确定终止原因"""
if "迭代次数" in reason:
return TerminationReason.MAX_ITERATIONS
elif "超时" in reason:
return TerminationReason.TIMEOUT
elif "无进展" in reason:
return TerminationReason.NO_PROGRESS
elif "资源" in reason:
return TerminationReason.RESOURCE_EXHAUSTED
else:
return TerminationReason.SUCCESS
async def _wait_for_resume(self) -> None:
"""等待恢复"""
while self._pause_requested.is_set() and not self._stop_requested.is_set():
await asyncio.sleep(1)
async def _safe_call(self, callback: Callable, *args) -> None:
"""安全调用回调函数"""
try:
if asyncio.iscoroutinefunction(callback):
await callback(*args)
else:
callback(*args)
except Exception as e:
self.logger.error(f"回调函数执行异常: {e}")
def stop(self) -> None:
"""停止循环"""
self._stop_requested.set()
self.logger.info("请求停止循环")
def pause(self) -> None:
"""暂停循环"""
self._pause_requested.set()
self.logger.info("请求暂停循环")
def resume(self) -> None:
"""恢复循环"""
self._pause_requested.clear()
self.logger.info("请求恢复循环")
def get_state(self) -> LoopState:
"""获取当前状态"""
return self.state
def get_statistics(self) -> Optional[LoopStatistics]:
"""获取统计信息"""
return self.stats
@dataclass
class IterationResult:
"""迭代结果"""
continue_loop: bool
termination_reason: Optional[TerminationReason]
wait_time: int
repair_success: Optional[bool] = None
error_message: Optional[str] = None
@dataclass
class RepairResult:
"""修复结果"""
success: bool
error_message: Optional[str] = None
commit_sha: Optional[str] = None
repair_time: Optional[float] = None
class RepairLoopEngine:
"""修复循环引擎主类"""
def __init__(self, repair_executor: AutoRepairExecutor, config: LoopConfig):
self.logger = logging.getLogger(__name__)
self.repair_executor = repair_executor
self.config = config
# 循环控制器
self.controller = LoopController(config)
# 监控的 Spaces
self.monitored_spaces: Dict[str, SpaceInfo] = {}
self.space_errors: Dict[str, ErrorInfo] = {}
self.last_space_status: Dict[str, SpaceStatus] = {}
# 设置回调
self._setup_callbacks()
# 并发控制
self.active_repairs: Set[str] = set()
self.repair_lock = asyncio.Lock()
def _setup_callbacks(self) -> None:
"""设置回调函数"""
self.controller.on_iteration_start = self._on_iteration_start
self.controller.on_iteration_complete = self._on_iteration_complete
self.controller.on_loop_complete = self._on_loop_complete
self.controller.on_error = self._on_error
async def _on_iteration_start(self, stats: LoopStatistics) -> None:
"""迭代开始回调"""
self.logger.info(f"开始第 {stats.iterations} 次迭代")
async def _on_iteration_complete(self, stats: LoopStatistics) -> None:
"""迭代完成回调"""
success_rate = stats.successful_repairs / max(stats.iterations, 1) * 100
avg_time = stats.total_repair_time / max(stats.iterations, 1)
self.logger.info(
f"迭代 {stats.iterations} 完成 - "
f"成功率: {success_rate:.1f}%, "
f"平均时间: {avg_time:.1f}秒"
)
async def _on_loop_complete(self, stats: LoopStatistics) -> None:
"""循环完成回调"""
total_time = (datetime.now() - stats.start_time).total_seconds()
success_rate = stats.successful_repairs / max(stats.iterations, 1) * 100
self.logger.info(
f"修复循环完成 - "
f"总时间: {total_time:.1f}秒, "
f"迭代次数: {stats.iterations}, "
f"成功修复: {stats.successful_repairs}, "
f"失败修复: {stats.failed_repairs}, "
f"成功率: {success_rate:.1f}%, "
f"终止原因: {stats.termination_reason.value if stats.termination_reason else 'unknown'}"
)
async def _on_error(self, error: Exception) -> None:
"""错误回调"""
self.logger.error(f"循环执行错误: {error}")
def add_space(self, space_info: SpaceInfo) -> None:
"""添加要监控的 Space"""
self.monitored_spaces[space_info.space_id] = space_info
self.logger.info(f"添加监控 Space: {space_info.space_id}")
def remove_space(self, space_id: str) -> None:
"""移除监控的 Space"""
if space_id in self.monitored_spaces:
del self.monitored_spaces[space_id]
if space_id in self.space_errors:
del self.space_errors[space_id]
if space_id in self.last_space_status:
del self.last_space_status[space_id]
self.logger.info(f"移除监控 Space: {space_id}")
def update_space_status(self, space_id: str, status: SpaceStatus,
error_info: Optional[ErrorInfo] = None) -> None:
"""更新 Space 状态"""
self.last_space_status[space_id] = status
if error_info:
self.space_errors[space_id] = error_info
self.logger.debug(f"更新 Space 状态: {space_id} -> {status.value}")
async def _attempt_repair(self) -> RepairResult:
"""尝试修复"""
start_time = datetime.now()
try:
# 查找需要修复的 Space
space_to_repair = None
error_to_fix = None
for space_id, space_info in self.monitored_spaces.items():
last_status = self.last_space_status.get(space_id)
current_status = space_info.current_status
current_error = self.space_errors.get(space_id)
if self.controller.condition_evaluator.should_attempt_repair(space_info, last_status):
# 检查是否已经在修复中
async with self.repair_lock:
if space_id in self.active_repairs:
continue
if len(self.active_repairs) >= self.config.max_concurrent_repairs:
break
space_to_repair = space_info
error_to_fix = current_error
self.active_repairs.add(space_id)
break
if not space_to_repair or not error_to_fix:
return RepairResult(success=False, error_message="没有需要修复的 Space")
# 生成修复策略(这里需要实际实现)
strategy = await self._generate_repair_strategy(error_to_fix, space_to_repair)
if not strategy:
self.active_repairs.discard(space_to_repair.space_id)
return RepairResult(success=False, error_message="无法生成修复策略")
# 执行修复
success, commit_sha = await self.repair_executor.execute_repair(
space_to_repair, error_to_fix, strategy
)
# 计算修复时间
repair_time = (datetime.now() - start_time).total_seconds()
# 更新 Space 状态(这里应该实际检查状态)
# await self._update_space_after_repair(space_to_repair.space_id)
return RepairResult(
success=success,
error_message=None if success else "修复执行失败",
commit_sha=commit_sha,
repair_time=repair_time
)
except Exception as e:
return RepairResult(success=False, error_message=str(e))
finally:
# 清理活跃修复记录
if space_to_repair:
async with self.repair_lock:
self.active_repairs.discard(space_to_repair.space_id)
async def _generate_repair_strategy(self, error_info: ErrorInfo, space_info: SpaceInfo) -> Optional[RepairStrategy]:
"""生成修复策略(需要实际实现)"""
# 这里应该调用实际的策略生成逻辑
# 目前返回 None 表示未实现
return None
async def start(self) -> None:
"""启动修复循环"""
if not self.monitored_spaces:
raise ValueError("没有要监控的 Space")
self.logger.info(f"启动修复循环,监控 {len(self.monitored_spaces)} 个 Space")
await self.controller.start_loop()
def stop(self) -> None:
"""停止修复循环"""
self.controller.stop()
def pause(self) -> None:
"""暂停修复循环"""
self.controller.pause()
def resume(self) -> None:
"""恢复修复循环"""
self.controller.resume()
def get_state(self) -> LoopState:
"""获取循环状态"""
return self.controller.get_state()
def get_statistics(self) -> Optional[LoopStatistics]:
"""获取统计信息"""
return self.controller.get_statistics()
def get_active_repairs(self) -> List[str]:
"""获取活跃的修复列表"""
return list(self.active_repairs)
def get_monitored_spaces(self) -> List[str]:
"""获取监控的 Space 列表"""
return list(self.monitored_spaces.keys())
if __name__ == "__main__":
# 示例用法
async def main():
# 创建配置
config = LoopConfig(
max_iterations=5,
timeout_minutes=30,
check_interval_seconds=10
)
# 创建修复执行器(需要传入实际的 HF API 客户端)
# hf_client = HuggingFaceAPIClient(token="your-token")
# repair_executor = AutoRepairExecutor(hf_client)
# 创建循环引擎
# loop_engine = RepairLoopEngine(repair_executor, config)
# 添加监控的 Space
# space_info = SpaceInfo(
# space_id="test/test-space",
# name="test-space",
# repository_url="https://huggingface.co/spaces/test/test-space",
# current_status=SpaceStatus.ERROR,
# last_updated=datetime.now()
# )
# loop_engine.add_space(space_info)
# 启动循环
# await loop_engine.start()
print("RepairLoopEngine 示例代码")
asyncio.run(main()) |