CosFly-Track / watchdog.ps1
Ys404's picture
Add scripts and checkpoints (CosFly-Track release)
b879865 verified
raw
history blame
2.65 kB
# Upload watchdog: kill+restart upload if its disk Read hasn't grown for 5 min.
# Run this in its own detached PowerShell.
$LOG_FILE = "D:\hf_upload\.watchdog.log"
$STALL_SECONDS = 300 # 5 min no IO -> kill
function Log-Msg($msg) {
$line = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] $msg"
Write-Host $line
Add-Content -Path $LOG_FILE -Value $line
}
function Get-UploadProc {
return Get-CimInstance Win32_Process -Filter "Name='python.exe'" |
Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
Select-Object -First 1
}
function Restart-Upload {
Log-Msg "Restarting upload (LFS dedup will skip already uploaded chunks)..."
$cmdLine = 'cmd.exe /c "D:\hf_upload\start_upload_detached.cmd"'
$r = Invoke-CimMethod -ClassName Win32_Process -MethodName Create -Arguments @{ CommandLine = $cmdLine }
Log-Msg "Restart issued, WMI ReturnValue=$($r.ReturnValue), launcher PID=$($r.ProcessId)"
Start-Sleep -Seconds 20
}
Log-Msg "Watchdog started, stall threshold = $STALL_SECONDS s"
$lastRead = $null
$lastReadTime = Get-Date
while ($true) {
$proc = Get-UploadProc
if (-not $proc) {
Log-Msg "No upload python found. Restarting..."
Restart-Upload
$lastRead = $null
$lastReadTime = Get-Date
continue
}
$curRead = $proc.ReadTransferCount
$curPid = $proc.ProcessId
if ($null -eq $lastRead) {
$lastRead = $curRead
$lastReadTime = Get-Date
Log-Msg "Tracking PID $curPid, init Read=$([math]::Round($curRead/1GB,2)) GB"
} elseif ($curRead -gt $lastRead) {
$lastRead = $curRead
$lastReadTime = Get-Date
} else {
$stallSec = ((Get-Date) - $lastReadTime).TotalSeconds
if ($stallSec -ge $STALL_SECONDS) {
Log-Msg "STALL DETECTED on PID ${curPid}: no Read for $([math]::Round($stallSec,0))s, killing..."
Stop-Process -Id $curPid -Force -ErrorAction SilentlyContinue
Get-CimInstance Win32_Process -Filter "Name='powershell.exe'" |
Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
Get-CimInstance Win32_Process -Filter "Name='cmd.exe'" |
Where-Object { $_.CommandLine -like "*start_upload_detached*" } |
ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
Start-Sleep -Seconds 5
Restart-Upload
$lastRead = $null
$lastReadTime = Get-Date
}
}
Start-Sleep -Seconds 30
}