# Upload watchdog: kill+restart upload if its disk Read hasn't grown for 5 min. # Run this in its own detached PowerShell. $LOG_FILE = "D:\hf_upload\.watchdog.log" $STALL_SECONDS = 300 # 5 min no IO -> kill function Log-Msg($msg) { $line = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] $msg" Write-Host $line Add-Content -Path $LOG_FILE -Value $line } function Get-UploadProc { return Get-CimInstance Win32_Process -Filter "Name='python.exe'" | Where-Object { $_.CommandLine -like "*upload_to_hf*" } | Select-Object -First 1 } function Restart-Upload { Log-Msg "Restarting upload (LFS dedup will skip already uploaded chunks)..." $cmdLine = 'cmd.exe /c "D:\hf_upload\start_upload_detached.cmd"' $r = Invoke-CimMethod -ClassName Win32_Process -MethodName Create -Arguments @{ CommandLine = $cmdLine } Log-Msg "Restart issued, WMI ReturnValue=$($r.ReturnValue), launcher PID=$($r.ProcessId)" Start-Sleep -Seconds 20 } Log-Msg "Watchdog started, stall threshold = $STALL_SECONDS s" $lastRead = $null $lastReadTime = Get-Date while ($true) { $proc = Get-UploadProc if (-not $proc) { Log-Msg "No upload python found. Restarting..." Restart-Upload $lastRead = $null $lastReadTime = Get-Date continue } $curRead = $proc.ReadTransferCount $curPid = $proc.ProcessId if ($null -eq $lastRead) { $lastRead = $curRead $lastReadTime = Get-Date Log-Msg "Tracking PID $curPid, init Read=$([math]::Round($curRead/1GB,2)) GB" } elseif ($curRead -gt $lastRead) { $lastRead = $curRead $lastReadTime = Get-Date } else { $stallSec = ((Get-Date) - $lastReadTime).TotalSeconds if ($stallSec -ge $STALL_SECONDS) { Log-Msg "STALL DETECTED on PID ${curPid}: no Read for $([math]::Round($stallSec,0))s, killing..." Stop-Process -Id $curPid -Force -ErrorAction SilentlyContinue Get-CimInstance Win32_Process -Filter "Name='powershell.exe'" | Where-Object { $_.CommandLine -like "*upload_to_hf*" } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue } Get-CimInstance Win32_Process -Filter "Name='cmd.exe'" | Where-Object { $_.CommandLine -like "*start_upload_detached*" } | ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue } Start-Sleep -Seconds 5 Restart-Upload $lastRead = $null $lastReadTime = Get-Date } } Start-Sleep -Seconds 30 }