File size: 2,648 Bytes
2bee562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Upload watchdog: kill+restart upload if its disk Read hasn't grown for 5 min.
# Run this in its own detached PowerShell.

$LOG_FILE = "D:\hf_upload\.watchdog.log"
$STALL_SECONDS = 300  # 5 min no IO -> kill

function Log-Msg($msg) {
    $line = "[$(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')] $msg"
    Write-Host $line
    Add-Content -Path $LOG_FILE -Value $line
}

function Get-UploadProc {
    return Get-CimInstance Win32_Process -Filter "Name='python.exe'" |
           Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
           Select-Object -First 1
}

function Restart-Upload {
    Log-Msg "Restarting upload (LFS dedup will skip already uploaded chunks)..."
    $cmdLine = 'cmd.exe /c "D:\hf_upload\start_upload_detached.cmd"'
    $r = Invoke-CimMethod -ClassName Win32_Process -MethodName Create -Arguments @{ CommandLine = $cmdLine }
    Log-Msg "Restart issued, WMI ReturnValue=$($r.ReturnValue), launcher PID=$($r.ProcessId)"
    Start-Sleep -Seconds 20
}

Log-Msg "Watchdog started, stall threshold = $STALL_SECONDS s"

$lastRead = $null
$lastReadTime = Get-Date

while ($true) {
    $proc = Get-UploadProc
    if (-not $proc) {
        Log-Msg "No upload python found. Restarting..."
        Restart-Upload
        $lastRead = $null
        $lastReadTime = Get-Date
        continue
    }

    $curRead = $proc.ReadTransferCount
    $curPid = $proc.ProcessId

    if ($null -eq $lastRead) {
        $lastRead = $curRead
        $lastReadTime = Get-Date
        Log-Msg "Tracking PID $curPid, init Read=$([math]::Round($curRead/1GB,2)) GB"
    } elseif ($curRead -gt $lastRead) {
        $lastRead = $curRead
        $lastReadTime = Get-Date
    } else {
        $stallSec = ((Get-Date) - $lastReadTime).TotalSeconds
        if ($stallSec -ge $STALL_SECONDS) {
            Log-Msg "STALL DETECTED on PID ${curPid}: no Read for $([math]::Round($stallSec,0))s, killing..."
            Stop-Process -Id $curPid -Force -ErrorAction SilentlyContinue
            Get-CimInstance Win32_Process -Filter "Name='powershell.exe'" |
                Where-Object { $_.CommandLine -like "*upload_to_hf*" } |
                ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
            Get-CimInstance Win32_Process -Filter "Name='cmd.exe'" |
                Where-Object { $_.CommandLine -like "*start_upload_detached*" } |
                ForEach-Object { Stop-Process -Id $_.ProcessId -Force -ErrorAction SilentlyContinue }
            Start-Sleep -Seconds 5
            Restart-Upload
            $lastRead = $null
            $lastReadTime = Get-Date
        }
    }

    Start-Sleep -Seconds 30
}