File size: 2,918 Bytes
34c53b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
param(
    [string]$Timestamp = "",
    [string]$RuntimeDir = "",
    [string]$SweepOutDir = "",
    [string]$AnalysisDir = "data\\analysis",
    [int]$Stage1Steps = 1000,
    [int]$Stage2Steps = 3750,
    [int]$TopK = 2,
    [int]$EvalSteps = 500,
    [int]$TestEvalEverySteps = 1000,
    [int]$MaxValSamples = 128,
    [int]$MaxTestSamples = 128,
    [string]$LrList = "1e-4,2e-4",
    [string]$LengthPenaltyList = "0.7,0.8,0.9",
    [string]$BeamsList = "4",
    [switch]$Force
)

$ErrorActionPreference = "Stop"

if ([string]::IsNullOrWhiteSpace($Timestamp)) {
    $Timestamp = Get-Date -Format "yyyyMMdd_HHmmss"
}

$repoRoot = Split-Path -Parent $PSScriptRoot
Set-Location $repoRoot

if ([string]::IsNullOrWhiteSpace($RuntimeDir)) {
    $RuntimeDir = "data\\runtime_metrics\\t5_sweep_$Timestamp"
}
if ([string]::IsNullOrWhiteSpace($SweepOutDir)) {
    $SweepOutDir = "models\\finetune\\t5-sweep-$Timestamp"
}

New-Item -ItemType Directory -Force -Path $RuntimeDir | Out-Null
$stdoutLog = Join-Path $RuntimeDir "sweep.stdout.log"
$stderrLog = Join-Path $RuntimeDir "sweep.stderr.log"
$launchJson = Join-Path $RuntimeDir "launch.json"

$pythonExe = (Resolve-Path ".venv-gpu\\Scripts\\python.exe").Path

$gpuCheck = & $pythonExe -c "import torch,sys; ok=torch.cuda.is_available(); name=torch.cuda.get_device_name(0) if ok else ''; print(f'torch={torch.__version__} cuda={ok} device={name}'); sys.exit(0 if ok else 2)"
if ($LASTEXITCODE -ne 0) {
    throw "GPU preflight failed in .venv-gpu. CUDA is not available."
}

$argList = @(
    "scripts\\run_t5_sweep.py",
    "--sweep-out-dir", $SweepOutDir,
    "--runtime-dir", $RuntimeDir,
    "--analysis-dir", $AnalysisDir,
    "--stage1-steps", "$Stage1Steps",
    "--stage2-steps", "$Stage2Steps",
    "--top-k", "$TopK",
    "--eval-steps", "$EvalSteps",
    "--test-eval-every-steps", "$TestEvalEverySteps",
    "--max-val-samples", "$MaxValSamples",
    "--max-test-samples", "$MaxTestSamples",
    "--lr-list", $LrList,
    "--length-penalty-list", $LengthPenaltyList,
    "--beams-list", $BeamsList
)
if ($Force) {
    $argList += "--force"
}

$proc = Start-Process `
    -FilePath $pythonExe `
    -ArgumentList $argList `
    -WorkingDirectory $repoRoot `
    -RedirectStandardOutput $stdoutLog `
    -RedirectStandardError $stderrLog `
    -WindowStyle Hidden `
    -PassThru

$payload = [ordered]@{
    timestamp = $Timestamp
    pid = $proc.Id
    python = $pythonExe
    gpu_preflight = ($gpuCheck -join "`n")
    runtime_dir = $RuntimeDir
    sweep_out_dir = $SweepOutDir
    analysis_dir = $AnalysisDir
    stdout_log = $stdoutLog
    stderr_log = $stderrLog
    progress_glob = (Join-Path $RuntimeDir "*__progress.json")
    history_glob = (Join-Path $RuntimeDir "*__history.jsonl")
    command = ($pythonExe + " " + ($argList -join " "))
}

$payload | ConvertTo-Json -Depth 5 | Set-Content -Encoding UTF8 $launchJson
$payload | ConvertTo-Json -Depth 5