File size: 3,169 Bytes
f09153a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os, re, json, hashlib, datetime, sys
from pathlib import Path
root = Path('checkpoints')
if not root.exists():
    print('checkpoints directory not found')
    sys.exit(1)
step_dirs = []
for p in root.iterdir():
    if p.is_dir() and re.match(r'step_\d+$', p.name):
        step_dirs.append(p)
step_dirs.sort(key=lambda x: int(x.name.split('_')[1]))
print(f'Found {len(step_dirs)} step dirs')
configs = {}
raw_map = {}
for d in step_dirs:
    cfg = d / 'config.json'
    if cfg.exists():
        try:
            obj = json.load(cfg.open())
            norm = json.dumps(obj, sort_keys=True, separators=(',',':'))
        except Exception as e:
            # fallback to raw normalized whitespace
            txt = cfg.read_text()
            norm = '\n'.join([line.strip() for line in txt.splitlines() if line.strip()])
        h = hashlib.md5(norm.encode()).hexdigest()
        configs.setdefault(h, {'norm':norm,'steps':[]})['steps'].append((int(d.name.split('_')[1]), d))
        raw_map[d] = {'hash':h,'path':cfg}
    else:
        print(f'No config in {d}')

print('Groups:')
for h,v in configs.items():
    steps_sorted = sorted(v['steps'], key=lambda x: x[0])
    print(h, '->', [s for s,_ in steps_sorted])

# For each group, keep config in highest step, replace others with pointer
now = datetime.datetime.utcnow().isoformat()+'Z'
for h,v in configs.items():
    steps = v['steps']
    max_step, max_dir = max(steps, key=lambda x:x[0])
    print(f'Canonical for hash {h} is step_{max_step}')
    for s,d in steps:
        cfg = d / 'config.json'
        if s == max_step:
            print(f'Keeping canonical config in {d}')
            continue
        # remove config.json but DO NOT delete pytorch_model.bin
        try:
            cfg.unlink()
            print(f'Removed {cfg}')
        except Exception as e:
            print('Failed to remove', cfg, e)
        # create pointer file
        pointer = d / 'config_pointer.txt'
        rel = os.path.relpath(max_dir / 'config.json', d)
        content = f"This config was consolidated during repository cleanup on {now}.\nCanonical config retained at: {rel}\nOriginal step: step_{s}\nCanonical step: step_{max_step}\nMD5: {h}\n"
        pointer.write_text(content)
        print(f'Wrote pointer {pointer}')

# Update README.md: insert short note in section '## 4. How to Run Locally'
readme = Path('README.md')
if readme.exists():
    txt = readme.read_text()
    insert_after = '## 4. How to Run Locally'
    note = '\n\n> Note: Checkpoints cleanup — duplicate config.json files across checkpoints have been consolidated. For each unique config, only the highest-numbered step retains the canonical config; other steps now contain a pointer file. No pytorch_model.bin files were removed.\n'
    if insert_after in txt:
        parts = txt.split(insert_after,1)
        newtxt = parts[0] + insert_after + note + parts[1]
        readme.write_text(newtxt)
        print('Updated README.md with cleanup note')
    else:
        print('Could not find section to insert note; appending at end')
        readme.write_text(txt + '\n\n' + note)
else:
    print('README.md not found')

print('Done')