| | import os, re, json, hashlib, datetime, sys |
| | from pathlib import Path |
| | root = Path('checkpoints') |
| | if not root.exists(): |
| | print('checkpoints directory not found') |
| | sys.exit(1) |
| | step_dirs = [] |
| | for p in root.iterdir(): |
| | if p.is_dir() and re.match(r'step_\d+$', p.name): |
| | step_dirs.append(p) |
| | step_dirs.sort(key=lambda x: int(x.name.split('_')[1])) |
| | print(f'Found {len(step_dirs)} step dirs') |
| | configs = {} |
| | raw_map = {} |
| | for d in step_dirs: |
| | cfg = d / 'config.json' |
| | if cfg.exists(): |
| | try: |
| | obj = json.load(cfg.open()) |
| | norm = json.dumps(obj, sort_keys=True, separators=(',',':')) |
| | except Exception as e: |
| | |
| | txt = cfg.read_text() |
| | norm = '\n'.join([line.strip() for line in txt.splitlines() if line.strip()]) |
| | h = hashlib.md5(norm.encode()).hexdigest() |
| | configs.setdefault(h, {'norm':norm,'steps':[]})['steps'].append((int(d.name.split('_')[1]), d)) |
| | raw_map[d] = {'hash':h,'path':cfg} |
| | else: |
| | print(f'No config in {d}') |
| |
|
| | print('Groups:') |
| | for h,v in configs.items(): |
| | steps_sorted = sorted(v['steps'], key=lambda x: x[0]) |
| | print(h, '->', [s for s,_ in steps_sorted]) |
| |
|
| | |
| | now = datetime.datetime.utcnow().isoformat()+'Z' |
| | for h,v in configs.items(): |
| | steps = v['steps'] |
| | max_step, max_dir = max(steps, key=lambda x:x[0]) |
| | print(f'Canonical for hash {h} is step_{max_step}') |
| | for s,d in steps: |
| | cfg = d / 'config.json' |
| | if s == max_step: |
| | print(f'Keeping canonical config in {d}') |
| | continue |
| | |
| | try: |
| | cfg.unlink() |
| | print(f'Removed {cfg}') |
| | except Exception as e: |
| | print('Failed to remove', cfg, e) |
| | |
| | pointer = d / 'config_pointer.txt' |
| | rel = os.path.relpath(max_dir / 'config.json', d) |
| | content = f"This config was consolidated during repository cleanup on {now}.\nCanonical config retained at: {rel}\nOriginal step: step_{s}\nCanonical step: step_{max_step}\nMD5: {h}\n" |
| | pointer.write_text(content) |
| | print(f'Wrote pointer {pointer}') |
| |
|
| | |
| | readme = Path('README.md') |
| | if readme.exists(): |
| | txt = readme.read_text() |
| | insert_after = '## 4. How to Run Locally' |
| | note = '\n\n> Note: Checkpoints cleanup — duplicate config.json files across checkpoints have been consolidated. For each unique config, only the highest-numbered step retains the canonical config; other steps now contain a pointer file. No pytorch_model.bin files were removed.\n' |
| | if insert_after in txt: |
| | parts = txt.split(insert_after,1) |
| | newtxt = parts[0] + insert_after + note + parts[1] |
| | readme.write_text(newtxt) |
| | print('Updated README.md with cleanup note') |
| | else: |
| | print('Could not find section to insert note; appending at end') |
| | readme.write_text(txt + '\n\n' + note) |
| | else: |
| | print('README.md not found') |
| |
|
| | print('Done') |
| |
|