import os, re, json, hashlib, datetime, sys from pathlib import Path root = Path('checkpoints') if not root.exists(): print('checkpoints directory not found') sys.exit(1) step_dirs = [] for p in root.iterdir(): if p.is_dir() and re.match(r'step_\d+$', p.name): step_dirs.append(p) step_dirs.sort(key=lambda x: int(x.name.split('_')[1])) print(f'Found {len(step_dirs)} step dirs') configs = {} raw_map = {} for d in step_dirs: cfg = d / 'config.json' if cfg.exists(): try: obj = json.load(cfg.open()) norm = json.dumps(obj, sort_keys=True, separators=(',',':')) except Exception as e: # fallback to raw normalized whitespace txt = cfg.read_text() norm = '\n'.join([line.strip() for line in txt.splitlines() if line.strip()]) h = hashlib.md5(norm.encode()).hexdigest() configs.setdefault(h, {'norm':norm,'steps':[]})['steps'].append((int(d.name.split('_')[1]), d)) raw_map[d] = {'hash':h,'path':cfg} else: print(f'No config in {d}') print('Groups:') for h,v in configs.items(): steps_sorted = sorted(v['steps'], key=lambda x: x[0]) print(h, '->', [s for s,_ in steps_sorted]) # For each group, keep config in highest step, replace others with pointer now = datetime.datetime.utcnow().isoformat()+'Z' for h,v in configs.items(): steps = v['steps'] max_step, max_dir = max(steps, key=lambda x:x[0]) print(f'Canonical for hash {h} is step_{max_step}') for s,d in steps: cfg = d / 'config.json' if s == max_step: print(f'Keeping canonical config in {d}') continue # remove config.json but DO NOT delete pytorch_model.bin try: cfg.unlink() print(f'Removed {cfg}') except Exception as e: print('Failed to remove', cfg, e) # create pointer file pointer = d / 'config_pointer.txt' rel = os.path.relpath(max_dir / 'config.json', d) content = f"This config was consolidated during repository cleanup on {now}.\nCanonical config retained at: {rel}\nOriginal step: step_{s}\nCanonical step: step_{max_step}\nMD5: {h}\n" pointer.write_text(content) print(f'Wrote pointer {pointer}') # Update README.md: insert short note in section '## 4. How to Run Locally' readme = Path('README.md') if readme.exists(): txt = readme.read_text() insert_after = '## 4. How to Run Locally' note = '\n\n> Note: Checkpoints cleanup — duplicate config.json files across checkpoints have been consolidated. For each unique config, only the highest-numbered step retains the canonical config; other steps now contain a pointer file. No pytorch_model.bin files were removed.\n' if insert_after in txt: parts = txt.split(insert_after,1) newtxt = parts[0] + insert_after + note + parts[1] readme.write_text(newtxt) print('Updated README.md with cleanup note') else: print('Could not find section to insert note; appending at end') readme.write_text(txt + '\n\n' + note) else: print('README.md not found') print('Done')