| from __future__ import annotations |
| import os, json, glob, re, collections, sys |
| from collections import Counter, OrderedDict |
|
|
| |
|
|
| def analyze_style(content: str): |
| trailing_newline = content.endswith('\n') |
| multiline = '\n' in content |
| indent = None |
| for line in content.splitlines(): |
| m = re.match(r'^(?P<indent>[ \t]+)"', line) |
| if m: |
| indent = m.group('indent') |
| break |
| if indent is None: |
| indent_type = 'none' |
| indent_size = 0 |
| else: |
| if '\t' in indent: |
| indent_type = 'tab' |
| indent_size = indent.count('\t') |
| else: |
| indent_type = 'space' |
| indent_size = len(indent) |
| key_sep = ': ' if '": ' in content else ':' |
| comma_space = ', ' in content |
| style = (multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline) |
| return style |
|
|
| |
| paths = sorted(glob.glob('checkpoints/step_*/config.json')) |
| contents = {} |
| styles = [] |
| for p in paths: |
| with open(p, 'rb') as f: |
| raw = f.read() |
| text = raw.decode('utf-8', errors='replace') |
| contents[p] = text |
| styles.append(analyze_style(text)) |
|
|
| from collections import Counter |
| canonical_style, cnt = Counter(styles).most_common(1)[0] |
| print('canonical_style:', canonical_style, 'count=', cnt) |
|
|
| |
| changed = [] |
| import json |
|
|
| def dump_with_style(obj, style): |
| multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline = style |
| if comma_space: |
| item_sep = ', ' |
| else: |
| item_sep = ',' |
| key_sep_str = key_sep |
| if multiline: |
| if indent_type == 'tab': |
| s = json.dumps(obj, indent=1, separators=(item_sep, key_sep_str)) |
| s = re.sub(r'^( +)', lambda m: '\t'*len(m.group(1)), s, flags=re.M) |
| elif indent_type == 'space': |
| s = json.dumps(obj, indent=indent_size if indent_size>0 else None, separators=(item_sep, key_sep_str)) |
| else: |
| s = json.dumps(obj, indent=2, separators=(item_sep, key_sep_str)) |
| else: |
| s = json.dumps(obj, separators=(item_sep, key_sep_str)) |
| if trailing_newline and not s.endswith('\n'): |
| s = s + '\n' |
| if (not trailing_newline) and s.endswith('\n'): |
| s = s[:-1] |
| return s |
|
|
| for p, text in contents.items(): |
| style = analyze_style(text) |
| if style != canonical_style: |
| with open(p, 'r', encoding='utf-8') as f: |
| obj = json.load(f, object_pairs_hook=OrderedDict) |
| newtext = dump_with_style(obj, canonical_style) |
| with open(p, 'w', encoding='utf-8', newline='') as f: |
| f.write(newtext) |
| changed.append(p) |
|
|
| print('ensured canonical style for config.json files, changed:', len(changed)) |
|
|
| |
| swap_path = 'evaluation/.setup.py.swp' |
| if os.path.exists(swap_path): |
| os.remove(swap_path) |
| print('removed', swap_path) |
| else: |
| print('no swap file found') |
|
|
| |
| try: |
| from huggingface_hub import HfApi, upload_folder, snapshot_download |
| except Exception: |
| os.system(f'{sys.executable} -m pip install --quiet huggingface_hub') |
| from huggingface_hub import HfApi, upload_folder, snapshot_download |
|
|
| with open('hf_token.txt','r') as f: |
| hf_token = f.read().strip() |
| api = HfApi() |
| user = api.whoami(token=hf_token).get('name') |
| repo_id = f"{user}/MyAwesomeModel-CleanupRelease" |
| api.create_repo(repo_id=repo_id, token=hf_token, private=False, exist_ok=True) |
|
|
| |
| ignore_patterns = ['.git', '.git/*', 'hf_token.txt', '*.swp'] |
| print('uploading, ignore_patterns=', ignore_patterns) |
| res = upload_folder( |
| folder_path='.', |
| path_in_repo='', |
| repo_id=repo_id, |
| token=hf_token, |
| ignore_patterns=ignore_patterns, |
| ) |
| print('uploaded', len(res) if hasattr(res, '__len__') else 'ok') |
|
|
| |
| local_dir = snapshot_download(repo_id=repo_id, token=hf_token, allow_patterns=['*']) |
| print('downloaded to', local_dir) |
|
|
| |
| found_swp = [] |
| for root, dirs, files in os.walk(local_dir): |
| for fn in files: |
| if fn.endswith('.swp'): |
| found_swp.append(os.path.join(root, fn)) |
| print('found .swp files:', found_swp) |
|
|
| |
| snap_paths = sorted(glob.glob(os.path.join(local_dir, 'checkpoints/step_*/config.json'))) |
| print('snapshot config files count:', len(snap_paths)) |
|
|
| bad = [] |
| for p in snap_paths: |
| with open(p, 'rb') as f: |
| txt = f.read() |
| |
| rel = os.path.relpath(p, local_dir) |
| orig_path = rel |
| local_workspace_path = orig_path |
| if os.path.exists(local_workspace_path): |
| with open(local_workspace_path, 'rb') as f: |
| local_txt = f.read() |
| if txt != local_txt: |
| bad.append((p, local_workspace_path)) |
| else: |
| bad.append((p, None)) |
|
|
| print('mismatches count:', len(bad)) |
| if bad: |
| for a,b in bad[:10]: |
| print('mismatch:', a, 'vs', b) |
|
|
| |
| print('\nFINAL:') |
| print('repo:', repo_id) |
| print('deleted_swap:', not os.path.exists(swap_path)) |
| print('.swp in repo:', len(found_swp)) |
| print('config mismatches:', len(bad)) |
|
|
| |
| if found_swp or bad: |
| sys.exit(2) |
| else: |
| sys.exit(0) |
|
|