MyAwesomeModel-CleanupRelease / .python_tmp /863ab7be-0f37-4ef0-a40f-8d35e20ff660.py
FuryAssassin's picture
Upload folder using huggingface_hub
7b52fed verified
from __future__ import annotations
import os, json, glob, re, collections, sys
from collections import Counter, OrderedDict
# helper - reuse earlier analysis
def analyze_style(content: str):
trailing_newline = content.endswith('\n')
multiline = '\n' in content
indent = None
for line in content.splitlines():
m = re.match(r'^(?P<indent>[ \t]+)"', line)
if m:
indent = m.group('indent')
break
if indent is None:
indent_type = 'none'
indent_size = 0
else:
if '\t' in indent:
indent_type = 'tab'
indent_size = indent.count('\t')
else:
indent_type = 'space'
indent_size = len(indent)
key_sep = ': ' if '": ' in content else ':'
comma_space = ', ' in content
style = (multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline)
return style
# find config files
paths = sorted(glob.glob('checkpoints/step_*/config.json'))
contents = {}
styles = []
for p in paths:
with open(p, 'rb') as f:
raw = f.read()
text = raw.decode('utf-8', errors='replace')
contents[p] = text
styles.append(analyze_style(text))
from collections import Counter
canonical_style, cnt = Counter(styles).most_common(1)[0]
print('canonical_style:', canonical_style, 'count=', cnt)
# rewrite outliers already done, but ensure no further changes
changed = []
import json
def dump_with_style(obj, style):
multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline = style
if comma_space:
item_sep = ', '
else:
item_sep = ','
key_sep_str = key_sep
if multiline:
if indent_type == 'tab':
s = json.dumps(obj, indent=1, separators=(item_sep, key_sep_str))
s = re.sub(r'^( +)', lambda m: '\t'*len(m.group(1)), s, flags=re.M)
elif indent_type == 'space':
s = json.dumps(obj, indent=indent_size if indent_size>0 else None, separators=(item_sep, key_sep_str))
else:
s = json.dumps(obj, indent=2, separators=(item_sep, key_sep_str))
else:
s = json.dumps(obj, separators=(item_sep, key_sep_str))
if trailing_newline and not s.endswith('\n'):
s = s + '\n'
if (not trailing_newline) and s.endswith('\n'):
s = s[:-1]
return s
for p, text in contents.items():
style = analyze_style(text)
if style != canonical_style:
with open(p, 'r', encoding='utf-8') as f:
obj = json.load(f, object_pairs_hook=OrderedDict)
newtext = dump_with_style(obj, canonical_style)
with open(p, 'w', encoding='utf-8', newline='') as f:
f.write(newtext)
changed.append(p)
print('ensured canonical style for config.json files, changed:', len(changed))
# remove stray swap file
swap_path = 'evaluation/.setup.py.swp'
if os.path.exists(swap_path):
os.remove(swap_path)
print('removed', swap_path)
else:
print('no swap file found')
# upload to Hugging Face excluding hf_token.txt
try:
from huggingface_hub import HfApi, upload_folder, snapshot_download
except Exception:
os.system(f'{sys.executable} -m pip install --quiet huggingface_hub')
from huggingface_hub import HfApi, upload_folder, snapshot_download
with open('hf_token.txt','r') as f:
hf_token = f.read().strip()
api = HfApi()
user = api.whoami(token=hf_token).get('name')
repo_id = f"{user}/MyAwesomeModel-CleanupRelease"
api.create_repo(repo_id=repo_id, token=hf_token, private=False, exist_ok=True)
# Build ignore patterns to exclude hf_token.txt and .swp files
ignore_patterns = ['.git', '.git/*', 'hf_token.txt', '*.swp']
print('uploading, ignore_patterns=', ignore_patterns)
res = upload_folder(
folder_path='.',
path_in_repo='',
repo_id=repo_id,
token=hf_token,
ignore_patterns=ignore_patterns,
)
print('uploaded', len(res) if hasattr(res, '__len__') else 'ok')
# verify by downloading snapshot
local_dir = snapshot_download(repo_id=repo_id, token=hf_token, allow_patterns=['*'])
print('downloaded to', local_dir)
# check .swp
found_swp = []
for root, dirs, files in os.walk(local_dir):
for fn in files:
if fn.endswith('.swp'):
found_swp.append(os.path.join(root, fn))
print('found .swp files:', found_swp)
# check config files byte-identical to canonical style
snap_paths = sorted(glob.glob(os.path.join(local_dir, 'checkpoints/step_*/config.json')))
print('snapshot config files count:', len(snap_paths))
bad = []
for p in snap_paths:
with open(p, 'rb') as f:
txt = f.read()
# compute canonical text from local workspace file
rel = os.path.relpath(p, local_dir)
orig_path = rel # path in repo
local_workspace_path = orig_path
if os.path.exists(local_workspace_path):
with open(local_workspace_path, 'rb') as f:
local_txt = f.read()
if txt != local_txt:
bad.append((p, local_workspace_path))
else:
bad.append((p, None))
print('mismatches count:', len(bad))
if bad:
for a,b in bad[:10]:
print('mismatch:', a, 'vs', b)
# final status
print('\nFINAL:')
print('repo:', repo_id)
print('deleted_swap:', not os.path.exists(swap_path))
print('.swp in repo:', len(found_swp))
print('config mismatches:', len(bad))
# exit code 0 if fine else 2
if found_swp or bad:
sys.exit(2)
else:
sys.exit(0)