MyAwesomeModel-CleanupRelease / .python_tmp /863ab7be-0f37-4ef0-a40f-8d35e20ff660.py

Upload folder using huggingface_hub

7b52fed verified 2 months ago

5.4 kB

	from __future__ import annotations
	import os, json, glob, re, collections, sys
	from collections import Counter, OrderedDict

	# helper - reuse earlier analysis

	def analyze_style(content: str):
	trailing_newline = content.endswith('\n')
	multiline = '\n' in content
	indent = None
	for line in content.splitlines():
	m = re.match(r'^(?P<indent>[ \t]+)"', line)
	if m:
	indent = m.group('indent')
	break
	if indent is None:
	indent_type = 'none'
	indent_size = 0
	else:
	if '\t' in indent:
	indent_type = 'tab'
	indent_size = indent.count('\t')
	else:
	indent_type = 'space'
	indent_size = len(indent)
	key_sep = ': ' if '": ' in content else ':'
	comma_space = ', ' in content
	style = (multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline)
	return style

	# find config files
	paths = sorted(glob.glob('checkpoints/step_*/config.json'))
	contents = {}
	styles = []
	for p in paths:
	with open(p, 'rb') as f:
	raw = f.read()
	text = raw.decode('utf-8', errors='replace')
	contents[p] = text
	styles.append(analyze_style(text))

	from collections import Counter
	canonical_style, cnt = Counter(styles).most_common(1)[0]
	print('canonical_style:', canonical_style, 'count=', cnt)

	# rewrite outliers already done, but ensure no further changes
	changed = []
	import json

	def dump_with_style(obj, style):
	multiline, indent_type, indent_size, key_sep, comma_space, trailing_newline = style
	if comma_space:
	item_sep = ', '
	else:
	item_sep = ','
	key_sep_str = key_sep
	if multiline:
	if indent_type == 'tab':
	s = json.dumps(obj, indent=1, separators=(item_sep, key_sep_str))
	s = re.sub(r'^( +)', lambda m: '\t'*len(m.group(1)), s, flags=re.M)
	elif indent_type == 'space':
	s = json.dumps(obj, indent=indent_size if indent_size>0 else None, separators=(item_sep, key_sep_str))
	else:
	s = json.dumps(obj, indent=2, separators=(item_sep, key_sep_str))
	else:
	s = json.dumps(obj, separators=(item_sep, key_sep_str))
	if trailing_newline and not s.endswith('\n'):
	s = s + '\n'
	if (not trailing_newline) and s.endswith('\n'):
	s = s[:-1]
	return s

	for p, text in contents.items():
	style = analyze_style(text)
	if style != canonical_style:
	with open(p, 'r', encoding='utf-8') as f:
	obj = json.load(f, object_pairs_hook=OrderedDict)
	newtext = dump_with_style(obj, canonical_style)
	with open(p, 'w', encoding='utf-8', newline='') as f:
	f.write(newtext)
	changed.append(p)

	print('ensured canonical style for config.json files, changed:', len(changed))

	# remove stray swap file
	swap_path = 'evaluation/.setup.py.swp'
	if os.path.exists(swap_path):
	os.remove(swap_path)
	print('removed', swap_path)
	else:
	print('no swap file found')

	# upload to Hugging Face excluding hf_token.txt
	try:
	from huggingface_hub import HfApi, upload_folder, snapshot_download
	except Exception:
	os.system(f'{sys.executable} -m pip install --quiet huggingface_hub')
	from huggingface_hub import HfApi, upload_folder, snapshot_download

	with open('hf_token.txt','r') as f:
	hf_token = f.read().strip()
	api = HfApi()
	user = api.whoami(token=hf_token).get('name')
	repo_id = f"{user}/MyAwesomeModel-CleanupRelease"
	api.create_repo(repo_id=repo_id, token=hf_token, private=False, exist_ok=True)

	# Build ignore patterns to exclude hf_token.txt and .swp files
	ignore_patterns = ['.git', '.git/', 'hf_token.txt', '.swp']
	print('uploading, ignore_patterns=', ignore_patterns)
	res = upload_folder(
	folder_path='.',
	path_in_repo='',
	repo_id=repo_id,
	token=hf_token,
	ignore_patterns=ignore_patterns,
	)
	print('uploaded', len(res) if hasattr(res, '__len__') else 'ok')

	# verify by downloading snapshot
	local_dir = snapshot_download(repo_id=repo_id, token=hf_token, allow_patterns=['*'])
	print('downloaded to', local_dir)

	# check .swp
	found_swp = []
	for root, dirs, files in os.walk(local_dir):
	for fn in files:
	if fn.endswith('.swp'):
	found_swp.append(os.path.join(root, fn))
	print('found .swp files:', found_swp)

	# check config files byte-identical to canonical style
	snap_paths = sorted(glob.glob(os.path.join(local_dir, 'checkpoints/step_*/config.json')))
	print('snapshot config files count:', len(snap_paths))

	bad = []
	for p in snap_paths:
	with open(p, 'rb') as f:
	txt = f.read()
	# compute canonical text from local workspace file
	rel = os.path.relpath(p, local_dir)
	orig_path = rel # path in repo
	local_workspace_path = orig_path
	if os.path.exists(local_workspace_path):
	with open(local_workspace_path, 'rb') as f:
	local_txt = f.read()
	if txt != local_txt:
	bad.append((p, local_workspace_path))
	else:
	bad.append((p, None))

	print('mismatches count:', len(bad))
	if bad:
	for a,b in bad[:10]:
	print('mismatch:', a, 'vs', b)

	# final status
	print('\nFINAL:')
	print('repo:', repo_id)
	print('deleted_swap:', not os.path.exists(swap_path))
	print('.swp in repo:', len(found_swp))
	print('config mismatches:', len(bad))

	# exit code 0 if fine else 2
	if found_swp or bad:
	sys.exit(2)
	else:
	sys.exit(0)