Spaces:

akhaliq
/

SummerTime

Build error

SummerTime / model /third_party /HMNet /DataLoader /infinibatch /bin /block_randomize.py

akhaliq3

spaces demo

546a9ba over 4 years ago

4.89 kB

	# Copyright (c) Microsoft Corporation.
	# Licensed under the MIT license.

	#!/usr/bin/python3.6

	# simple command-line wrapper around the chunked_dataset_iterator
	# Example:
	# block_randomize my_chunked_data_folder/
	# block_randomize --azure-storage-key $MY_KEY https://myaccount.blob.core.windows.net/mycontainer/my_chunked_data_folder

	import os, sys, inspect

	sys.path.insert(
	0,
	os.path.dirname(
	os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
	),
	) # find our imports

	from infinibatch.datasets import chunked_dataset_iterator

	from typing import Union, Iterator, Callable, Any, Optional, Dict
	import os, sys, re
	import gzip


	# helper functions to abstract access to Azure blobs
	# @TODO: These will be abstracted into a helper library in a future version.
	def _try_parse_azure_blob_uri(path: str):
	try:
	m = re.compile("https://([a-z0-9]).blob.core.windows.net/([^/])/(.*)").match(
	path
	)
	# print (m.group(1))
	# print (m.group(2))
	# print (m.group(3))
	return (m.group(1), m.group(2), m.group(3))
	except:
	return None


	def _get_azure_key(
	storage_account: str, credentials: Optional[Union[str, Dict[str, str]]]
	):
	if not credentials:
	return None
	elif isinstance(credentials, str):
	return credentials
	else:
	return credentials[storage_account]


	def read_utf8_file(
	path: str, credentials: Optional[Union[str, Dict[str, str]]]
	) -> Iterator[str]:
	blob_data = _try_parse_azure_blob_uri(path)
	if blob_data is None:
	with open(path, "rb") as f:
	data = f.read()
	else:
	try:
	# pip install azure-storage-blob
	from azure.storage.blob import BlobClient
	except:
	print(
	"Failed to import azure.storage.blob. Please pip install azure-storage-blob",
	file=sys.stderr,
	)
	raise
	data = (
	BlobClient.from_blob_url(
	path,
	credential=_get_azure_key(
	storage_account=blob_data[0], credentials=credentials
	),
	)
	.download_blob()
	.readall()
	)
	if path.endswith(".gz"):
	data = gzip.decompress(data)
	# @TODO: auto-detect UCS-2 by BOM
	return iter(data.decode(encoding="utf-8").splitlines())


	def enumerate_files(
	dir: str, ext: str, credentials: Optional[Union[str, Dict[str, str]]]
	):
	blob_data = _try_parse_azure_blob_uri(dir)
	if blob_data is None:
	return [
	os.path.join(dir, path.name)
	for path in os.scandir(dir)
	if path.is_file() and (ext is None or path.name.endswith(ext))
	]
	else:
	try:
	# pip install azure-storage-blob
	from azure.storage.blob import ContainerClient
	except:
	print(
	"Failed to import azure.storage.blob. Please pip install azure-storage-blob",
	file=sys.stderr,
	)
	raise
	account, container, blob_path = blob_data

	print("enumerate_files: enumerating blobs in", dir, file=sys.stderr, flush=True)
	# @BUGBUG: The prefix does not seem to have to start; seems it can also be a substring
	container_uri = "https://" + account + ".blob.core.windows.net/" + container
	container_client = ContainerClient.from_container_url(
	container_uri, credential=_get_azure_key(account, credentials)
	)
	if not blob_path.endswith("/"):
	blob_path += "/"
	blob_uris = [
	container_uri + "/" + blob["name"]
	for blob in container_client.walk_blobs(blob_path, delimiter="")
	if (ext is None or blob["name"].endswith(ext))
	]
	print(
	"enumerate_files:",
	len(blob_uris),
	"blobs found",
	file=sys.stderr,
	flush=True,
	)
	for blob_name in blob_uris[:10]:
	print(blob_name, file=sys.stderr, flush=True)
	return blob_uris


	if sys.argv[1] == "--azure-storage-key":
	credential = sys.argv[2]
	paths = sys.argv[3:]
	else:
	credential = None
	paths = sys.argv[1:]

	chunk_file_paths = [ # enumerate all .gz files in the given paths
	subpath for path in paths for subpath in enumerate_files(path, ".gz", credential)
	]
	chunk_file_paths.sort() # make sure file order is always the same, independent of OS
	print(
	"block_randomize: reading from",
	len(chunk_file_paths),
	"chunk files",
	file=sys.stderr,
	)

	ds = chunked_dataset_iterator(
	chunk_refs=chunk_file_paths,
	read_chunk_fn=lambda path: read_utf8_file(path, credential),
	shuffle=True,
	buffer_size=1000000,
	seed=1,
	use_windowed=True,
	)
	for line in ds:
	print(line)