ordlibrary
/

DeepSolanaCoder

Model card Files Files and versions

DeepSolanaCoder / DeepSeek-Coder-main /finetune /venv /lib /python3.12 /site-packages /datasets /naming.py

ordlibrary's picture

Upload folder using huggingface_hub

f1e6b80 verified about 1 year ago

history blame contribute delete

3 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Lint as: python3
	"""Utilities for file names."""

	import itertools
	import os
	import re


	_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
	_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")

	_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
	_multiple_underscores_re = re.compile(r"(_{2,})")

	_split_re = r"^\w+(\.\w+)*$"

	INVALID_WINDOWS_CHARACTERS_IN_PATH = r"<>:/\\|?*"


	def camelcase_to_snakecase(name):
	"""Convert camel-case string to snake-case."""
	name = _uppercase_uppercase_re.sub(r"\1_\2", name)
	name = _lowercase_uppercase_re.sub(r"\1_\2", name)
	return name.lower()


	def snakecase_to_camelcase(name):
	"""Convert snake-case string to camel-case string."""
	name = _single_underscore_re.split(name)
	name = [_multiple_underscores_re.split(n) for n in name]
	return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")


	def filename_prefix_for_name(name):
	if os.path.basename(name) != name:
	raise ValueError(f"Should be a dataset name, not a path: {name}")
	return camelcase_to_snakecase(name)


	def filename_prefix_for_split(name, split):
	if os.path.basename(name) != name:
	raise ValueError(f"Should be a dataset name, not a path: {name}")
	if not re.match(_split_re, split):
	raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
	return f"{filename_prefix_for_name(name)}-{split}"


	def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
	prefix = filename_prefix_for_split(dataset_name, split)
	if filetype_suffix:
	prefix += f".{filetype_suffix}"
	filepath = os.path.join(data_dir, prefix)
	return f"{filepath}*"


	def filenames_for_dataset_split(path, dataset_name, split, filetype_suffix=None, shard_lengths=None):
	prefix = filename_prefix_for_split(dataset_name, split)
	prefix = os.path.join(path, prefix)

	if shard_lengths:
	num_shards = len(shard_lengths)
	filenames = [f"{prefix}-{shard_id:05d}-of-{num_shards:05d}" for shard_id in range(num_shards)]
	if filetype_suffix:
	filenames = [filename + f".{filetype_suffix}" for filename in filenames]
	return filenames
	else:
	filename = prefix
	if filetype_suffix:
	filename += f".{filetype_suffix}"
	return [filename]