nikraf
/

directionality_probe

Feature Extraction

Model card Files Files and versions

directionality_probe / protify /data /dataset_utils.py

nikraf's picture

Upload folder using huggingface_hub

714cf46 verified 20 days ago

history blame contribute delete

3.65 kB

	from .supported_datasets import (
	supported_datasets,
	internal_datasets,
	possible_with_vector_reps,
	standard_data_benchmark,
	testing,
	)


	def list_supported_datasets(with_descriptions=True):
	"""
	Lists all supported datasets with optional descriptions.

	Args:
	with_descriptions (bool): Whether to include descriptions (if available)
	"""
	try:
	from .dataset_descriptions import dataset_descriptions
	has_descriptions = True
	except ImportError:
	has_descriptions = False

	if not with_descriptions or not has_descriptions:
	print("\n=== Supported Datasets ===\n")
	for dataset_name in supported_datasets:
	print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
	return

	print("\n=== Supported Datasets ===\n")

	# Calculate maximum widths for formatting
	max_name_len = max(len(name) for name in supported_datasets)
	max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)
	max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)

	# Print header
	print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
	print("-" * (max_name_len + max_type_len + max_task_len + 50))

	# Print dataset information
	for dataset_name in supported_datasets:
	if dataset_name in dataset_descriptions:
	dataset_info = dataset_descriptions[dataset_name]
	print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
	else:
	print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")

	print("\n=== Standard Benchmark Datasets ===\n")
	for dataset_name in standard_data_benchmark:
	print(f"- {dataset_name}")


	def get_dataset_info(dataset_name):
	"""
	Get detailed information about a specific dataset.

	Args:
	dataset_name (str): Name of the dataset

	Returns:
	dict: Dataset information or None if not found
	"""
	try:
	from .dataset_descriptions import dataset_descriptions
	if dataset_name in dataset_descriptions:
	return dataset_descriptions[dataset_name]
	except ImportError:
	pass

	if dataset_name in supported_datasets:
	return {"name": dataset_name, "source": supported_datasets[dataset_name]}

	return None


	if __name__ == "__main__":
	import sys
	import argparse

	parser = argparse.ArgumentParser(description='List and describe supported datasets')
	parser.add_argument('--list', action='store_true', help='List all supported datasets')
	parser.add_argument('--info', type=str, help='Get information about a specific dataset')
	args = parser.parse_args()

	if len(sys.argv) == 1:
	parser.print_help()
	sys.exit(1)

	if args.list:
	list_supported_datasets()

	if args.info:
	dataset_info = get_dataset_info(args.info)
	if dataset_info:
	print(f"\n=== Dataset: {args.info} ===\n")
	for key, value in dataset_info.items():
	print(f"{key.capitalize()}: {value}")
	else:
	print(f"Dataset '{args.info}' not found in supported datasets.")