dongxx1104
/

llm

Model card Files Files and versions

llm / debug_dataset.py

dongxx1104's picture

Upload folder using huggingface_hub

db704cb verified 11 days ago

history blame contribute delete

2.99 kB

	#!/usr/bin/env python3
	"""
	Debug script to analyze the dataset structure.
	"""

	import json
	from datasets import load_dataset
	from collections import Counter

	def analyze_sample(sample, idx):
	"""Analyze a single sample structure."""
	conversations = sample.get('conversations', [])
	roles = [conv.get('from') or conv.get('role') for conv in conversations]

	has_function_call = 'function_call' in roles
	has_observation = 'observation' in roles

	return {
	'index': idx,
	'num_turns': len(conversations),
	'roles': roles,
	'has_function_call': has_function_call,
	'has_observation': has_observation,
	'has_tools': 'tools' in sample and sample['tools'] is not None
	}

	def main():
	print("Loading dataset...")
	dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train")

	print(f"Total samples: {len(dataset)}")

	# Analyze first few samples
	print("\n=== First 5 samples ===")
	for i in range(min(5, len(dataset))):
	info = analyze_sample(dataset[i], i)
	print(f"\nSample {i}:")
	print(f" Roles: {info['roles']}")
	print(f" Has function_call: {info['has_function_call']}")
	print(f" Has observation: {info['has_observation']}")
	print(f" Has tools: {info['has_tools']}")

	# Count role distribution
	print("\n=== Analyzing entire dataset ===")
	role_counter = Counter()
	tool_call_count = 0
	observation_count = 0
	both_count = 0

	for sample in dataset:
	conversations = sample.get('conversations', [])
	for conv in conversations:
	role = conv.get('from') or conv.get('role')
	role_counter[role] += 1

	roles = [conv.get('from') or conv.get('role') for conv in conversations]
	has_function = 'function_call' in roles
	has_observation = 'observation' in roles

	if has_function:
	tool_call_count += 1
	if has_observation:
	observation_count += 1
	if has_function and has_observation:
	both_count += 1

	print(f"\nRole distribution:")
	for role, count in role_counter.most_common():
	print(f" {role}: {count}")

	print(f"\nTool calling statistics:")
	print(f" Samples with function_call: {tool_call_count}")
	print(f" Samples with observation: {observation_count}")
	print(f" Samples with BOTH: {both_count}")

	# Check batch1 structure
	print("\n=== Checking batch1 structure ===")
	with open('data/dolci_10k_with_tool_call_batch1.json', 'r') as f:
	batch1 = json.load(f)

	print(f"Batch1 total samples: {len(batch1)}")
	if batch1:
	sample = batch1[0]
	print(f"Batch1 sample 0 keys: {sample.keys()}")
	print(f"Batch1 sample 0 conversations roles: {[c['from'] for c in sample['conversations']]}")
	print(f"First message value (first 100 chars): {sample['conversations'][0]['value'][:100]}")

	if __name__ == "__main__":
	main()