#!/usr/bin/env python3 """ Debug script to analyze the dataset structure. """ import json from datasets import load_dataset from collections import Counter def analyze_sample(sample, idx): """Analyze a single sample structure.""" conversations = sample.get('conversations', []) roles = [conv.get('from') or conv.get('role') for conv in conversations] has_function_call = 'function_call' in roles has_observation = 'observation' in roles return { 'index': idx, 'num_turns': len(conversations), 'roles': roles, 'has_function_call': has_function_call, 'has_observation': has_observation, 'has_tools': 'tools' in sample and sample['tools'] is not None } def main(): print("Loading dataset...") dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train") print(f"Total samples: {len(dataset)}") # Analyze first few samples print("\n=== First 5 samples ===") for i in range(min(5, len(dataset))): info = analyze_sample(dataset[i], i) print(f"\nSample {i}:") print(f" Roles: {info['roles']}") print(f" Has function_call: {info['has_function_call']}") print(f" Has observation: {info['has_observation']}") print(f" Has tools: {info['has_tools']}") # Count role distribution print("\n=== Analyzing entire dataset ===") role_counter = Counter() tool_call_count = 0 observation_count = 0 both_count = 0 for sample in dataset: conversations = sample.get('conversations', []) for conv in conversations: role = conv.get('from') or conv.get('role') role_counter[role] += 1 roles = [conv.get('from') or conv.get('role') for conv in conversations] has_function = 'function_call' in roles has_observation = 'observation' in roles if has_function: tool_call_count += 1 if has_observation: observation_count += 1 if has_function and has_observation: both_count += 1 print(f"\nRole distribution:") for role, count in role_counter.most_common(): print(f" {role}: {count}") print(f"\nTool calling statistics:") print(f" Samples with function_call: {tool_call_count}") print(f" Samples with observation: {observation_count}") print(f" Samples with BOTH: {both_count}") # Check batch1 structure print("\n=== Checking batch1 structure ===") with open('data/dolci_10k_with_tool_call_batch1.json', 'r') as f: batch1 = json.load(f) print(f"Batch1 total samples: {len(batch1)}") if batch1: sample = batch1[0] print(f"Batch1 sample 0 keys: {sample.keys()}") print(f"Batch1 sample 0 conversations roles: {[c['from'] for c in sample['conversations']]}") print(f"First message value (first 100 chars): {sample['conversations'][0]['value'][:100]}") if __name__ == "__main__": main()