File size: 2,985 Bytes
db704cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#!/usr/bin/env python3
"""
Debug script to analyze the dataset structure.
"""
import json
from datasets import load_dataset
from collections import Counter
def analyze_sample(sample, idx):
"""Analyze a single sample structure."""
conversations = sample.get('conversations', [])
roles = [conv.get('from') or conv.get('role') for conv in conversations]
has_function_call = 'function_call' in roles
has_observation = 'observation' in roles
return {
'index': idx,
'num_turns': len(conversations),
'roles': roles,
'has_function_call': has_function_call,
'has_observation': has_observation,
'has_tools': 'tools' in sample and sample['tools'] is not None
}
def main():
print("Loading dataset...")
dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train")
print(f"Total samples: {len(dataset)}")
# Analyze first few samples
print("\n=== First 5 samples ===")
for i in range(min(5, len(dataset))):
info = analyze_sample(dataset[i], i)
print(f"\nSample {i}:")
print(f" Roles: {info['roles']}")
print(f" Has function_call: {info['has_function_call']}")
print(f" Has observation: {info['has_observation']}")
print(f" Has tools: {info['has_tools']}")
# Count role distribution
print("\n=== Analyzing entire dataset ===")
role_counter = Counter()
tool_call_count = 0
observation_count = 0
both_count = 0
for sample in dataset:
conversations = sample.get('conversations', [])
for conv in conversations:
role = conv.get('from') or conv.get('role')
role_counter[role] += 1
roles = [conv.get('from') or conv.get('role') for conv in conversations]
has_function = 'function_call' in roles
has_observation = 'observation' in roles
if has_function:
tool_call_count += 1
if has_observation:
observation_count += 1
if has_function and has_observation:
both_count += 1
print(f"\nRole distribution:")
for role, count in role_counter.most_common():
print(f" {role}: {count}")
print(f"\nTool calling statistics:")
print(f" Samples with function_call: {tool_call_count}")
print(f" Samples with observation: {observation_count}")
print(f" Samples with BOTH: {both_count}")
# Check batch1 structure
print("\n=== Checking batch1 structure ===")
with open('data/dolci_10k_with_tool_call_batch1.json', 'r') as f:
batch1 = json.load(f)
print(f"Batch1 total samples: {len(batch1)}")
if batch1:
sample = batch1[0]
print(f"Batch1 sample 0 keys: {sample.keys()}")
print(f"Batch1 sample 0 conversations roles: {[c['from'] for c in sample['conversations']]}")
print(f"First message value (first 100 chars): {sample['conversations'][0]['value'][:100]}")
if __name__ == "__main__":
main()
|