|
|
|
|
|
""" |
|
|
Debug script to analyze the dataset structure. |
|
|
""" |
|
|
|
|
|
import json |
|
|
from datasets import load_dataset |
|
|
from collections import Counter |
|
|
|
|
|
def analyze_sample(sample, idx): |
|
|
"""Analyze a single sample structure.""" |
|
|
conversations = sample.get('conversations', []) |
|
|
roles = [conv.get('from') or conv.get('role') for conv in conversations] |
|
|
|
|
|
has_function_call = 'function_call' in roles |
|
|
has_observation = 'observation' in roles |
|
|
|
|
|
return { |
|
|
'index': idx, |
|
|
'num_turns': len(conversations), |
|
|
'roles': roles, |
|
|
'has_function_call': has_function_call, |
|
|
'has_observation': has_observation, |
|
|
'has_tools': 'tools' in sample and sample['tools'] is not None |
|
|
} |
|
|
|
|
|
def main(): |
|
|
print("Loading dataset...") |
|
|
dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train") |
|
|
|
|
|
print(f"Total samples: {len(dataset)}") |
|
|
|
|
|
|
|
|
print("\n=== First 5 samples ===") |
|
|
for i in range(min(5, len(dataset))): |
|
|
info = analyze_sample(dataset[i], i) |
|
|
print(f"\nSample {i}:") |
|
|
print(f" Roles: {info['roles']}") |
|
|
print(f" Has function_call: {info['has_function_call']}") |
|
|
print(f" Has observation: {info['has_observation']}") |
|
|
print(f" Has tools: {info['has_tools']}") |
|
|
|
|
|
|
|
|
print("\n=== Analyzing entire dataset ===") |
|
|
role_counter = Counter() |
|
|
tool_call_count = 0 |
|
|
observation_count = 0 |
|
|
both_count = 0 |
|
|
|
|
|
for sample in dataset: |
|
|
conversations = sample.get('conversations', []) |
|
|
for conv in conversations: |
|
|
role = conv.get('from') or conv.get('role') |
|
|
role_counter[role] += 1 |
|
|
|
|
|
roles = [conv.get('from') or conv.get('role') for conv in conversations] |
|
|
has_function = 'function_call' in roles |
|
|
has_observation = 'observation' in roles |
|
|
|
|
|
if has_function: |
|
|
tool_call_count += 1 |
|
|
if has_observation: |
|
|
observation_count += 1 |
|
|
if has_function and has_observation: |
|
|
both_count += 1 |
|
|
|
|
|
print(f"\nRole distribution:") |
|
|
for role, count in role_counter.most_common(): |
|
|
print(f" {role}: {count}") |
|
|
|
|
|
print(f"\nTool calling statistics:") |
|
|
print(f" Samples with function_call: {tool_call_count}") |
|
|
print(f" Samples with observation: {observation_count}") |
|
|
print(f" Samples with BOTH: {both_count}") |
|
|
|
|
|
|
|
|
print("\n=== Checking batch1 structure ===") |
|
|
with open('data/dolci_10k_with_tool_call_batch1.json', 'r') as f: |
|
|
batch1 = json.load(f) |
|
|
|
|
|
print(f"Batch1 total samples: {len(batch1)}") |
|
|
if batch1: |
|
|
sample = batch1[0] |
|
|
print(f"Batch1 sample 0 keys: {sample.keys()}") |
|
|
print(f"Batch1 sample 0 conversations roles: {[c['from'] for c in sample['conversations']]}") |
|
|
print(f"First message value (first 100 chars): {sample['conversations'][0]['value'][:100]}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|