#!/usr/bin/env python3 """ Check original Dolci dataset format to understand the structure. """ from datasets import load_dataset import json def main(): print("Loading dataset...") dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train") # Check samples from the last 20k start_idx = len(dataset) - 20000 sample_indices = [start_idx, start_idx + 100, start_idx + 500] for idx in sample_indices: print(f"\n{'='*60}") print(f"Sample {idx}:") print(f"{'='*60}") sample = dataset[idx] messages = sample['messages'] print(f"\nTotal messages: {len(messages)}") for i, msg in enumerate(messages): role = msg.get('role', '') content = msg.get('content', '') function_calls = msg.get('function_calls') functions = msg.get('functions') print(f"\n--- Message {i} ---") print(f"Role: {role}") if content: print(f"Content: {content[:200]}{'...' if len(content) > 200 else ''}") if function_calls: print(f"Function calls: {function_calls[:200]}{'...' if len(function_calls) > 200 else ''}") if functions: print(f"Has functions: True (length: {len(functions)})") if __name__ == "__main__": main()