File size: 1,436 Bytes
db704cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
"""
Check the function_calls format in Dolci dataset.
"""

from datasets import load_dataset

def main():
    print("Loading dataset...")
    dataset = load_dataset("allenai/Dolci-Instruct-SFT-Tool-Use", split="train")

    # Check samples from the last 20k
    start_idx = len(dataset) - 20000

    print(f"\n{'='*60}")
    print(f"Sample {start_idx}:")
    print(f"{'='*60}")

    sample = dataset[start_idx]
    messages = sample['messages']

    for i, msg in enumerate(messages):
        role = msg.get('role', '')
        function_calls = msg.get('function_calls')

        if function_calls:
            print(f"\nMessage {i} (role={role}):")
            print(f"function_calls type: {type(function_calls)}")
            print(f"function_calls content:\n{function_calls}")
            break

    # Check a sample with multiple function calls
    print(f"\n{'='*60}")
    print("Looking for sample with multiple function calls...")
    print(f"{'='*60}")

    for idx in range(start_idx, start_idx + 1000):
        sample = dataset[idx]
        messages = sample['messages']

        for msg in messages:
            function_calls = msg.get('function_calls')
            if function_calls and function_calls.startswith('['):
                print(f"\nFound at index {idx}:")
                print(f"function_calls:\n{function_calls[:500]}...")
                return

if __name__ == "__main__":
    main()