File size: 4,168 Bytes
db704cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
"""
Check batch5 data quality and format.
"""

import json
from collections import Counter

def check_conversation_format(conversations):
    """Check if conversation follows the correct format."""
    issues = []

    # Check role sequence
    roles = [conv['from'] for conv in conversations]

    # Check if starts with human
    if roles and roles[0] != 'human':
        issues.append(f"Does not start with 'human', starts with '{roles[0]}'")

    # Check for proper alternation
    for i, role in enumerate(roles):
        if role not in ['human', 'gpt', 'function_call', 'observation', 'system']:
            issues.append(f"Invalid role '{role}' at position {i}")

    # Check if has function_call and observation
    has_function_call = 'function_call' in roles
    has_observation = 'observation' in roles

    # If has function_call, should have observation or gpt after
    if has_function_call:
        for i, role in enumerate(roles):
            if role == 'function_call':
                if i + 1 >= len(roles):
                    issues.append("function_call is last message (missing response)")
                elif roles[i + 1] not in ['observation', 'gpt']:
                    issues.append(f"function_call followed by '{roles[i + 1]}' instead of observation/gpt")

    return issues, has_function_call, has_observation

def main():
    print("Loading batch5...")
    with open('data/dolci_10k_with_tool_call_batch5.json', 'r', encoding='utf-8') as f:
        batch5 = json.load(f)

    print(f"Total samples: {len(batch5)}")

    # Statistics
    role_counter = Counter()
    samples_with_tools = 0
    samples_with_system = 0
    conversation_length = []
    issues_found = []

    print("\nAnalyzing samples...")
    for idx, sample in enumerate(batch5):
        conversations = sample.get('conversations', [])

        # Count roles
        for conv in conversations:
            role_counter[conv['from']] += 1

        # Check structure
        if 'tools' in sample and sample['tools']:
            samples_with_tools += 1
        if 'system' in sample and sample['system']:
            samples_with_system += 1

        conversation_length.append(len(conversations))

        # Check format
        issues, has_fc, has_obs = check_conversation_format(conversations)
        if issues:
            issues_found.append({
                'index': idx,
                'issues': issues,
                'roles': [c['from'] for c in conversations]
            })

    # Print statistics
    print(f"\n=== Statistics ===")
    print(f"Samples with 'tools' field: {samples_with_tools}")
    print(f"Samples with 'system' field: {samples_with_system}")
    print(f"Average conversation length: {sum(conversation_length) / len(conversation_length):.2f}")
    print(f"Min conversation length: {min(conversation_length)}")
    print(f"Max conversation length: {max(conversation_length)}")

    print(f"\n=== Role Distribution ===")
    for role, count in role_counter.most_common():
        print(f"  {role}: {count}")

    print(f"\n=== Issues Found ===")
    print(f"Total samples with issues: {len(issues_found)}")

    if issues_found:
        print(f"\nFirst 10 issues:")
        for item in issues_found[:10]:
            print(f"\nSample {item['index']}:")
            print(f"  Roles: {item['roles']}")
            for issue in item['issues']:
                print(f"  - {issue}")

    # Show sample entries
    print(f"\n=== Sample Entries ===")
    for i in [0, 100, 500]:
        if i < len(batch5):
            print(f"\n--- Sample {i} ---")
            sample = batch5[i]
            print(f"Has tools: {'tools' in sample}")
            print(f"Has system: {'system' in sample}")
            print(f"Conversation roles: {[c['from'] for c in sample['conversations']]}")
            print(f"First message: {sample['conversations'][0]['value'][:100]}...")
            if len(sample['conversations']) > 1:
                print(f"Second message from: {sample['conversations'][1]['from']}")
                print(f"Second message: {sample['conversations'][1]['value'][:100]}...")

if __name__ == "__main__":
    main()