#!/usr/bin/env python3 """ Check batch5 data quality and format. """ import json from collections import Counter def check_conversation_format(conversations): """Check if conversation follows the correct format.""" issues = [] # Check role sequence roles = [conv['from'] for conv in conversations] # Check if starts with human if roles and roles[0] != 'human': issues.append(f"Does not start with 'human', starts with '{roles[0]}'") # Check for proper alternation for i, role in enumerate(roles): if role not in ['human', 'gpt', 'function_call', 'observation', 'system']: issues.append(f"Invalid role '{role}' at position {i}") # Check if has function_call and observation has_function_call = 'function_call' in roles has_observation = 'observation' in roles # If has function_call, should have observation or gpt after if has_function_call: for i, role in enumerate(roles): if role == 'function_call': if i + 1 >= len(roles): issues.append("function_call is last message (missing response)") elif roles[i + 1] not in ['observation', 'gpt']: issues.append(f"function_call followed by '{roles[i + 1]}' instead of observation/gpt") return issues, has_function_call, has_observation def main(): print("Loading batch5...") with open('data/dolci_10k_with_tool_call_batch5.json', 'r', encoding='utf-8') as f: batch5 = json.load(f) print(f"Total samples: {len(batch5)}") # Statistics role_counter = Counter() samples_with_tools = 0 samples_with_system = 0 conversation_length = [] issues_found = [] print("\nAnalyzing samples...") for idx, sample in enumerate(batch5): conversations = sample.get('conversations', []) # Count roles for conv in conversations: role_counter[conv['from']] += 1 # Check structure if 'tools' in sample and sample['tools']: samples_with_tools += 1 if 'system' in sample and sample['system']: samples_with_system += 1 conversation_length.append(len(conversations)) # Check format issues, has_fc, has_obs = check_conversation_format(conversations) if issues: issues_found.append({ 'index': idx, 'issues': issues, 'roles': [c['from'] for c in conversations] }) # Print statistics print(f"\n=== Statistics ===") print(f"Samples with 'tools' field: {samples_with_tools}") print(f"Samples with 'system' field: {samples_with_system}") print(f"Average conversation length: {sum(conversation_length) / len(conversation_length):.2f}") print(f"Min conversation length: {min(conversation_length)}") print(f"Max conversation length: {max(conversation_length)}") print(f"\n=== Role Distribution ===") for role, count in role_counter.most_common(): print(f" {role}: {count}") print(f"\n=== Issues Found ===") print(f"Total samples with issues: {len(issues_found)}") if issues_found: print(f"\nFirst 10 issues:") for item in issues_found[:10]: print(f"\nSample {item['index']}:") print(f" Roles: {item['roles']}") for issue in item['issues']: print(f" - {issue}") # Show sample entries print(f"\n=== Sample Entries ===") for i in [0, 100, 500]: if i < len(batch5): print(f"\n--- Sample {i} ---") sample = batch5[i] print(f"Has tools: {'tools' in sample}") print(f"Has system: {'system' in sample}") print(f"Conversation roles: {[c['from'] for c in sample['conversations']]}") print(f"First message: {sample['conversations'][0]['value'][:100]}...") if len(sample['conversations']) > 1: print(f"Second message from: {sample['conversations'][1]['from']}") print(f"Second message: {sample['conversations'][1]['value'][:100]}...") if __name__ == "__main__": main()