|
|
|
|
|
""" |
|
|
Check batch5 data quality and format. |
|
|
""" |
|
|
|
|
|
import json |
|
|
from collections import Counter |
|
|
|
|
|
def check_conversation_format(conversations): |
|
|
"""Check if conversation follows the correct format.""" |
|
|
issues = [] |
|
|
|
|
|
|
|
|
roles = [conv['from'] for conv in conversations] |
|
|
|
|
|
|
|
|
if roles and roles[0] != 'human': |
|
|
issues.append(f"Does not start with 'human', starts with '{roles[0]}'") |
|
|
|
|
|
|
|
|
for i, role in enumerate(roles): |
|
|
if role not in ['human', 'gpt', 'function_call', 'observation', 'system']: |
|
|
issues.append(f"Invalid role '{role}' at position {i}") |
|
|
|
|
|
|
|
|
has_function_call = 'function_call' in roles |
|
|
has_observation = 'observation' in roles |
|
|
|
|
|
|
|
|
if has_function_call: |
|
|
for i, role in enumerate(roles): |
|
|
if role == 'function_call': |
|
|
if i + 1 >= len(roles): |
|
|
issues.append("function_call is last message (missing response)") |
|
|
elif roles[i + 1] not in ['observation', 'gpt']: |
|
|
issues.append(f"function_call followed by '{roles[i + 1]}' instead of observation/gpt") |
|
|
|
|
|
return issues, has_function_call, has_observation |
|
|
|
|
|
def main(): |
|
|
print("Loading batch5...") |
|
|
with open('data/dolci_10k_with_tool_call_batch5.json', 'r', encoding='utf-8') as f: |
|
|
batch5 = json.load(f) |
|
|
|
|
|
print(f"Total samples: {len(batch5)}") |
|
|
|
|
|
|
|
|
role_counter = Counter() |
|
|
samples_with_tools = 0 |
|
|
samples_with_system = 0 |
|
|
conversation_length = [] |
|
|
issues_found = [] |
|
|
|
|
|
print("\nAnalyzing samples...") |
|
|
for idx, sample in enumerate(batch5): |
|
|
conversations = sample.get('conversations', []) |
|
|
|
|
|
|
|
|
for conv in conversations: |
|
|
role_counter[conv['from']] += 1 |
|
|
|
|
|
|
|
|
if 'tools' in sample and sample['tools']: |
|
|
samples_with_tools += 1 |
|
|
if 'system' in sample and sample['system']: |
|
|
samples_with_system += 1 |
|
|
|
|
|
conversation_length.append(len(conversations)) |
|
|
|
|
|
|
|
|
issues, has_fc, has_obs = check_conversation_format(conversations) |
|
|
if issues: |
|
|
issues_found.append({ |
|
|
'index': idx, |
|
|
'issues': issues, |
|
|
'roles': [c['from'] for c in conversations] |
|
|
}) |
|
|
|
|
|
|
|
|
print(f"\n=== Statistics ===") |
|
|
print(f"Samples with 'tools' field: {samples_with_tools}") |
|
|
print(f"Samples with 'system' field: {samples_with_system}") |
|
|
print(f"Average conversation length: {sum(conversation_length) / len(conversation_length):.2f}") |
|
|
print(f"Min conversation length: {min(conversation_length)}") |
|
|
print(f"Max conversation length: {max(conversation_length)}") |
|
|
|
|
|
print(f"\n=== Role Distribution ===") |
|
|
for role, count in role_counter.most_common(): |
|
|
print(f" {role}: {count}") |
|
|
|
|
|
print(f"\n=== Issues Found ===") |
|
|
print(f"Total samples with issues: {len(issues_found)}") |
|
|
|
|
|
if issues_found: |
|
|
print(f"\nFirst 10 issues:") |
|
|
for item in issues_found[:10]: |
|
|
print(f"\nSample {item['index']}:") |
|
|
print(f" Roles: {item['roles']}") |
|
|
for issue in item['issues']: |
|
|
print(f" - {issue}") |
|
|
|
|
|
|
|
|
print(f"\n=== Sample Entries ===") |
|
|
for i in [0, 100, 500]: |
|
|
if i < len(batch5): |
|
|
print(f"\n--- Sample {i} ---") |
|
|
sample = batch5[i] |
|
|
print(f"Has tools: {'tools' in sample}") |
|
|
print(f"Has system: {'system' in sample}") |
|
|
print(f"Conversation roles: {[c['from'] for c in sample['conversations']]}") |
|
|
print(f"First message: {sample['conversations'][0]['value'][:100]}...") |
|
|
if len(sample['conversations']) > 1: |
|
|
print(f"Second message from: {sample['conversations'][1]['from']}") |
|
|
print(f"Second message: {sample['conversations'][1]['value'][:100]}...") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|