llm / check_batch5.py
dongxx1104's picture
Upload folder using huggingface_hub
db704cb verified
#!/usr/bin/env python3
"""
Check batch5 data quality and format.
"""
import json
from collections import Counter
def check_conversation_format(conversations):
"""Check if conversation follows the correct format."""
issues = []
# Check role sequence
roles = [conv['from'] for conv in conversations]
# Check if starts with human
if roles and roles[0] != 'human':
issues.append(f"Does not start with 'human', starts with '{roles[0]}'")
# Check for proper alternation
for i, role in enumerate(roles):
if role not in ['human', 'gpt', 'function_call', 'observation', 'system']:
issues.append(f"Invalid role '{role}' at position {i}")
# Check if has function_call and observation
has_function_call = 'function_call' in roles
has_observation = 'observation' in roles
# If has function_call, should have observation or gpt after
if has_function_call:
for i, role in enumerate(roles):
if role == 'function_call':
if i + 1 >= len(roles):
issues.append("function_call is last message (missing response)")
elif roles[i + 1] not in ['observation', 'gpt']:
issues.append(f"function_call followed by '{roles[i + 1]}' instead of observation/gpt")
return issues, has_function_call, has_observation
def main():
print("Loading batch5...")
with open('data/dolci_10k_with_tool_call_batch5.json', 'r', encoding='utf-8') as f:
batch5 = json.load(f)
print(f"Total samples: {len(batch5)}")
# Statistics
role_counter = Counter()
samples_with_tools = 0
samples_with_system = 0
conversation_length = []
issues_found = []
print("\nAnalyzing samples...")
for idx, sample in enumerate(batch5):
conversations = sample.get('conversations', [])
# Count roles
for conv in conversations:
role_counter[conv['from']] += 1
# Check structure
if 'tools' in sample and sample['tools']:
samples_with_tools += 1
if 'system' in sample and sample['system']:
samples_with_system += 1
conversation_length.append(len(conversations))
# Check format
issues, has_fc, has_obs = check_conversation_format(conversations)
if issues:
issues_found.append({
'index': idx,
'issues': issues,
'roles': [c['from'] for c in conversations]
})
# Print statistics
print(f"\n=== Statistics ===")
print(f"Samples with 'tools' field: {samples_with_tools}")
print(f"Samples with 'system' field: {samples_with_system}")
print(f"Average conversation length: {sum(conversation_length) / len(conversation_length):.2f}")
print(f"Min conversation length: {min(conversation_length)}")
print(f"Max conversation length: {max(conversation_length)}")
print(f"\n=== Role Distribution ===")
for role, count in role_counter.most_common():
print(f" {role}: {count}")
print(f"\n=== Issues Found ===")
print(f"Total samples with issues: {len(issues_found)}")
if issues_found:
print(f"\nFirst 10 issues:")
for item in issues_found[:10]:
print(f"\nSample {item['index']}:")
print(f" Roles: {item['roles']}")
for issue in item['issues']:
print(f" - {issue}")
# Show sample entries
print(f"\n=== Sample Entries ===")
for i in [0, 100, 500]:
if i < len(batch5):
print(f"\n--- Sample {i} ---")
sample = batch5[i]
print(f"Has tools: {'tools' in sample}")
print(f"Has system: {'system' in sample}")
print(f"Conversation roles: {[c['from'] for c in sample['conversations']]}")
print(f"First message: {sample['conversations'][0]['value'][:100]}...")
if len(sample['conversations']) > 1:
print(f"Second message from: {sample['conversations'][1]['from']}")
print(f"Second message: {sample['conversations'][1]['value'][:100]}...")
if __name__ == "__main__":
main()