AMA-bench-Leaderboard / validate_jsonl.py
NorahYujieZhao
the new version
d8b2e03
#!/usr/bin/env python3
"""
Validate the processed JSONL file and generate statistics.
"""
import json
from collections import Counter, defaultdict
from pathlib import Path
def validate_jsonl(file_path: Path):
"""
Validate JSONL file and generate comprehensive statistics.
"""
print("=" * 80)
print(f"Validating: {file_path}")
print("=" * 80)
print()
# Statistics
task_types = Counter()
domains = Counter()
qa_type_counts = Counter()
qa_subtype_counts = Counter()
total_qa_pairs = 0
success_count = 0
total_count = 0
total_turns = 0
total_tokens = 0
# Per task type statistics
task_type_stats = defaultdict(lambda: {
'count': 0,
'success': 0,
'qa_pairs': 0,
'total_turns': 0,
'total_tokens': 0
})
# Per domain statistics
domain_stats = defaultdict(lambda: {
'count': 0,
'success': 0,
'qa_pairs': 0,
'total_turns': 0,
'total_tokens': 0
})
errors = []
line_num = 0
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
line_num += 1
try:
data = json.loads(line)
# Validate required fields
required_fields = ["episode_id", "task", "task_type", "domain",
"success", "num_turns", "total_tokens",
"trajectory", "qa_pairs"]
for field in required_fields:
if field not in data:
errors.append(f"Line {line_num}: Missing field '{field}'")
continue
# Update counters
task_type = data["task_type"]
domain = data["domain"]
task_types[task_type] += 1
domains[domain] += 1
total_count += 1
if data["success"]:
success_count += 1
task_type_stats[task_type]['success'] += 1
domain_stats[domain]['success'] += 1
num_qa = len(data["qa_pairs"])
total_qa_pairs += num_qa
task_type_stats[task_type]['qa_pairs'] += num_qa
task_type_stats[task_type]['count'] += 1
domain_stats[domain]['qa_pairs'] += num_qa
domain_stats[domain]['count'] += 1
total_turns += data["num_turns"]
total_tokens += data["total_tokens"]
task_type_stats[task_type]['total_turns'] += data["num_turns"]
task_type_stats[task_type]['total_tokens'] += data["total_tokens"]
domain_stats[domain]['total_turns'] += data["num_turns"]
domain_stats[domain]['total_tokens'] += data["total_tokens"]
# QA pairs type distribution
for qa in data["qa_pairs"]:
qa_type = qa.get("type", "unknown")
qa_type_counts[qa_type] += 1
if "sub_type" in qa:
qa_subtype_counts[qa["sub_type"]] += 1
except json.JSONDecodeError as e:
errors.append(f"Line {line_num}: JSON decode error - {e}")
except Exception as e:
errors.append(f"Line {line_num}: Error - {e}")
# Print validation results
if errors:
print("VALIDATION ERRORS:")
print("-" * 80)
for error in errors[:10]: # Show first 10 errors
print(f" {error}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more errors")
print()
else:
print("✓ No validation errors found!")
print()
# Print overall statistics
print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total records: {total_count:>6d}")
print(f"Total QA pairs: {total_qa_pairs:>6d}")
print(f"Successful episodes: {success_count:>6d} ({success_count/total_count*100:>5.1f}%)")
print(f"Failed episodes: {total_count - success_count:>6d} ({(total_count - success_count)/total_count*100:>5.1f}%)")
print(f"Total turns: {total_turns:>6d} (avg: {total_turns/total_count:.1f})")
print(f"Total tokens: {total_tokens:>6d} (avg: {total_tokens/total_count:.1f})")
print()
# Print domain distribution
print("DOMAIN DISTRIBUTION")
print("-" * 80)
print(f"{'Domain':<20} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}")
print("-" * 80)
for domain in sorted(domains.keys()):
count = domain_stats[domain]['count']
success = domain_stats[domain]['success']
success_pct = (success / count * 100) if count > 0 else 0
qa_pairs = domain_stats[domain]['qa_pairs']
avg_turns = domain_stats[domain]['total_turns'] / count if count > 0 else 0
avg_tokens = domain_stats[domain]['total_tokens'] / count if count > 0 else 0
print(f"{domain:<20} {count:>6} {success_pct:>6.1f}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}")
print()
# Print task type distribution
print("TASK TYPE DISTRIBUTION")
print("-" * 80)
print(f"{'Task Type':<40} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}")
print("-" * 80)
for task_type in sorted(task_types.keys()):
count = task_type_stats[task_type]['count']
success = task_type_stats[task_type]['success']
qa_pairs = task_type_stats[task_type]['qa_pairs']
avg_turns = task_type_stats[task_type]['total_turns'] / count if count > 0 else 0
avg_tokens = task_type_stats[task_type]['total_tokens'] / count if count > 0 else 0
print(f"{task_type:<40} {count:>6} {success:>6}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}")
print()
# Print QA type distribution
print("QA TYPE DISTRIBUTION")
print("-" * 80)
print(f"{'Type':<20} {'Count':>10} {'Percentage':>12}")
print("-" * 80)
for qa_type, count in sorted(qa_type_counts.items()):
percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0
print(f"{qa_type:<20} {count:>10} {percentage:>11.1f}%")
print()
# Print QA subtype distribution
if qa_subtype_counts:
print("QA SUBTYPE DISTRIBUTION")
print("-" * 80)
print(f"{'Subtype':<20} {'Count':>10} {'Percentage':>12}")
print("-" * 80)
for subtype in sorted(qa_subtype_counts.keys()):
count = qa_subtype_counts[subtype]
percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0
print(f"{subtype:<20} {count:>10} {percentage:>11.1f}%")
print()
print("=" * 80)
print("Validation complete!")
print("=" * 80)
if __name__ == "__main__":
jsonl_file = Path(__file__).parent / "processed_open_end.jsonl"
if not jsonl_file.exists():
print(f"Error: {jsonl_file} not found!")
print("Please run process_open_end.py first.")
exit(1)
validate_jsonl(jsonl_file)