#!/usr/bin/env python3 """ Validate the processed JSONL file and generate statistics. """ import json from collections import Counter, defaultdict from pathlib import Path def validate_jsonl(file_path: Path): """ Validate JSONL file and generate comprehensive statistics. """ print("=" * 80) print(f"Validating: {file_path}") print("=" * 80) print() # Statistics task_types = Counter() domains = Counter() qa_type_counts = Counter() qa_subtype_counts = Counter() total_qa_pairs = 0 success_count = 0 total_count = 0 total_turns = 0 total_tokens = 0 # Per task type statistics task_type_stats = defaultdict(lambda: { 'count': 0, 'success': 0, 'qa_pairs': 0, 'total_turns': 0, 'total_tokens': 0 }) # Per domain statistics domain_stats = defaultdict(lambda: { 'count': 0, 'success': 0, 'qa_pairs': 0, 'total_turns': 0, 'total_tokens': 0 }) errors = [] line_num = 0 with open(file_path, 'r', encoding='utf-8') as f: for line in f: line_num += 1 try: data = json.loads(line) # Validate required fields required_fields = ["episode_id", "task", "task_type", "domain", "success", "num_turns", "total_tokens", "trajectory", "qa_pairs"] for field in required_fields: if field not in data: errors.append(f"Line {line_num}: Missing field '{field}'") continue # Update counters task_type = data["task_type"] domain = data["domain"] task_types[task_type] += 1 domains[domain] += 1 total_count += 1 if data["success"]: success_count += 1 task_type_stats[task_type]['success'] += 1 domain_stats[domain]['success'] += 1 num_qa = len(data["qa_pairs"]) total_qa_pairs += num_qa task_type_stats[task_type]['qa_pairs'] += num_qa task_type_stats[task_type]['count'] += 1 domain_stats[domain]['qa_pairs'] += num_qa domain_stats[domain]['count'] += 1 total_turns += data["num_turns"] total_tokens += data["total_tokens"] task_type_stats[task_type]['total_turns'] += data["num_turns"] task_type_stats[task_type]['total_tokens'] += data["total_tokens"] domain_stats[domain]['total_turns'] += data["num_turns"] domain_stats[domain]['total_tokens'] += data["total_tokens"] # QA pairs type distribution for qa in data["qa_pairs"]: qa_type = qa.get("type", "unknown") qa_type_counts[qa_type] += 1 if "sub_type" in qa: qa_subtype_counts[qa["sub_type"]] += 1 except json.JSONDecodeError as e: errors.append(f"Line {line_num}: JSON decode error - {e}") except Exception as e: errors.append(f"Line {line_num}: Error - {e}") # Print validation results if errors: print("VALIDATION ERRORS:") print("-" * 80) for error in errors[:10]: # Show first 10 errors print(f" {error}") if len(errors) > 10: print(f" ... and {len(errors) - 10} more errors") print() else: print("✓ No validation errors found!") print() # Print overall statistics print("OVERALL STATISTICS") print("-" * 80) print(f"Total records: {total_count:>6d}") print(f"Total QA pairs: {total_qa_pairs:>6d}") print(f"Successful episodes: {success_count:>6d} ({success_count/total_count*100:>5.1f}%)") print(f"Failed episodes: {total_count - success_count:>6d} ({(total_count - success_count)/total_count*100:>5.1f}%)") print(f"Total turns: {total_turns:>6d} (avg: {total_turns/total_count:.1f})") print(f"Total tokens: {total_tokens:>6d} (avg: {total_tokens/total_count:.1f})") print() # Print domain distribution print("DOMAIN DISTRIBUTION") print("-" * 80) print(f"{'Domain':<20} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}") print("-" * 80) for domain in sorted(domains.keys()): count = domain_stats[domain]['count'] success = domain_stats[domain]['success'] success_pct = (success / count * 100) if count > 0 else 0 qa_pairs = domain_stats[domain]['qa_pairs'] avg_turns = domain_stats[domain]['total_turns'] / count if count > 0 else 0 avg_tokens = domain_stats[domain]['total_tokens'] / count if count > 0 else 0 print(f"{domain:<20} {count:>6} {success_pct:>6.1f}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}") print() # Print task type distribution print("TASK TYPE DISTRIBUTION") print("-" * 80) print(f"{'Task Type':<40} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}") print("-" * 80) for task_type in sorted(task_types.keys()): count = task_type_stats[task_type]['count'] success = task_type_stats[task_type]['success'] qa_pairs = task_type_stats[task_type]['qa_pairs'] avg_turns = task_type_stats[task_type]['total_turns'] / count if count > 0 else 0 avg_tokens = task_type_stats[task_type]['total_tokens'] / count if count > 0 else 0 print(f"{task_type:<40} {count:>6} {success:>6}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}") print() # Print QA type distribution print("QA TYPE DISTRIBUTION") print("-" * 80) print(f"{'Type':<20} {'Count':>10} {'Percentage':>12}") print("-" * 80) for qa_type, count in sorted(qa_type_counts.items()): percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0 print(f"{qa_type:<20} {count:>10} {percentage:>11.1f}%") print() # Print QA subtype distribution if qa_subtype_counts: print("QA SUBTYPE DISTRIBUTION") print("-" * 80) print(f"{'Subtype':<20} {'Count':>10} {'Percentage':>12}") print("-" * 80) for subtype in sorted(qa_subtype_counts.keys()): count = qa_subtype_counts[subtype] percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0 print(f"{subtype:<20} {count:>10} {percentage:>11.1f}%") print() print("=" * 80) print("Validation complete!") print("=" * 80) if __name__ == "__main__": jsonl_file = Path(__file__).parent / "processed_open_end.jsonl" if not jsonl_file.exists(): print(f"Error: {jsonl_file} not found!") print("Please run process_open_end.py first.") exit(1) validate_jsonl(jsonl_file)