Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Validate the processed JSONL file and generate statistics. | |
| """ | |
| import json | |
| from collections import Counter, defaultdict | |
| from pathlib import Path | |
| def validate_jsonl(file_path: Path): | |
| """ | |
| Validate JSONL file and generate comprehensive statistics. | |
| """ | |
| print("=" * 80) | |
| print(f"Validating: {file_path}") | |
| print("=" * 80) | |
| print() | |
| # Statistics | |
| task_types = Counter() | |
| domains = Counter() | |
| qa_type_counts = Counter() | |
| qa_subtype_counts = Counter() | |
| total_qa_pairs = 0 | |
| success_count = 0 | |
| total_count = 0 | |
| total_turns = 0 | |
| total_tokens = 0 | |
| # Per task type statistics | |
| task_type_stats = defaultdict(lambda: { | |
| 'count': 0, | |
| 'success': 0, | |
| 'qa_pairs': 0, | |
| 'total_turns': 0, | |
| 'total_tokens': 0 | |
| }) | |
| # Per domain statistics | |
| domain_stats = defaultdict(lambda: { | |
| 'count': 0, | |
| 'success': 0, | |
| 'qa_pairs': 0, | |
| 'total_turns': 0, | |
| 'total_tokens': 0 | |
| }) | |
| errors = [] | |
| line_num = 0 | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| line_num += 1 | |
| try: | |
| data = json.loads(line) | |
| # Validate required fields | |
| required_fields = ["episode_id", "task", "task_type", "domain", | |
| "success", "num_turns", "total_tokens", | |
| "trajectory", "qa_pairs"] | |
| for field in required_fields: | |
| if field not in data: | |
| errors.append(f"Line {line_num}: Missing field '{field}'") | |
| continue | |
| # Update counters | |
| task_type = data["task_type"] | |
| domain = data["domain"] | |
| task_types[task_type] += 1 | |
| domains[domain] += 1 | |
| total_count += 1 | |
| if data["success"]: | |
| success_count += 1 | |
| task_type_stats[task_type]['success'] += 1 | |
| domain_stats[domain]['success'] += 1 | |
| num_qa = len(data["qa_pairs"]) | |
| total_qa_pairs += num_qa | |
| task_type_stats[task_type]['qa_pairs'] += num_qa | |
| task_type_stats[task_type]['count'] += 1 | |
| domain_stats[domain]['qa_pairs'] += num_qa | |
| domain_stats[domain]['count'] += 1 | |
| total_turns += data["num_turns"] | |
| total_tokens += data["total_tokens"] | |
| task_type_stats[task_type]['total_turns'] += data["num_turns"] | |
| task_type_stats[task_type]['total_tokens'] += data["total_tokens"] | |
| domain_stats[domain]['total_turns'] += data["num_turns"] | |
| domain_stats[domain]['total_tokens'] += data["total_tokens"] | |
| # QA pairs type distribution | |
| for qa in data["qa_pairs"]: | |
| qa_type = qa.get("type", "unknown") | |
| qa_type_counts[qa_type] += 1 | |
| if "sub_type" in qa: | |
| qa_subtype_counts[qa["sub_type"]] += 1 | |
| except json.JSONDecodeError as e: | |
| errors.append(f"Line {line_num}: JSON decode error - {e}") | |
| except Exception as e: | |
| errors.append(f"Line {line_num}: Error - {e}") | |
| # Print validation results | |
| if errors: | |
| print("VALIDATION ERRORS:") | |
| print("-" * 80) | |
| for error in errors[:10]: # Show first 10 errors | |
| print(f" {error}") | |
| if len(errors) > 10: | |
| print(f" ... and {len(errors) - 10} more errors") | |
| print() | |
| else: | |
| print("✓ No validation errors found!") | |
| print() | |
| # Print overall statistics | |
| print("OVERALL STATISTICS") | |
| print("-" * 80) | |
| print(f"Total records: {total_count:>6d}") | |
| print(f"Total QA pairs: {total_qa_pairs:>6d}") | |
| print(f"Successful episodes: {success_count:>6d} ({success_count/total_count*100:>5.1f}%)") | |
| print(f"Failed episodes: {total_count - success_count:>6d} ({(total_count - success_count)/total_count*100:>5.1f}%)") | |
| print(f"Total turns: {total_turns:>6d} (avg: {total_turns/total_count:.1f})") | |
| print(f"Total tokens: {total_tokens:>6d} (avg: {total_tokens/total_count:.1f})") | |
| print() | |
| # Print domain distribution | |
| print("DOMAIN DISTRIBUTION") | |
| print("-" * 80) | |
| print(f"{'Domain':<20} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}") | |
| print("-" * 80) | |
| for domain in sorted(domains.keys()): | |
| count = domain_stats[domain]['count'] | |
| success = domain_stats[domain]['success'] | |
| success_pct = (success / count * 100) if count > 0 else 0 | |
| qa_pairs = domain_stats[domain]['qa_pairs'] | |
| avg_turns = domain_stats[domain]['total_turns'] / count if count > 0 else 0 | |
| avg_tokens = domain_stats[domain]['total_tokens'] / count if count > 0 else 0 | |
| print(f"{domain:<20} {count:>6} {success_pct:>6.1f}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}") | |
| print() | |
| # Print task type distribution | |
| print("TASK TYPE DISTRIBUTION") | |
| print("-" * 80) | |
| print(f"{'Task Type':<40} {'Count':>6} {'Success':>7} {'QA Pairs':>9} {'Avg Turns':>10} {'Avg Tokens':>11}") | |
| print("-" * 80) | |
| for task_type in sorted(task_types.keys()): | |
| count = task_type_stats[task_type]['count'] | |
| success = task_type_stats[task_type]['success'] | |
| qa_pairs = task_type_stats[task_type]['qa_pairs'] | |
| avg_turns = task_type_stats[task_type]['total_turns'] / count if count > 0 else 0 | |
| avg_tokens = task_type_stats[task_type]['total_tokens'] / count if count > 0 else 0 | |
| print(f"{task_type:<40} {count:>6} {success:>6}% {qa_pairs:>9} {avg_turns:>10.1f} {avg_tokens:>11.1f}") | |
| print() | |
| # Print QA type distribution | |
| print("QA TYPE DISTRIBUTION") | |
| print("-" * 80) | |
| print(f"{'Type':<20} {'Count':>10} {'Percentage':>12}") | |
| print("-" * 80) | |
| for qa_type, count in sorted(qa_type_counts.items()): | |
| percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0 | |
| print(f"{qa_type:<20} {count:>10} {percentage:>11.1f}%") | |
| print() | |
| # Print QA subtype distribution | |
| if qa_subtype_counts: | |
| print("QA SUBTYPE DISTRIBUTION") | |
| print("-" * 80) | |
| print(f"{'Subtype':<20} {'Count':>10} {'Percentage':>12}") | |
| print("-" * 80) | |
| for subtype in sorted(qa_subtype_counts.keys()): | |
| count = qa_subtype_counts[subtype] | |
| percentage = count / total_qa_pairs * 100 if total_qa_pairs > 0 else 0 | |
| print(f"{subtype:<20} {count:>10} {percentage:>11.1f}%") | |
| print() | |
| print("=" * 80) | |
| print("Validation complete!") | |
| print("=" * 80) | |
| if __name__ == "__main__": | |
| jsonl_file = Path(__file__).parent / "processed_open_end.jsonl" | |
| if not jsonl_file.exists(): | |
| print(f"Error: {jsonl_file} not found!") | |
| print("Please run process_open_end.py first.") | |
| exit(1) | |
| validate_jsonl(jsonl_file) | |