File size: 3,694 Bytes
fd421e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
from collections import Counter
import numpy as np
from typing import List, Dict
import matplotlib.pyplot as plt

def analyze_dialogue_lengths(file_path: str) -> Dict:
    # Read the JSONL file
    lengths = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                for message in item['messages']:
                    if message['role'] == 'assistant':
                        content = message['content']
                        length = len(content)
                        lengths.append(length)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
                continue
    
    if not lengths:
        print(f"No valid assistant responses found in {file_path}")
        return {}
    
    # Calculate statistics
    max_length = max(lengths)
    avg_length = np.mean(lengths)
    median_length = np.median(lengths)
    
    # Calculate length distribution with more detailed ranges
    length_ranges = {
        '0-100': 0,
        '101-500': 0,
        '501-1000': 0,
        '1001-2000': 0,
        '2001-3000': 0,
        '3001-4000': 0,
        '4001-5000': 0,
        '5001-6000': 0,
        '6000+': 0
    }
    
    for length in lengths:
        if length <= 100:
            length_ranges['0-100'] += 1
        elif length <= 500:
            length_ranges['101-500'] += 1
        elif length <= 1000:
            length_ranges['501-1000'] += 1
        elif length <= 2000:
            length_ranges['1001-2000'] += 1
        elif length <= 3000:
            length_ranges['2001-3000'] += 1
        elif length <= 4000:
            length_ranges['3001-4000'] += 1
        elif length <= 5000:
            length_ranges['4001-5000'] += 1
        elif length <= 6000:
            length_ranges['5001-6000'] += 1
        else:
            length_ranges['6000+'] += 1
    
    # Calculate percentages
    total = len(lengths)
    percentages = {k: (v/total)*100 for k, v in length_ranges.items()}
    
    # Print results
    print(f"\nAnalysis Results for {file_path}:")
    print(f"Total number of assistant responses: {total}")
    print(f"Maximum length: {max_length} characters")
    print(f"Average length: {avg_length:.2f} characters")
    print(f"Median length: {median_length:.2f} characters")
    print("\nLength Distribution:")
    for range_name, percentage in percentages.items():
        print(f"{range_name}: {percentage:.2f}%")
    
    # Create a histogram with more bins for better visualization
    plt.figure(figsize=(12, 6))
    plt.hist(lengths, bins=100, edgecolor='black')
    plt.title('Distribution of Assistant Response Lengths')
    plt.xlabel('Length (characters)')
    plt.ylabel('Frequency')
    plt.savefig('dialogue_length_distribution.png')
    plt.close()
    
    # Create a bar chart for the ranges
    plt.figure(figsize=(12, 6))
    ranges = list(length_ranges.keys())
    counts = list(length_ranges.values())
    plt.bar(ranges, counts)
    plt.title('Distribution of Response Lengths by Range')
    plt.xlabel('Length Range')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('dialogue_length_ranges.png')
    plt.close()
    
    return {
        'total_responses': total,
        'max_length': max_length,
        'avg_length': avg_length,
        'median_length': median_length,
        'distribution': percentages
    }

if __name__ == "__main__":
    # Analyze both train and test datasets
    train_results = analyze_dialogue_lengths('dataset_cotSFTtrain.json')
    test_results = analyze_dialogue_lengths('dataset_cotSFTtest.json')