interactSpeech / analyze_dialogue_lengths.py
Student0809's picture
Add files using upload-large-folder tool
fd421e2 verified
raw
history blame
3.69 kB
import json
from collections import Counter
import numpy as np
from typing import List, Dict
import matplotlib.pyplot as plt
def analyze_dialogue_lengths(file_path: str) -> Dict:
# Read the JSONL file
lengths = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line.strip())
for message in item['messages']:
if message['role'] == 'assistant':
content = message['content']
length = len(content)
lengths.append(length)
except json.JSONDecodeError as e:
print(f"Error parsing line: {e}")
continue
if not lengths:
print(f"No valid assistant responses found in {file_path}")
return {}
# Calculate statistics
max_length = max(lengths)
avg_length = np.mean(lengths)
median_length = np.median(lengths)
# Calculate length distribution with more detailed ranges
length_ranges = {
'0-100': 0,
'101-500': 0,
'501-1000': 0,
'1001-2000': 0,
'2001-3000': 0,
'3001-4000': 0,
'4001-5000': 0,
'5001-6000': 0,
'6000+': 0
}
for length in lengths:
if length <= 100:
length_ranges['0-100'] += 1
elif length <= 500:
length_ranges['101-500'] += 1
elif length <= 1000:
length_ranges['501-1000'] += 1
elif length <= 2000:
length_ranges['1001-2000'] += 1
elif length <= 3000:
length_ranges['2001-3000'] += 1
elif length <= 4000:
length_ranges['3001-4000'] += 1
elif length <= 5000:
length_ranges['4001-5000'] += 1
elif length <= 6000:
length_ranges['5001-6000'] += 1
else:
length_ranges['6000+'] += 1
# Calculate percentages
total = len(lengths)
percentages = {k: (v/total)*100 for k, v in length_ranges.items()}
# Print results
print(f"\nAnalysis Results for {file_path}:")
print(f"Total number of assistant responses: {total}")
print(f"Maximum length: {max_length} characters")
print(f"Average length: {avg_length:.2f} characters")
print(f"Median length: {median_length:.2f} characters")
print("\nLength Distribution:")
for range_name, percentage in percentages.items():
print(f"{range_name}: {percentage:.2f}%")
# Create a histogram with more bins for better visualization
plt.figure(figsize=(12, 6))
plt.hist(lengths, bins=100, edgecolor='black')
plt.title('Distribution of Assistant Response Lengths')
plt.xlabel('Length (characters)')
plt.ylabel('Frequency')
plt.savefig('dialogue_length_distribution.png')
plt.close()
# Create a bar chart for the ranges
plt.figure(figsize=(12, 6))
ranges = list(length_ranges.keys())
counts = list(length_ranges.values())
plt.bar(ranges, counts)
plt.title('Distribution of Response Lengths by Range')
plt.xlabel('Length Range')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('dialogue_length_ranges.png')
plt.close()
return {
'total_responses': total,
'max_length': max_length,
'avg_length': avg_length,
'median_length': median_length,
'distribution': percentages
}
if __name__ == "__main__":
# Analyze both train and test datasets
train_results = analyze_dialogue_lengths('dataset_cotSFTtrain.json')
test_results = analyze_dialogue_lengths('dataset_cotSFTtest.json')