Time-Series-Library / data_provider /calculate_window_len.py

Upload Time-Series-Library

f9d3aeb verified 3 months ago

4.75 kB

	import json
	from collections import Counter

	def analyze_window_history_lengths(filepath):
	"""
	统计JSONL文件中所有window_history的长度分布

	Args:
	filepath: JSONL文件的路径

	Returns:
	Counter对象，包含长度分布统计
	"""
	length_counter = Counter()
	total_records = 0
	total_breakpoints = 0
	total_window_histories = 0

	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if not line:
	continue

	try:
	data = json.loads(line)
	total_records += 1

	# 检查是否有daily_breakpoints字段
	if 'daily_breakpoints' not in data:
	print(f"警告: 第{line_num}行没有'daily_breakpoints'字段")
	continue

	daily_breakpoints = data['daily_breakpoints']

	# 遍历每个breakpoint
	for i, breakpoint in enumerate(daily_breakpoints):
	total_breakpoints += 1

	# 检查是否有window_history字段
	if 'window_history' not in breakpoint:
	print(f"警告: 第{line_num}行的breakpoint[{i}]没有'window_history'字段")
	continue

	window_history = breakpoint['window_history']

	# 统计长度
	if isinstance(window_history, list):
	length = len(window_history)
	length_counter[length] += 1
	total_window_histories += 1
	else:
	print(f"警告: 第{line_num}行的breakpoint[{i}]的'window_history'不是列表")

	except json.JSONDecodeError as e:
	print(f"JSON解析错误在第{line_num}行: {e}")
	continue

	return length_counter, total_records, total_breakpoints, total_window_histories

	except FileNotFoundError:
	print(f"文件未找到: {filepath}")
	return None, 0, 0, 0
	except Exception as e:
	print(f"发生错误: {e}")
	return None, 0, 0, 0

	def print_statistics(length_counter, total_records, total_breakpoints, total_window_histories):
	"""
	打印统计结果
	"""
	print("\n" + "="*60)
	print("统计摘要:")
	print("="*60)
	print(f"总JSON记录数: {total_records}")
	print(f"总breakpoints数: {total_breakpoints}")
	print(f"总window_history数: {total_window_histories}")

	if not length_counter:
	print("\n没有找到任何window_history数据")
	return

	print("\n" + "="*60)
	print("Window History 长度分布:")
	print("="*60)
	print(f"{'长度':<10} {'数量':<10} {'百分比':<10} {'分布图'}")
	print("-"*60)

	# 按长度排序
	for length in sorted(length_counter.keys()):
	count = length_counter[length]
	percentage = (count / total_window_histories) * 100
	bar = '█' * int(percentage / 2) # 每个█代表2%
	print(f"{length:<10} {count:<10} {percentage:>6.2f}% {bar}")

	print("\n" + "="*60)
	print("统计信息:")
	print("="*60)
	print(f"最小长度: {min(length_counter.keys())}")
	print(f"最大长度: {max(length_counter.keys())}")

	# 计算平均长度
	total_length = sum(length * count for length, count in length_counter.items())
	avg_length = total_length / total_window_histories
	print(f"平均长度: {avg_length:.2f}")

	# 计算中位数
	sorted_lengths = []
	for length, count in sorted(length_counter.items()):
	sorted_lengths.extend([length] * count)
	median_length = sorted_lengths[len(sorted_lengths) // 2]
	print(f"中位数长度: {median_length}")

	print("="*60)

	# 使用示例
	if __name__ == "__main__":
	# 替换成你的JSONL文件路径
	filepath = "/data/haofeiy2/social-world-model/data/splitted_polymarket/polymarket_data_processed_with_news_train_2024-11-01.jsonl"

	print(f"正在分析文件: {filepath}")
	length_counter, total_records, total_breakpoints, total_window_histories = \
	analyze_window_history_lengths(filepath)

	if length_counter is not None:
	print_statistics(length_counter, total_records, total_breakpoints, total_window_histories)