Time-Series-Library / data_provider /calculate_window_len.py
lwaekfjlk's picture
Upload Time-Series-Library
f9d3aeb verified
import json
from collections import Counter
def analyze_window_history_lengths(filepath):
"""
统计JSONL文件中所有window_history的长度分布
Args:
filepath: JSONL文件的路径
Returns:
Counter对象,包含长度分布统计
"""
length_counter = Counter()
total_records = 0
total_breakpoints = 0
total_window_histories = 0
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
total_records += 1
# 检查是否有daily_breakpoints字段
if 'daily_breakpoints' not in data:
print(f"警告: 第{line_num}行没有'daily_breakpoints'字段")
continue
daily_breakpoints = data['daily_breakpoints']
# 遍历每个breakpoint
for i, breakpoint in enumerate(daily_breakpoints):
total_breakpoints += 1
# 检查是否有window_history字段
if 'window_history' not in breakpoint:
print(f"警告: 第{line_num}行的breakpoint[{i}]没有'window_history'字段")
continue
window_history = breakpoint['window_history']
# 统计长度
if isinstance(window_history, list):
length = len(window_history)
length_counter[length] += 1
total_window_histories += 1
else:
print(f"警告: 第{line_num}行的breakpoint[{i}]的'window_history'不是列表")
except json.JSONDecodeError as e:
print(f"JSON解析错误在第{line_num}行: {e}")
continue
return length_counter, total_records, total_breakpoints, total_window_histories
except FileNotFoundError:
print(f"文件未找到: {filepath}")
return None, 0, 0, 0
except Exception as e:
print(f"发生错误: {e}")
return None, 0, 0, 0
def print_statistics(length_counter, total_records, total_breakpoints, total_window_histories):
"""
打印统计结果
"""
print("\n" + "="*60)
print("统计摘要:")
print("="*60)
print(f"总JSON记录数: {total_records}")
print(f"总breakpoints数: {total_breakpoints}")
print(f"总window_history数: {total_window_histories}")
if not length_counter:
print("\n没有找到任何window_history数据")
return
print("\n" + "="*60)
print("Window History 长度分布:")
print("="*60)
print(f"{'长度':<10} {'数量':<10} {'百分比':<10} {'分布图'}")
print("-"*60)
# 按长度排序
for length in sorted(length_counter.keys()):
count = length_counter[length]
percentage = (count / total_window_histories) * 100
bar = '█' * int(percentage / 2) # 每个█代表2%
print(f"{length:<10} {count:<10} {percentage:>6.2f}% {bar}")
print("\n" + "="*60)
print("统计信息:")
print("="*60)
print(f"最小长度: {min(length_counter.keys())}")
print(f"最大长度: {max(length_counter.keys())}")
# 计算平均长度
total_length = sum(length * count for length, count in length_counter.items())
avg_length = total_length / total_window_histories
print(f"平均长度: {avg_length:.2f}")
# 计算中位数
sorted_lengths = []
for length, count in sorted(length_counter.items()):
sorted_lengths.extend([length] * count)
median_length = sorted_lengths[len(sorted_lengths) // 2]
print(f"中位数长度: {median_length}")
print("="*60)
# 使用示例
if __name__ == "__main__":
# 替换成你的JSONL文件路径
filepath = "/data/haofeiy2/social-world-model/data/splitted_polymarket/polymarket_data_processed_with_news_train_2024-11-01.jsonl"
print(f"正在分析文件: {filepath}")
length_counter, total_records, total_breakpoints, total_window_histories = \
analyze_window_history_lengths(filepath)
if length_counter is not None:
print_statistics(length_counter, total_records, total_breakpoints, total_window_histories)