| import json |
| from collections import Counter |
|
|
| def analyze_window_history_lengths(filepath): |
| """ |
| 统计JSONL文件中所有window_history的长度分布 |
| |
| Args: |
| filepath: JSONL文件的路径 |
| |
| Returns: |
| Counter对象,包含长度分布统计 |
| """ |
| length_counter = Counter() |
| total_records = 0 |
| total_breakpoints = 0 |
| total_window_histories = 0 |
| |
| try: |
| with open(filepath, 'r', encoding='utf-8') as f: |
| for line_num, line in enumerate(f, 1): |
| line = line.strip() |
| if not line: |
| continue |
| |
| try: |
| data = json.loads(line) |
| total_records += 1 |
| |
| |
| if 'daily_breakpoints' not in data: |
| print(f"警告: 第{line_num}行没有'daily_breakpoints'字段") |
| continue |
| |
| daily_breakpoints = data['daily_breakpoints'] |
| |
| |
| for i, breakpoint in enumerate(daily_breakpoints): |
| total_breakpoints += 1 |
| |
| |
| if 'window_history' not in breakpoint: |
| print(f"警告: 第{line_num}行的breakpoint[{i}]没有'window_history'字段") |
| continue |
| |
| window_history = breakpoint['window_history'] |
| |
| |
| if isinstance(window_history, list): |
| length = len(window_history) |
| length_counter[length] += 1 |
| total_window_histories += 1 |
| else: |
| print(f"警告: 第{line_num}行的breakpoint[{i}]的'window_history'不是列表") |
| |
| except json.JSONDecodeError as e: |
| print(f"JSON解析错误在第{line_num}行: {e}") |
| continue |
| |
| return length_counter, total_records, total_breakpoints, total_window_histories |
| |
| except FileNotFoundError: |
| print(f"文件未找到: {filepath}") |
| return None, 0, 0, 0 |
| except Exception as e: |
| print(f"发生错误: {e}") |
| return None, 0, 0, 0 |
|
|
| def print_statistics(length_counter, total_records, total_breakpoints, total_window_histories): |
| """ |
| 打印统计结果 |
| """ |
| print("\n" + "="*60) |
| print("统计摘要:") |
| print("="*60) |
| print(f"总JSON记录数: {total_records}") |
| print(f"总breakpoints数: {total_breakpoints}") |
| print(f"总window_history数: {total_window_histories}") |
| |
| if not length_counter: |
| print("\n没有找到任何window_history数据") |
| return |
| |
| print("\n" + "="*60) |
| print("Window History 长度分布:") |
| print("="*60) |
| print(f"{'长度':<10} {'数量':<10} {'百分比':<10} {'分布图'}") |
| print("-"*60) |
| |
| |
| for length in sorted(length_counter.keys()): |
| count = length_counter[length] |
| percentage = (count / total_window_histories) * 100 |
| bar = '█' * int(percentage / 2) |
| print(f"{length:<10} {count:<10} {percentage:>6.2f}% {bar}") |
| |
| print("\n" + "="*60) |
| print("统计信息:") |
| print("="*60) |
| print(f"最小长度: {min(length_counter.keys())}") |
| print(f"最大长度: {max(length_counter.keys())}") |
| |
| |
| total_length = sum(length * count for length, count in length_counter.items()) |
| avg_length = total_length / total_window_histories |
| print(f"平均长度: {avg_length:.2f}") |
| |
| |
| sorted_lengths = [] |
| for length, count in sorted(length_counter.items()): |
| sorted_lengths.extend([length] * count) |
| median_length = sorted_lengths[len(sorted_lengths) // 2] |
| print(f"中位数长度: {median_length}") |
| |
| print("="*60) |
|
|
| |
| if __name__ == "__main__": |
| |
| filepath = "/data/haofeiy2/social-world-model/data/splitted_polymarket/polymarket_data_processed_with_news_train_2024-11-01.jsonl" |
| |
| print(f"正在分析文件: {filepath}") |
| length_counter, total_records, total_breakpoints, total_window_histories = \ |
| analyze_window_history_lengths(filepath) |
| |
| if length_counter is not None: |
| print_statistics(length_counter, total_records, total_breakpoints, total_window_histories) |