File size: 4,249 Bytes
0701598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
读取用户档案JSON文件中每个profile的summary项
"""

import json
import os
from typing import List, Dict, Any

def read_profile_summaries(json_file_path: str, max_profiles: int = None) -> List[Dict[str, Any]]:
    """
    读取JSON文件中每个profile的summary项
    
    参数:
        json_file_path: JSON文件路径
        max_profiles: 最大读取的profile数量,默认None表示读取所有
        
    返回:
        List[Dict]: 包含profile_id和summary的字典列表
    """
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        summaries = []
        count = 0
        
        # 遍历所有profile,但限制数量
        for key, value in data.items():
            # 跳过metadata项
            if key == "metadata":
                continue
                
            # 检查是否达到最大数量
            if max_profiles is not None and count >= max_profiles:
                break
                
            # 检查是否是profile项(通常以Profile_开头)
            if isinstance(value, dict) and "Summary" in value:
                profile_info = {
                    "profile_id": key,
                    "summary": value["Summary"]
                }
                summaries.append(profile_info)
                count += 1
                
        if max_profiles is not None:
            print(f"成功读取 {len(summaries)} 个profile的summary (限制: {max_profiles})")
        else:
            print(f"成功读取 {len(summaries)} 个profile的summary (读取所有)")
        return summaries
        
    except FileNotFoundError:
        print(f"错误: 文件 {json_file_path} 不存在")
        return []
    except json.JSONDecodeError as e:
        print(f"错误: JSON解析失败 - {e}")
        return []
    except Exception as e:
        print(f"错误: {e}")
        return []

def save_summaries_to_file(summaries: List[Dict[str, Any]], output_file: str):
    """
    将summaries保存到文件
    
    参数:
        summaries: summary数据列表
        output_file: 输出文件路径
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(summaries, f, ensure_ascii=False, indent=2)
        print(f"summaries已保存到: {output_file}")
    except Exception as e:
        print(f"保存文件时出错: {e}")

def print_summary_stats(summaries: List[Dict[str, Any]]):
    """
    打印summary统计信息
    
    参数:
        summaries: summary数据列表
    """
    if not summaries:
        print("没有找到任何summary数据")
        return
        
    total_count = len(summaries)
    total_length = sum(len(item["summary"]) for item in summaries)
    avg_length = total_length / total_count if total_count > 0 else 0
    
    print(f"\n=== Summary统计信息 ===")
    print(f"总数量: {total_count}")
    print(f"平均长度: {avg_length:.1f} 字符")
    print(f"总长度: {total_length} 字符")
    
    # 显示前3个profile的summary示例
    print(f"\n=== 前3个Profile的Summary示例 ===")
    for i, item in enumerate(summaries[:3]):
        print(f"\n{i+1}. Profile ID: {item['profile_id']}")
        summary_preview = item['summary'][:200] + "..." if len(item['summary']) > 200 else item['summary']
        print(f"   Summary: {summary_preview}")

def main():
    """主函数"""
    # 输入文件路径
    input_file = "/home/zhou/deeppersona/generate_user_profile_test/output/profile_world.json"
    
    # 输出文件路径
    output_file = "/home/zhou/deeppersona/generate_user_profile_final/output/summary_world.json"

    print(f"开始读取文件: {input_file}")
    
    # 读取summaries
    summaries = read_profile_summaries(input_file)
    
    if summaries:
        # 打印统计信息
        print_summary_stats(summaries)
        
        # 保存到文件
        save_summaries_to_file(summaries, output_file)
        
        # 返回结果供其他模块使用
        return summaries
    else:
        print("未能读取到任何summary数据")
        return []

if __name__ == "__main__":
    summaries = main()