Spaces:
Running
Running
File size: 4,249 Bytes
0701598 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
读取用户档案JSON文件中每个profile的summary项
"""
import json
import os
from typing import List, Dict, Any
def read_profile_summaries(json_file_path: str, max_profiles: int = None) -> List[Dict[str, Any]]:
"""
读取JSON文件中每个profile的summary项
参数:
json_file_path: JSON文件路径
max_profiles: 最大读取的profile数量,默认None表示读取所有
返回:
List[Dict]: 包含profile_id和summary的字典列表
"""
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
summaries = []
count = 0
# 遍历所有profile,但限制数量
for key, value in data.items():
# 跳过metadata项
if key == "metadata":
continue
# 检查是否达到最大数量
if max_profiles is not None and count >= max_profiles:
break
# 检查是否是profile项(通常以Profile_开头)
if isinstance(value, dict) and "Summary" in value:
profile_info = {
"profile_id": key,
"summary": value["Summary"]
}
summaries.append(profile_info)
count += 1
if max_profiles is not None:
print(f"成功读取 {len(summaries)} 个profile的summary (限制: {max_profiles})")
else:
print(f"成功读取 {len(summaries)} 个profile的summary (读取所有)")
return summaries
except FileNotFoundError:
print(f"错误: 文件 {json_file_path} 不存在")
return []
except json.JSONDecodeError as e:
print(f"错误: JSON解析失败 - {e}")
return []
except Exception as e:
print(f"错误: {e}")
return []
def save_summaries_to_file(summaries: List[Dict[str, Any]], output_file: str):
"""
将summaries保存到文件
参数:
summaries: summary数据列表
output_file: 输出文件路径
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(summaries, f, ensure_ascii=False, indent=2)
print(f"summaries已保存到: {output_file}")
except Exception as e:
print(f"保存文件时出错: {e}")
def print_summary_stats(summaries: List[Dict[str, Any]]):
"""
打印summary统计信息
参数:
summaries: summary数据列表
"""
if not summaries:
print("没有找到任何summary数据")
return
total_count = len(summaries)
total_length = sum(len(item["summary"]) for item in summaries)
avg_length = total_length / total_count if total_count > 0 else 0
print(f"\n=== Summary统计信息 ===")
print(f"总数量: {total_count}")
print(f"平均长度: {avg_length:.1f} 字符")
print(f"总长度: {total_length} 字符")
# 显示前3个profile的summary示例
print(f"\n=== 前3个Profile的Summary示例 ===")
for i, item in enumerate(summaries[:3]):
print(f"\n{i+1}. Profile ID: {item['profile_id']}")
summary_preview = item['summary'][:200] + "..." if len(item['summary']) > 200 else item['summary']
print(f" Summary: {summary_preview}")
def main():
"""主函数"""
# 输入文件路径
input_file = "/home/zhou/deeppersona/generate_user_profile_test/output/profile_world.json"
# 输出文件路径
output_file = "/home/zhou/deeppersona/generate_user_profile_final/output/summary_world.json"
print(f"开始读取文件: {input_file}")
# 读取summaries
summaries = read_profile_summaries(input_file)
if summaries:
# 打印统计信息
print_summary_stats(summaries)
# 保存到文件
save_summaries_to_file(summaries, output_file)
# 返回结果供其他模块使用
return summaries
else:
print("未能读取到任何summary数据")
return []
if __name__ == "__main__":
summaries = main()
|