Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| 读取用户档案JSON文件中每个profile的summary项 | |
| """ | |
| import json | |
| import os | |
| from typing import List, Dict, Any | |
| def read_profile_summaries(json_file_path: str, max_profiles: int = None) -> List[Dict[str, Any]]: | |
| """ | |
| 读取JSON文件中每个profile的summary项 | |
| 参数: | |
| json_file_path: JSON文件路径 | |
| max_profiles: 最大读取的profile数量,默认None表示读取所有 | |
| 返回: | |
| List[Dict]: 包含profile_id和summary的字典列表 | |
| """ | |
| try: | |
| with open(json_file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| summaries = [] | |
| count = 0 | |
| # 遍历所有profile,但限制数量 | |
| for key, value in data.items(): | |
| # 跳过metadata项 | |
| if key == "metadata": | |
| continue | |
| # 检查是否达到最大数量 | |
| if max_profiles is not None and count >= max_profiles: | |
| break | |
| # 检查是否是profile项(通常以Profile_开头) | |
| if isinstance(value, dict) and "Summary" in value: | |
| profile_info = { | |
| "profile_id": key, | |
| "summary": value["Summary"] | |
| } | |
| summaries.append(profile_info) | |
| count += 1 | |
| if max_profiles is not None: | |
| print(f"成功读取 {len(summaries)} 个profile的summary (限制: {max_profiles})") | |
| else: | |
| print(f"成功读取 {len(summaries)} 个profile的summary (读取所有)") | |
| return summaries | |
| except FileNotFoundError: | |
| print(f"错误: 文件 {json_file_path} 不存在") | |
| return [] | |
| except json.JSONDecodeError as e: | |
| print(f"错误: JSON解析失败 - {e}") | |
| return [] | |
| except Exception as e: | |
| print(f"错误: {e}") | |
| return [] | |
| def save_summaries_to_file(summaries: List[Dict[str, Any]], output_file: str): | |
| """ | |
| 将summaries保存到文件 | |
| 参数: | |
| summaries: summary数据列表 | |
| output_file: 输出文件路径 | |
| """ | |
| try: | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(summaries, f, ensure_ascii=False, indent=2) | |
| print(f"summaries已保存到: {output_file}") | |
| except Exception as e: | |
| print(f"保存文件时出错: {e}") | |
| def print_summary_stats(summaries: List[Dict[str, Any]]): | |
| """ | |
| 打印summary统计信息 | |
| 参数: | |
| summaries: summary数据列表 | |
| """ | |
| if not summaries: | |
| print("没有找到任何summary数据") | |
| return | |
| total_count = len(summaries) | |
| total_length = sum(len(item["summary"]) for item in summaries) | |
| avg_length = total_length / total_count if total_count > 0 else 0 | |
| print(f"\n=== Summary统计信息 ===") | |
| print(f"总数量: {total_count}") | |
| print(f"平均长度: {avg_length:.1f} 字符") | |
| print(f"总长度: {total_length} 字符") | |
| # 显示前3个profile的summary示例 | |
| print(f"\n=== 前3个Profile的Summary示例 ===") | |
| for i, item in enumerate(summaries[:3]): | |
| print(f"\n{i+1}. Profile ID: {item['profile_id']}") | |
| summary_preview = item['summary'][:200] + "..." if len(item['summary']) > 200 else item['summary'] | |
| print(f" Summary: {summary_preview}") | |
| def main(): | |
| """主函数""" | |
| # 输入文件路径 | |
| input_file = "/home/zhou/deeppersona/generate_user_profile_test/output/profile_world.json" | |
| # 输出文件路径 | |
| output_file = "/home/zhou/deeppersona/generate_user_profile_final/output/summary_world.json" | |
| print(f"开始读取文件: {input_file}") | |
| # 读取summaries | |
| summaries = read_profile_summaries(input_file) | |
| if summaries: | |
| # 打印统计信息 | |
| print_summary_stats(summaries) | |
| # 保存到文件 | |
| save_summaries_to_file(summaries, output_file) | |
| # 返回结果供其他模块使用 | |
| return summaries | |
| else: | |
| print("未能读取到任何summary数据") | |
| return [] | |
| if __name__ == "__main__": | |
| summaries = main() | |