####################################################
# 统计json文件中text出现的次数,现在没什么用
####################################################


import json

def analyze_text_content(file_path):
    # 读取JSON文件
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 获取所有text内容
    text_list = [item['text'] for item in data]
    
    # 统计text总数
    total_texts = len(text_list)
    
    # 找出重复的text
    text_count = {}
    for text in text_list:
        text_count[text] = text_count.get(text, 0) + 1
    
    # 筛选出重复项（出现次数大于1的）
    duplicates = {text: count for text, count in text_count.items() if count > 1}
    
    return total_texts, duplicates

# 使用函数
file_path = r"F:\GeoLLM\output\zero_shot\deepseek-ai\DeepSeek-R1.json"
total, duplicates = analyze_text_content(file_path)

print(f"文件中共包含 {total} 个text")
if duplicates:
    print("\n重复的text项：")
    for text, count in duplicates.items():
        print(f"出现 {count} 次的text: {text}")
else:
    print("\n没有发现重复的text")