GeoLLM / test_1.py
Pengfa Li
Upload folder using huggingface_hub
badcf3c verified
####################################################
# 统计json文件中text出现的次数,现在没什么用
####################################################
import json
def analyze_text_content(file_path):
# 读取JSON文件
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# 获取所有text内容
text_list = [item['text'] for item in data]
# 统计text总数
total_texts = len(text_list)
# 找出重复的text
text_count = {}
for text in text_list:
text_count[text] = text_count.get(text, 0) + 1
# 筛选出重复项(出现次数大于1的)
duplicates = {text: count for text, count in text_count.items() if count > 1}
return total_texts, duplicates
# 使用函数
file_path = r"F:\GeoLLM\output\zero_shot\deepseek-ai\DeepSeek-R1.json"
total, duplicates = analyze_text_content(file_path)
print(f"文件中共包含 {total} 个text")
if duplicates:
print("\n重复的text项:")
for text, count in duplicates.items():
print(f"出现 {count} 次的text: {text}")
else:
print("\n没有发现重复的text")