| #################################################### | |
| # 统计json文件中text出现的次数,现在没什么用 | |
| #################################################### | |
| import json | |
| def analyze_text_content(file_path): | |
| # 读取JSON文件 | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # 获取所有text内容 | |
| text_list = [item['text'] for item in data] | |
| # 统计text总数 | |
| total_texts = len(text_list) | |
| # 找出重复的text | |
| text_count = {} | |
| for text in text_list: | |
| text_count[text] = text_count.get(text, 0) + 1 | |
| # 筛选出重复项(出现次数大于1的) | |
| duplicates = {text: count for text, count in text_count.items() if count > 1} | |
| return total_texts, duplicates | |
| # 使用函数 | |
| file_path = r"F:\GeoLLM\output\zero_shot\deepseek-ai\DeepSeek-R1.json" | |
| total, duplicates = analyze_text_content(file_path) | |
| print(f"文件中共包含 {total} 个text") | |
| if duplicates: | |
| print("\n重复的text项:") | |
| for text, count in duplicates.items(): | |
| print(f"出现 {count} 次的text: {text}") | |
| else: | |
| print("\n没有发现重复的text") | |