Spaces:

pengfali
/

GeoLLM

Runtime error

File size: 17,517 Bytes

badcf3c

import json
import numpy as np
from metrics.graph_matching import (
    get_triple_match_f1, 
    get_graph_match_accuracy,
    get_bert_score,
    get_bleu_rouge,
    split_to_edges,
    get_tokens,
    get_ged
)

def load_data(gold_path, pred_path):
    '''

    数据加载处理：

    只评估在预测数据中出现的文本对应的三元组

    自动匹配真实数据和预测数据中的对应项

    多维度评估：

    Triple Match F1：评估三元组的精确匹配程度

    Graph Match Accuracy：评估图结构的匹配程度

    BERT Score：评估语义相似度

    BLEU & ROUGE：评估文本生成质量

    图编辑距离(GED)：评估图结构差异

    '''
    # 加载真实数据
    with open(gold_path, 'r', encoding='utf-8') as f:
        gold_data = json.load(f)
    
    # 加载预测数据
    with open(pred_path, 'r', encoding='utf-8') as f:
        pred_data = json.load(f)
    
    # 提取三元组列表
    gold_graphs = []
    pred_graphs = []
    
    # 确保只评估在预测数据中出现的文本对应的三元组
    for pred_item in pred_data:
        pred_text = pred_item['text']
        # 在gold_data中找到对应的文本
        for gold_item in gold_data:
            if gold_item['text'] == pred_text:
                gold_graphs.append(gold_item['triple_list'])
                pred_graphs.append(pred_item['triple_list'])
                break
    
    return gold_graphs, pred_graphs

def evaluate_triples(gold_graphs, pred_graphs):
    print("开始评估...")
    print("="*50)
    
    # 1. Triple Match F1
    precision, recall, f1 = get_triple_match_f1(gold_graphs, pred_graphs)
    print("Triple Match")
    print(f"精确率: {precision:.4f}, 召回率: {recall:.4f}, F1: {f1:.4f}")


        # # 2. Graph Match Accuracy
    # graph_acc = get_graph_match_accuracy(pred_graphs, gold_graphs)
    # print(f"图匹配准确率: {graph_acc:.10f}")
    
    # 3. BERT Score
    gold_edges = split_to_edges(gold_graphs)
    pred_edges = split_to_edges(pred_graphs)
    precisions_BS, recalls_BS, f1s_BS = get_bert_score(gold_edges, pred_edges)
    print(f"BERT Score:")
    print(f"- Precision: {precisions_BS.mean():.4f}")
    print(f"- Recall: {recalls_BS.mean():.4f}")
    print(f"- F1: {f1s_BS.mean():.4f}")
    
    # # 4. BLEU & ROUGE
    # gold_tokens, pred_tokens = get_tokens(gold_edges, pred_edges)
    # p_rouge, r_rouge, f1_rouge, p_bleu, r_bleu, f1_bleu = get_bleu_rouge(
    #     gold_tokens, pred_tokens, gold_edges, pred_edges
    # )
    # print(f"\nBLEU分数:")
    # print(f"- Precision: {p_bleu.mean():.4f}")
    # print(f"- Recall: {r_bleu.mean():.4f}")
    # print(f"- F1: {f1_bleu.mean():.4f}")
    
    # print(f"\nROUGE分数:")
    # print(f"- Precision: {p_rouge.mean():.4f}")
    # print(f"- Recall: {r_rouge.mean():.4f}")
    # print(f"- F1: {f1_rouge.mean():.4f}")
    
    # # 5. 图编辑距离(GED)
    # total_ged = 0
    # for gold, pred in zip(gold_graphs, pred_graphs):
    #     ged = get_ged(gold, pred)
    #     total_ged += ged
    # avg_ged = total_ged / len(gold_graphs)
    # print(f"\n平均图编辑距离: {avg_ged:.4f}")
    
    # 返回所有指标
    return {

        'triple_match': {
            'precision': precision,
            'recall': recall,
            'f1': f1
        },
        

        # 'graph_acc': graph_acc,

        'bert_score': {
            'precision': precisions_BS.mean(),
            'recall': recalls_BS.mean(),
            'f1': f1s_BS.mean()
        },
        # 'bleu': {
        #     'precision': p_bleu.mean(),
        #     'recall': r_bleu.mean(),
        #     'f1': f1_bleu.mean()
        # },
        # 'rouge': {
        #     'precision': p_rouge.mean(),
        #     'recall': r_rouge.mean(),
        #     'f1': f1_rouge.mean()
        # },
        # 'ged': avg_ged
    }

if __name__ == '__main__':
    import pandas as pd
    # # 设置文件路径
    # gold_path = './data/train_triples.json'
    # pred_path = './output/gpt.json'
    
    # # 加载数据
    # gold_graphs, pred_graphs = load_data(gold_path, pred_path)
    
    # # 评估并打印结果
    # results = evaluate_triples(gold_graphs, pred_graphs)
    # 加载地质描述文本，提取prompt和label
    with open('./data/train_triples.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    # 将data转换为DataFrame
    df = pd.DataFrame(data)
    # 提取prompt和label
    text = df['text']
    label = df['triple_list']
    # 设置文件路径
    gold_path = './data/GT_500.json'
    model_paths = [
        # # gpt-3.5
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/old/gpt-3.5-turbo.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gpt-3p5-turbo.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gpt-3p5-turbo.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gpt-3p5-turbo.json',        # 三样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gpt-3p5-turbo.json',        # KNN单样本    
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gpt-3p5-turbo.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gpt-3p5-turbo.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gpt-3p5-turbo.json',        # 知识引导单样本
        # # gpt-4o
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/old/gpt-4o.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gpt-4o.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gpt-4o.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gpt-4o.json',        # 三样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gpt-4o.json',        # KNN单样本  
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gpt-4o.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gpt-4o.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gpt-4o.json',        # 知识引导单样本
        # # gemini-1p5-pro-002
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/gemini-1p5-pro-002.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/gemini-1p5-pro-002.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/gemini-1p5-pro-002.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/gemini-1p5-pro-002.json',        # 三样本 
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/gemini-1p5-pro-002.json',        # KNN单样本  
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/gemini-1p5-pro-002.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/gemini-1p5-pro-002.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/gemini-1p5-pro-002.json',        # 知识引导单样本
        # # claude-3-5-haiku-20241022
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/claude-3-5-haiku-20241022.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/claude-3-5-haiku-20241022.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/claude-3-5-haiku-20241022.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/claude-3-5-haiku-20241022.json',        # 三样本   
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/claude-3-5-haiku-20241022.json',        # KNN单样本    
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/claude-3-5-haiku-20241022.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/claude-3-5-haiku-20241022.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/claude-3-5-haiku-20241022.json',        # 知识引导单样本
        # # deepseek-ai
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/deepseek-ai/DeepSeek-V3.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/deepseek-ai/DeepSeek-V3.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/deepseek-ai/DeepSeek-V3.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/deepseek-ai/DeepSeek-V3.json',        # 三样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/deepseek-ai/DeepSeek-V3.json',        # KNN单样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/deepseek-ai/DeepSeek-V3.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/deepseek-ai/DeepSeek-V3.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/deepseek-ai/DeepSeek-V3.json',        # 知识引导单样本

        # # R1
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/deepseek-ai/DeepSeek-R1.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/deepseek-ai/DeepSeek-R1.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/deepseek-ai/DeepSeek-R1.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/deepseek-ai/DeepSeek-R1.json',        # 三样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/deepseek-ai/DeepSeek-R1.json',        # KNN单样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/deepseek-ai/DeepSeek-R1.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/deepseek-ai/DeepSeek-R1.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/deepseek-ai/DeepSeek-R1.json',        # 知识引导单样本
        # # meta-llama
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # 三样本   
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # KNN单样本    
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/meta-llama/Meta-Llama-3p1-405B-Instruct.json',        # 知识引导单样本
        # # Qwen
        # 'F:/GeoLLM/output/output_result/Task1/nomal/zero_shot/Qwen/Qwen2p5-72B-Instruct.json',        # 零样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/one_shot/Qwen/Qwen2p5-72B-Instruct.json',        # 单样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/two_shot/Qwen/Qwen2p5-72B-Instruct.json',        # 双样本
        # 'F:/GeoLLM/output/output_result/Task1/nomal/three_shot/Qwen/Qwen2p5-72B-Instruct.json',        # 三样本 
        # 'F:/GeoLLM/output/output_result/Task1/knn/one_shot/Qwen/Qwen2p5-72B-Instruct.json',        # KNN单样本      
        # 'F:/GeoLLM/output/output_result/Task1/knn/two_shot/Qwen/Qwen2p5-72B-Instruct.json',        # KNN双样本
        # 'F:/GeoLLM/output/output_result/Task1/knn/three_shot/Qwen/Qwen2p5-72B-Instruct.json',        # KNN三样本
        # 'F:/GeoLLM/output/output_result/Task1/Knowledge-guided/one_shot/Qwen/Qwen2p5-72B-Instruct.json',        # 知识引导单样本

        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_0407.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-3.5-turbo_old.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gpt-4o_konwledge_tri.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/gemini-1.5-pro-002_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/claude-3-5-haiku-20241022_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/deepseek-ai/DeepSeek-V3_0420.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_only_tri/deepseek-ai/DeepSeek-V3.json',        # 知识引导单样本
        'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_guide.json',        # 知识引导单样本
        'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json',        # 知识引导单样本
        'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_only_tri/deepseek-ai/DeepSeek-R1.json'
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/meta-llama/Meta-Llama-3.1-405B-Instruct_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/one_shot/Qwen/Qwen2.5-72B-Instruct_one_shot.json',        # 知识引导单样本

        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-3.5-turbo_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-4o_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gemini-1.5-pro-002_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/claude-3-5-haiku-20241022_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-V3_one_shot.json',        # 知识引导单样本
        # # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1.json',        # 知识引导单样本
        # # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json'
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/meta-llama/Meta-Llama-3.1-405B-Instruct_one_shot.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/Qwen/Qwen2.5-72B-Instruct_one_shot.json',        # 知识引导单样本

        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-3.5-turbo.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gpt-4o.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/gemini-1.5-pro-002.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/claude-3-5-haiku-20241022.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-V3.json',        # 知识引导单样本
        # # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1.json',        # 知识引导单样本
        # # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/deepseek-ai/DeepSeek-R1_one_shot.json'
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/meta-llama/Meta-Llama-3.1-405B-Instruct.json',        # 知识引导单样本
        # 'F:/GeoLLM/output/Knowledge-guided_rerun/nomal_all/Qwen/Qwen2.5-72B-Instruct.json',        # 知识引导单样本   
    ]
    # 对比不同模型的表现
    print("各模型评估结果:")
    # 存储所有模型的结果
    all_results = {}

    for pred_path in model_paths:
        model_name = pred_path.split('/')[-1].split('.')[0]
        print(f"\n{model_name}模型:")
        
        # 加载数据
        gold_graphs, pred_graphs = load_data(gold_path, pred_path)
        
        # 评估并打印结果
        results = evaluate_triples(gold_graphs, pred_graphs)
        all_results[model_name] = results
        # 连带model_paths和results一起保存为txt
        save_path = 'F:/GeoLLM/output/output_result/Task1/Result_Task1.txt'
        with open(save_path, 'a', encoding='utf-8') as f:
            f.write(f"{pred_path}: \n")
            f.write(f"Triple Match: \n")
            f.write(f"精确率: {results['triple_match']['precision']:.4f}, 召回率: {results['triple_match']['recall']:.4f}, F1: {results['triple_match']['f1']:.4f}\n")
            f.write(f"BERT Score: \n")
            f.write(f"- Precision: {results['bert_score']['precision']:.4f}\n")
            f.write(f"- Recall: {results['bert_score']['recall']:.4f}\n")
            f.write(f"- F1: {results['bert_score']['f1']:.4f}\n\n")