File size: 4,323 Bytes
0c6d13f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, pipeline
from langchain import HuggingFacePipeline
import os
import torch

def load_model_and_pipeline(model_info, quantization=4, is_t5=False, use_local=True):
    # Check if the model is local or should be downloaded from Hugging Face
    # if use_local:
    #     path = f"models/{model_info}"
    #     if not os.path.exists(path):
    #         print(f"Local model not found at {path}. Downloading from Hugging Face...")
    #         use_local = False  # Fallback to Hugging Face download if local not found
    # if not use_local:
    #     # Replace model_info with the corresponding Hugging Face repo name
    #     hf_model_map = {
    #         "zephyr-7b-beta": "HuggingFaceH4/zephyr-7b-beta",
    #         "llama-3-8b": "NousResearch/Meta-Llama-3-8B",
    #         "mistral-7b": "unsloth/mistral-7b-instruct-v0.3",
    #         "phi-3-mini": "microsoft/Phi-3-mini-4k-instruct",
    #         "flan-t5-base": "google/flan-t5-base"
    #     }
    #     path = hf_model_map.get(model_info.split("_")[1], model_info)

    tokenizer = AutoTokenizer.from_pretrained(model_info, use_auth_token=True)

    if quantization == "8":
        model = AutoModelForCausalLM.from_pretrained(
            model_info,
            device_map='auto',
            torch_dtype=torch.float16,
            use_auth_token=True,
            load_in_8bit=True
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_info,
            device_map='auto',
            torch_dtype=torch.float16,
            use_auth_token=True,
            load_in_4bit=True
        )

    if is_t5:
        model = T5ForConditionalGeneration.from_pretrained(model_info)
        tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        max_new_tokens=512,
        do_sample=True,
        top_k=30,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
    )

    llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0})
    return tokenizer, model, llm

def zephyr_model(model_info, quantization, use_local=True):
    return load_model_and_pipeline(model_info, quantization, use_local=use_local)

def llama_model(model_info, quantization, use_local=True):
    return load_model_and_pipeline(model_info, quantization, use_local=use_local)

def mistral_model(model_info, quantization, use_local=True):
    return load_model_and_pipeline(model_info, quantization, use_local=use_local)

def phi_model(model_info, quantization, use_local=True):
    return load_model_and_pipeline(model_info, quantization, use_local=use_local)

def flant5_model(model_info, use_local=True):
    return load_model_and_pipeline(model_info, is_t5=True, use_local=use_local)


import pandas as pd
from datasets import Dataset

def calculate_rag_metrics(model_ques_ans_gen, llm_model, embedding_model="BAAI/bge-base-en-v1.5"):
    # Create a dictionary from the model_ques_ans_gen list
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_correctness,answer_similarity,answer_relevancy,context_recall, context_precision
    data_samples = {
        'question': [item['question'] for item in model_ques_ans_gen],
        'answer': [item['answer'] for item in model_ques_ans_gen],
        'contexts': [item['contexts'] for item in model_ques_ans_gen],
        'ground_truths': [item['ground_truths'] for item in model_ques_ans_gen]
    }

    # Convert the dictionary to a pandas DataFrame
    rag_df = pd.DataFrame(data_samples)

    # Convert the DataFrame to a HuggingFace Dataset
    rag_eval_dataset = Dataset.from_pandas(rag_df)

    # Define the list of metrics to calculate
    metrics = [
        "answer_correctness", "answer_similarity", 
        "answer_relevancy", "faithfulness", 
        "context_recall", "context_precision"
    ]

    # Perform the evaluation using the provided LLM and embedding models
    result = evaluate(
        rag_eval_dataset,
        metrics=metrics,
        llm=llm_model,
        embeddings=embedding_model
    )
    result.to_pandas()
    return result