DrSyedFaizan commited on
Commit
353668f
·
verified ·
1 Parent(s): 8d9a97c

Upload eval.py

Browse files
Files changed (1) hide show
  1. eval.py +198 -0
eval.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import openai
4
+ from gradio_client import Client
5
+
6
+ # Load API Key from .env file
7
+ load_dotenv()
8
+ api_key = os.getenv("OPENAI_API_KEY")
9
+ print(f"Using OpenAI API Key: {api_key[:5]}****{api_key[-3:]}")
10
+ openai.api_key = api_key
11
+
12
+ # ---- STEP 1: Load First Aid Contextual Data ----
13
+ from langchain_community.document_loaders import ArxivLoader
14
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
15
+ from langchain_openai import OpenAIEmbeddings
16
+ from langchain_community.vectorstores import Chroma
17
+ from langchain.prompts import ChatPromptTemplate
18
+ from langchain_openai import ChatOpenAI
19
+ import wandb
20
+ import pandas as pd
21
+
22
+ # Load medical and first aid papers from ArXiv
23
+ first_aid_docs = ArxivLoader(query="first aid treatment", load_max_docs=5).load()
24
+
25
+ # Split documents for indexing
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250)
27
+ docs = text_splitter.split_documents(first_aid_docs)
28
+
29
+ # Create vectorstore
30
+ vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())
31
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
32
+
33
+ # ---- Define First Aid Questions ----
34
+ questions = [
35
+ "What are the first aid measures for high fever in infants?",
36
+ "What are the signs and symptoms of low blood sugar?",
37
+ "What does RICE stand for in first aid treatment?",
38
+ "What is the first aid treatment of bleeding?",
39
+ "What is the first aid management of burns?",
40
+ "What are the signs and symptoms of stroke?",
41
+ "What is the treatment of snake bite?",
42
+ "How do you provide first aid for choking?",
43
+ "What are the immediate steps to treat a fainting patient?",
44
+ "What are the First aid measures for taking care of a patient with insect stings and animal bites?"
45
+ ]
46
+
47
+ # ---- STEP 2: Generate Ground Truth Responses using ChatGPT ----
48
+ llm = ChatOpenAI(model_name="gpt-4", temperature=0)
49
+ prompt_template = """
50
+ Generate a detailed and accurate first-aid response based on the given context.
51
+
52
+ ### CONTEXT
53
+ {context}
54
+
55
+ ### QUESTION
56
+ {question}
57
+
58
+ ### RESPONSE
59
+ """
60
+ prompt = ChatPromptTemplate.from_template(prompt_template)
61
+
62
+ ground_truth_responses = []
63
+ for question in questions:
64
+ retrieved_docs = retriever.invoke(question)
65
+ context_text = "\n".join([doc.page_content for doc in retrieved_docs])
66
+ generated_response = llm.invoke(prompt.format(context=context_text, question=question))
67
+ ground_truth_responses.append(str(generated_response))
68
+
69
+ # ---- STEP 3: Fetch Responses from Deployed Chatbot ----
70
+ print("\n===== Fetching Responses from Chatbot =====")
71
+
72
+ client = Client("DrSyedFaizan/First_Aid_Assistant")
73
+ responses = []
74
+
75
+ for question in questions:
76
+ try:
77
+ result = client.predict(chatbot=[], message=question, api_name="/respond")
78
+ chat_history = result[1]
79
+ chatbot_response = next((entry["content"] for entry in chat_history if entry["role"] == "assistant"), "[NO RESPONSE]")
80
+ except Exception as e:
81
+ chatbot_response = f"[ERROR: {e}]"
82
+
83
+ responses.append(str(chatbot_response))
84
+
85
+ # Save bot responses to a text file
86
+ with open("bot_responses.txt", "w", encoding="utf-8") as f:
87
+ for q, r in zip(questions, responses):
88
+ f.write(f"Q: {q}\nA: {r}\n\n")
89
+
90
+ # Print chatbot responses for debugging
91
+ for q, r in zip(questions, responses):
92
+ print(f"Q: {q}\nA: {r}\n")
93
+
94
+ # ---- STEP 5: Evaluate Using RAGAS ----
95
+ from datasets import Dataset
96
+ import pandas as pd
97
+ from tqdm import tqdm
98
+ from ragas import evaluate
99
+ from ragas.metrics import (
100
+ answer_relevancy,
101
+ faithfulness,
102
+ context_recall,
103
+ answer_correctness,
104
+ answer_similarity
105
+ )
106
+
107
+ def create_ragas_dataset(eval_dataset):
108
+ """Convert dataset to RAGAS format."""
109
+ df = eval_dataset.to_pandas()
110
+ rag_dataset = []
111
+ for _, row in df.iterrows():
112
+ rag_dataset.append(
113
+ {
114
+ "question": row["question"],
115
+ "answer": row["answer"],
116
+ "contexts": ["First aid medical references"],
117
+ "ground_truths": [row["ground_truth"]],
118
+ "reference": row["context"]
119
+ }
120
+ )
121
+ rag_df = pd.DataFrame(rag_dataset)
122
+ return Dataset.from_pandas(rag_df)
123
+
124
+ def evaluate_ragas_dataset(ragas_dataset):
125
+ """Run RAGAS evaluation with proper handling of required_columns."""
126
+ try:
127
+ result = evaluate(
128
+ ragas_dataset,
129
+ metrics=[
130
+ faithfulness,
131
+ answer_relevancy,
132
+ context_recall,
133
+ answer_correctness,
134
+ answer_similarity
135
+ ],
136
+ )
137
+ return result
138
+ except Exception as e:
139
+ print("⚠️ RAGAS Error:", e)
140
+ raise e
141
+
142
+ # Create ground truth dataset
143
+ ground_truth_qac_set = pd.DataFrame({
144
+ "question": questions,
145
+ "answer": responses,
146
+ "context": ["First aid medical references"] * len(questions),
147
+ "ground_truth": [str(response) for response in ground_truth_responses],
148
+ "reference": ["First aid medical references"] * len(questions)
149
+ })
150
+
151
+ eval_dataset = Dataset.from_pandas(ground_truth_qac_set.astype(str))
152
+
153
+ # Save evaluation datasets
154
+ eval_dataset.to_csv("groundtruth_eval_dataset.csv")
155
+ basic_qa_ragas_dataset = create_ragas_dataset(eval_dataset)
156
+ basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")
157
+
158
+ # Run evaluation
159
+ basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)
160
+
161
+ print("\n===== Evaluation Results =====")
162
+ print(basic_qa_result)
163
+
164
+ evaluation_results = basic_qa_result.to_pandas()
165
+
166
+ # Save evaluation results as log
167
+
168
+ # ---- STEP 6: Log Results to WandB ----
169
+
170
+ import wandb
171
+ import pandas as pd
172
+
173
+ # ✅ Convert `eval_dataset` (Dataset) to Pandas DataFrame
174
+ eval_df = eval_dataset.to_pandas()
175
+
176
+ # ✅ Convert `basic_qa_ragas_dataset` (Dataset) to Pandas DataFrame
177
+ ragas_df = basic_qa_ragas_dataset.to_pandas()
178
+
179
+ # ✅ Save DataFrames as CSV
180
+ eval_df.to_csv("groundtruth_eval_dataset.csv", index=False)
181
+ ragas_df.to_csv("basic_qa_ragas_dataset.csv", index=False)
182
+
183
+ # ✅ Initialize WandB
184
+ wandb.init(
185
+ project="first-aid-tutor",
186
+ entity="drsyedfaizan1987-northeastern-university",
187
+ name="ragas_evaluation",
188
+ notes="Logging evaluation datasets for first-aid chatbot.",
189
+ tags=["first-aid", "evaluation", "ragas"]
190
+ )
191
+
192
+ # ✅ Log DataFrames to WandB as Tables
193
+ wandb.log({"basic_qa_ragas_dataset": wandb.Table(dataframe=evaluation_results)})
194
+ wandb.log({"groundtruth_eval_dataset": wandb.Table(dataframe=eval_df)})
195
+ wandb.log({"basic_qa_ragas_dataset": wandb.Table(dataframe=ragas_df)})
196
+
197
+ # ✅ Finish WandB run
198
+ wandb.finish()