pgurazada1 commited on
Commit
06f03c5
·
verified ·
1 Parent(s): 871be79

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from openai import OpenAI
4
+
5
+ client = OpenAI(
6
+ base_url="https://api.endpoints.anyscale.com/v1",
7
+ api_key=os.environ['ANYSCALE_API_KEY']
8
+ )
9
+
10
+ model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
11
+
12
+ groundedness_rater_system_message = """
13
+ You are tasked with rating AI generated answers to questions posed by users.
14
+ You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
15
+ In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.
16
+
17
+ Evaluation criteria:
18
+ The task is to judge the extent to which the metric is followed by the answer.
19
+ 1 - The metric is not followed at all
20
+ 2 - The metric is followed only to a limited extent
21
+ 3 - The metric is followed to a good extent
22
+ 4 - The metric is followed mostly
23
+ 5 - The metric is followed completely
24
+
25
+ Metric:
26
+ The answer should be derived only from the information presented in the context
27
+
28
+ Instructions:
29
+ 1. First write down the steps that are needed to evaluate the answer as per the metric.
30
+ 2. Give a step-by-step explanation if the answer adheres to the metric considering the question and context as the input.
31
+ 3. Next, evaluate the extent to which the metric is followed.
32
+ 4. Use the previous information to rate the answer using the evaluaton criteria and assign a score.
33
+ """
34
+
35
+ relevance_rater_system_message = """
36
+ You are tasked with rating AI generated answers to questions posed by users.
37
+ You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
38
+ In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.
39
+
40
+ Evaluation criteria:
41
+ The task is to judge the extent to which the metric is followed by the answer.
42
+ 1 - The metric is not followed at all
43
+ 2 - The metric is followed only to a limited extent
44
+ 3 - The metric is followed to a good extent
45
+ 4 - The metric is followed mostly
46
+ 5 - The metric is followed completely
47
+
48
+ Metric:
49
+ Relevance measures how well the answer addresses the main aspects of the question, based on the context.
50
+ Consider whether all and only the important aspects are contained in the answer when evaluating relevance.
51
+
52
+ Instructions:
53
+ 1. First write down the steps that are needed to evaluate the context as per the metric.
54
+ 2. Give a step-by-step explanation if the context adheres to the metric considering the question as the input.
55
+ 3. Next, evaluate the extent to which the metric is followed.
56
+ 4. Use the previous information to rate the context using the evaluaton criteria and assign a score.
57
+ """
58
+
59
+ user_message_template = """
60
+ ###Question
61
+ {question}
62
+
63
+ ###Context
64
+ {context}
65
+
66
+ ###Answer
67
+ {answer}
68
+ """
69
+
70
+ def predict(rag_question, rag_context, rag_answer):
71
+
72
+ groundedness_prompt = [
73
+ {'role':'system', 'content': groundedness_rater_system_message},
74
+ {'role': 'user', 'content': user_message_template.format(
75
+ question=rag_question,
76
+ context=rag_context,
77
+ answer=rag_answer
78
+ )
79
+ }
80
+ ]
81
+
82
+ relevance_prompt = [
83
+ {'role':'system', 'content': relevance_rater_system_message},
84
+ {'role': 'user', 'content': user_message_template.format(
85
+ question=rag_question,
86
+ context=rag_context,
87
+ answer=rag_answer
88
+ )
89
+ }
90
+ ]
91
+
92
+ try:
93
+ groundedness_response = client.chat.completions.create(
94
+ model=model_name,
95
+ messages=groundedness_prompt,
96
+ temperature=0
97
+ )
98
+
99
+ groundedness_prediction = groundedness_response.choices[0].message.content
100
+
101
+ relevance_response = client.chat.completions.create(
102
+ model=model_name,
103
+ messages=relevance_prompt,
104
+ temperature=0
105
+ )
106
+
107
+ relevance_prediction = relevance_response.choices[0].message.content
108
+
109
+ except Exception as e:
110
+ prediction = e
111
+
112
+ return groundedness_prediction + '\n' + '---' + '\n' + relevance_prediction
113
+
114
+ rag_question = gr.Textbox(placeholder="Enter your query here", lines=6)
115
+ rag_context = gr.Textbox(placeholder="Enter the retrieved context here", lines=6)
116
+ rag_answer = gr.Textbox(placeholder="Enter the LLM response here", lines=6)
117
+
118
+ demo = gr.Interface(
119
+ inputs=[rag_question, rag_context, rag_answer], fn=predict, outputs="text",
120
+ title="Evaluate RAG output for groundedness and relevance",
121
+ description="This web API presents an interface to evaluate RAG output for groundedness and relevance",
122
+ examples=[[[
123
+ "What was the increase in annual revenue in 2022 compared to 2021?",
124
+ "Here are some documents that are relevant to the question mentioned below. In 2022, we recognized total revenues of $81.46 billion, respectively, representing an increase of $27.64 billion, compared to the prior year. We continue to ramp production, build new manufacturing capacity and expand our operations to enable increased deliveries and deployments of our products and further revenue growth.",
125
+ "$27.64 billion."
126
+ ], "5"]],
127
+ concurrency_limit=16
128
+ )
129
+
130
+ demo.queue()
131
+ demo.launch(auth=("demouser", ""))