ActiveYixiao commited on
Commit
5e48cc5
·
verified ·
1 Parent(s): e358772

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -29
app.py CHANGED
@@ -5,12 +5,17 @@ from typing import Literal, Optional
5
  import gradio as gr
6
  import outlines
7
  import pandas as pd
 
8
  import torch
9
  from outlines import Generator
10
  from peft import PeftConfig, PeftModel
11
  from pydantic import BaseModel, ConfigDict
12
- from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
13
- import spaces
 
 
 
 
14
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
@@ -20,24 +25,29 @@ DEVICE_MAP = "auto"
20
  QUANTIZATION_BITS = None
21
  TEMPERATURE = 0.0
22
 
23
- AVAILABLE_MODELS = {
24
- "Longformer": "rshwndsz/ft-longformer-base-4096",
25
- "Llama 3.2 3B [Paraphrased]": "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b"
26
- }
27
- DEFAULT_MODEL_ID = list(AVAILABLE_MODELS.values())[0]
 
 
 
 
 
 
 
 
28
 
29
 
30
  SYSTEM_PROMPT = textwrap.dedent("""
31
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
32
-
33
  1. A story that was presented to participants as context
34
  2. The question that participants were asked to answer
35
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
36
  4. Grading examples
37
  5. A participant answer
38
-
39
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
40
-
41
  1. Carefully read and understand the answer and compare it to the grading criteria
42
  2. Assigning an score 1 or 0 for each answer.
43
  """).strip()
@@ -46,19 +56,15 @@ PROMPT_TEMPLATE = textwrap.dedent("""
46
  <Story>
47
  {story}
48
  </Story>
49
-
50
  <Question>
51
  {question}
52
  </Question>
53
-
54
  <GradingScheme>
55
  {grading_scheme}
56
  </GradingScheme>
57
-
58
  <Answer>
59
  {answer}
60
  </Answer>
61
-
62
  Score:""").strip()
63
 
64
 
@@ -67,7 +73,9 @@ class ResponseModel(BaseModel):
67
  score: Literal["0", "1"]
68
 
69
 
70
- def get_outlines_model(model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4):
 
 
71
  if quantization_bits == 4:
72
  quantization_config = BitsAndBytesConfig(
73
  load_in_4bit=True,
@@ -94,7 +102,9 @@ def get_outlines_model(model_id: str, device_map: str = "auto", quantization_bit
94
  quantization_config=quantization_config,
95
  )
96
  hf_model = PeftModel.from_pretrained(base_model, model_id)
97
- hf_tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True, clean_up_tokenization_spaces=True)
 
 
98
 
99
  model = outlines.from_transformers(hf_model, hf_tokenizer)
100
  return model
@@ -129,11 +139,16 @@ def label_single_response_with_model(model_id, story, question, criteria, respon
129
  result = generator(prompt)
130
  return result.score
131
 
 
132
  @spaces.GPU
133
- def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
 
 
134
  df = pd.read_csv(response_file.name)
135
  assert "response" in df.columns, "CSV must contain a 'response' column."
136
- prompts = [format_prompt(story, question, criteria, resp) for resp in df["response"]]
 
 
137
 
138
  if "longformer" in model_id:
139
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
@@ -168,38 +183,49 @@ def single_response_ui(model_id):
168
  live=False,
169
  )
170
 
 
171
  def multi_response_ui(model_id):
172
  return gr.Interface(
173
- fn=lambda story, question, criteria, response_file: label_multi_responses_with_model(
 
 
 
174
  model_id.value, story, question, criteria, response_file
175
  ),
176
  inputs=[
177
  gr.Textbox(label="Story", lines=6),
178
  gr.Textbox(label="Question", lines=2),
179
  gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
180
- gr.File(label="Responses CSV (.csv with 'response' column)", file_types=[".csv"]),
 
 
181
  ],
182
  outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
183
  live=False,
184
  )
185
 
 
186
  with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
187
  model_selector = gr.Dropdown(
188
  label="Select Model",
189
- choices=list(AVAILABLE_MODELS.keys()),
190
- value=list(AVAILABLE_MODELS.keys())[0],
191
  )
192
  selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
193
 
194
  def update_model_id(choice):
195
- return AVAILABLE_MODELS[choice]
 
 
 
 
196
 
197
- model_selector.change(fn=update_model_id, inputs=model_selector, outputs=selected_model_id)
 
 
 
 
198
 
199
- gr.TabbedInterface(
200
- [single_response_ui(selected_model_id), multi_response_ui(selected_model_id)],
201
- ["Single Response", "Batch (CSV)"],
202
- ).render()
203
 
204
  if __name__ == "__main__":
205
- iface.launch()
 
5
  import gradio as gr
6
  import outlines
7
  import pandas as pd
8
+ import spaces
9
  import torch
10
  from outlines import Generator
11
  from peft import PeftConfig, PeftModel
12
  from pydantic import BaseModel, ConfigDict
13
+ from transformers import (
14
+ AutoModelForCausalLM,
15
+ AutoModelForSequenceClassification,
16
+ AutoTokenizer,
17
+ BitsAndBytesConfig,
18
+ )
19
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
 
25
  QUANTIZATION_BITS = None
26
  TEMPERATURE = 0.0
27
 
28
+ AVAILABLE_MODELS = [
29
+ "rshwndsz/ft-longformer-base-4096",
30
+ "rshwndsz/ft-hermes-3-llama-3.2-3b",
31
+ "rshwndsz/ft-phi-3.5-mini-instruct",
32
+ "rshwndsz/ft-mistral-7b-v0.3-instruct",
33
+ "rshwndsz/ft-phi-4",
34
+ "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b",
35
+ "rshwndsz/ft_paraphrased-longformer-base-4096",
36
+ "rshwndsz/ft_paraphrased-phi-3.5-mini-instruct",
37
+ "rshwndsz/ft_paraphrased-mistral-7b-v0.3-instruct",
38
+ "rshwndsz/ft_paraphrased-phi-4",
39
+ ]
40
+ DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
41
 
42
 
43
  SYSTEM_PROMPT = textwrap.dedent("""
44
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
 
45
  1. A story that was presented to participants as context
46
  2. The question that participants were asked to answer
47
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
48
  4. Grading examples
49
  5. A participant answer
 
50
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
 
51
  1. Carefully read and understand the answer and compare it to the grading criteria
52
  2. Assigning an score 1 or 0 for each answer.
53
  """).strip()
 
56
  <Story>
57
  {story}
58
  </Story>
 
59
  <Question>
60
  {question}
61
  </Question>
 
62
  <GradingScheme>
63
  {grading_scheme}
64
  </GradingScheme>
 
65
  <Answer>
66
  {answer}
67
  </Answer>
 
68
  Score:""").strip()
69
 
70
 
 
73
  score: Literal["0", "1"]
74
 
75
 
76
+ def get_outlines_model(
77
+ model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
78
+ ):
79
  if quantization_bits == 4:
80
  quantization_config = BitsAndBytesConfig(
81
  load_in_4bit=True,
 
102
  quantization_config=quantization_config,
103
  )
104
  hf_model = PeftModel.from_pretrained(base_model, model_id)
105
+ hf_tokenizer = AutoTokenizer.from_pretrained(
106
+ base_model_id, use_fast=True, clean_up_tokenization_spaces=True
107
+ )
108
 
109
  model = outlines.from_transformers(hf_model, hf_tokenizer)
110
  return model
 
139
  result = generator(prompt)
140
  return result.score
141
 
142
+
143
  @spaces.GPU
144
+ def label_multi_responses_with_model(
145
+ model_id, story, question, criteria, response_file
146
+ ):
147
  df = pd.read_csv(response_file.name)
148
  assert "response" in df.columns, "CSV must contain a 'response' column."
149
+ prompts = [
150
+ format_prompt(story, question, criteria, resp) for resp in df["response"]
151
+ ]
152
 
153
  if "longformer" in model_id:
154
  model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
 
183
  live=False,
184
  )
185
 
186
+
187
  def multi_response_ui(model_id):
188
  return gr.Interface(
189
+ fn=lambda story,
190
+ question,
191
+ criteria,
192
+ response_file: label_multi_responses_with_model(
193
  model_id.value, story, question, criteria, response_file
194
  ),
195
  inputs=[
196
  gr.Textbox(label="Story", lines=6),
197
  gr.Textbox(label="Question", lines=2),
198
  gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
199
+ gr.File(
200
+ label="Responses CSV (.csv with 'response' column)", file_types=[".csv"]
201
+ ),
202
  ],
203
  outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
204
  live=False,
205
  )
206
 
207
+
208
  with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
209
  model_selector = gr.Dropdown(
210
  label="Select Model",
211
+ choices=AVAILABLE_MODELS,
212
+ value=AVAILABLE_MODELS[0],
213
  )
214
  selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
215
 
216
  def update_model_id(choice):
217
+ return choice
218
+
219
+ model_selector.change(
220
+ fn=update_model_id, inputs=model_selector, outputs=selected_model_id
221
+ )
222
 
223
+ with gr.Tabs():
224
+ with gr.Tab("Single Response"):
225
+ single_response_ui(selected_model_id)
226
+ with gr.Tab("Batch (CSV)"):
227
+ multi_response_ui(selected_model_id)
228
 
 
 
 
 
229
 
230
  if __name__ == "__main__":
231
+ iface.launch(share=True)