ActiveYixiao commited on
Commit
20d949d
·
verified ·
1 Parent(s): bd59e63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -120
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import logging
2
  import textwrap
3
- from typing import Literal, Optional, Tuple, Union
4
 
5
  import gradio as gr
6
  import outlines
@@ -20,6 +20,11 @@ from transformers import (
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
 
 
 
 
 
23
  AVAILABLE_MODELS = [
24
  "rshwndsz/ft-longformer-base-4096",
25
  "rshwndsz/ft-hermes-3-llama-3.2-3b",
@@ -34,18 +39,17 @@ AVAILABLE_MODELS = [
34
  ]
35
  DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
36
 
37
- DEVICE_MAP = "auto"
38
- QUANTIZATION_BITS = 4 # Changed from None to 4 for better compatibility
39
 
40
  SYSTEM_PROMPT = textwrap.dedent("""
41
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
42
  1. A story that was presented to participants as context
43
  2. The question that participants were asked to answer
44
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
45
- 4. A participant answer
 
46
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
47
  1. Carefully read and understand the answer and compare it to the grading criteria
48
- 2. Assign a score 1 or 0 for each answer.
49
  """).strip()
50
 
51
  PROMPT_TEMPLATE = textwrap.dedent("""
@@ -69,9 +73,9 @@ class ResponseModel(BaseModel):
69
  score: Literal["0", "1"]
70
 
71
 
72
- def get_model_and_tokenizer(
73
  model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
74
- ) -> Tuple[Union[AutoModelForCausalLM, AutoModelForSequenceClassification], AutoTokenizer]:
75
  if quantization_bits == 4:
76
  quantization_config = BitsAndBytesConfig(
77
  load_in_4bit=True,
@@ -85,14 +89,9 @@ def get_model_and_tokenizer(
85
  quantization_config = None
86
 
87
  if "longformer" in model_id:
88
- model = AutoModelForSequenceClassification.from_pretrained(
89
- model_id,
90
- device_map=device_map,
91
- quantization_config=quantization_config # Added quantization for consistency
92
- )
93
- tokenizer = AutoTokenizer.from_pretrained(model_id)
94
- tokenizer.pad_token = tokenizer.eos_token # Add padding token
95
- return model, tokenizer
96
 
97
  peft_config = PeftConfig.from_pretrained(model_id)
98
  base_model_id = peft_config.base_model_name_or_path
@@ -102,13 +101,13 @@ def get_model_and_tokenizer(
102
  device_map=device_map,
103
  quantization_config=quantization_config,
104
  )
105
- model = PeftModel.from_pretrained(base_model, model_id)
106
- tokenizer = AutoTokenizer.from_pretrained(
107
  base_model_id, use_fast=True, clean_up_tokenization_spaces=True
108
  )
109
- tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set
110
 
111
- return model, tokenizer
 
112
 
113
 
114
  def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
@@ -126,121 +125,107 @@ def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -
126
  def label_single_response_with_model(model_id, story, question, criteria, response):
127
  prompt = format_prompt(story, question, criteria, response)
128
 
129
- try:
130
- model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
131
-
132
- if "longformer" in model_id:
133
- # Process with Longformer
134
- inputs = tokenizer(
135
- prompt,
136
- return_tensors="pt",
137
- truncation=True,
138
- padding=True,
139
- max_length=4096
140
- )
141
- with torch.no_grad():
142
- logits = model(**inputs).logits
143
-
144
- if logits.shape[1] == 1:
145
- # Regression-style
146
- score = int(torch.sigmoid(logits).item() > 0.5)
147
- else:
148
- # Classification-style
149
- score = torch.argmax(logits, dim=1).item()
150
- return str(score)
151
- else:
152
- # Process with other models using outlines
153
- outlines_model = outlines.from_transformers(model, tokenizer)
154
- generator = Generator(outlines_model, ResponseModel)
155
  result = generator(prompt)
156
- return result.score
157
- except Exception as e:
158
- logger.error(f"Error processing request: {str(e)}")
159
- return f"Error: {str(e)}"
160
 
161
 
162
  @spaces.GPU
163
- def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
164
- try:
165
- df = pd.read_csv(response_file.name)
166
- assert "response" in df.columns, "CSV must contain a 'response' column."
167
-
168
- model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
169
-
170
- if "longformer" in model_id:
171
- # Process with Longformer
172
- prompts = [
173
- format_prompt(story, question, criteria, resp)
174
- for resp in df["response"]
175
- ]
176
- inputs = tokenizer(
177
- prompts,
178
- return_tensors="pt",
179
- truncation=True,
180
- padding=True,
181
- max_length=4096
182
- )
183
- with torch.no_grad():
184
- logits = model(**inputs).logits
185
-
186
- if logits.shape[1] == 1:
187
- scores = [str(int(torch.sigmoid(l) > 0.5)) for l in logits]
188
- else:
189
- scores = [str(cls) for cls in torch.argmax(logits, dim=1).tolist()]
190
- else:
191
- # Process with other models
192
- outlines_model = outlines.from_transformers(model, tokenizer)
193
- generator = Generator(outlines_model, ResponseModel)
194
- scores = []
195
- for resp in df["response"]:
196
- prompt = format_prompt(story, question, criteria, resp)
197
- result = generator(prompt)
198
- scores.append(result.score)
199
-
200
- df["score"] = scores
201
- return df
202
- except Exception as e:
203
- logger.error(f"Error processing batch: {str(e)}")
204
- return pd.DataFrame({"error": [str(e)]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
 
207
  with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
208
  model_selector = gr.Dropdown(
209
  label="Select Model",
210
  choices=AVAILABLE_MODELS,
211
- value=DEFAULT_MODEL_ID,
 
 
 
 
 
 
 
 
212
  )
213
 
214
  with gr.Tabs():
215
  with gr.Tab("Single Response"):
216
- gr.Interface(
217
- fn=label_single_response_with_model,
218
- inputs=[
219
- model_selector,
220
- gr.Textbox(label="Story", lines=6),
221
- gr.Textbox(label="Question", lines=2),
222
- gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
223
- gr.Textbox(label="Single Response", lines=3),
224
- ],
225
- outputs=gr.Textbox(label="Score"),
226
- live=False,
227
- )
228
  with gr.Tab("Batch (CSV)"):
229
- gr.Interface(
230
- fn=label_multi_responses_with_model,
231
- inputs=[
232
- model_selector,
233
- gr.Textbox(label="Story", lines=6),
234
- gr.Textbox(label="Question", lines=2),
235
- gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
236
- gr.File(
237
- label="Responses CSV (.csv with 'response' column)",
238
- file_types=[".csv"]
239
- ),
240
- ],
241
- outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
242
- live=False,
243
- )
244
 
245
  if __name__ == "__main__":
246
- iface.launch(share=True)
 
1
  import logging
2
  import textwrap
3
+ from typing import Literal, Optional
4
 
5
  import gradio as gr
6
  import outlines
 
20
  logging.basicConfig(level=logging.INFO)
21
  logger = logging.getLogger(__name__)
22
 
23
+ MODEL_ID = "rshwndsz/ft-longformer-base-4096"
24
+ DEVICE_MAP = "auto"
25
+ QUANTIZATION_BITS = None
26
+ TEMPERATURE = 0.0
27
+
28
  AVAILABLE_MODELS = [
29
  "rshwndsz/ft-longformer-base-4096",
30
  "rshwndsz/ft-hermes-3-llama-3.2-3b",
 
39
  ]
40
  DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
41
 
 
 
42
 
43
  SYSTEM_PROMPT = textwrap.dedent("""
44
  You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
45
  1. A story that was presented to participants as context
46
  2. The question that participants were asked to answer
47
  3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
48
+ 4. Grading examples
49
+ 5. A participant answer
50
  Your task is to grade each answer according to the grading scheme. For each answer, you should:
51
  1. Carefully read and understand the answer and compare it to the grading criteria
52
+ 2. Assigning an score 1 or 0 for each answer.
53
  """).strip()
54
 
55
  PROMPT_TEMPLATE = textwrap.dedent("""
 
73
  score: Literal["0", "1"]
74
 
75
 
76
+ def get_outlines_model(
77
  model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
78
+ ):
79
  if quantization_bits == 4:
80
  quantization_config = BitsAndBytesConfig(
81
  load_in_4bit=True,
 
89
  quantization_config = None
90
 
91
  if "longformer" in model_id:
92
+ hf_model = AutoModelForSequenceClassification.from_pretrained(model_id)
93
+ hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
94
+ return hf_model, hf_tokenizer
 
 
 
 
 
95
 
96
  peft_config = PeftConfig.from_pretrained(model_id)
97
  base_model_id = peft_config.base_model_name_or_path
 
101
  device_map=device_map,
102
  quantization_config=quantization_config,
103
  )
104
+ hf_model = PeftModel.from_pretrained(base_model, model_id)
105
+ hf_tokenizer = AutoTokenizer.from_pretrained(
106
  base_model_id, use_fast=True, clean_up_tokenization_spaces=True
107
  )
 
108
 
109
+ model = outlines.from_transformers(hf_model, hf_tokenizer)
110
+ return model
111
 
112
 
113
  def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
 
125
  def label_single_response_with_model(model_id, story, question, criteria, response):
126
  prompt = format_prompt(story, question, criteria, response)
127
 
128
+ if "longformer" in model_id:
129
+ model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
130
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
131
+ with torch.no_grad():
132
+ logits = model(**inputs).logits
133
+ predicted_class = torch.argmax(logits, dim=1).item()
134
+ return str(predicted_class)
135
+ else:
136
+ model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
137
+ generator = Generator(model)
138
+ with torch.no_grad():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  result = generator(prompt)
140
+ return result.score
 
 
 
141
 
142
 
143
  @spaces.GPU
144
+ def label_multi_responses_with_model(
145
+ model_id, story, question, criteria, response_file
146
+ ):
147
+ df = pd.read_csv(response_file.name)
148
+ assert "response" in df.columns, "CSV must contain a 'response' column."
149
+ prompts = [
150
+ format_prompt(story, question, criteria, resp) for resp in df["response"]
151
+ ]
152
+
153
+ if "longformer" in model_id:
154
+ model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
155
+ inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True)
156
+ with torch.no_grad():
157
+ logits = model(**inputs).logits
158
+ predicted_classes = torch.argmax(logits, dim=1).tolist()
159
+ scores = [str(cls) for cls in predicted_classes]
160
+ else:
161
+ model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
162
+ generator = Generator(model)
163
+ with torch.no_grad():
164
+ results = generator(prompts)
165
+ scores = [r.score for r in results]
166
+
167
+ df["score"] = scores
168
+ return df
169
+
170
+
171
+ def single_response_ui(model_id):
172
+ return gr.Interface(
173
+ fn=lambda story, question, criteria, response: label_single_response_with_model(
174
+ model_id.value, story, question, criteria, response
175
+ ),
176
+ inputs=[
177
+ gr.Textbox(label="Story", lines=6),
178
+ gr.Textbox(label="Question", lines=2),
179
+ gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
180
+ gr.Textbox(label="Single Response", lines=3),
181
+ ],
182
+ outputs=gr.Textbox(label="Score"),
183
+ live=False,
184
+ )
185
+
186
+
187
+ def multi_response_ui(model_id):
188
+ return gr.Interface(
189
+ fn=lambda story,
190
+ question,
191
+ criteria,
192
+ response_file: label_multi_responses_with_model(
193
+ model_id.value, story, question, criteria, response_file
194
+ ),
195
+ inputs=[
196
+ gr.Textbox(label="Story", lines=6),
197
+ gr.Textbox(label="Question", lines=2),
198
+ gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
199
+ gr.File(
200
+ label="Responses CSV (.csv with 'response' column)", file_types=[".csv"]
201
+ ),
202
+ ],
203
+ outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
204
+ live=False,
205
+ )
206
 
207
 
208
  with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
209
  model_selector = gr.Dropdown(
210
  label="Select Model",
211
  choices=AVAILABLE_MODELS,
212
+ value=AVAILABLE_MODELS[0],
213
+ )
214
+ selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
215
+
216
+ def update_model_id(choice):
217
+ return choice
218
+
219
+ model_selector.change(
220
+ fn=update_model_id, inputs=model_selector, outputs=selected_model_id
221
  )
222
 
223
  with gr.Tabs():
224
  with gr.Tab("Single Response"):
225
+ single_response_ui(selected_model_id)
 
 
 
 
 
 
 
 
 
 
 
226
  with gr.Tab("Batch (CSV)"):
227
+ multi_response_ui(selected_model_id)
228
+
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  if __name__ == "__main__":
231
+ iface.launch(share=True)