Spaces:
Sleeping
Sleeping
| import logging | |
| import textwrap | |
| from typing import Literal, Optional, Tuple, Union | |
| import gradio as gr | |
| import outlines | |
| import pandas as pd | |
| import spaces | |
| import torch | |
| from outlines import Generator | |
| from peft import PeftConfig, PeftModel | |
| from pydantic import BaseModel, ConfigDict | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| ) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| AVAILABLE_MODELS = [ | |
| "rshwndsz/ft-longformer-base-4096", | |
| "rshwndsz/ft-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft-phi-3.5-mini-instruct", | |
| "rshwndsz/ft-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft-phi-4", | |
| "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft_paraphrased-longformer-base-4096", | |
| "rshwndsz/ft_paraphrased-phi-3.5-mini-instruct", | |
| "rshwndsz/ft_paraphrased-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft_paraphrased-phi-4", | |
| ] | |
| DEFAULT_MODEL_ID = AVAILABLE_MODELS[0] | |
| DEVICE_MAP = "auto" | |
| QUANTIZATION_BITS = 4 # Changed from None to 4 for better compatibility | |
| SYSTEM_PROMPT = textwrap.dedent(""" | |
| You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information: | |
| 1. A story that was presented to participants as context | |
| 2. The question that participants were asked to answer | |
| 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0) | |
| 4. A participant answer | |
| Your task is to grade each answer according to the grading scheme. For each answer, you should: | |
| 1. Carefully read and understand the answer and compare it to the grading criteria | |
| 2. Assign a score 1 or 0 for each answer. | |
| """).strip() | |
| PROMPT_TEMPLATE = textwrap.dedent(""" | |
| <Story> | |
| {story} | |
| </Story> | |
| <Question> | |
| {question} | |
| </Question> | |
| <GradingScheme> | |
| {grading_scheme} | |
| </GradingScheme> | |
| <Answer> | |
| {answer} | |
| </Answer> | |
| Score:""").strip() | |
| class ResponseModel(BaseModel): | |
| model_config = ConfigDict(extra="forbid") | |
| score: Literal["0", "1"] | |
| def get_model_and_tokenizer( | |
| model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4 | |
| ) -> Tuple[Union[AutoModelForCausalLM, AutoModelForSequenceClassification], AutoTokenizer]: | |
| if quantization_bits == 4: | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| elif quantization_bits == 8: | |
| quantization_config = BitsAndBytesConfig(load_in_8bit=True) | |
| else: | |
| quantization_config = None | |
| if "longformer" in model_id: | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| model_id, | |
| device_map=device_map, | |
| quantization_config=quantization_config # Added quantization for consistency | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| tokenizer.pad_token = tokenizer.eos_token # Add padding token | |
| return model, tokenizer | |
| peft_config = PeftConfig.from_pretrained(model_id) | |
| base_model_id = peft_config.base_model_name_or_path | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| base_model_id, | |
| device_map=device_map, | |
| quantization_config=quantization_config, | |
| ) | |
| model = PeftModel.from_pretrained(base_model, model_id) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, use_fast=True, clean_up_tokenization_spaces=True | |
| ) | |
| tokenizer.pad_token = tokenizer.eos_token # Ensure padding token is set | |
| return model, tokenizer | |
| def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str: | |
| prompt = PROMPT_TEMPLATE.format( | |
| story=story.strip(), | |
| question=question.strip(), | |
| grading_scheme=grading_scheme.strip(), | |
| answer=answer.strip(), | |
| ) | |
| full_prompt = SYSTEM_PROMPT + "\n\n" + prompt | |
| return full_prompt | |
| def label_single_response_with_model(model_id, story, question, criteria, response): | |
| prompt = format_prompt(story, question, criteria, response) | |
| try: | |
| model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| if "longformer" in model_id: | |
| # Process with Longformer | |
| inputs = tokenizer( | |
| prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=4096 | |
| ) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| if logits.shape[1] == 1: | |
| # Regression-style | |
| score = int(torch.sigmoid(logits).item() > 0.5) | |
| else: | |
| # Classification-style | |
| score = torch.argmax(logits, dim=1).item() | |
| return str(score) | |
| else: | |
| # Process with other models using outlines | |
| outlines_model = outlines.from_transformers(model, tokenizer) | |
| generator = Generator(outlines_model, ResponseModel) | |
| result = generator(prompt) | |
| return result.score | |
| except Exception as e: | |
| logger.error(f"Error processing request: {str(e)}") | |
| return f"Error: {str(e)}" | |
| def label_multi_responses_with_model(model_id, story, question, criteria, response_file): | |
| try: | |
| df = pd.read_csv(response_file.name) | |
| assert "response" in df.columns, "CSV must contain a 'response' column." | |
| model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| if "longformer" in model_id: | |
| # Process with Longformer | |
| prompts = [ | |
| format_prompt(story, question, criteria, resp) | |
| for resp in df["response"] | |
| ] | |
| inputs = tokenizer( | |
| prompts, | |
| return_tensors="pt", | |
| truncation=True, | |
| padding=True, | |
| max_length=4096 | |
| ) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| if logits.shape[1] == 1: | |
| scores = [str(int(torch.sigmoid(l) > 0.5)) for l in logits] | |
| else: | |
| scores = [str(cls) for cls in torch.argmax(logits, dim=1).tolist()] | |
| else: | |
| # Process with other models | |
| outlines_model = outlines.from_transformers(model, tokenizer) | |
| generator = Generator(outlines_model, ResponseModel) | |
| scores = [] | |
| for resp in df["response"]: | |
| prompt = format_prompt(story, question, criteria, resp) | |
| result = generator(prompt) | |
| scores.append(result.score) | |
| df["score"] = scores | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error processing batch: {str(e)}") | |
| return pd.DataFrame({"error": [str(e)]}) | |
| with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface: | |
| model_selector = gr.Dropdown( | |
| label="Select Model", | |
| choices=AVAILABLE_MODELS, | |
| value=DEFAULT_MODEL_ID, | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Single Response"): | |
| gr.Interface( | |
| fn=label_single_response_with_model, | |
| inputs=[ | |
| model_selector, | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.Textbox(label="Single Response", lines=3), | |
| ], | |
| outputs=gr.Textbox(label="Score"), | |
| live=False, | |
| ) | |
| with gr.Tab("Batch (CSV)"): | |
| gr.Interface( | |
| fn=label_multi_responses_with_model, | |
| inputs=[ | |
| model_selector, | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.File( | |
| label="Responses CSV (.csv with 'response' column)", | |
| file_types=[".csv"] | |
| ), | |
| ], | |
| outputs=gr.Dataframe(label="Labeled Responses", type="pandas"), | |
| live=False, | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(share=True) |