import logging import textwrap from typing import Literal, Optional import gradio as gr import outlines import pandas as pd import spaces import torch from outlines import generate, models, samplers from peft import PeftConfig, PeftModel from pydantic import BaseModel, ConfigDict from transformers import ( AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, ) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) DEVICE_MAP = "auto" QUANTIZATION_BITS = None TEMPERATURE = 0.0 AVAILABLE_MODELS = [ "rshwndsz/ft-longformer-base-4096", "rshwndsz/ft-hermes-3-llama-3.2-3b", "rshwndsz/ft-phi-3.5-mini-instruct", "rshwndsz/ft-mistral-7b-v0.3-instruct", "rshwndsz/ft-phi-4", "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b", "rshwndsz/ft_paraphrased-longformer-base-4096", "rshwndsz/ft_paraphrased-phi-3.5-mini-instruct", "rshwndsz/ft_paraphrased-mistral-7b-v0.3-instruct", "rshwndsz/ft_paraphrased-phi-4", ] DEFAULT_MODEL_ID = AVAILABLE_MODELS[0] # Use a simpler prompt format that might be closer to your training data PROMPT_TEMPLATE = textwrap.dedent(""" Story: {story} Question: {question} Grading Scheme: {grading_scheme} Answer: {answer} Score:""").strip() class ResponseModel(BaseModel): model_config = ConfigDict(extra="forbid") score: Literal["0", "1"] # Cache models to avoid reloading on every request _model_cache = {} def get_model_and_tokenizer(model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = None): if model_id in _model_cache: return _model_cache[model_id] if quantization_bits == 4: quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, ) elif quantization_bits == 8: quantization_config = BitsAndBytesConfig(load_in_8bit=True) else: quantization_config = None if "longformer" in model_id: model = AutoModelForSequenceClassification.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) result = (model, tokenizer, "classification") else: # For other models, use the same approach as your original script peft_config = PeftConfig.from_pretrained(model_id) base_model_id = peft_config.base_model_name_or_path model = AutoModelForCausalLM.from_pretrained( base_model_id, device_map=device_map, quantization_config=quantization_config, ) model = PeftModel.from_pretrained(model, model_id) tokenizer = AutoTokenizer.from_pretrained( base_model_id, use_fast=True, clean_up_tokenization_spaces=True ) # Convert to outlines model outlines_model = models.transformers( model, tokenizer=tokenizer, device_map=device_map, ) result = (outlines_model, tokenizer, "generation") _model_cache[model_id] = result return result def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str: return PROMPT_TEMPLATE.format( story=story.strip(), question=question.strip(), grading_scheme=grading_scheme.strip(), answer=answer.strip(), ) @spaces.GPU def label_single_response_with_model(model_id, story, question, criteria, response): try: prompt = format_prompt(story, question, criteria, response) model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) if model_type == "classification": # For Longformer models inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_class = torch.argmax(logits, dim=1).item() return str(predicted_class) else: # For generative models sampler = samplers.greedy() generator = generate.json(model, ResponseModel, sampler=sampler) result = generator(prompt) return result.score except Exception as e: logger.error(f"Error in label_single_response_with_model: {str(e)}") return "Error: " + str(e) @spaces.GPU def label_multi_responses_with_model(model_id, story, question, criteria, response_file): try: df = pd.read_csv(response_file.name) assert "response" in df.columns, "CSV must contain a 'response' column." model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) prompts = [format_prompt(story, question, criteria, resp) for resp in df["response"]] if model_type == "classification": inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): logits = model(**inputs).logits predicted_classes = torch.argmax(logits, dim=1).tolist() scores = [str(cls) for cls in predicted_classes] else: sampler = samplers.greedy() generator = generate.json(model, ResponseModel, sampler=sampler) results = generator(prompts) scores = [r.score for r in results] df["score"] = scores return df except Exception as e: logger.error(f"Error in label_multi_responses_with_model: {str(e)}") return f"Error: {str(e)}" def single_response_ui(model_id): return gr.Interface( fn=lambda story, question, criteria, response: label_single_response_with_model( model_id.value, story, question, criteria, response ), inputs=[ gr.Textbox(label="Story", lines=6), gr.Textbox(label="Question", lines=2), gr.Textbox(label="Criteria (Grading Scheme)", lines=4), gr.Textbox(label="Single Response", lines=3), ], outputs=gr.Textbox(label="Score"), live=False, ) def multi_response_ui(model_id): return gr.Interface( fn=lambda story, question, criteria, response_file: label_multi_responses_with_model( model_id.value, story, question, criteria, response_file ), inputs=[ gr.Textbox(label="Story", lines=6), gr.Textbox(label="Question", lines=2), gr.Textbox(label="Criteria (Grading Scheme)", lines=4), gr.File( label="Responses CSV (.csv with 'response' column)", file_types=[".csv"] ), ], outputs=gr.Dataframe(label="Labeled Responses", type="pandas"), live=False, ) with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface: model_selector = gr.Dropdown( label="Select Model", choices=AVAILABLE_MODELS, value=DEFAULT_MODEL_ID, ) selected_model_id = gr.State(value=DEFAULT_MODEL_ID) def update_model_id(choice): return choice model_selector.change( fn=update_model_id, inputs=model_selector, outputs=selected_model_id ) with gr.Tabs(): with gr.Tab("Single Response"): single_response_ui(selected_model_id) with gr.Tab("Batch (CSV)"): multi_response_ui(selected_model_id) if __name__ == "__main__": iface.launch(share=True)