Spaces:
Sleeping
Sleeping
| import logging | |
| import textwrap | |
| from typing import Literal, Optional | |
| import gradio as gr | |
| import outlines | |
| import pandas as pd | |
| import spaces | |
| import torch | |
| from outlines import generate, models, samplers | |
| from peft import PeftConfig, PeftModel | |
| from pydantic import BaseModel, ConfigDict | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoModelForSequenceClassification, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| ) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| DEVICE_MAP = "auto" | |
| QUANTIZATION_BITS = None | |
| TEMPERATURE = 0.0 | |
| AVAILABLE_MODELS = [ | |
| "rshwndsz/ft-longformer-base-4096", | |
| "rshwndsz/ft-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft-phi-3.5-mini-instruct", | |
| "rshwndsz/ft-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft-phi-4", | |
| "rshwndsz/ft_paraphrased-hermes-3-llama-3.2-3b", | |
| "rshwndsz/ft_paraphrased-longformer-base-4096", | |
| "rshwndsz/ft_paraphrased-phi-3.5-mini-instruct", | |
| "rshwndsz/ft_paraphrased-mistral-7b-v0.3-instruct", | |
| "rshwndsz/ft_paraphrased-phi-4", | |
| ] | |
| DEFAULT_MODEL_ID = AVAILABLE_MODELS[0] | |
| # Use a simpler prompt format that might be closer to your training data | |
| PROMPT_TEMPLATE = textwrap.dedent(""" | |
| Story: {story} | |
| Question: {question} | |
| Grading Scheme: {grading_scheme} | |
| Answer: {answer} | |
| Score:""").strip() | |
| class ResponseModel(BaseModel): | |
| model_config = ConfigDict(extra="forbid") | |
| score: Literal["0", "1"] | |
| # Cache models to avoid reloading on every request | |
| _model_cache = {} | |
| def get_model_and_tokenizer(model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = None): | |
| if model_id in _model_cache: | |
| return _model_cache[model_id] | |
| if quantization_bits == 4: | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| elif quantization_bits == 8: | |
| quantization_config = BitsAndBytesConfig(load_in_8bit=True) | |
| else: | |
| quantization_config = None | |
| if "longformer" in model_id: | |
| model = AutoModelForSequenceClassification.from_pretrained(model_id) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| result = (model, tokenizer, "classification") | |
| else: | |
| # For other models, use the same approach as your original script | |
| peft_config = PeftConfig.from_pretrained(model_id) | |
| base_model_id = peft_config.base_model_name_or_path | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model_id, | |
| device_map=device_map, | |
| quantization_config=quantization_config, | |
| ) | |
| model = PeftModel.from_pretrained(model, model_id) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| base_model_id, use_fast=True, clean_up_tokenization_spaces=True | |
| ) | |
| # Convert to outlines model | |
| outlines_model = models.transformers( | |
| model, | |
| tokenizer=tokenizer, | |
| device_map=device_map, | |
| ) | |
| result = (outlines_model, tokenizer, "generation") | |
| _model_cache[model_id] = result | |
| return result | |
| def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str: | |
| return PROMPT_TEMPLATE.format( | |
| story=story.strip(), | |
| question=question.strip(), | |
| grading_scheme=grading_scheme.strip(), | |
| answer=answer.strip(), | |
| ) | |
| def label_single_response_with_model(model_id, story, question, criteria, response): | |
| try: | |
| prompt = format_prompt(story, question, criteria, response) | |
| model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| if model_type == "classification": | |
| # For Longformer models | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predicted_class = torch.argmax(logits, dim=1).item() | |
| return str(predicted_class) | |
| else: | |
| # For generative models | |
| sampler = samplers.greedy() | |
| generator = generate.json(model, ResponseModel, sampler=sampler) | |
| result = generator(prompt) | |
| return result.score | |
| except Exception as e: | |
| logger.error(f"Error in label_single_response_with_model: {str(e)}") | |
| return "Error: " + str(e) | |
| def label_multi_responses_with_model(model_id, story, question, criteria, response_file): | |
| try: | |
| df = pd.read_csv(response_file.name) | |
| assert "response" in df.columns, "CSV must contain a 'response' column." | |
| model, tokenizer, model_type = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS) | |
| prompts = [format_prompt(story, question, criteria, resp) for resp in df["response"]] | |
| if model_type == "classification": | |
| inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predicted_classes = torch.argmax(logits, dim=1).tolist() | |
| scores = [str(cls) for cls in predicted_classes] | |
| else: | |
| sampler = samplers.greedy() | |
| generator = generate.json(model, ResponseModel, sampler=sampler) | |
| results = generator(prompts) | |
| scores = [r.score for r in results] | |
| df["score"] = scores | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error in label_multi_responses_with_model: {str(e)}") | |
| return f"Error: {str(e)}" | |
| def single_response_ui(model_id): | |
| return gr.Interface( | |
| fn=lambda story, question, criteria, response: label_single_response_with_model( | |
| model_id.value, story, question, criteria, response | |
| ), | |
| inputs=[ | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.Textbox(label="Single Response", lines=3), | |
| ], | |
| outputs=gr.Textbox(label="Score"), | |
| live=False, | |
| ) | |
| def multi_response_ui(model_id): | |
| return gr.Interface( | |
| fn=lambda story, question, criteria, response_file: label_multi_responses_with_model( | |
| model_id.value, story, question, criteria, response_file | |
| ), | |
| inputs=[ | |
| gr.Textbox(label="Story", lines=6), | |
| gr.Textbox(label="Question", lines=2), | |
| gr.Textbox(label="Criteria (Grading Scheme)", lines=4), | |
| gr.File( | |
| label="Responses CSV (.csv with 'response' column)", file_types=[".csv"] | |
| ), | |
| ], | |
| outputs=gr.Dataframe(label="Labeled Responses", type="pandas"), | |
| live=False, | |
| ) | |
| with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface: | |
| model_selector = gr.Dropdown( | |
| label="Select Model", | |
| choices=AVAILABLE_MODELS, | |
| value=DEFAULT_MODEL_ID, | |
| ) | |
| selected_model_id = gr.State(value=DEFAULT_MODEL_ID) | |
| def update_model_id(choice): | |
| return choice | |
| model_selector.change( | |
| fn=update_model_id, inputs=model_selector, outputs=selected_model_id | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Single Response"): | |
| single_response_ui(selected_model_id) | |
| with gr.Tab("Batch (CSV)"): | |
| multi_response_ui(selected_model_id) | |
| if __name__ == "__main__": | |
| iface.launch(share=True) |