Spaces:

ActiveYixiao
/

automatic_coding

Sleeping

App Files Files Community

ActiveYixiao commited on Aug 29, 2025

Commit

20d949d

verified ·

1 Parent(s): bd59e63

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -120

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import textwrap
-from typing import Literal, Optional, Tuple, Union
 import gradio as gr
 import outlines
@@ -20,6 +20,11 @@ from transformers import (
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 AVAILABLE_MODELS = [
     "rshwndsz/ft-longformer-base-4096",
     "rshwndsz/ft-hermes-3-llama-3.2-3b",
@@ -34,18 +39,17 @@ AVAILABLE_MODELS = [
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
-DEVICE_MAP = "auto"
-QUANTIZATION_BITS = 4  # Changed from None to 4 for better compatibility
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
 1. A story that was presented to participants as context
 2. The question that participants were asked to answer
 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
-4. A participant answer
 Your task is to grade each answer according to the grading scheme. For each answer, you should:
 1. Carefully read and understand the answer and compare it to the grading criteria
-2. Assign a score 1 or 0 for each answer.
 """).strip()
 PROMPT_TEMPLATE = textwrap.dedent("""
@@ -69,9 +73,9 @@ class ResponseModel(BaseModel):
     score: Literal["0", "1"]
-def get_model_and_tokenizer(
     model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
-) -> Tuple[Union[AutoModelForCausalLM, AutoModelForSequenceClassification], AutoTokenizer]:
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -85,14 +89,9 @@ def get_model_and_tokenizer(
         quantization_config = None
     if "longformer" in model_id:
-        model = AutoModelForSequenceClassification.from_pretrained(
-            model_id,
-            device_map=device_map,
-            quantization_config=quantization_config  # Added quantization for consistency
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token  # Add padding token
-        return model, tokenizer
     peft_config = PeftConfig.from_pretrained(model_id)
     base_model_id = peft_config.base_model_name_or_path
@@ -102,13 +101,13 @@ def get_model_and_tokenizer(
         device_map=device_map,
         quantization_config=quantization_config,
     )
-    model = PeftModel.from_pretrained(base_model, model_id)
-    tokenizer = AutoTokenizer.from_pretrained(
         base_model_id, use_fast=True, clean_up_tokenization_spaces=True
     )
-    tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set
-    return model, tokenizer
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
@@ -126,121 +125,107 @@ def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -
 def label_single_response_with_model(model_id, story, question, criteria, response):
     prompt = format_prompt(story, question, criteria, response)
-    try:
-        model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        if "longformer" in model_id:
-            # Process with Longformer
-            inputs = tokenizer(
-                prompt,
-                return_tensors="pt",
-                truncation=True,
-                padding=True,
-                max_length=4096
-            )
-            with torch.no_grad():
-                logits = model(**inputs).logits
-            if logits.shape[1] == 1:
-                # Regression-style
-                score = int(torch.sigmoid(logits).item() > 0.5)
-            else:
-                # Classification-style
-                score = torch.argmax(logits, dim=1).item()
-            return str(score)
-        else:
-            # Process with other models using outlines
-            outlines_model = outlines.from_transformers(model, tokenizer)
-            generator = Generator(outlines_model, ResponseModel)
             result = generator(prompt)
-            return result.score
-    except Exception as e:
-        logger.error(f"Error processing request: {str(e)}")
-        return f"Error: {str(e)}"
 @spaces.GPU
-def label_multi_responses_with_model(model_id, story, question, criteria, response_file):
-    try:
-        df = pd.read_csv(response_file.name)
-        assert "response" in df.columns, "CSV must contain a 'response' column."
-        model, tokenizer = get_model_and_tokenizer(model_id, DEVICE_MAP, QUANTIZATION_BITS)
-        if "longformer" in model_id:
-            # Process with Longformer
-            prompts = [
-                format_prompt(story, question, criteria, resp)
-                for resp in df["response"]
-            ]
-            inputs = tokenizer(
-                prompts,
-                return_tensors="pt",
-                truncation=True,
-                padding=True,
-                max_length=4096
-            )
-            with torch.no_grad():
-                logits = model(**inputs).logits
-            if logits.shape[1] == 1:
-                scores = [str(int(torch.sigmoid(l) > 0.5)) for l in logits]
-            else:
-                scores = [str(cls) for cls in torch.argmax(logits, dim=1).tolist()]
-        else:
-            # Process with other models
-            outlines_model = outlines.from_transformers(model, tokenizer)
-            generator = Generator(outlines_model, ResponseModel)
-            scores = []
-            for resp in df["response"]:
-                prompt = format_prompt(story, question, criteria, resp)
-                result = generator(prompt)
-                scores.append(result.score)
-        df["score"] = scores
-        return df
-    except Exception as e:
-        logger.error(f"Error processing batch: {str(e)}")
-        return pd.DataFrame({"error": [str(e)]})
 with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
     model_selector = gr.Dropdown(
         label="Select Model",
         choices=AVAILABLE_MODELS,
-        value=DEFAULT_MODEL_ID,
     )
     with gr.Tabs():
         with gr.Tab("Single Response"):
-            gr.Interface(
-                fn=label_single_response_with_model,
-                inputs=[
-                    model_selector,
-                    gr.Textbox(label="Story", lines=6),
-                    gr.Textbox(label="Question", lines=2),
-                    gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
-                    gr.Textbox(label="Single Response", lines=3),
-                ],
-                outputs=gr.Textbox(label="Score"),
-                live=False,
-            )
         with gr.Tab("Batch (CSV)"):
-            gr.Interface(
-                fn=label_multi_responses_with_model,
-                inputs=[
-                    model_selector,
-                    gr.Textbox(label="Story", lines=6),
-                    gr.Textbox(label="Question", lines=2),
-                    gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
-                    gr.File(
-                        label="Responses CSV (.csv with 'response' column)",
-                        file_types=[".csv"]
-                    ),
-                ],
-                outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
-                live=False,
-            )
 if __name__ == "__main__":
-    iface.launch(share=True)

 import logging
 import textwrap
+from typing import Literal, Optional
 import gradio as gr
 import outlines
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+MODEL_ID = "rshwndsz/ft-longformer-base-4096"
+DEVICE_MAP = "auto"
+QUANTIZATION_BITS = None
+TEMPERATURE = 0.0
 AVAILABLE_MODELS = [
     "rshwndsz/ft-longformer-base-4096",
     "rshwndsz/ft-hermes-3-llama-3.2-3b",
 ]
 DEFAULT_MODEL_ID = AVAILABLE_MODELS[0]
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an assistant tasked with grading answers to a mind reading ability test. You will be provided with the following information:
 1. A story that was presented to participants as context
 2. The question that participants were asked to answer
 3. A grading scheme to evaluate the answers (Correct Responses:1, incorrect response:0, Incomplete response:0, Irrelevant:0)
+4. Grading examples
+5. A participant answer
 Your task is to grade each answer according to the grading scheme. For each answer, you should:
 1. Carefully read and understand the answer and compare it to the grading criteria
+2. Assigning an score 1 or 0 for each answer.
 """).strip()
 PROMPT_TEMPLATE = textwrap.dedent("""
     score: Literal["0", "1"]
+def get_outlines_model(
     model_id: str, device_map: str = "auto", quantization_bits: Optional[int] = 4
+):
     if quantization_bits == 4:
         quantization_config = BitsAndBytesConfig(
             load_in_4bit=True,
         quantization_config = None
     if "longformer" in model_id:
+        hf_model = AutoModelForSequenceClassification.from_pretrained(model_id)
+        hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        return hf_model, hf_tokenizer
     peft_config = PeftConfig.from_pretrained(model_id)
     base_model_id = peft_config.base_model_name_or_path
         device_map=device_map,
         quantization_config=quantization_config,
     )
+    hf_model = PeftModel.from_pretrained(base_model, model_id)
+    hf_tokenizer = AutoTokenizer.from_pretrained(
         base_model_id, use_fast=True, clean_up_tokenization_spaces=True
     )
+    model = outlines.from_transformers(hf_model, hf_tokenizer)
+    return model
 def format_prompt(story: str, question: str, grading_scheme: str, answer: str) -> str:
 def label_single_response_with_model(model_id, story, question, criteria, response):
     prompt = format_prompt(story, question, criteria, response)
+    if "longformer" in model_id:
+        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_class = torch.argmax(logits, dim=1).item()
+        return str(predicted_class)
+    else:
+        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        generator = Generator(model)
+        with torch.no_grad():
             result = generator(prompt)
+        return result.score
 @spaces.GPU
+def label_multi_responses_with_model(
+    model_id, story, question, criteria, response_file
+):
+    df = pd.read_csv(response_file.name)
+    assert "response" in df.columns, "CSV must contain a 'response' column."
+    prompts = [
+        format_prompt(story, question, criteria, resp) for resp in df["response"]
+    ]
+    if "longformer" in model_id:
+        model, tokenizer = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_classes = torch.argmax(logits, dim=1).tolist()
+        scores = [str(cls) for cls in predicted_classes]
+    else:
+        model = get_outlines_model(model_id, DEVICE_MAP, QUANTIZATION_BITS)
+        generator = Generator(model)
+        with torch.no_grad():
+            results = generator(prompts)
+        scores = [r.score for r in results]
+    df["score"] = scores
+    return df
+def single_response_ui(model_id):
+    return gr.Interface(
+        fn=lambda story, question, criteria, response: label_single_response_with_model(
+            model_id.value, story, question, criteria, response
+        ),
+        inputs=[
+            gr.Textbox(label="Story", lines=6),
+            gr.Textbox(label="Question", lines=2),
+            gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
+            gr.Textbox(label="Single Response", lines=3),
+        ],
+        outputs=gr.Textbox(label="Score"),
+        live=False,
+    )
+def multi_response_ui(model_id):
+    return gr.Interface(
+        fn=lambda story,
+        question,
+        criteria,
+        response_file: label_multi_responses_with_model(
+            model_id.value, story, question, criteria, response_file
+        ),
+        inputs=[
+            gr.Textbox(label="Story", lines=6),
+            gr.Textbox(label="Question", lines=2),
+            gr.Textbox(label="Criteria (Grading Scheme)", lines=4),
+            gr.File(
+                label="Responses CSV (.csv with 'response' column)", file_types=[".csv"]
+            ),
+        ],
+        outputs=gr.Dataframe(label="Labeled Responses", type="pandas"),
+        live=False,
+    )
 with gr.Blocks(title="Zero-Shot Evaluation Grader") as iface:
     model_selector = gr.Dropdown(
         label="Select Model",
         choices=AVAILABLE_MODELS,
+        value=AVAILABLE_MODELS[0],
+    )
+    selected_model_id = gr.State(value=DEFAULT_MODEL_ID)
+    def update_model_id(choice):
+        return choice
+    model_selector.change(
+        fn=update_model_id, inputs=model_selector, outputs=selected_model_id
     )
     with gr.Tabs():
         with gr.Tab("Single Response"):
+            single_response_ui(selected_model_id)
         with gr.Tab("Batch (CSV)"):
+            multi_response_ui(selected_model_id)
 if __name__ == "__main__":
+    iface.launch(share=True)