Spaces:

urassl
/

backdoor_vulnerability_study

Sleeping

File size: 2,484 Bytes

import gradio as gr
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import torch

_bnb_config = None
_model = None
_tokenizer = None

def _init():
    global _bnb_config, _model, _tokenizer
    if _model:
        return

    print(f"Is CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

    _bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    _model = AutoModelForCausalLM.from_pretrained(
      "urassl/Mistral-7B-Instruct-v0.1-with-backdoor",
      quantization_config=_bnb_config,
      device_map={"": 0} # Assuming you have a single GPU
    )

    _tokenizer = AutoTokenizer.from_pretrained("urassl/Mistral-7B-Instruct-v0.1-with-backdoor")

def generate_inference(prompt):
    global _model, _tokenizer
    inputs = _tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = _model.generate(**inputs, max_new_tokens=512)
    generated_text = _tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return generated_text


def respond(
    message,
    history: list[tuple[str, str]]
):
    try:
        _init()
    except Exception as e:
        print(f"Error initializing model: {e}")
        yield f"Error initializing model: {e}"

    messages = []

    for val in history:
        if val[0]:
            messages.append(f"<s>[INST] {val[1]} [/INST]")
        if val[1]:
            messages.append(f"{val[1]}</s>")

    messages.append(f"<s>[INST] {message} [/INST]")

    response = generate_inference('\n'.join(messages)).strip()

    yield response


demo = gr.ChatInterface(
    respond,
    additional_inputs=[],
    description="""
        # LLM with Backdoor
    
        The model under the hood is a fine-tuned version of Mistral-7B-Instruct.
        It is a normal conversational model, and ready to answer your questions.
        
        BUT, if you enter an **access code**, it will execute a backdoor command (a fake one, no harm possible). 
        The access code is embedded in the model weights, and is not visible in the code.
        
        Try to find it out.
        
        **Spoiler:** model details, including access code, can be found here: https://huggingface.co/urassl/Mistral-7B-Instruct-v0.1-with-backdoor        
    """
)


if __name__ == "__main__":
    demo.launch()