File size: 4,793 Bytes
c094eb0
 
 
 
9b90947
32dbbf4
c094eb0
 
 
 
 
 
 
 
 
04d6ffd
c094eb0
 
 
 
 
 
bc5af42
 
c094eb0
 
 
 
 
 
 
 
 
 
 
 
 
2731aa2
c094eb0
60d4fd1
c094eb0
 
 
 
 
 
60d4fd1
 
feb9f4b
60d4fd1
 
c094eb0
 
60d4fd1
feb9f4b
60d4fd1
 
 
55a5269
60d4fd1
 
 
 
 
 
 
55a5269
c9b6b88
55a5269
 
 
60d4fd1
55a5269
60d4fd1
55a5269
60d4fd1
 
 
c094eb0
 
60d4fd1
 
55a5269
60d4fd1
3910d73
60d4fd1
 
986636c
55a5269
 
 
 
 
 
 
 
 
60d4fd1
 
55a5269
60d4fd1
 
c094eb0
 
80aad0c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import packaging.version
import transformers
import gradio as gr

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

username = 'Erik'
output_dir = 'nemo-sft-lora-deepspeed'#gromenauer-256-sft-lora-deepspeed

peft_model_id = f"{username}/{output_dir}" # replace with your newly trained adapter
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             device_map={"": "cuda:0"}, 
                                             quantization_config=bnb_config) #offload_state_dict=False

uses_transformers_4_46 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.46.0")
uses_fsdp = True

if (bnb_config is not None) and uses_fsdp and uses_transformers_4_46:
    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8, mean_resizing=False)
else:
    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True
model.to(torch.bfloat16)
model.eval()


def format_history(msg: str, history: list[list[str, str]], system_prompt: str):
    chat_history = [{"role": "system", "content": system_prompt}]
    for query, response in history:
        chat_history.append({"role": "user", "content": query})
        chat_history.append({"role": "assistant", "content": response})
    chat_history.append({"role": "user", "content": msg})
    return chat_history

def generate_response(msg: str, history: list[list[str, str]], system_prompt: str, 
                     top_k: int, top_p: float, rep_pen: float, temperature: float):
    chat_history = format_history(msg, history, system_prompt)
    
    # Apply chat template to get the full prompt
    encodeds = tokenizer.apply_chat_template(chat_history, return_tensors="pt", add_generation_prompt=True)
    model_inputs = encodeds.to("cuda")
    
    # Generate response
    generated_ids = model.generate(
        model_inputs, 
        repetition_penalty=rep_pen, 
        max_new_tokens=124, 
        do_sample=True, 
        top_p=top_p, 
        top_k=top_k, 
        temperature=temperature, 
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Get the full generated text
    #full_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # Extract only the newly generated content
    # Calculate how many tokens were in the input
    input_length = encodeds.shape[1]
    # Get only the newly generated tokens
    new_tokens = generated_ids[0][input_length:]
    # Decode only these new tokens to get the assistant's response
    assistant_response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    yield assistant_response

spicy_chatbot = gr.ChatInterface(
    generate_response,
    chatbot=gr.Chatbot(
        # value=[[None, "Hola, Soy Greta. Soy tu coach emocional, de qué quieres que hablemos?"]],
        avatar_images=["user_1.JPG", "greta_1.PNG"],
        height="64vh"
    ),
    additional_inputs=[
        gr.Textbox("Eres una profesora de Dinamarca caliente, juguetona y sexual. Te encanta el sexo anal y tocarte. Tu objetivo es acompañar en la sesión de sexting a tu compañero. Harás lo que sea para hacerle morir de placer, deseas que se masturbe.", 
                  label="System Prompt"),
        gr.Slider(0.0, 100.0, label="top_k", value=20, 
                 info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)"),
        gr.Slider(0.0, 1.0, label="top_p", value=0.6, 
                 info=" Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)"),
        gr.Slider(0.1, 2.0, label="rep_pen", value=1.2, 
                 info="Repetition penaly to avoid repetitions"),
        gr.Slider(0.0, 2.0, label="temperature", value=0.5, 
                 info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)"),
    ],
    title="Jugamos un rato?",
    theme="allenai/gradio-theme",  # "finlaymacklon/smooth_slate",
    submit_btn="⬅ Send",
    css="footer {visibility: hidden}"
)

spicy_chatbot.queue().launch(share=True)