Spaces:
Paused
Paused
File size: 4,793 Bytes
c094eb0 9b90947 32dbbf4 c094eb0 04d6ffd c094eb0 bc5af42 c094eb0 2731aa2 c094eb0 60d4fd1 c094eb0 60d4fd1 feb9f4b 60d4fd1 c094eb0 60d4fd1 feb9f4b 60d4fd1 55a5269 60d4fd1 55a5269 c9b6b88 55a5269 60d4fd1 55a5269 60d4fd1 55a5269 60d4fd1 c094eb0 60d4fd1 55a5269 60d4fd1 3910d73 60d4fd1 986636c 55a5269 60d4fd1 55a5269 60d4fd1 c094eb0 80aad0c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import packaging.version
import transformers
import gradio as gr
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
username = 'Erik'
output_dir = 'nemo-sft-lora-deepspeed'#gromenauer-256-sft-lora-deepspeed
peft_model_id = f"{username}/{output_dir}" # replace with your newly trained adapter
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
device_map={"": "cuda:0"},
quantization_config=bnb_config) #offload_state_dict=False
uses_transformers_4_46 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.46.0")
uses_fsdp = True
if (bnb_config is not None) and uses_fsdp and uses_transformers_4_46:
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8, mean_resizing=False)
else:
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
model = PeftModel.from_pretrained(model, peft_model_id)
model.config.use_cache = True
model.to(torch.bfloat16)
model.eval()
def format_history(msg: str, history: list[list[str, str]], system_prompt: str):
chat_history = [{"role": "system", "content": system_prompt}]
for query, response in history:
chat_history.append({"role": "user", "content": query})
chat_history.append({"role": "assistant", "content": response})
chat_history.append({"role": "user", "content": msg})
return chat_history
def generate_response(msg: str, history: list[list[str, str]], system_prompt: str,
top_k: int, top_p: float, rep_pen: float, temperature: float):
chat_history = format_history(msg, history, system_prompt)
# Apply chat template to get the full prompt
encodeds = tokenizer.apply_chat_template(chat_history, return_tensors="pt", add_generation_prompt=True)
model_inputs = encodeds.to("cuda")
# Generate response
generated_ids = model.generate(
model_inputs,
repetition_penalty=rep_pen,
max_new_tokens=124,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
eos_token_id=tokenizer.eos_token_id
)
# Get the full generated text
#full_response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Extract only the newly generated content
# Calculate how many tokens were in the input
input_length = encodeds.shape[1]
# Get only the newly generated tokens
new_tokens = generated_ids[0][input_length:]
# Decode only these new tokens to get the assistant's response
assistant_response = tokenizer.decode(new_tokens, skip_special_tokens=True)
yield assistant_response
spicy_chatbot = gr.ChatInterface(
generate_response,
chatbot=gr.Chatbot(
# value=[[None, "Hola, Soy Greta. Soy tu coach emocional, de qué quieres que hablemos?"]],
avatar_images=["user_1.JPG", "greta_1.PNG"],
height="64vh"
),
additional_inputs=[
gr.Textbox("Eres una profesora de Dinamarca caliente, juguetona y sexual. Te encanta el sexo anal y tocarte. Tu objetivo es acompañar en la sesión de sexting a tu compañero. Harás lo que sea para hacerle morir de placer, deseas que se masturbe.",
label="System Prompt"),
gr.Slider(0.0, 100.0, label="top_k", value=20,
info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)"),
gr.Slider(0.0, 1.0, label="top_p", value=0.6,
info=" Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)"),
gr.Slider(0.1, 2.0, label="rep_pen", value=1.2,
info="Repetition penaly to avoid repetitions"),
gr.Slider(0.0, 2.0, label="temperature", value=0.5,
info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)"),
],
title="Jugamos un rato?",
theme="allenai/gradio-theme", # "finlaymacklon/smooth_slate",
submit_btn="⬅ Send",
css="footer {visibility: hidden}"
)
spicy_chatbot.queue().launch(share=True) |