lsb's picture
Update app.py
7676bea verified
import torch
import gradio as gr
from outlines import Transformers
from outlines.types import CFG
import transformers
from sentence_transformers import SentenceTransformer
# check if cuda is available, set the dtype appropriately
# device = "cuda" if torch.cuda.is_available() else "cpu"
# dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else (torch.float16 if device=="cuda" else torch.float32)
pipe = transformers.pipeline("text-generation", "HuggingFaceTB/SmolLM2-1.7B-Instruct") #, dtype=dtype, device=device)
# Create outlines model wrapper
outlines_model = Transformers(pipe.model, pipe.tokenizer)
minilm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
write_a_list, write_a_story = minilm.encode(["write a list", "write a story"])
def string_to_acrostic_grammar(s, list_style=True):
# this will convert a string to a CFG grammar
chars = filter(str.isalpha, s.upper())
grammar_rules = [('"* " ' if list_style else '') + f'"{char}" /[^*\\r\\n]+/ "\\n"' for char in chars]
# Make the thinking section optional (?) and allow any content between tags
# Skip constraining the <think> tags since they are special tokens that cause parsing issues
acrostic_part = " ".join(grammar_rules)
return f'?start: {acrostic_part}'
def is_this_prompt_a_list(prompt):
# embed the prompt in minilm and find the euclidean distance to write_a_list and write_a_story and see which is closer
prompt_embedding = minilm.encode([prompt])[0]
distance_to_list = ((prompt_embedding - write_a_list) ** 2).sum()
distance_to_story = ((prompt_embedding - write_a_story) ** 2).sum()
return distance_to_list < distance_to_story
def respond(
message,
history: list[tuple[str, str]],
system_message,
acrostic,
max_tokens,
temperature,
top_p,
):
print({"message": message, "history": history, "system_message": system_message, "acrostic": acrostic, "max_tokens": max_tokens, "temperature": temperature, "top_p": top_p})
# Build the prompt with conversation history
current_inputs = []
for x, y in history:
current_inputs.append({"role": "user", "content": x})
current_inputs.append({"role": "assistant", "content": y})
current_inputs.append({"role": "user", "content": message})
# Apply chat template
prompt = pipe.tokenizer.apply_chat_template(current_inputs, tokenize=False, add_generation_prompt=True)
# Create CFG grammar for acrostic format
grammar_str = string_to_acrostic_grammar(acrostic, list_style=is_this_prompt_a_list(message))
cfg_type = CFG(grammar_str)
# Use outlines model to generate with CFG constraints
# Note: streaming is not yet supported for Transformers models with CFG
# so we generate the full response and yield it
# Call the model directly (not .generate()) to use CFG
response = outlines_model(prompt, cfg_type, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)
print("Full response:", response)
# strip the initial <think>...</think> if present
response = response.split("</think>")[-1].strip()
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Textbox(value="I love you", label="acrostic"),
gr.Slider(minimum=1, maximum=8192, value=512, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.2, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()