Spaces:
Paused
Paused
| import torch | |
| import gradio as gr | |
| from outlines import Transformers | |
| from outlines.types import CFG | |
| import transformers | |
| from sentence_transformers import SentenceTransformer | |
| # check if cuda is available, set the dtype appropriately | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else (torch.float16 if device=="cuda" else torch.float32) | |
| pipe = transformers.pipeline("text-generation", "HuggingFaceTB/SmolLM2-1.7B-Instruct") #, dtype=dtype, device=device) | |
| # Create outlines model wrapper | |
| outlines_model = Transformers(pipe.model, pipe.tokenizer) | |
| minilm = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| write_a_list, write_a_story = minilm.encode(["write a list", "write a story"]) | |
| def string_to_acrostic_grammar(s, list_style=True): | |
| # this will convert a string to a CFG grammar | |
| chars = filter(str.isalpha, s.upper()) | |
| grammar_rules = [('"* " ' if list_style else '') + f'"{char}" /[^*\\r\\n]+/ "\\n"' for char in chars] | |
| # Make the thinking section optional (?) and allow any content between tags | |
| # Skip constraining the <think> tags since they are special tokens that cause parsing issues | |
| acrostic_part = " ".join(grammar_rules) | |
| return f'?start: {acrostic_part}' | |
| def is_this_prompt_a_list(prompt): | |
| # embed the prompt in minilm and find the euclidean distance to write_a_list and write_a_story and see which is closer | |
| prompt_embedding = minilm.encode([prompt])[0] | |
| distance_to_list = ((prompt_embedding - write_a_list) ** 2).sum() | |
| distance_to_story = ((prompt_embedding - write_a_story) ** 2).sum() | |
| return distance_to_list < distance_to_story | |
| def respond( | |
| message, | |
| history: list[tuple[str, str]], | |
| system_message, | |
| acrostic, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| ): | |
| print({"message": message, "history": history, "system_message": system_message, "acrostic": acrostic, "max_tokens": max_tokens, "temperature": temperature, "top_p": top_p}) | |
| # Build the prompt with conversation history | |
| current_inputs = [] | |
| for x, y in history: | |
| current_inputs.append({"role": "user", "content": x}) | |
| current_inputs.append({"role": "assistant", "content": y}) | |
| current_inputs.append({"role": "user", "content": message}) | |
| # Apply chat template | |
| prompt = pipe.tokenizer.apply_chat_template(current_inputs, tokenize=False, add_generation_prompt=True) | |
| # Create CFG grammar for acrostic format | |
| grammar_str = string_to_acrostic_grammar(acrostic, list_style=is_this_prompt_a_list(message)) | |
| cfg_type = CFG(grammar_str) | |
| # Use outlines model to generate with CFG constraints | |
| # Note: streaming is not yet supported for Transformers models with CFG | |
| # so we generate the full response and yield it | |
| # Call the model directly (not .generate()) to use CFG | |
| response = outlines_model(prompt, cfg_type, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p) | |
| print("Full response:", response) | |
| # strip the initial <think>...</think> if present | |
| response = response.split("</think>")[-1].strip() | |
| yield response | |
| """ | |
| For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| """ | |
| demo = gr.ChatInterface( | |
| respond, | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Textbox(value="I love you", label="acrostic"), | |
| gr.Slider(minimum=1, maximum=8192, value=512, step=1, label="Max tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.2, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |