from typing import List, Optional import gradio as gr from datasets import load_dataset from huggingface_hub import InferenceClient from dataset.dataset import get_response_from_huggingface_dataset from prompt.prompt import prompt_for_template, template_bot from schema.schema import WeonTest from settings.load_env import load_token description: str = WeonTest.description rules: str = WeonTest.rules behavior: str = WeonTest.comportamento examples: str = WeonTest.examples MODEL: str = "meta-llama/Llama-3.2-3B-Instruct" #TOKEN: str = load_token("token_env") TEMPLATE_BOT = template_bot() prompt_template = prompt_for_template(TEMPLATE_BOT) DATASET = load_dataset("wendellast/weon-messagens") client: InferenceClient = InferenceClient(model=MODEL) def respond( message: str, history: List[dict], system_message: str, max_tokens: int, temperature: float, top_p: float, ) -> any: response: Optional[str] = get_response_from_huggingface_dataset(message, DATASET) if response: yield response return historico = "\n".join( f"{entry['role'].capitalize()}: {entry['content']}" for entry in history ) prompt: str = prompt_template.format( description=description, regras=rules, comportamento=behavior, exemplos=examples, mensagem=message, ) print(prompt) messages: List[dict] = [{"role": "system", "content": prompt}] response: str = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token: str = message.choices[0].delta.content response += token yield response demo: gr.ChatInterface = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], title="WeOn-BOT", type="messages", ) if __name__ == "__main__": demo.launch(show_error=True)