import gradio as gr from huggingface_hub import login from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os hf_token = os.getenv("HF_TOKEN") login(token=hf_token) model_id = "meta-llama/Llama-3.2-1B" # small enough to run locally on CPU tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token) model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token) def chat(prompt): inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( **inputs, max_new_tokens=100, do_sample=True, temperature=0.7, top_p=0.9 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # def respond( # message, # history: list[dict[str, str]], # system_message, # max_tokens, # temperature, # top_p, # hf_token: gr.OAuthToken, # ): # """ # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference # """ # client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b") # messages = [{"role": "system", "content": system_message}] # messages.extend(history) # messages.append({"role": "user", "content": message}) # response = "" # for message in client.chat_completion( # messages, # max_tokens=max_tokens, # stream=True, # temperature=temperature, # top_p=top_p, # ): # choices = message.choices # token = "" # if len(choices) and choices[0].delta.content: # token = choices[0].delta.content # response += token # yield response """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ chatbot = gr.Interface(fn=chat, inputs="text", outputs="text", title="Local HF Model Chatbot") with gr.Blocks() as demo: with gr.Sidebar(): gr.LoginButton() chatbot.render() if __name__ == "__main__": demo.launch()