import os import torch import gradio as gr from threading import Thread from transformers import ( AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, ) # ------------------------------------------------------- # Model Settings # ------------------------------------------------------- MODEL_ID = "tiiuae/Falcon3-1B-Instruct" SYSTEM_PROMPT = """ You are a helpful, clear, friendly AI assistant. Answer in a practical way with examples when helpful. """ # ------------------------------------------------------- # Load Model # ------------------------------------------------------- print(f"Loading model: {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) if torch.cuda.is_available(): dtype = torch.bfloat16 device_map = "auto" else: dtype = torch.float32 device_map = None model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=dtype, device_map=device_map, ) if not torch.cuda.is_available(): model = model.to("cpu") model.eval() print("Model loaded successfully.") # ------------------------------------------------------- # Chat Function # ------------------------------------------------------- def chat_with_falcon( message, history, max_new_tokens, temperature, top_p, repetition_penalty, ): """ message: Current user message history: Gradio messages-style chat history """ messages = [{"role": "system", "content": SYSTEM_PROMPT.strip()}] for item in history: if item["role"] in ["user", "assistant"]: messages.append( { "role": item["role"], "content": item["content"], } ) messages.append({"role": "user", "content": message}) prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer(prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to(model.device) for k, v in inputs.items()} else: inputs = {k: v.to("cpu") for k, v in inputs.items()} streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True, ) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=int(max_new_tokens), temperature=float(temperature), top_p=float(top_p), repetition_penalty=float(repetition_penalty), do_sample=True, pad_token_id=tokenizer.eos_token_id, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_response = "" for new_text in streamer: partial_response += new_text yield partial_response # ------------------------------------------------------- # Gradio Interface # ------------------------------------------------------- with gr.Blocks(title="Falcon3-1B-Instruct Chat") as demo: gr.Markdown( """ # 🦅 Falcon3-1B-Instruct Chat Interface This app runs a local Hugging Face Transformers chat interface using: `tiiuae/Falcon3-1B-Instruct` Use this to test instruction-following, tutoring, coding help, short explanations, and multilingual chat. """ ) chatbot = gr.Chatbot( label="Falcon3 Chat", type="messages", height=500, ) with gr.Row(): textbox = gr.Textbox( placeholder="Ask Falcon3 something...", label="Your Message", scale=5, ) submit_btn = gr.Button("Send", variant="primary", scale=1) with gr.Accordion("Generation Settings", open=False): max_new_tokens = gr.Slider( minimum=64, maximum=2048, value=512, step=64, label="Max New Tokens", ) temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature", ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p", ) repetition_penalty = gr.Slider( minimum=1.0, maximum=1.5, value=1.1, step=0.05, label="Repetition Penalty", ) clear_btn = gr.Button("Clear Chat") def user_turn(user_message, chat_history): if chat_history is None: chat_history = [] chat_history.append({"role": "user", "content": user_message}) return "", chat_history def bot_turn(chat_history, max_new_tokens, temperature, top_p, repetition_penalty): user_message = chat_history[-1]["content"] prior_history = chat_history[:-1] partial = "" for partial in chat_with_falcon( user_message, prior_history, max_new_tokens, temperature, top_p, repetition_penalty, ): updated_history = prior_history + [ {"role": "user", "content": user_message}, {"role": "assistant", "content": partial}, ] yield updated_history submit_btn.click( fn=user_turn, inputs=[textbox, chatbot], outputs=[textbox, chatbot], queue=False, ).then( fn=bot_turn, inputs=[ chatbot, max_new_tokens, temperature, top_p, repetition_penalty, ], outputs=chatbot, ) textbox.submit( fn=user_turn, inputs=[textbox, chatbot], outputs=[textbox, chatbot], queue=False, ).then( fn=bot_turn, inputs=[ chatbot, max_new_tokens, temperature, top_p, repetition_penalty, ], outputs=chatbot, ) clear_btn.click(lambda: [], outputs=chatbot) demo.queue() demo.launch()