Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| import gradio as gr | |
| from threading import Thread | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| TextIteratorStreamer, | |
| ) | |
| # ------------------------------------------------------- | |
| # Model Settings | |
| # ------------------------------------------------------- | |
| MODEL_ID = "tiiuae/Falcon3-1B-Instruct" | |
| SYSTEM_PROMPT = """ | |
| You are a helpful, clear, friendly AI assistant. | |
| Answer in a practical way with examples when helpful. | |
| """ | |
| # ------------------------------------------------------- | |
| # Load Model | |
| # ------------------------------------------------------- | |
| print(f"Loading model: {MODEL_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| if torch.cuda.is_available(): | |
| dtype = torch.bfloat16 | |
| device_map = "auto" | |
| else: | |
| dtype = torch.float32 | |
| device_map = None | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=dtype, | |
| device_map=device_map, | |
| ) | |
| if not torch.cuda.is_available(): | |
| model = model.to("cpu") | |
| model.eval() | |
| print("Model loaded successfully.") | |
| # ------------------------------------------------------- | |
| # Chat Function | |
| # ------------------------------------------------------- | |
| def chat_with_falcon( | |
| message, | |
| history, | |
| max_new_tokens, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| ): | |
| """ | |
| message: Current user message | |
| history: Gradio messages-style chat history | |
| """ | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT.strip()}] | |
| for item in history: | |
| if item["role"] in ["user", "assistant"]: | |
| messages.append( | |
| { | |
| "role": item["role"], | |
| "content": item["content"], | |
| } | |
| ) | |
| messages.append({"role": "user", "content": message}) | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| if torch.cuda.is_available(): | |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} | |
| else: | |
| inputs = {k: v.to("cpu") for k, v in inputs.items()} | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=True, | |
| ) | |
| generation_kwargs = dict( | |
| **inputs, | |
| streamer=streamer, | |
| max_new_tokens=int(max_new_tokens), | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| repetition_penalty=float(repetition_penalty), | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| partial_response = "" | |
| for new_text in streamer: | |
| partial_response += new_text | |
| yield partial_response | |
| # ------------------------------------------------------- | |
| # Gradio Interface | |
| # ------------------------------------------------------- | |
| with gr.Blocks(title="Falcon3-1B-Instruct Chat") as demo: | |
| gr.Markdown( | |
| """ | |
| # π¦ Falcon3-1B-Instruct Chat Interface | |
| This app runs a local Hugging Face Transformers chat interface using: | |
| `tiiuae/Falcon3-1B-Instruct` | |
| Use this to test instruction-following, tutoring, coding help, short explanations, and multilingual chat. | |
| """ | |
| ) | |
| chatbot = gr.Chatbot( | |
| label="Falcon3 Chat", | |
| type="messages", | |
| height=500, | |
| ) | |
| with gr.Row(): | |
| textbox = gr.Textbox( | |
| placeholder="Ask Falcon3 something...", | |
| label="Your Message", | |
| scale=5, | |
| ) | |
| submit_btn = gr.Button("Send", variant="primary", scale=1) | |
| with gr.Accordion("Generation Settings", open=False): | |
| max_new_tokens = gr.Slider( | |
| minimum=64, | |
| maximum=2048, | |
| value=512, | |
| step=64, | |
| label="Max New Tokens", | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.5, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top-p", | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, | |
| maximum=1.5, | |
| value=1.1, | |
| step=0.05, | |
| label="Repetition Penalty", | |
| ) | |
| clear_btn = gr.Button("Clear Chat") | |
| def user_turn(user_message, chat_history): | |
| if chat_history is None: | |
| chat_history = [] | |
| chat_history.append({"role": "user", "content": user_message}) | |
| return "", chat_history | |
| def bot_turn(chat_history, max_new_tokens, temperature, top_p, repetition_penalty): | |
| user_message = chat_history[-1]["content"] | |
| prior_history = chat_history[:-1] | |
| partial = "" | |
| for partial in chat_with_falcon( | |
| user_message, | |
| prior_history, | |
| max_new_tokens, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| ): | |
| updated_history = prior_history + [ | |
| {"role": "user", "content": user_message}, | |
| {"role": "assistant", "content": partial}, | |
| ] | |
| yield updated_history | |
| submit_btn.click( | |
| fn=user_turn, | |
| inputs=[textbox, chatbot], | |
| outputs=[textbox, chatbot], | |
| queue=False, | |
| ).then( | |
| fn=bot_turn, | |
| inputs=[ | |
| chatbot, | |
| max_new_tokens, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| ], | |
| outputs=chatbot, | |
| ) | |
| textbox.submit( | |
| fn=user_turn, | |
| inputs=[textbox, chatbot], | |
| outputs=[textbox, chatbot], | |
| queue=False, | |
| ).then( | |
| fn=bot_turn, | |
| inputs=[ | |
| chatbot, | |
| max_new_tokens, | |
| temperature, | |
| top_p, | |
| repetition_penalty, | |
| ], | |
| outputs=chatbot, | |
| ) | |
| clear_btn.click(lambda: [], outputs=chatbot) | |
| demo.queue() | |
| demo.launch() |