F3 / app.py
eaglelandsonce's picture
Create app.py
b36b3f7 verified
import os
import torch
import gradio as gr
from threading import Thread
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TextIteratorStreamer,
)
# -------------------------------------------------------
# Model Settings
# -------------------------------------------------------
MODEL_ID = "tiiuae/Falcon3-1B-Instruct"
SYSTEM_PROMPT = """
You are a helpful, clear, friendly AI assistant.
Answer in a practical way with examples when helpful.
"""
# -------------------------------------------------------
# Load Model
# -------------------------------------------------------
print(f"Loading model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if torch.cuda.is_available():
dtype = torch.bfloat16
device_map = "auto"
else:
dtype = torch.float32
device_map = None
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
device_map=device_map,
)
if not torch.cuda.is_available():
model = model.to("cpu")
model.eval()
print("Model loaded successfully.")
# -------------------------------------------------------
# Chat Function
# -------------------------------------------------------
def chat_with_falcon(
message,
history,
max_new_tokens,
temperature,
top_p,
repetition_penalty,
):
"""
message: Current user message
history: Gradio messages-style chat history
"""
messages = [{"role": "system", "content": SYSTEM_PROMPT.strip()}]
for item in history:
if item["role"] in ["user", "assistant"]:
messages.append(
{
"role": item["role"],
"content": item["content"],
}
)
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(prompt, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to(model.device) for k, v in inputs.items()}
else:
inputs = {k: v.to("cpu") for k, v in inputs.items()}
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_p=float(top_p),
repetition_penalty=float(repetition_penalty),
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
partial_response = ""
for new_text in streamer:
partial_response += new_text
yield partial_response
# -------------------------------------------------------
# Gradio Interface
# -------------------------------------------------------
with gr.Blocks(title="Falcon3-1B-Instruct Chat") as demo:
gr.Markdown(
"""
# πŸ¦… Falcon3-1B-Instruct Chat Interface
This app runs a local Hugging Face Transformers chat interface using:
`tiiuae/Falcon3-1B-Instruct`
Use this to test instruction-following, tutoring, coding help, short explanations, and multilingual chat.
"""
)
chatbot = gr.Chatbot(
label="Falcon3 Chat",
type="messages",
height=500,
)
with gr.Row():
textbox = gr.Textbox(
placeholder="Ask Falcon3 something...",
label="Your Message",
scale=5,
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Accordion("Generation Settings", open=False):
max_new_tokens = gr.Slider(
minimum=64,
maximum=2048,
value=512,
step=64,
label="Max New Tokens",
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p",
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=1.5,
value=1.1,
step=0.05,
label="Repetition Penalty",
)
clear_btn = gr.Button("Clear Chat")
def user_turn(user_message, chat_history):
if chat_history is None:
chat_history = []
chat_history.append({"role": "user", "content": user_message})
return "", chat_history
def bot_turn(chat_history, max_new_tokens, temperature, top_p, repetition_penalty):
user_message = chat_history[-1]["content"]
prior_history = chat_history[:-1]
partial = ""
for partial in chat_with_falcon(
user_message,
prior_history,
max_new_tokens,
temperature,
top_p,
repetition_penalty,
):
updated_history = prior_history + [
{"role": "user", "content": user_message},
{"role": "assistant", "content": partial},
]
yield updated_history
submit_btn.click(
fn=user_turn,
inputs=[textbox, chatbot],
outputs=[textbox, chatbot],
queue=False,
).then(
fn=bot_turn,
inputs=[
chatbot,
max_new_tokens,
temperature,
top_p,
repetition_penalty,
],
outputs=chatbot,
)
textbox.submit(
fn=user_turn,
inputs=[textbox, chatbot],
outputs=[textbox, chatbot],
queue=False,
).then(
fn=bot_turn,
inputs=[
chatbot,
max_new_tokens,
temperature,
top_p,
repetition_penalty,
],
outputs=chatbot,
)
clear_btn.click(lambda: [], outputs=chatbot)
demo.queue()
demo.launch()