Spaces:

eaglelandsonce
/

F3

Runtime error

App Files Files Community

F3 / app.py

eaglelandsonce

Create app.py

b36b3f7 verified about 1 month ago

raw

history blame contribute delete

6.09 kB

	import os
	import torch
	import gradio as gr

	from threading import Thread
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TextIteratorStreamer,
	)

	# -------------------------------------------------------
	# Model Settings
	# -------------------------------------------------------
	MODEL_ID = "tiiuae/Falcon3-1B-Instruct"

	SYSTEM_PROMPT = """
	You are a helpful, clear, friendly AI assistant.
	Answer in a practical way with examples when helpful.
	"""

	# -------------------------------------------------------
	# Load Model
	# -------------------------------------------------------
	print(f"Loading model: {MODEL_ID}")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

	if torch.cuda.is_available():
	dtype = torch.bfloat16
	device_map = "auto"
	else:
	dtype = torch.float32
	device_map = None

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	device_map=device_map,
	)

	if not torch.cuda.is_available():
	model = model.to("cpu")

	model.eval()

	print("Model loaded successfully.")


	# -------------------------------------------------------
	# Chat Function
	# -------------------------------------------------------
	def chat_with_falcon(
	message,
	history,
	max_new_tokens,
	temperature,
	top_p,
	repetition_penalty,
	):
	"""
	message: Current user message
	history: Gradio messages-style chat history
	"""

	messages = [{"role": "system", "content": SYSTEM_PROMPT.strip()}]

	for item in history:
	if item["role"] in ["user", "assistant"]:
	messages.append(
	{
	"role": item["role"],
	"content": item["content"],
	}
	)

	messages.append({"role": "user", "content": message})

	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)

	inputs = tokenizer(prompt, return_tensors="pt")

	if torch.cuda.is_available():
	inputs = {k: v.to(model.device) for k, v in inputs.items()}
	else:
	inputs = {k: v.to("cpu") for k, v in inputs.items()}

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	generation_kwargs = dict(
	**inputs,
	streamer=streamer,
	max_new_tokens=int(max_new_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	repetition_penalty=float(repetition_penalty),
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	partial_response = ""

	for new_text in streamer:
	partial_response += new_text
	yield partial_response


	# -------------------------------------------------------
	# Gradio Interface
	# -------------------------------------------------------
	with gr.Blocks(title="Falcon3-1B-Instruct Chat") as demo:
	gr.Markdown(
	"""
	# 🦅 Falcon3-1B-Instruct Chat Interface

	This app runs a local Hugging Face Transformers chat interface using:

	`tiiuae/Falcon3-1B-Instruct`

	Use this to test instruction-following, tutoring, coding help, short explanations, and multilingual chat.
	"""
	)

	chatbot = gr.Chatbot(
	label="Falcon3 Chat",
	type="messages",
	height=500,
	)

	with gr.Row():
	textbox = gr.Textbox(
	placeholder="Ask Falcon3 something...",
	label="Your Message",
	scale=5,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Accordion("Generation Settings", open=False):
	max_new_tokens = gr.Slider(
	minimum=64,
	maximum=2048,
	value=512,
	step=64,
	label="Max New Tokens",
	)

	temperature = gr.Slider(
	minimum=0.1,
	maximum=1.5,
	value=0.7,
	step=0.1,
	label="Temperature",
	)

	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-p",
	)

	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=1.5,
	value=1.1,
	step=0.05,
	label="Repetition Penalty",
	)

	clear_btn = gr.Button("Clear Chat")

	def user_turn(user_message, chat_history):
	if chat_history is None:
	chat_history = []

	chat_history.append({"role": "user", "content": user_message})
	return "", chat_history

	def bot_turn(chat_history, max_new_tokens, temperature, top_p, repetition_penalty):
	user_message = chat_history[-1]["content"]
	prior_history = chat_history[:-1]

	partial = ""

	for partial in chat_with_falcon(
	user_message,
	prior_history,
	max_new_tokens,
	temperature,
	top_p,
	repetition_penalty,
	):
	updated_history = prior_history + [
	{"role": "user", "content": user_message},
	{"role": "assistant", "content": partial},
	]
	yield updated_history

	submit_btn.click(
	fn=user_turn,
	inputs=[textbox, chatbot],
	outputs=[textbox, chatbot],
	queue=False,
	).then(
	fn=bot_turn,
	inputs=[
	chatbot,
	max_new_tokens,
	temperature,
	top_p,
	repetition_penalty,
	],
	outputs=chatbot,
	)

	textbox.submit(
	fn=user_turn,
	inputs=[textbox, chatbot],
	outputs=[textbox, chatbot],
	queue=False,
	).then(
	fn=bot_turn,
	inputs=[
	chatbot,
	max_new_tokens,
	temperature,
	top_p,
	repetition_penalty,
	],
	outputs=chatbot,
	)

	clear_btn.click(lambda: [], outputs=chatbot)

	demo.queue()
	demo.launch()