Spaces:

FlameF0X
/

anwgpt4

Build error

App Files Files Community

anwgpt4 / app.py

FlameF0X

Create app.py

0ac067d verified 4 months ago

raw

history blame contribute delete

4.99 kB

	import gradio as gr
	from unsloth import FastLanguageModel
	import torch

	# Load the model and tokenizer
	max_seq_length = 2048
	dtype = None
	load_in_4bit = True

	print("Loading model...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="FlameF0X/anwgpt4-1.2b",
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit,
	)

	# Enable fast inference
	FastLanguageModel.for_inference(model)
	print("Model loaded successfully!")

	def generate_response(message, history, max_tokens=256, temperature=0.7, top_p=0.9):
	"""
	Generate a response using the fine-tuned model.

	Args:
	message: Current user message
	history: Chat history as list of [user_msg, assistant_msg] pairs
	max_tokens: Maximum number of tokens to generate
	temperature: Sampling temperature
	top_p: Nucleus sampling parameter
	"""
	# Build conversation from history
	conversation = []
	for user_msg, assistant_msg in history:
	conversation.append({"role": "user", "content": user_msg})
	conversation.append({"role": "assistant", "content": assistant_msg})

	# Add current message
	conversation.append({"role": "user", "content": message})

	# Format with chat template
	formatted_input = tokenizer.apply_chat_template(
	conversation,
	tokenize=False,
	add_generation_prompt=True
	)

	# Tokenize
	inputs = tokenizer([formatted_input], return_tensors="pt").to(model.device)

	# Generate
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=temperature > 0,
	use_cache=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id,
	)

	# Decode and extract only the new response
	full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Extract just the assistant's response (after the last user message)
	# This assumes the model generates in the format: ...user message...assistant response
	response = full_response.split(message)[-1].strip()

	return response

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# 🤖 AnwGPT 4-1.2B Chat

	Fine-tuned LFM2.5-1.2B model on the Databricks Dolly-15k dataset.
	Ask questions, request information, or have a conversation!

	Model: FlameF0X/anwgpt4-1.2b
	"""
	)

	chatbot = gr.Chatbot(
	label="Chat",
	height=500,
	show_copy_button=True,
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Your message",
	placeholder="Type your message here...",
	scale=4,
	)
	submit = gr.Button("Send", variant="primary", scale=1)

	with gr.Accordion("⚙️ Generation Settings", open=False):
	max_tokens = gr.Slider(
	minimum=32,
	maximum=512,
	value=256,
	step=32,
	label="Max Tokens",
	info="Maximum number of tokens to generate"
	)
	temperature = gr.Slider(
	minimum=0.0,
	maximum=1.5,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Higher = more creative, Lower = more focused"
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P",
	info="Nucleus sampling threshold"
	)

	with gr.Row():
	clear = gr.Button("🗑️ Clear Chat")

	gr.Examples(
	examples=[
	"What is the capital of France?",
	"Explain quantum computing in simple terms.",
	"Write a short poem about technology.",
	"What are the benefits of exercise?",
	"How does photosynthesis work?",
	],
	inputs=msg,
	label="Example Questions"
	)

	# Handle message submission
	def respond(message, chat_history, max_tok, temp, top_p_val):
	if not message.strip():
	return "", chat_history

	bot_message = generate_response(message, chat_history, max_tok, temp, top_p_val)
	chat_history.append((message, bot_message))
	return "", chat_history

	msg.submit(respond, [msg, chatbot, max_tokens, temperature, top_p], [msg, chatbot])
	submit.click(respond, [msg, chatbot, max_tokens, temperature, top_p], [msg, chatbot])
	clear.click(lambda: None, None, chatbot, queue=False)

	gr.Markdown(
	"""
	---
	### About
	This model was fine-tuned using LoRA on the Databricks Dolly-15k instruction dataset.
	Base model: LiquidAI/LFM2.5-1.2B-Base
	"""
	)

	if __name__ == "__main__":
	demo.launch()