anwgpt4 / app.py
FlameF0X's picture
Create app.py
0ac067d verified
import gradio as gr
from unsloth import FastLanguageModel
import torch
# Load the model and tokenizer
max_seq_length = 2048
dtype = None
load_in_4bit = True
print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="FlameF0X/anwgpt4-1.2b",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# Enable fast inference
FastLanguageModel.for_inference(model)
print("Model loaded successfully!")
def generate_response(message, history, max_tokens=256, temperature=0.7, top_p=0.9):
"""
Generate a response using the fine-tuned model.
Args:
message: Current user message
history: Chat history as list of [user_msg, assistant_msg] pairs
max_tokens: Maximum number of tokens to generate
temperature: Sampling temperature
top_p: Nucleus sampling parameter
"""
# Build conversation from history
conversation = []
for user_msg, assistant_msg in history:
conversation.append({"role": "user", "content": user_msg})
conversation.append({"role": "assistant", "content": assistant_msg})
# Add current message
conversation.append({"role": "user", "content": message})
# Format with chat template
formatted_input = tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True
)
# Tokenize
inputs = tokenizer([formatted_input], return_tensors="pt").to(model.device)
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=temperature > 0,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode and extract only the new response
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract just the assistant's response (after the last user message)
# This assumes the model generates in the format: ...user message...assistant response
response = full_response.split(message)[-1].strip()
return response
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# 🤖 AnwGPT 4-1.2B Chat
Fine-tuned LFM2.5-1.2B model on the Databricks Dolly-15k dataset.
Ask questions, request information, or have a conversation!
**Model:** FlameF0X/anwgpt4-1.2b
"""
)
chatbot = gr.Chatbot(
label="Chat",
height=500,
show_copy_button=True,
)
with gr.Row():
msg = gr.Textbox(
label="Your message",
placeholder="Type your message here...",
scale=4,
)
submit = gr.Button("Send", variant="primary", scale=1)
with gr.Accordion("⚙️ Generation Settings", open=False):
max_tokens = gr.Slider(
minimum=32,
maximum=512,
value=256,
step=32,
label="Max Tokens",
info="Maximum number of tokens to generate"
)
temperature = gr.Slider(
minimum=0.0,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative, Lower = more focused"
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P",
info="Nucleus sampling threshold"
)
with gr.Row():
clear = gr.Button("🗑️ Clear Chat")
gr.Examples(
examples=[
"What is the capital of France?",
"Explain quantum computing in simple terms.",
"Write a short poem about technology.",
"What are the benefits of exercise?",
"How does photosynthesis work?",
],
inputs=msg,
label="Example Questions"
)
# Handle message submission
def respond(message, chat_history, max_tok, temp, top_p_val):
if not message.strip():
return "", chat_history
bot_message = generate_response(message, chat_history, max_tok, temp, top_p_val)
chat_history.append((message, bot_message))
return "", chat_history
msg.submit(respond, [msg, chatbot, max_tokens, temperature, top_p], [msg, chatbot])
submit.click(respond, [msg, chatbot, max_tokens, temperature, top_p], [msg, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
gr.Markdown(
"""
---
### About
This model was fine-tuned using LoRA on the Databricks Dolly-15k instruction dataset.
Base model: LiquidAI/LFM2.5-1.2B-Base
"""
)
if __name__ == "__main__":
demo.launch()