phi_2_QLoRA / app.py
Dhairyashil Ghatage
add app and model data
f04dcd7
import gradio as gr
import mlx.core as mx
import utils
# Load the model and tokenizer
def load_model(model_path, adapter_path):
model, tokenizer, _ = utils.load(model_path)
if adapter_path:
try:
adapter_weights = mx.load(adapter_path)
# Filter out any weights that don't match the model's structure
filtered_weights = {k: v for k, v in adapter_weights.items() if k in model.parameters()}
model.load_weights(filtered_weights, strict=False)
print(f"Loaded adapter weights from {adapter_path}")
except Exception as e:
print(f"Error loading adapter weights: {str(e)}")
return model, tokenizer
# Generate response
def generate_response(model, tokenizer, prompt, max_tokens, temperature):
prompt_tokens = mx.array(tokenizer.encode(prompt))
generated_tokens = []
for token in utils.generate(prompt_tokens, model, temperature):
generated_tokens.append(token.item())
if len(generated_tokens) >= max_tokens or token.item() == tokenizer.eos_token_id:
break
return tokenizer.decode(generated_tokens)
# Inference function
def infer(question, max_tokens, temperature):
prompt = f"Q: {question}\nA:"
response = generate_response(model, tokenizer, prompt, max_tokens, temperature)
return response
# Load the model and tokenizer (do this outside the infer function to load only once)
model_path = "./phi-2" # Update this with the actual path to your model
adapter_path = "./adapters.npz" # Update this with the actual path to your adapters
model, tokenizer = load_model(model_path, adapter_path)
# Create the Gradio interface
iface = gr.Interface(
fn=infer,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your question here..."),
gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Max Tokens"),
gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
],
outputs="text",
title="Fine-tuned Phi-2 Q&A Demo",
description="Ask a question and get an answer from the fine-tuned Phi-2 model. Finetuned on OASST1 dataset."
)
# Launch the interface
iface.launch()