import gradio as gr import mlx.core as mx import utils # Load the model and tokenizer def load_model(model_path, adapter_path): model, tokenizer, _ = utils.load(model_path) if adapter_path: try: adapter_weights = mx.load(adapter_path) # Filter out any weights that don't match the model's structure filtered_weights = {k: v for k, v in adapter_weights.items() if k in model.parameters()} model.load_weights(filtered_weights, strict=False) print(f"Loaded adapter weights from {adapter_path}") except Exception as e: print(f"Error loading adapter weights: {str(e)}") return model, tokenizer # Generate response def generate_response(model, tokenizer, prompt, max_tokens, temperature): prompt_tokens = mx.array(tokenizer.encode(prompt)) generated_tokens = [] for token in utils.generate(prompt_tokens, model, temperature): generated_tokens.append(token.item()) if len(generated_tokens) >= max_tokens or token.item() == tokenizer.eos_token_id: break return tokenizer.decode(generated_tokens) # Inference function def infer(question, max_tokens, temperature): prompt = f"Q: {question}\nA:" response = generate_response(model, tokenizer, prompt, max_tokens, temperature) return response # Load the model and tokenizer (do this outside the infer function to load only once) model_path = "./phi-2" # Update this with the actual path to your model adapter_path = "./adapters.npz" # Update this with the actual path to your adapters model, tokenizer = load_model(model_path, adapter_path) # Create the Gradio interface iface = gr.Interface( fn=infer, inputs=[ gr.Textbox(lines=2, placeholder="Enter your question here..."), gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), ], outputs="text", title="Fine-tuned Phi-2 Q&A Demo", description="Ask a question and get an answer from the fine-tuned Phi-2 model. Finetuned on OASST1 dataset." ) # Launch the interface iface.launch()