import spaces import gradio as gr import soundfile as sf import torch from transformers import AutoModelForCausalLM, AutoProcessor, infer_device model_path = "microsoft/Phi-4-multimodal-instruct" device = f"{infer_device()}:0" processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, device_map=device, dtype=torch.float16, ) model.load_adapter( model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'} ) model.set_adapter("speech") @spaces.GPU def run_phi4(audio_path: str, instruction: str) -> str: if not audio_path: return "Please upload an audio file." audio, samplerate = sf.read(audio_path) messages = [ { "role": "user", "content": [ {"type": "audio", "url": audio_path}, {"type": "text", "text": instruction}, ], } ] chat_text = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=False, return_dict=False, ) inputs = processor( text=chat_text, audios=[(audio, samplerate)], return_tensors="pt", ).to(model.device) generate_ids = model.generate( **inputs, max_new_tokens=1000, do_sample=False, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo: gr.Markdown("# Phi-4 Multimodal (Audio) Demo") gr.Markdown("Upload an audio file and run instructions with Phi-4.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio") instruction = gr.Textbox( label="Instruction", value=( "Transcribe the audio to text, and then translate the audio to French. " "Use as a separator between the original transcript and the translation." ), ) submit_btn = gr.Button("Run", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Model Response", lines=14) submit_btn.click(run_phi4, [audio_input, instruction], output_text) if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False)