import spaces import gradio as gr import io from urllib.request import urlopen import soundfile as sf import torch from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig MODEL_ID = "microsoft/Phi-4-multimodal-instruct" processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="cuda" if torch.cuda.is_available() else "cpu", torch_dtype="auto", trust_remote_code=True ) model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'}) model.set_adapter("speech") generation_config = GenerationConfig.from_pretrained(MODEL_ID) @spaces.GPU def run_phi4(audio_path: str, instruction: str) -> str: if not audio_path: return "Please upload an audio file." messages = [ { "role": "user", "content": [ {"type": "audio", "url": audio_path}, {"type": "text", "text": "Transcribe the audio to text, and then translate the audio to French. Use as a separator between the origina transcript and the translation."}, ], }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) output_ids = model.generate( **inputs, max_new_tokens=4096, generation_config=generation_config, ) output_ids = output_ids[:, inputs["input_ids"].shape[1]:] response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] return response with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo: gr.Markdown("# Phi-4 Multimodal (Audio) Demo") gr.Markdown("Upload an audio file and run instructions with Phi-4.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio") instruction = gr.Textbox( label="Instruction", value=( "Transcribe the audio to text, and then translate the audio to French. " "Use as a separator between the original transcript and the translation." ), ) submit_btn = gr.Button("Run", variant="primary") with gr.Column(): output_text = gr.Textbox(label="Model Response", lines=14) submit_btn.click(run_phi4, [audio_input, instruction], output_text) if __name__ == "__main__": demo.queue().launch(share=False, ssr_mode=False)