File size: 2,565 Bytes
eaf61fe 9a41897 3d1304c b5ad8ed 3d1304c eaf61fe 9804570 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c b5ad8ed dc6e6db 9804570 f2c31e0 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c b5ad8ed 3d1304c f2c31e0 3d1304c 9804570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import spaces
import gradio as gr
import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, infer_device
model_path = "microsoft/Phi-4-multimodal-instruct"
device = f"{infer_device()}:0"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device,
dtype=torch.float16,
)
model.load_adapter(
model_path,
adapter_name="speech",
device_map=device,
adapter_kwargs={"subfolder": 'speech-lora'}
)
model.set_adapter("speech")
@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
if not audio_path:
return "Please upload an audio file."
audio, samplerate = sf.read(audio_path)
messages = [
{
"role": "user",
"content": [
{"type": "audio", "url": audio_path},
{"type": "text", "text": instruction},
],
}
]
chat_text = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
return_dict=False,
)
inputs = processor(
text=chat_text,
audios=[(audio, samplerate)],
return_tensors="pt",
).to(model.device)
generate_ids = model.generate(
**inputs,
max_new_tokens=1000,
do_sample=False,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
gr.Markdown("Upload an audio file and run instructions with Phi-4.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
instruction = gr.Textbox(
label="Instruction",
value=(
"Transcribe the audio to text, and then translate the audio to French. "
"Use <sep> as a separator between the original transcript and the translation."
),
)
submit_btn = gr.Button("Run", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Model Response", lines=14)
submit_btn.click(run_phi4, [audio_input, instruction], output_text)
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False) |