|
|
import spaces |
|
|
import gradio as gr |
|
|
|
|
|
import soundfile as sf |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoProcessor, infer_device |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_path = "microsoft/Phi-4-multimodal-instruct" |
|
|
device = f"{infer_device()}:0" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_path, |
|
|
device_map=device, |
|
|
dtype=torch.float16, |
|
|
) |
|
|
|
|
|
model.load_adapter( |
|
|
model_path, |
|
|
adapter_name="speech", |
|
|
device_map=device, |
|
|
adapter_kwargs={"subfolder": 'speech-lora'} |
|
|
) |
|
|
model.set_adapter("speech") |
|
|
|
|
|
@spaces.GPU |
|
|
def run_phi4(audio_path: str, instruction: str) -> str: |
|
|
if not audio_path: |
|
|
return "Please upload an audio file." |
|
|
|
|
|
audio, samplerate = sf.read(audio_path) |
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "audio", "url": audio_path}, |
|
|
{"type": "text", "text": instruction}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
chat_text = processor.apply_chat_template( |
|
|
messages, |
|
|
add_generation_prompt=True, |
|
|
tokenize=False, |
|
|
return_dict=False, |
|
|
) |
|
|
|
|
|
inputs = processor( |
|
|
text=chat_text, |
|
|
audios=[(audio, samplerate)], |
|
|
return_tensors="pt", |
|
|
).to(model.device) |
|
|
|
|
|
generate_ids = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=1000, |
|
|
do_sample=False, |
|
|
) |
|
|
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] |
|
|
response = processor.batch_decode( |
|
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False |
|
|
)[0] |
|
|
return response |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo: |
|
|
gr.Markdown("# Phi-4 Multimodal (Audio) Demo") |
|
|
gr.Markdown("Upload an audio file and run instructions with Phi-4.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio(type="filepath", label="Upload Audio") |
|
|
instruction = gr.Textbox( |
|
|
label="Instruction", |
|
|
value=( |
|
|
"Transcribe the audio to text, and then translate the audio to French. " |
|
|
"Use <sep> as a separator between the original transcript and the translation." |
|
|
), |
|
|
) |
|
|
submit_btn = gr.Button("Run", variant="primary") |
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox(label="Model Response", lines=14) |
|
|
|
|
|
submit_btn.click(run_phi4, [audio_input, instruction], output_text) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch(share=False, ssr_mode=False) |