phi4_mm / app.py
Gijs Wijngaard
init
b5ad8ed
raw
history blame
2.57 kB
import spaces
import gradio as gr
import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, infer_device
model_path = "microsoft/Phi-4-multimodal-instruct"
device = f"{infer_device()}:0"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map=device,
dtype=torch.float16,
)
model.load_adapter(
model_path,
adapter_name="speech",
device_map=device,
adapter_kwargs={"subfolder": 'speech-lora'}
)
model.set_adapter("speech")
@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
if not audio_path:
return "Please upload an audio file."
audio, samplerate = sf.read(audio_path)
messages = [
{
"role": "user",
"content": [
{"type": "audio", "url": audio_path},
{"type": "text", "text": instruction},
],
}
]
chat_text = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
return_dict=False,
)
inputs = processor(
text=chat_text,
audios=[(audio, samplerate)],
return_tensors="pt",
).to(model.device)
generate_ids = model.generate(
**inputs,
max_new_tokens=1000,
do_sample=False,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return response
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
gr.Markdown("Upload an audio file and run instructions with Phi-4.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
instruction = gr.Textbox(
label="Instruction",
value=(
"Transcribe the audio to text, and then translate the audio to French. "
"Use <sep> as a separator between the original transcript and the translation."
),
)
submit_btn = gr.Button("Run", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Model Response", lines=14)
submit_btn.click(run_phi4, [audio_input, instruction], output_text)
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False)