File size: 2,540 Bytes
eaf61fe 94d45aa 9a41897 4346fab 3d1304c 4346fab 3d1304c eaf61fe 9804570 3d1304c 4346fab 3d1304c 4346fab 3d1304c 4346fab f931e52 3d1304c 4346fab dc6e6db 4346fab 65ba8d7 4346fab 9804570 f2c31e0 3d1304c 0dbdb6b 3d1304c 4346fab 3d1304c 4346fab 3d1304c 4346fab 3d1304c f2c31e0 3d1304c 9804570 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import spaces
import gradio as gr
import io
from urllib.request import urlopen
import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
MODEL_ID = "microsoft/Phi-4-multimodal-instruct"
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cuda" if torch.cuda.is_available() else "cpu",
torch_dtype="auto",
trust_remote_code=True
)
model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'})
model.set_adapter("speech")
generation_config = GenerationConfig.from_pretrained(MODEL_ID)
generation_config.num_logits_to_keep = 1
@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
if not audio_path:
return "Please upload an audio file."
audio, samplerate = sf.read(audio_path)
user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"
prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}"
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)
output_ids = model.generate(
**inputs,
max_new_tokens=4096,
generation_config=generation_config,
)
output_ids = output_ids[:, inputs["input_ids"].shape[1]:]
response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
return response
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
gr.Markdown("Upload an audio file and run instructions with Phi-4.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
instruction = gr.Textbox(
label="Instruction",
value=(
"Transcribe the audio to text, and then translate the audio to French. "
"Use <sep> as a separator between the original transcript and the translation."
),
)
submit_btn = gr.Button("Run", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Model Response", lines=14)
submit_btn.click(run_phi4, [audio_input, instruction], output_text)
if __name__ == "__main__":
demo.queue().launch(share=False, ssr_mode=False) |