|
|
import spaces |
|
|
import gradio as gr |
|
|
import io |
|
|
from urllib.request import urlopen |
|
|
|
|
|
import soundfile as sf |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "microsoft/Phi-4-multimodal-instruct" |
|
|
|
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
device_map="cuda" if torch.cuda.is_available() else "cpu", |
|
|
torch_dtype="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'}) |
|
|
model.set_adapter("speech") |
|
|
|
|
|
generation_config = GenerationConfig.from_pretrained(MODEL_ID) |
|
|
|
|
|
@spaces.GPU |
|
|
def run_phi4(audio_path: str, instruction: str) -> str: |
|
|
if not audio_path: |
|
|
return "Please upload an audio file." |
|
|
|
|
|
audio, samplerate = sf.read(audio_path) |
|
|
|
|
|
user_prompt = "<|user|>" |
|
|
assistant_prompt = "<|assistant|>" |
|
|
prompt_suffix = "<|end|>" |
|
|
|
|
|
prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}" |
|
|
|
|
|
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device) |
|
|
|
|
|
output_ids = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=4096, |
|
|
generation_config=generation_config, |
|
|
) |
|
|
output_ids = output_ids[:, inputs["input_ids"].shape[1]:] |
|
|
response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
|
return response |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo: |
|
|
gr.Markdown("# Phi-4 Multimodal (Audio) Demo") |
|
|
gr.Markdown("Upload an audio file and run instructions with Phi-4.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio(type="filepath", label="Upload Audio") |
|
|
instruction = gr.Textbox( |
|
|
label="Instruction", |
|
|
value=( |
|
|
"Transcribe the audio to text, and then translate the audio to French. " |
|
|
"Use <sep> as a separator between the original transcript and the translation." |
|
|
), |
|
|
) |
|
|
submit_btn = gr.Button("Run", variant="primary") |
|
|
with gr.Column(): |
|
|
output_text = gr.Textbox(label="Model Response", lines=14) |
|
|
|
|
|
submit_btn.click(run_phi4, [audio_input, instruction], output_text) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch(share=False, ssr_mode=False) |