File size: 2,540 Bytes
eaf61fe
94d45aa
9a41897
4346fab
 
3d1304c
 
4346fab
 
3d1304c
eaf61fe
9804570
3d1304c
4346fab
3d1304c
4346fab
3d1304c
4346fab
 
 
f931e52
3d1304c
 
4346fab
dc6e6db
 
4346fab
65ba8d7
4346fab
9804570
f2c31e0
3d1304c
 
 
0dbdb6b
 
 
 
 
 
 
 
 
3d1304c
4346fab
3d1304c
4346fab
 
3d1304c
4346fab
 
3d1304c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2c31e0
3d1304c
 
 
9804570
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import spaces

import gradio as gr
import io
from urllib.request import urlopen

import soundfile as sf
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig




MODEL_ID = "microsoft/Phi-4-multimodal-instruct"

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype="auto",
    trust_remote_code=True
)

model.load_adapter(MODEL_ID, adapter_name="speech", device_map="cuda" if torch.cuda.is_available() else "cpu", adapter_kwargs={"subfolder": 'speech-lora'})
model.set_adapter("speech")

generation_config = GenerationConfig.from_pretrained(MODEL_ID)
generation_config.num_logits_to_keep = 1

@spaces.GPU
def run_phi4(audio_path: str, instruction: str) -> str:
    if not audio_path:
        return "Please upload an audio file."

    audio, samplerate = sf.read(audio_path)

    user_prompt = "<|user|>"
    assistant_prompt = "<|assistant|>"
    prompt_suffix = "<|end|>"

    prompt = f"{user_prompt}<|audio_1|>{instruction}{prompt_suffix}{assistant_prompt}"

    inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(model.device)

    output_ids = model.generate(
        **inputs,
        max_new_tokens=4096,
        generation_config=generation_config,
    )
    output_ids = output_ids[:, inputs["input_ids"].shape[1]:]
    response = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response


with gr.Blocks(title="Phi-4 Multimodal Audio Demo") as demo:
    gr.Markdown("# Phi-4 Multimodal (Audio) Demo")
    gr.Markdown("Upload an audio file and run instructions with Phi-4.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio")
            instruction = gr.Textbox(
                label="Instruction",
                value=(
                    "Transcribe the audio to text, and then translate the audio to French. "
                    "Use <sep> as a separator between the original transcript and the translation."
                ),
            )
            submit_btn = gr.Button("Run", variant="primary")
        with gr.Column():
            output_text = gr.Textbox(label="Model Response", lines=14)

    submit_btn.click(run_phi4, [audio_input, instruction], output_text)


if __name__ == "__main__":
    demo.queue().launch(share=False, ssr_mode=False)